"""
Contains RCE metrics.
"""
import copy
from functools import partial
from typing import Union

from tml.metrics import aggregation

import torch
import torchmetrics


def _smooth(
  value: torch.Tensor, label_smoothing: Union[float, torch.Tensor]
) -> Union[float, torch.Tensor]:
  """
  Smooth given values.
  Args:
    value: Value to smooth.
    label_smoothing: smoothing constant.
  Returns: Smoothed values.
  """
  return value * (1.0 - label_smoothing) + 0.5 * label_smoothing


def _binary_cross_entropy_with_clipping(
  predictions: torch.Tensor,
  target: torch.Tensor,
  epsilon: Union[float, torch.Tensor],
  reduction: str = "none",
) -> torch.Tensor:
  """
  Clip Predictions and apply binary cross entropy.
  This is done to match the implementation in keras at
  https://github.com/keras-team/keras/blob/r2.9/keras/backend.py#L5294-L5300
  Args:
    predictions: Predicted probabilities.
    target: Ground truth.
    epsilon: Epsilon fuzz factor used to clip the predictions.
    reduction: The reduction method to use.

  Returns: Binary cross entropy on the clipped predictions.

  """
  predictions = torch.clamp(predictions, epsilon, 1.0 - epsilon)
  bce = -target * torch.log(predictions + epsilon)
  bce -= (1.0 - target) * torch.log(1.0 - predictions + epsilon)
  if reduction == "mean":
    return torch.mean(bce)
  return bce


class RCE(torchmetrics.Metric):
  """
  Compute the relative cross entropy (`RCE <http://go/rce>`_).

  RCE is metric used for models predicting probability of success (p), i.e. pCTR.
  RCE represents the binary `cross entropy <https://en.wikipedia.org/wiki/Cross_entropy>` of
  the model compared to a reference straw man model.

  Binary cross entropy is defined as:

  y = label; p = prediction;
  binary cross entropy(example) = - y * log(p) - (1-y) * log(1-p)

  Where y in {0, 1}

  Cross entropy of a model is defined as:

  CE(model) = average(binary cross entropy(example))

  Over all the examples we aggregate on.

  The straw man model is quite simple, it is a constant predictor, always predicting the average
  over the labels.

  RCE of a model is defined as:

  RCE(model) = 100 * (CE(reference model) - CE(model)) / CE(reference model)

  .. note:: Maximizing the likelihood is the same as minimizing the cross entropy or maximizing
            the RCE. Since cross entropy is the average minus likelihood for the binary case.

  .. note:: Binary cross entropy of an example is non negative, and equal to the
            `KL divergence <(https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
            #Properties>`
            since p is constant, and its entropy is equal to zero.

  .. note:: 0% RCE means as good as the straw man model.
            100% means always predicts exactly the label. Namely, cross entropy of the model is
                always zero. In practice 100% is impossible to achieve due to clipping.
            Negative RCE means that the model is doing worse than the straw man.
            This usually means an un-calibrated model, namely, the average prediction
            is "far" from the average label. Examining NRCE might help identifying if that is
            the case.

  .. note:: RCE is not a "ratio" in the statistical
            `level of measurement sense <https://en.wikipedia.org/wiki/Level_of_measurement>`.
            The higher the model's RCE is the harder it is to improve it by an extra point.

            For example:
            Let CE(model) = 0.5 CE(reference model), then the RCE(model) = 50.
            Now take a "twice as good" model:
            Let CE(better model) = 0.5 CE(model) = 0.25 CE(reference model),
            then the RCE(better model) = 75 and not 100.

  .. note:: In order to keep the log function stable, typically p is limited to
            lie in [CLAMP_EPSILON, 1-CLAMP_EPSILON],
            where CLAMP_EPSILON is some small constant like: 1e-7.
            Old implementation used 1e-5 clipping by default, current uses
            tf.keras.backend.epsilon()
            whose default is 1e-7.

  .. note:: Since the reference model prediction is constant (probability),
            CE(reference model) = H(average(label))

            Where H is the standard
            `entropy <https://en.wikipedia.org/wiki/Entropy_(information_theory)>` function.

  .. note:: Must have at least 1 positive and 1 negative sample accumulated,
            or RCE will come out as NaN.
  """

  def __init__(
    self, from_logits: bool = False, label_smoothing: float = 0, epsilon: float = 1e-7, **kwargs
  ):
    """
    Args:
      from_logits: whether or not predictions are logits or probabilities.
      label_smoothing: label smoothing constant.
      epsilon: Epsilon fuzz factor used on the predictions probabilities when from_logits is False.
      **kwargs: Additional parameters supported by all torchmetrics.Metric.
    """
    super().__init__(**kwargs)
    self.from_logits = from_logits
    self.label_smoothing = label_smoothing
    self.epsilon = epsilon
    self.kwargs = kwargs

    self.mean_label = aggregation.StableMean(**kwargs)
    self.binary_cross_entropy = aggregation.StableMean(**kwargs)

    if self.from_logits:
      self.bce_loss_fn = torch.nn.functional.binary_cross_entropy_with_logits
    else:
      self.bce_loss_fn = partial(_binary_cross_entropy_with_clipping, epsilon=self.epsilon)

    # Used to compute non-accumulated batch metric if `forward` or `__call__` functions are used.
    self.batch_metric = copy.deepcopy(self)

  def update(
    self, predictions: torch.Tensor, target: torch.Tensor, weight: float = 1.0
  ) -> torch.Tensor:
    """
    Update the current rce.
    Args:
      predictions: Predicted values.
      target: Ground truth. Should have same shape as predictions.
      weight: The weight to use for the predicted values. Shape should be broadcastable to that of
       predictions.
    """
    target = _smooth(target, self.label_smoothing)
    self.mean_label.update(target, weight)
    self.binary_cross_entropy.update(
      self.bce_loss_fn(predictions, target, reduction="none"), weight
    )

  def compute(self) -> torch.Tensor:
    """
    Compute and return the accumulated rce.
    """
    baseline_mean = self.mean_label.compute()

    baseline_ce = _binary_cross_entropy_with_clipping(
      baseline_mean, baseline_mean, reduction="mean", epsilon=self.epsilon
    )

    pred_ce = self.binary_cross_entropy.compute()

    rce = (1.0 - (pred_ce / baseline_ce)) * 100

    return rce

  def reset(self):
    """
    Reset the metric to its initial state.
    """
    super().reset()
    self.mean_label.reset()
    self.binary_cross_entropy.reset()

  def forward(self, *args, **kwargs):
    """
    Serves the dual purpose of both computing the metric on the current batch of inputs but also
        add the batch statistics to the overall accumulating metric state.
    Input arguments are the exact same as corresponding ``update`` method.
    The returned output is the exact same as the output of ``compute``.
    """
    self.update(*args, **kwargs)
    self.batch_metric.update(*args, **kwargs)
    batch_result = self.batch_metric.compute()
    self.batch_metric.reset()
    return batch_result


class NRCE(RCE):
  """
  Calculate the RCE of the normalizes model.
  Where the normalized model prediction average is normalized to the average label seen so far.
  Namely, the the normalized model prediction:

  normalized model prediction(example) = (model prediction(example) * average(label)) /
  average(model prediction)

  Where the average is over all previously seen examples.

  .. note:: average(normalized model prediction) = average(label)

  .. note:: NRCE can be misleading since it is oblivious to mis-calibrations.
            The common interpretation of NRCE is to measure how good your model could potentially
            perform if it was well calibrated.

  .. note:: A big gap between NRCE and RCE might indicate a badly calibrated model,

  """

  def __init__(
    self, from_logits: bool = False, label_smoothing: float = 0, epsilon: float = 1e-7, **kwargs
  ):
    """

    Args:
      from_logits: whether or not predictions are logits or probabilities.
      label_smoothing: label smoothing constant.
      epsilon: Epsilon fuzz factor used on the predictions probabilities when from_logits is False.
               It only used when computing the cross entropy but not when normalizing.
      **kwargs: Additional parameters supported by all torchmetrics.Metric.
    """
    super().__init__(from_logits=False, label_smoothing=0, epsilon=epsilon, **kwargs)
    self.nrce_from_logits = from_logits
    self.nrce_label_smoothing = label_smoothing
    self.mean_prediction = aggregation.StableMean()

    # Used to compute non-accumulated batch metric if `forward` or `__call__` functions are used.
    self.batch_metric = copy.deepcopy(self)

  def update(
    self,
    predictions: torch.Tensor,
    target: torch.Tensor,
    weight: Union[float, torch.Tensor] = 1.0,
  ):
    """
    Update the current nrce.
    Args:
      predictions: Predicted values.
      target: Ground truth. Should have same shape as predictions.
      weight: The weight to use for the predicted values. Shape should be broadcastable to that of
       predictions.
    """
    predictions = torch.sigmoid(predictions) if self.nrce_from_logits else predictions

    target = _smooth(target, self.nrce_label_smoothing)
    self.mean_label.update(target, weight)

    self.mean_prediction.update(predictions, weight)

    normalizer = self.mean_label.compute() / self.mean_prediction.compute()

    predictions = predictions * normalizer

    self.binary_cross_entropy.update(
      self.bce_loss_fn(predictions, target, reduction="none"), weight
    )

  def reset(self):
    """
    Reset the metric to its initial state.
    """
    super().reset()
    self.mean_prediction.reset()