"""Optimization configurations for models."""

import typing

import tml.core.config as base_config

import pydantic


class PiecewiseConstant(base_config.BaseConfig):
  """
    Configuration for a piecewise constant learning rate schedule.

    This configuration class allows you to specify a piecewise constant learning rate schedule
    by defining boundaries and corresponding learning rate values.

    Attributes:
        learning_rate_boundaries (List[int], optional): List of step boundaries at which
            the learning rate will change. If None, no boundaries are defined.
        learning_rate_values (List[float], optional): List of learning rate values
            corresponding to the boundaries. If None, no values are defined.

    Example:
        To configure a piecewise constant learning rate schedule, create an instance of this class
        and set the attributes accordingly. For example:

        ```python
        piecewise_lr = PiecewiseConstant(
            learning_rate_boundaries=[1000, 2000, 3000],
            learning_rate_values=[0.1, 0.05, 0.01, 0.001]
        )
        ```

    Note:
        The number of learning rate values should be one more than the number of boundaries.

    """
  learning_rate_boundaries: typing.List[int] = pydantic.Field(None)
  learning_rate_values: typing.List[float] = pydantic.Field(None)


class LinearRampToConstant(base_config.BaseConfig):
  """
    Configuration for a linear ramp-up to constant learning rate schedule.

    This configuration class allows you to specify a learning rate schedule that ramps up linearly
    from zero to a constant value over a specified number of steps.

    Attributes:
        learning_rate (float): The final constant learning rate.
        num_ramp_steps (PositiveInt): Number of steps to ramp up the learning rate from zero.

    Example:
        To configure a linear ramp-up to a constant learning rate, create an instance of this class
        and set the attributes accordingly. For example:

        ```python
        linear_ramp_lr = LinearRampToConstant(
            learning_rate=0.1,
            num_ramp_steps=1000
        )
        ```

    """
  learning_rate: float
  num_ramp_steps: pydantic.PositiveInt = pydantic.Field(
    description="Number of steps to ramp this up from zero."
  )


class LinearRampToCosine(base_config.BaseConfig):
  """
    Configuration for a linear ramp-up to cosine decay learning rate schedule.

    This configuration class allows you to specify a learning rate schedule that ramps up linearly
    from zero, then decays following a cosine schedule to a final constant learning rate.

    Attributes:
        learning_rate (float): The initial learning rate at the start of ramp-up.
        final_learning_rate (float): The final constant learning rate after decay.
        num_ramp_steps (PositiveInt): Number of steps to ramp up the learning rate from zero.
        final_num_steps (PositiveInt): Final number of steps where decay stops.

    Example:
        To configure a linear ramp-up to cosine decay learning rate, create an instance of this
        class and set the attributes accordingly. For example:

        ```python
        ramp_to_cosine_lr = LinearRampToCosine(
            learning_rate=0.01,
            final_learning_rate=0.001,
            num_ramp_steps=1000,
            final_num_steps=5000
        )
        ```

    """
  learning_rate: float
  final_learning_rate: float
  num_ramp_steps: pydantic.PositiveInt = pydantic.Field(
    description="Number of steps to ramp this up from zero."
  )
  final_num_steps: pydantic.PositiveInt = pydantic.Field(
    description="Final number of steps where decay stops."
  )


class LearningRate(base_config.BaseConfig):
  """
    Learning rate configuration for training.

    This configuration class allows you to specify different learning rate schedules
    for your training process.

    Attributes:
        constant (float, optional): Constant learning rate to be used throughout training.
        linear_ramp_to_cosine (LinearRampToCosine, optional): Learning rate that ramps up linearly
            and then decays following a cosine schedule.
        linear_ramp_to_constant (LinearRampToConstant, optional): Learning rate that ramps up
            linearly and then remains constant.
        piecewise_constant (PiecewiseConstant, optional): Learning rate that changes at specified
            boundaries with corresponding values.

    Example:
        To configure a learning rate schedule, create an instance of this class and set the
        attributes accordingly. For example:

        ```python
        learning_rate = LearningRate(
            constant=0.01,
            linear_ramp_to_cosine=LinearRampToCosine(
                learning_rate=0.1,
                final_learning_rate=0.001,
                num_ramp_steps=1000,
                final_num_steps=5000
            )
        )
        ```

    Note:
        Each learning rate schedule attribute can be set to `None` if not needed.

    """
  constant: float = pydantic.Field(None, one_of="lr")
  linear_ramp_to_cosine: LinearRampToCosine = pydantic.Field(None, one_of="lr")
  linear_ramp_to_constant: LinearRampToConstant = pydantic.Field(None, one_of="lr")
  piecewise_constant: PiecewiseConstant = pydantic.Field(None, one_of="lr")


class OptimizerAlgorithmConfig(base_config.BaseConfig):
  """
    Base class for optimizer configurations.

    This base configuration class provides a structure for specifying various optimizer-related
    settings, including the learning rate and different learning rate schedules.

    Attributes:
        lr (float): The base learning rate used by the optimizer.

    Subclasses should inherit from this base class and define additional attributes specific to
    the optimizer algorithm they represent.

    Example:
        To create a custom optimizer configuration, create a subclass of this base class and
        define the necessary attributes. For example:

        ```python
        class MyOptimizerConfig(OptimizerAlgorithmConfig):
            momentum: float = pydantic.Field(0.9, description="Momentum value for SGD.")
        ```

    Note:
        This base class does not include specific optimizer settings. Subclasses should define
        the optimizer-specific attributes as needed.

    """

  lr: float
  ...


class AdamConfig(OptimizerAlgorithmConfig):
  """
    Configuration for the Adam optimizer.

    This configuration class allows you to specify the hyperparameters for the Adam optimizer.

    Attributes:
        lr (float): The learning rate for optimization.
        betas (Tuple[float, float], optional): Coefficients used for computing running averages
            of gradient and squared gradient. Defaults to (0.9, 0.999).
        eps (float, optional): A small constant added to the denominator for numerical stability.
            Defaults to 1e-7.

    Example:
        To configure the Adam optimizer, create an instance of this class and set the attributes
        accordingly. For example:

        ```python
        adam_optimizer = AdamConfig(
            lr=0.001,
            betas=(0.9, 0.999),
            eps=1e-8
        )
        ```

    See Also:
        [PyTorch Adam Documentation](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam)

    """
  lr: float
  betas: typing.Tuple[float, float] = [0.9, 0.999]
  eps: float = 1e-7  # Numerical stability in denominator.


class SgdConfig(OptimizerAlgorithmConfig):
  """
    Configuration for the Stochastic Gradient Descent (SGD) optimizer.

    This configuration class allows you to specify the hyperparameters for the SGD optimizer.

    Attributes:
        lr (float): The learning rate for optimization.
        momentum (float, optional): The momentum factor for SGD. Defaults to 0.0.

    Example:
        To configure the SGD optimizer, create an instance of this class and set the attributes
        accordingly. For example:

        ```python
        sgd_optimizer = SgdConfig(
            lr=0.01,
            momentum=0.9
        )
        ```

    """
  lr: float
  momentum: float = 0.0


class AdagradConfig(OptimizerAlgorithmConfig):
  """
    Configuration for the optimizer used during training.

    This configuration class allows you to specify the optimizer for training, including
    options for various optimizer algorithms.

    Attributes:
        learning_rate (LearningRate, optional): Learning rate configuration. Defaults to None.
        adam (AdamConfig, optional): Configuration for the Adam optimizer. Defaults to None.
        sgd (SgdConfig, optional): Configuration for the Stochastic Gradient Descent (SGD) optimizer.
            Defaults to None.
        adagrad (AdagradConfig, optional): Configuration for the Adagrad optimizer. Defaults to None.

    Example:
        To configure the optimizer for training, create an instance of this class and set the
        attributes accordingly. For example:

        ```python
        optimizer_config = OptimizerConfig(
            learning_rate=LearningRate(constant=0.001),
            adam=AdamConfig(lr=0.001, betas=(0.9, 0.999), eps=1e-8)
        )
        ```

    """
  lr: float
  eps: float = 0


class OptimizerConfig(base_config.BaseConfig):
  """
    Configuration for defining different optimizer algorithms and their parameters.

    This class allows you to configure various optimizer algorithms such as Adam, SGD, and Adagrad,
    along with their respective hyperparameters.

    Args:
        learning_rate (LearningRate): The learning rate configuration, which can include
            constant learning rates or other learning rate schedules.
        adam (AdamConfig): Configuration for the Adam optimizer.
        sgd (SgdConfig): Configuration for the Stochastic Gradient Descent (SGD) optimizer.
        adagrad (AdagradConfig): Configuration for the Adagrad optimizer.

    Example:
        ```python
        optimizer_config = OptimizerConfig(
            learning_rate=LearningRate(constant=0.001),
            adam=AdamConfig(lr=0.001, betas=(0.9, 0.999), eps=1e-8),
        )
        ```

    Attributes:
        learning_rate (LearningRate): The learning rate configuration.
        adam (AdamConfig): Configuration for the Adam optimizer.
        sgd (SgdConfig): Configuration for the Stochastic Gradient Descent (SGD) optimizer.
        adagrad (AdagradConfig): Configuration for the Adagrad optimizer.

    Note:
        You can specify only one of the optimizer configurations (adam, sgd, or adagrad) in an
        `OptimizerConfig` instance.

    See Also:
        - `LearningRate`: Configuration for specifying learning rates.
        - `AdamConfig`: Configuration for the Adam optimizer.
        - `SgdConfig`: Configuration for the Stochastic Gradient Descent (SGD) optimizer.
        - `AdagradConfig`: Configuration for the Adagrad optimizer.

    """
  learning_rate: LearningRate = pydantic.Field(
    None,
    description="Constant learning rates",
  )
  adam: AdamConfig = pydantic.Field(None, one_of="optimizer")
  sgd: SgdConfig = pydantic.Field(None, one_of="optimizer")
  adagrad: AdagradConfig = pydantic.Field(None, one_of="optimizer")


def get_optimizer_algorithm_config(optimizer_config: OptimizerConfig):
  """
    Get the optimizer algorithm configuration from the given `OptimizerConfig`.

    This function extracts and returns the specific optimizer algorithm configuration
    (e.g., Adam, SGD, or Adagrad) from the provided `OptimizerConfig`.

    Args:
        optimizer_config (OptimizerConfig): The optimizer configuration object containing
            one of the optimizer algorithm configurations.

    Returns:
        Union[AdamConfig, SgdConfig, AdagradConfig]: The specific optimizer algorithm
        configuration extracted from `optimizer_config`.

    Raises:
        ValueError: If no optimizer algorithm is selected in `optimizer_config`.

    Example:
        ```python
        optimizer_config = OptimizerConfig(
            adam=AdamConfig(lr=0.001, betas=(0.9, 0.999), eps=1e-8)
        )
        algorithm_config = get_optimizer_algorithm_config(optimizer_config)
        # `algorithm_config` will be an instance of `AdamConfig`.
        ```

    """
  if optimizer_config.adam is not None:
    return optimizer_config.adam
  elif optimizer_config.sgd is not None:
    return optimizer_config.sgd
  elif optimizer_config.adagrad is not None:
    return optimizer_config.adagrad
  else:
    raise ValueError(f"No optimizer selected in optimizer_config, passed {optimizer_config}")