mirror of
https://github.com/twitter/the-algorithm-ml.git
synced 2024-11-16 21:29:24 +01:00
356 lines
12 KiB
Python
356 lines
12 KiB
Python
"""Optimization configurations for models."""
|
|
|
|
import typing
|
|
|
|
import tml.core.config as base_config
|
|
|
|
import pydantic
|
|
|
|
|
|
class PiecewiseConstant(base_config.BaseConfig):
|
|
"""
|
|
Configuration for a piecewise constant learning rate schedule.
|
|
|
|
This configuration class allows you to specify a piecewise constant learning rate schedule
|
|
by defining boundaries and corresponding learning rate values.
|
|
|
|
Attributes:
|
|
learning_rate_boundaries (List[int], optional): List of step boundaries at which
|
|
the learning rate will change. If None, no boundaries are defined.
|
|
learning_rate_values (List[float], optional): List of learning rate values
|
|
corresponding to the boundaries. If None, no values are defined.
|
|
|
|
Example:
|
|
To configure a piecewise constant learning rate schedule, create an instance of this class
|
|
and set the attributes accordingly. For example:
|
|
|
|
```python
|
|
piecewise_lr = PiecewiseConstant(
|
|
learning_rate_boundaries=[1000, 2000, 3000],
|
|
learning_rate_values=[0.1, 0.05, 0.01, 0.001]
|
|
)
|
|
```
|
|
|
|
Note:
|
|
The number of learning rate values should be one more than the number of boundaries.
|
|
|
|
"""
|
|
learning_rate_boundaries: typing.List[int] = pydantic.Field(None)
|
|
learning_rate_values: typing.List[float] = pydantic.Field(None)
|
|
|
|
|
|
class LinearRampToConstant(base_config.BaseConfig):
|
|
"""
|
|
Configuration for a linear ramp-up to constant learning rate schedule.
|
|
|
|
This configuration class allows you to specify a learning rate schedule that ramps up linearly
|
|
from zero to a constant value over a specified number of steps.
|
|
|
|
Attributes:
|
|
learning_rate (float): The final constant learning rate.
|
|
num_ramp_steps (PositiveInt): Number of steps to ramp up the learning rate from zero.
|
|
|
|
Example:
|
|
To configure a linear ramp-up to a constant learning rate, create an instance of this class
|
|
and set the attributes accordingly. For example:
|
|
|
|
```python
|
|
linear_ramp_lr = LinearRampToConstant(
|
|
learning_rate=0.1,
|
|
num_ramp_steps=1000
|
|
)
|
|
```
|
|
|
|
"""
|
|
learning_rate: float
|
|
num_ramp_steps: pydantic.PositiveInt = pydantic.Field(
|
|
description="Number of steps to ramp this up from zero."
|
|
)
|
|
|
|
|
|
class LinearRampToCosine(base_config.BaseConfig):
|
|
"""
|
|
Configuration for a linear ramp-up to cosine decay learning rate schedule.
|
|
|
|
This configuration class allows you to specify a learning rate schedule that ramps up linearly
|
|
from zero, then decays following a cosine schedule to a final constant learning rate.
|
|
|
|
Attributes:
|
|
learning_rate (float): The initial learning rate at the start of ramp-up.
|
|
final_learning_rate (float): The final constant learning rate after decay.
|
|
num_ramp_steps (PositiveInt): Number of steps to ramp up the learning rate from zero.
|
|
final_num_steps (PositiveInt): Final number of steps where decay stops.
|
|
|
|
Example:
|
|
To configure a linear ramp-up to cosine decay learning rate, create an instance of this
|
|
class and set the attributes accordingly. For example:
|
|
|
|
```python
|
|
ramp_to_cosine_lr = LinearRampToCosine(
|
|
learning_rate=0.01,
|
|
final_learning_rate=0.001,
|
|
num_ramp_steps=1000,
|
|
final_num_steps=5000
|
|
)
|
|
```
|
|
|
|
"""
|
|
learning_rate: float
|
|
final_learning_rate: float
|
|
num_ramp_steps: pydantic.PositiveInt = pydantic.Field(
|
|
description="Number of steps to ramp this up from zero."
|
|
)
|
|
final_num_steps: pydantic.PositiveInt = pydantic.Field(
|
|
description="Final number of steps where decay stops."
|
|
)
|
|
|
|
|
|
class LearningRate(base_config.BaseConfig):
|
|
"""
|
|
Learning rate configuration for training.
|
|
|
|
This configuration class allows you to specify different learning rate schedules
|
|
for your training process.
|
|
|
|
Attributes:
|
|
constant (float, optional): Constant learning rate to be used throughout training.
|
|
linear_ramp_to_cosine (LinearRampToCosine, optional): Learning rate that ramps up linearly
|
|
and then decays following a cosine schedule.
|
|
linear_ramp_to_constant (LinearRampToConstant, optional): Learning rate that ramps up
|
|
linearly and then remains constant.
|
|
piecewise_constant (PiecewiseConstant, optional): Learning rate that changes at specified
|
|
boundaries with corresponding values.
|
|
|
|
Example:
|
|
To configure a learning rate schedule, create an instance of this class and set the
|
|
attributes accordingly. For example:
|
|
|
|
```python
|
|
learning_rate = LearningRate(
|
|
constant=0.01,
|
|
linear_ramp_to_cosine=LinearRampToCosine(
|
|
learning_rate=0.1,
|
|
final_learning_rate=0.001,
|
|
num_ramp_steps=1000,
|
|
final_num_steps=5000
|
|
)
|
|
)
|
|
```
|
|
|
|
Note:
|
|
Each learning rate schedule attribute can be set to `None` if not needed.
|
|
|
|
"""
|
|
constant: float = pydantic.Field(None, one_of="lr")
|
|
linear_ramp_to_cosine: LinearRampToCosine = pydantic.Field(None, one_of="lr")
|
|
linear_ramp_to_constant: LinearRampToConstant = pydantic.Field(None, one_of="lr")
|
|
piecewise_constant: PiecewiseConstant = pydantic.Field(None, one_of="lr")
|
|
|
|
|
|
class OptimizerAlgorithmConfig(base_config.BaseConfig):
|
|
"""
|
|
Base class for optimizer configurations.
|
|
|
|
This base configuration class provides a structure for specifying various optimizer-related
|
|
settings, including the learning rate and different learning rate schedules.
|
|
|
|
Attributes:
|
|
lr (float): The base learning rate used by the optimizer.
|
|
|
|
Subclasses should inherit from this base class and define additional attributes specific to
|
|
the optimizer algorithm they represent.
|
|
|
|
Example:
|
|
To create a custom optimizer configuration, create a subclass of this base class and
|
|
define the necessary attributes. For example:
|
|
|
|
```python
|
|
class MyOptimizerConfig(OptimizerAlgorithmConfig):
|
|
momentum: float = pydantic.Field(0.9, description="Momentum value for SGD.")
|
|
```
|
|
|
|
Note:
|
|
This base class does not include specific optimizer settings. Subclasses should define
|
|
the optimizer-specific attributes as needed.
|
|
|
|
"""
|
|
|
|
lr: float
|
|
...
|
|
|
|
|
|
class AdamConfig(OptimizerAlgorithmConfig):
|
|
"""
|
|
Configuration for the Adam optimizer.
|
|
|
|
This configuration class allows you to specify the hyperparameters for the Adam optimizer.
|
|
|
|
Attributes:
|
|
lr (float): The learning rate for optimization.
|
|
betas (Tuple[float, float], optional): Coefficients used for computing running averages
|
|
of gradient and squared gradient. Defaults to (0.9, 0.999).
|
|
eps (float, optional): A small constant added to the denominator for numerical stability.
|
|
Defaults to 1e-7.
|
|
|
|
Example:
|
|
To configure the Adam optimizer, create an instance of this class and set the attributes
|
|
accordingly. For example:
|
|
|
|
```python
|
|
adam_optimizer = AdamConfig(
|
|
lr=0.001,
|
|
betas=(0.9, 0.999),
|
|
eps=1e-8
|
|
)
|
|
```
|
|
|
|
See Also:
|
|
[PyTorch Adam Documentation](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam)
|
|
|
|
"""
|
|
lr: float
|
|
betas: typing.Tuple[float, float] = [0.9, 0.999]
|
|
eps: float = 1e-7 # Numerical stability in denominator.
|
|
|
|
|
|
class SgdConfig(OptimizerAlgorithmConfig):
|
|
"""
|
|
Configuration for the Stochastic Gradient Descent (SGD) optimizer.
|
|
|
|
This configuration class allows you to specify the hyperparameters for the SGD optimizer.
|
|
|
|
Attributes:
|
|
lr (float): The learning rate for optimization.
|
|
momentum (float, optional): The momentum factor for SGD. Defaults to 0.0.
|
|
|
|
Example:
|
|
To configure the SGD optimizer, create an instance of this class and set the attributes
|
|
accordingly. For example:
|
|
|
|
```python
|
|
sgd_optimizer = SgdConfig(
|
|
lr=0.01,
|
|
momentum=0.9
|
|
)
|
|
```
|
|
|
|
"""
|
|
lr: float
|
|
momentum: float = 0.0
|
|
|
|
|
|
class AdagradConfig(OptimizerAlgorithmConfig):
|
|
"""
|
|
Configuration for the optimizer used during training.
|
|
|
|
This configuration class allows you to specify the optimizer for training, including
|
|
options for various optimizer algorithms.
|
|
|
|
Attributes:
|
|
learning_rate (LearningRate, optional): Learning rate configuration. Defaults to None.
|
|
adam (AdamConfig, optional): Configuration for the Adam optimizer. Defaults to None.
|
|
sgd (SgdConfig, optional): Configuration for the Stochastic Gradient Descent (SGD) optimizer.
|
|
Defaults to None.
|
|
adagrad (AdagradConfig, optional): Configuration for the Adagrad optimizer. Defaults to None.
|
|
|
|
Example:
|
|
To configure the optimizer for training, create an instance of this class and set the
|
|
attributes accordingly. For example:
|
|
|
|
```python
|
|
optimizer_config = OptimizerConfig(
|
|
learning_rate=LearningRate(constant=0.001),
|
|
adam=AdamConfig(lr=0.001, betas=(0.9, 0.999), eps=1e-8)
|
|
)
|
|
```
|
|
|
|
"""
|
|
lr: float
|
|
eps: float = 0
|
|
|
|
|
|
class OptimizerConfig(base_config.BaseConfig):
|
|
"""
|
|
Configuration for defining different optimizer algorithms and their parameters.
|
|
|
|
This class allows you to configure various optimizer algorithms such as Adam, SGD, and Adagrad,
|
|
along with their respective hyperparameters.
|
|
|
|
Args:
|
|
learning_rate (LearningRate): The learning rate configuration, which can include
|
|
constant learning rates or other learning rate schedules.
|
|
adam (AdamConfig): Configuration for the Adam optimizer.
|
|
sgd (SgdConfig): Configuration for the Stochastic Gradient Descent (SGD) optimizer.
|
|
adagrad (AdagradConfig): Configuration for the Adagrad optimizer.
|
|
|
|
Example:
|
|
```python
|
|
optimizer_config = OptimizerConfig(
|
|
learning_rate=LearningRate(constant=0.001),
|
|
adam=AdamConfig(lr=0.001, betas=(0.9, 0.999), eps=1e-8),
|
|
)
|
|
```
|
|
|
|
Attributes:
|
|
learning_rate (LearningRate): The learning rate configuration.
|
|
adam (AdamConfig): Configuration for the Adam optimizer.
|
|
sgd (SgdConfig): Configuration for the Stochastic Gradient Descent (SGD) optimizer.
|
|
adagrad (AdagradConfig): Configuration for the Adagrad optimizer.
|
|
|
|
Note:
|
|
You can specify only one of the optimizer configurations (adam, sgd, or adagrad) in an
|
|
`OptimizerConfig` instance.
|
|
|
|
See Also:
|
|
- `LearningRate`: Configuration for specifying learning rates.
|
|
- `AdamConfig`: Configuration for the Adam optimizer.
|
|
- `SgdConfig`: Configuration for the Stochastic Gradient Descent (SGD) optimizer.
|
|
- `AdagradConfig`: Configuration for the Adagrad optimizer.
|
|
|
|
"""
|
|
learning_rate: LearningRate = pydantic.Field(
|
|
None,
|
|
description="Constant learning rates",
|
|
)
|
|
adam: AdamConfig = pydantic.Field(None, one_of="optimizer")
|
|
sgd: SgdConfig = pydantic.Field(None, one_of="optimizer")
|
|
adagrad: AdagradConfig = pydantic.Field(None, one_of="optimizer")
|
|
|
|
|
|
def get_optimizer_algorithm_config(optimizer_config: OptimizerConfig):
|
|
"""
|
|
Get the optimizer algorithm configuration from the given `OptimizerConfig`.
|
|
|
|
This function extracts and returns the specific optimizer algorithm configuration
|
|
(e.g., Adam, SGD, or Adagrad) from the provided `OptimizerConfig`.
|
|
|
|
Args:
|
|
optimizer_config (OptimizerConfig): The optimizer configuration object containing
|
|
one of the optimizer algorithm configurations.
|
|
|
|
Returns:
|
|
Union[AdamConfig, SgdConfig, AdagradConfig]: The specific optimizer algorithm
|
|
configuration extracted from `optimizer_config`.
|
|
|
|
Raises:
|
|
ValueError: If no optimizer algorithm is selected in `optimizer_config`.
|
|
|
|
Example:
|
|
```python
|
|
optimizer_config = OptimizerConfig(
|
|
adam=AdamConfig(lr=0.001, betas=(0.9, 0.999), eps=1e-8)
|
|
)
|
|
algorithm_config = get_optimizer_algorithm_config(optimizer_config)
|
|
# `algorithm_config` will be an instance of `AdamConfig`.
|
|
```
|
|
|
|
"""
|
|
if optimizer_config.adam is not None:
|
|
return optimizer_config.adam
|
|
elif optimizer_config.sgd is not None:
|
|
return optimizer_config.sgd
|
|
elif optimizer_config.adagrad is not None:
|
|
return optimizer_config.adagrad
|
|
else:
|
|
raise ValueError(f"No optimizer selected in optimizer_config, passed {optimizer_config}")
|