core update

remaning train_pipline.py
2025-06-16 18:58:30 +02:00 · 2023-09-11 16:26:29 +05:30
parent b85210863f
commit 799254345f
7 changed files with 366 additions and 29 deletions
--- a/core/custom_training_loop.py
+++ b/core/custom_training_loop.py
@ -46,7 +46,24 @@ def get_new_iterator(iterable: Iterable):
 def _get_step_fn(pipeline, data_iterator, training: bool):
  """
    Returns a function to perform a single evaluation step.
    Args:
        pipeline (Pipeline): The pipeline object containing the model.
        data_iterator (Iterator): The data iterator for evaluation.
        training (bool): Flag indicating if the model should be in training mode.
    Returns:
        function: A function that performs a single evaluation step.
  """
  def step_fn():
    """
        Perform a single evaluation step.
        Returns:
            Any: The evaluation results after a single step.
      """
    # It turns out that model.train() and model.eval() simply switch a single field inside the model
    # class,so it's somewhat safer to wrap in here.
    if training:
@ -69,7 +86,21 @@ def _run_evaluation(
  eval_batch_size: int,
  logger=None,
 ):
-  """Runs the evaluation loop over all evaluation iterators."""
+  """
    Run the evaluation loop over all evaluation iterators.
    Args:
        pipeline (Pipeline): The pipeline object containing the model.
        dataset (Dataset): The dataset to evaluate.
        eval_steps (int): The number of evaluation steps to perform.
        metrics (tm.MetricCollection): A collection of evaluation metrics.
        eval_batch_size (int): Batch size for evaluation.
        logger (Optional[Logger]): A logger for recording evaluation progress (default: None).
    Returns:
        dict: A dictionary containing the computed evaluation metrics.
    """
  dataset = get_new_iterator(dataset)
  step_fn = _get_step_fn(pipeline, dataset, training=False)
  last_time = datetime.datetime.now()
@ -109,14 +140,28 @@ def train(
  parameters_to_log: Optional[Dict[str, Callable]] = None,
  tables_to_log: Optional[List[str]] = None,
 ) -> None:
-  """Runs training and eval on the given TrainPipeline
+  """
    Runs training and evaluation on the given TrainPipeline.
    Args:
-    dataset: data iterator for the training set
+        model (torch.nn.Module): The neural network model to train.
-    evaluation_iterators: data iterators for the different evaluation sets
+        optimizer (torch.optim.Optimizer): The optimizer for model optimization.
-    scheduler: optional learning rate scheduler
+        device (str): The target device for model training (e.g., 'cuda' or 'cpu').
-    output_transform_for_metrics: optional transformation functions to transorm the model
+        save_dir (str): The directory to save model checkpoints and logs.
-                                  output and labels into a format the metrics can understand
+        logging_interval (int): Interval for logging training progress.
        train_steps (int): The number of training steps to perform.
        checkpoint_frequency (int): Frequency of saving model checkpoints.
        dataset (Iterable): Data iterator for the training set.
        worker_batch_size (int): Batch size for data loading workers.
        num_workers (Optional[int]): Number of data loading workers (default: 0).
        enable_amp (bool): Flag to enable Automatic Mixed Precision (AMP) training (default: False).
        initial_checkpoint_dir (Optional[str]): Directory to initialize training from (default: None).
        gradient_accumulation (Optional[int]): Number of gradient accumulation steps (default: None).
        logger_initializer (Optional[Callable]): A logger initializer function (default: None).
        scheduler (_LRScheduler): Optional learning rate scheduler (default: None).
        metrics (Optional[tm.MetricCollection]): A collection of evaluation metrics (default: None).
        parameters_to_log (Optional[Dict[str, Callable]]): Dictionary of parameters to log (default: None).
        tables_to_log (Optional[List[str]]): List of tables to log (default: None).
    """
  train_pipeline = TrainPipelineSparseDist(
@ -262,6 +307,15 @@ def log_eval_results(
  partition_name: str,
  step: int,
 ):
  """
    Logs evaluation results and optionally records them using a provided logger.
    Args:
        results (Any): The evaluation results to log.
        eval_logger (Callable): A logger for recording evaluation results.
        partition_name (str): The name of the evaluation partition.
        step (int): The current step in the evaluation.
    """
  results = tree.map_structure(lambda elem: torch.as_tensor(elem).cpu(), results)
  logging.info(f"Step: {step}, evaluation ({partition_name}).")
  for metric_name, metric_value in results.items():
@ -285,6 +339,23 @@ def only_evaluate(
  partition_name: str,
  metrics: Optional[tm.MetricCollection] = None,
 ):
  """
    Performs evaluation on a given dataset partition.
    Args:
        model (torch.nn.Module): The neural network model for evaluation.
        optimizer (torch.optim.Optimizer): The optimizer used during evaluation.
        device (str): The target device for evaluation (e.g., 'cuda' or 'cpu').
        save_dir (str): The directory containing model checkpoints.
        num_train_steps (int): The total number of training steps.
        dataset (Iterable): Data iterator for evaluation.
        eval_batch_size (int): Batch size for evaluation.
        num_eval_steps (int): The number of evaluation steps to perform.
        eval_timeout_in_s (int): Timeout for evaluating checkpoints in seconds.
        eval_logger (Callable): A logger for recording evaluation results.
        partition_name (str): The name of the evaluation partition.
        metrics (Optional[tm.MetricCollection]): A collection of evaluation metrics (default: None).
    """
  logging.info(f"Evaluating on partition {partition_name}.")
  logging.info("Computing metrics:")
  logging.info(metrics)
--- a/core/debug_training_loop.py
+++ b/core/debug_training_loop.py
@ -28,6 +28,18 @@ def train(
  *args,
  **kwargs,
 ) -> None:
  """
    Debugging training loop. Do not use for actual model training.
    Args:
        model (torch.nn.Module): The neural network model.
        optimizer (torch.optim.Optimizer): The optimizer for model optimization.
        train_steps (int): The number of training steps to perform.
        dataset (Iterable): Data iterator for training data.
        scheduler (_LRScheduler, optional): Learning rate scheduler (default: None).
        *args: Additional arguments (ignored).
        **kwargs: Additional keyword arguments (ignored).
    """
  logging.warning("Running debug training loop, don't use for model training.")
--- a/core/losses.py
+++ b/core/losses.py
@ -10,7 +10,10 @@ import torch
 def _maybe_warn(reduction: str):
  """
-  Warning for reduction different than mean.
+    Emit a warning if the reduction method is different from 'mean'.
    Args:
        reduction (str): The reduction method being used.
    """
  if reduction != "mean":
    logging.warn(
@ -24,6 +27,16 @@ def build_loss(
  loss_type: LossType,
  reduction="mean",
 ):
  """
    Build a loss function based on the specified loss type and reduction method.
    Args:
        loss_type (LossType): The type of loss to build.
        reduction (str): The reduction method for the loss (default: 'mean').
    Returns:
        Callable: A loss function that takes logits and labels as input.
    """
  _maybe_warn(reduction)
  f = _LOSS_TYPE_TO_FUNCTION[loss_type]
@ -36,9 +49,13 @@ def build_loss(
 def get_global_loss_detached(local_loss, reduction="mean"):
  """
    Perform all_reduce to obtain the global loss function using the provided reduction.
-  :param local_loss: The local loss of the current rank.
+
-  :param reduction: The reduction to use for all_reduce. Should match the reduction used by DDP.
+    Args:
-  :return: The reduced & detached global loss.
+        local_loss (torch.Tensor): The local loss of the current rank.
        reduction (str): The reduction to use for all_reduce. Should match the reduction used by DDP.
    Returns:
        torch.Tensor: The reduced and detached global loss.
    """
  if reduction != "mean":
    logging.warn(
@ -66,6 +83,19 @@ def build_multi_task_loss(
  global_reduction="mean",
  pos_weights=None,
 ):
  """
    Build a multi-task loss function based on the specified loss type and configurations.
    Args:
        loss_type (LossType): The type of loss to build.
        tasks (typing.List[str]): List of task names.
        task_loss_reduction (str): Reduction method for task-specific losses (default: 'mean').
        global_reduction (str): Reduction method for the global loss (default: 'mean').
        pos_weights (Optional): Positive class weights for tasks (default: None).
    Returns:
        Callable: A multi-task loss function that takes logits, labels, and weights as input.
    """ 
  _maybe_warn(global_reduction)
  _maybe_warn(task_loss_reduction)
  f = _LOSS_TYPE_TO_FUNCTION[loss_type]
--- a/core/metric_mixin.py
+++ b/core/metric_mixin.py
@ -36,9 +36,24 @@ import torchmetrics
 class MetricMixin:
  @abstractmethod
  def transform(self, outputs: Dict[str, torch.Tensor]) -> Dict:
    """
        Abstract method to transform model outputs into a dictionary of metrics.
        Args:
            outputs (Dict[str, torch.Tensor]): Model outputs.
        Returns:
            Dict: A dictionary of computed metrics.
        """
    ...
  def update(self, outputs: Dict[str, torch.Tensor]):
    """
        Update the metrics based on model outputs.
        Args:
            outputs (Dict[str, torch.Tensor]): Model outputs.
        """
    results = self.transform(outputs)
    # Do not try to update if any tensor is empty as a result of stratification.
    for value in results.values():
@ -49,6 +64,13 @@ class MetricMixin:
 class TaskMixin:
  def __init__(self, task_idx: int = -1, **kwargs):
    """
        Initialize a TaskMixin instance.
        Args:
            task_idx (int): Index of the task associated with this mixin (default: -1).
            **kwargs: Additional keyword arguments.
        """
    super().__init__(**kwargs)
    self._task_idx = task_idx
@ -59,13 +81,31 @@ class StratifyMixin:
    stratifier=None,
    **kwargs,
  ):
    """
        Initialize a StratifyMixin instance.
        Args:
            stratifier: A stratifier for filtering outputs (default: None).
            **kwargs: Additional keyword arguments.
        """
    super().__init__(**kwargs)
    self._stratifier = stratifier
  def maybe_apply_stratification(
    self, outputs: Dict[str, torch.Tensor], value_names: List[str]
  ) -> Dict[str, torch.Tensor]:
-    """Pick out examples with values for which the stratifier feature is equal to a specific stratifier indicator value."""
+    """
        Apply stratification to filter examples in the outputs.
        Pick out examples with values for which the stratifier feature is equal to a specific stratifier indicator value.
        Args:
            outputs (Dict[str, torch.Tensor]): Model outputs.
            value_names (List[str]): Names of values to filter.
        Returns:
            Dict[str, torch.Tensor]: Filtered outputs.
        """
    outputs = outputs.copy()
    if not self._stratifier:
      return outputs
@ -84,11 +124,19 @@ class StratifyMixin:
 def prepend_transform(base_metric: torchmetrics.Metric, transform: Callable):
-  """Returns new class using MetricMixin and given base_metric.
+  """
    Returns a new class using MetricMixin and the given base_metric.
-  Functionally the same using inheritance, just saves some lines of code
+    Functionally the same as using inheritance, but it saves some lines of code
-  if no need for class attributes.
+    if there's no need for class attributes.
    Args:
        base_metric (torchmetrics.Metric): The base metric class to prepend the transform to.
        transform (Callable): The transformation function to prepend to the metric.
    Returns:
        Type: A new class that includes MetricMixin and the provided base_metric
        with the specified transformation method.
    """
  def transform_method(_self, *args, **kwargs):
--- a/core/metrics.py
+++ b/core/metrics.py
@ -15,6 +15,16 @@ def probs_and_labels(
  outputs: Dict[str, torch.Tensor],
  task_idx: int,
 ) -> Dict[str, torch.Tensor]:
  """
    Extract probabilities and labels from model outputs.
    Args:
        outputs (Dict[str, torch.Tensor]): Model outputs.
        task_idx (int): Index of the task.
    Returns:
        Dict[str, torch.Tensor]: Dictionary containing 'preds' and 'target' tensors.
    """
  preds = outputs["probabilities"]
  target = outputs["labels"]
  if task_idx >= 0:
@ -28,6 +38,11 @@ def probs_and_labels(
 class Count(StratifyMixin, TaskMixin, MetricMixin, tm.SumMetric):
  def transform(self, outputs):
    """
    Count metric class that inherits from StratifyMixin, TaskMixin, MetricMixin, and SumMetric.
    This metric counts values after potential stratification and task selection.
    """
    outputs = self.maybe_apply_stratification(outputs, ["labels"])
    value = outputs["labels"]
    if self._task_idx >= 0:
@ -36,6 +51,12 @@ class Count(StratifyMixin, TaskMixin, MetricMixin, tm.SumMetric):
 class Ctr(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
  """
    Ctr (Click-Through Rate) metric class that inherits from StratifyMixin, TaskMixin, MetricMixin, and MeanMetric.
    This metric calculates the mean metric value after potential stratification and task selection.
    """
  def transform(self, outputs):
    outputs = self.maybe_apply_stratification(outputs, ["labels"])
    value = outputs["labels"]
@ -45,6 +66,11 @@ class Ctr(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
 class Pctr(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
  """
    Pctr (Predicted Click-Through Rate) metric class that inherits from StratifyMixin, TaskMixin, MetricMixin, and MeanMetric.
    This metric calculates the mean metric value using probabilities after potential stratification and task selection.
    """
  def transform(self, outputs):
    outputs = self.maybe_apply_stratification(outputs, ["probabilities"])
    value = outputs["probabilities"]
@ -54,12 +80,22 @@ class Pctr(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
 class Precision(StratifyMixin, TaskMixin, MetricMixin, tm.Precision):
  """
    Precision metric class that inherits from StratifyMixin, TaskMixin, MetricMixin, and Precision.
    This metric computes precision after potential stratification and task selection.
    """
  def transform(self, outputs):
    outputs = self.maybe_apply_stratification(outputs, ["probabilities", "labels"])
    return probs_and_labels(outputs, self._task_idx)
 class Recall(StratifyMixin, TaskMixin, MetricMixin, tm.Recall):
  """
    Recall metric class that inherits from StratifyMixin, TaskMixin, MetricMixin, and Recall.
    This metric computes recall after potential stratification and task selection.
    """
  def transform(self, outputs):
    outputs = self.maybe_apply_stratification(outputs, ["probabilities", "labels"])
    return probs_and_labels(outputs, self._task_idx)
@ -73,6 +109,14 @@ class TorchMetricsRocauc(StratifyMixin, TaskMixin, MetricMixin, tm.AUROC):
 class Auc(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
  """
    AUC (Area Under the ROC Curve) metric class.
    This metric computes the AUC metric based on the logits and labels in the model outputs.
    Args:
        num_samples (int): The number of samples used to compute AUC.
        **kwargs: Additional keyword arguments.
  Based on:
  https://github.com/facebookresearch/PyTorch-BigGraph/blob/a11ff0eb644b7e4cb569067c280112b47f40ef62/torchbiggraph/util.py#L420
  """
@ -94,8 +138,14 @@ class Auc(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
 class PosRanks(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
  """
-  The ranks of all positives
+    PosRanks metric class.
-  Based on:
+
    This metric computes the ranks of all positive examples based on the logits and labels
    in the model outputs.
    Args:
        **kwargs: Additional keyword arguments.
  https://github.com/facebookresearch/PyTorch-BigGraph/blob/a11ff0eb644b7e4cb569067c280112b47f40ef62/torchbiggraph/eval.py#L73
  """
@ -112,8 +162,13 @@ class PosRanks(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
 class ReciprocalRank(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
  """
-  The reciprocal of the ranks of all
+    ReciprocalRank metric class.
-  Based on:
+
    This metric computes the reciprocal of the ranks of all positive examples based on the logits and labels
    in the model outputs.
    Args:
        **kwargs: Additional keyword arguments.
  https://github.com/facebookresearch/PyTorch-BigGraph/blob/a11ff0eb644b7e4cb569067c280112b47f40ef62/torchbiggraph/eval.py#L74
  """
@ -130,9 +185,14 @@ class ReciprocalRank(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
 class HitAtK(StratifyMixin, TaskMixin, MetricMixin, tm.MeanMetric):
  """
-  The fraction of positives that rank in the top K among their negatives
+    HitAtK metric class.
-  Note that this is basically precision@k
+
-  Based on:
+    This metric computes the fraction of positive examples that rank in the top K among their negatives,
    which is equivalent to precision@K.
    Args:
        k (int): The value of K.
        **kwargs: Additional keyword arguments.
  https://github.com/facebookresearch/PyTorch-BigGraph/blob/a11ff0eb644b7e4cb569067c280112b47f40ef62/torchbiggraph/eval.py#L75
  """
--- a/core/test_metrics.py
+++ b/core/test_metrics.py
@ -9,12 +9,26 @@ from torchmetrics import MaxMetric, MetricCollection, SumMetric
@dataclass
 class MockStratifierConfig:
  """
    Configuration dataclass for mocking a stratifier.
    Args:
        name (str): The name of the stratifier.
        index (int): The index of the stratifier.
        value (int): The value of the stratifier.
    """
  name: str
  index: int
  value: int
 class Count(MetricMixin, SumMetric):
  """
    Count metric class that inherits from MetricMixin and SumMetric.
    This metric counts occurrences.
    """
  def transform(self, outputs):
    return {"value": 1}
@ -23,6 +37,12 @@ Max = prepend_transform(MaxMetric, lambda outputs: {"value": outputs["value"]})
 def test_count_metric():
  """
    Test function for the Count metric.
    It checks if the Count metric correctly counts the number of examples.
    """
  num_examples = 123
  examples = [
    {"stuff": 0},
@ -36,6 +56,12 @@ def test_count_metric():
 def test_collections():
  """
    Test function for metric collections.
    It tests if metric collections correctly aggregate metrics.
    """
  max_metric = Max()
  count_metric = Count()
  metric = MetricCollection([max_metric, count_metric])
@ -51,6 +77,12 @@ def test_collections():
 def test_task_dependent_ctr():
  """
    Test function for task-dependent Ctr (Click-Through Rate) metric.
    It checks if the Ctr metric computes the correct value for different tasks.
    """
  num_examples = 144
  batch_size = 1024
  outputs = [
@ -69,6 +101,13 @@ def test_task_dependent_ctr():
 def test_stratified_ctr():
  """
    Test function for the Stratified Ctr (Click-Through Rate) metric.
    It checks if the Stratified Ctr metric computes the correct value for different tasks
    and stratified samples.
    """
  outputs = [
    {
      "stuff": 0,
@ -114,6 +153,12 @@ def test_stratified_ctr():
 def test_auc():
  """
    Test function for the AUC (Area Under the Curve) metric.
    It checks if the AUC metric correctly computes the Area Under the ROC Curve.
    """
  num_samples = 10000
  metric = core_metrics.Auc(num_samples)
  target = torch.tensor([0, 0, 1, 1, 1])
@ -131,6 +176,12 @@ def test_auc():
 def test_pos_rank():
  """
    Test function for the PosRanks metric.
    It checks if the PosRanks metric correctly computes the ranks of positive samples.
    """
  metric = core_metrics.PosRanks()
  target = torch.tensor([0, 0, 1, 1, 1])
  preds_correct = torch.tensor([-1.0, -1.0, 0.5, 1.0, 1.5])
@ -147,6 +198,12 @@ def test_pos_rank():
 def test_reciprocal_rank():
  """
    Test function for the Reciprocal Rank metric.
    It checks if the Reciprocal Rank metric correctly computes the reciprocal of ranks.
    """
  metric = core_metrics.ReciprocalRank()
  target = torch.tensor([0, 0, 1, 1, 1])
  preds_correct = torch.tensor([-1.0, -1.0, 0.5, 1.0, 1.5])
@ -163,6 +220,12 @@ def test_reciprocal_rank():
 def test_hit_k():
  """
    Test function for the Hit@K metric.
    It checks if the Hit@K metric correctly computes the fraction of positives that rank in the top K among their negatives.
    """
  hit1_metric = core_metrics.HitAtK(1)
  target = torch.tensor([0, 0, 1, 1, 1])
  preds_correct = torch.tensor([-1.0, 1.0, 0.5, -0.1, 1.5])
--- a/core/test_train_pipeline.py
+++ b/core/test_train_pipeline.py
@ -11,23 +11,60 @@ from torchrec.distributed import DistributedModelParallel
@dataclass
 class MockDataclassBatch(DataclassBatch):
  """
    Mock data class batch for testing purposes.
    This class represents a batch of data with continuous features and labels.
    Attributes:
        continuous_features (torch.Tensor): Tensor containing continuous feature data.
        labels (torch.Tensor): Tensor containing label data.
    """
  continuous_features: torch.Tensor
  labels: torch.Tensor
 class MockModule(torch.nn.Module):
  """
    Mock PyTorch module for testing purposes.
    This module defines a simple neural network model with a linear layer
    followed by a BCEWithLogitsLoss loss function.
    Attributes:
        model (torch.nn.Linear): The linear model layer.
        loss_fn (torch.nn.BCEWithLogitsLoss): Binary cross-entropy loss function.
    """
  def __init__(self) -> None:
    super().__init__()
    self.model = torch.nn.Linear(10, 1)
    self.loss_fn = torch.nn.BCEWithLogitsLoss()
  def forward(self, batch: MockDataclassBatch) -> Tuple[torch.Tensor, torch.Tensor]:
    """
        Forward pass of the mock module.
        Args:
            batch (MockDataclassBatch): Input data batch with continuous features and labels.
        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the loss and predictions.
        """
    pred = self.model(batch.continuous_features)
    loss = self.loss_fn(pred, batch.labels)
    return (loss, pred)
 def create_batch(bsz: int):
  """
    Create a mock data batch with random continuous features and labels.
    Args:
        bsz (int): Batch size.
    Returns:
        MockDataclassBatch: A batch of data with continuous features and labels.
    """
  return MockDataclassBatch(
    continuous_features=torch.rand(bsz, 10).float(),
    labels=torch.bernoulli(torch.empty(bsz, 1).uniform_(0, 1)).float(),
@ -35,6 +72,13 @@ def create_batch(bsz: int):
 def test_sparse_pipeline():
  """
    Test function for the sparse pipeline with distributed model parallelism.
    This function tests the behavior of the sparse training pipeline using
    a mock module and data.
    """
  device = torch.device("cpu")
  model = MockModule().to(device)
@ -65,6 +109,15 @@ def test_sparse_pipeline():
 def test_amp():
  """
    Test automatic mixed-precision (AMP) training with the sparse pipeline.
    This function tests the behavior of the sparse training pipeline with
    automatic mixed-precision (AMP) enabled, using a mock module and data.
    AMP allows for faster training by using lower-precision data types, such as
    torch.bfloat16, while maintaining model accuracy.
  """
  device = torch.device("cpu")
  model = MockModule().to(device)