Update speed_check(), add type hints.

- Print batch performance metrics when idx is greater than zero and not calculate and change each batch. - Generate a ValueError when placing an Exception when an iterable generates a problem.
2024-12-23 06:41:49 +01:00 · 2023-04-02 13:11:15 -03:00 · 2023-04-02 13:11:15 -03:00 · 9029e9ee2e
commit 9029e9ee2e
parent 78c3235eee
1 changed files with 56 additions and 65 deletions
--- a/reader/utils.py
+++ b/reader/utils.py
@ -1,20 +1,17 @@
-"""Reader utilities."""
 import itertools
 import time
 from typing import Optional

-from tml.common.batch import DataclassBatch
-from tml.ml_logging.torch_logging import logging
-
 import pyarrow as pa
 import torch

+from tml.common.batch import DataclassBatch
+from tml.ml_logging.torch_logging import logging
+

 def roundrobin(*iterables):
    """Round robin through provided iterables, useful for simple load balancing.
-
    Adapted from https://docs.python.org/3/library/itertools.html.
-
    """
    num_active = len(iterables)
    nexts = itertools.cycle(iter(it).__next__ for it in iterables)
@ -24,36 +21,30 @@ def roundrobin(*iterables):
                result = _next()
                yield result
        except StopIteration:
-      # Remove the iterator we just exhausted from the cycle.
            num_active -= 1
            nexts = itertools.cycle(itertools.islice(nexts, num_active))
            logging.warning(f"Iterable exhausted, {num_active} iterables left.")
        except Exception as exc:
            logging.warning(f"Iterable raised exception {exc}, ignoring.")
-      # continue
            raise


 def speed_check(data_loader, max_steps: int, frequency: int, peek: Optional[int]):
-  num_examples = 0
    prev = time.perf_counter()
    for idx, batch in enumerate(data_loader):
        if idx > max_steps:
            break
        if peek and idx % peek == 0:
            logging.info(f"Batch: {batch}")
-    num_examples += batch.batch_size
-    if idx % frequency == 0:
+        if idx % frequency == 0 and idx > 0:
            now = time.perf_counter()
            elapsed = now - prev
+            examples_per_second = batch.batch_size / elapsed
            logging.info(
-        f"step: {idx}, "
-        f"elapsed(s): {elapsed}, "
-        f"examples: {num_examples}, "
-        f"ex/s: {num_examples / elapsed}, "
+                f"step: {idx}, elapsed(s): {elapsed:.2f}, examples: {batch.batch_size}, "
+                f"ex/s: {examples_per_second:.2f}"
            )
            prev = now
-      num_examples = 0


 def pa_to_torch(array: pa.array) -> torch.Tensor:
@ -71,7 +62,7 @@ def create_default_pa_to_batch(schema) -> DataclassBatch:
            pa.string(): pa.scalar("", type=pa.string()),
        }
        if pa_type not in type_map:
-      raise Exception(f"Imputation for type {pa_type} not supported.")
+            raise ValueError(f"Imputation for type {pa_type} not supported.")
        return type_map[pa_type]

    def _impute(array: pa.array) -> pa.array: