diff --git a/common/checkpointing/snapshot.py b/common/checkpointing/snapshot.py index 2703efd..c2b88c6 100644 --- a/common/checkpointing/snapshot.py +++ b/common/checkpointing/snapshot.py @@ -101,7 +101,7 @@ class Snapshot: weight_tensor, ) -> None: """Loads pretrained embedding from the snapshot to the model. - Utilise partial lodaing meachanism from torchsnapshot. + Utilise partial loading mechanism from torchsnapshot. Args: embedding_snapshot: Path to the snapshot containing pretrained embeddings (EBC). snapshot_emb_name: Name of the layer in the *snapshot* model, containing the EBC. diff --git a/core/config/config_load.py b/core/config/config_load.py index 709da41..e2fac34 100644 --- a/core/config/config_load.py +++ b/core/config/config_load.py @@ -11,7 +11,7 @@ def load_config_from_yaml(config_type: Type[BaseConfig], yaml_path: str): """Recommend method to load a config file (a yaml file) and parse it. Because we have a shared filesystem the recommended route to running jobs it put modified config - files with the desired parameters somewhere on the filesytem and run jobs pointing to them. + files with the desired parameters somewhere on the filesystem and run jobs pointing to them. """ def _substitute(s): diff --git a/core/custom_training_loop.py b/core/custom_training_loop.py index 0241145..b4d240a 100644 --- a/core/custom_training_loop.py +++ b/core/custom_training_loop.py @@ -28,7 +28,7 @@ import torchmetrics as tm def get_new_iterator(iterable: Iterable): """ - This obtain a new iterator from the iterable. If the iterable uses tf.data.Dataset internally, + This obtains a new iterator from the iterable. If the iterable uses tf.data.Dataset internally, getting a new iterator each N steps will avoid memory leak. To avoid the memory leak calling iter(iterable) should return a "fresh" iterator using a fresh (new instance of) tf.data.Iterator. @@ -115,7 +115,7 @@ def train( dataset: data iterator for the training set evaluation_iterators: data iterators for the different evaluation sets scheduler: optional learning rate scheduler - output_transform_for_metrics: optional transformation functions to transorm the model + output_transform_for_metrics: optional transformation functions to transform the model output and labels into a format the metrics can understand """ diff --git a/core/debug_training_loop.py b/core/debug_training_loop.py index 610eea9..c7a1129 100644 --- a/core/debug_training_loop.py +++ b/core/debug_training_loop.py @@ -1,6 +1,6 @@ """This is a very limited feature training loop useful for interactive debugging. -It is not intended for actual model tranining (it is not fast, doesn't compile the model). +It is not intended for actual model training (it is not fast, doesn't compile the model). It does not support checkpointing. suggested use: diff --git a/projects/home/recap/FEATURES.md b/projects/home/recap/FEATURES.md index 2fa54ac..595908c 100644 --- a/projects/home/recap/FEATURES.md +++ b/projects/home/recap/FEATURES.md @@ -73,7 +73,7 @@ author (real_time)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -112,7 +112,7 @@ original_author (real_time)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -544,7 +544,7 @@ user (real_time)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -585,7 +585,7 @@ user (48h_real_time_v5)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -1422,7 +1422,7 @@ topic (real_time)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -1460,7 +1460,7 @@ topic (24_hour_real_time)
timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_block_clicked
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
@@ -1552,7 +1552,7 @@ These features aggregate values corresponding to a tweet.
tweet (real_time)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -1954,7 +1954,7 @@ recap.tweetfeature.match_ui_lang
recap.tweetfeature.mention_searcher
recap.tweetfeature.num_hashtags
recap.tweetfeature.num_mentions
-recap.tweetfeature.prev_user_tweet_enagagement
+recap.tweetfeature.prev_user_tweet_engagement
recap.tweetfeature.reply_other
recap.tweetfeature.reply_searcher
recap.tweetfeature.retweet_other
@@ -2081,7 +2081,7 @@ in_reply_to_tweet.recap.tweetfeature.is_offensive
in_reply_to_tweet.recap.tweetfeature.is_reply
in_reply_to_tweet.recap.tweetfeature.is_sensitive
in_reply_to_tweet.recap.tweetfeature.num_mentions
-in_reply_to_tweet.recap.tweetfeature.prev_user_tweet_enagagement
+in_reply_to_tweet.recap.tweetfeature.prev_user_tweet_engagement
in_reply_to_tweet.recap.tweetfeature.unidirectiona_fav_count
in_reply_to_tweet.recap.tweetfeature.unidirectional_reply_count
in_reply_to_tweet.recap.tweetfeature.unidirectional_retweet_count
diff --git a/projects/home/recap/data/config.py b/projects/home/recap/data/config.py
index 27ef3ed..81079a9 100644
--- a/projects/home/recap/data/config.py
+++ b/projects/home/recap/data/config.py
@@ -50,7 +50,7 @@ class DatasetConfig(base_config.BaseConfig):
None, description="Number of shards to keep."
)
repeat_files: bool = pydantic.Field(
- True, description="DEPRICATED. Files are repeated no matter what this is set to."
+ True, description="Deprecated. Files are repeated no matter what this is set to."
)
file_batch_size: pydantic.PositiveInt = pydantic.Field(16, description="File batch size")
diff --git a/projects/home/recap/data/dataset.py b/projects/home/recap/data/dataset.py
index 3478c68..16d9d25 100644
--- a/projects/home/recap/data/dataset.py
+++ b/projects/home/recap/data/dataset.py
@@ -47,7 +47,7 @@ def to_batch(x, sparse_feature_names: Optional[List[str]] = None) -> RecapBatch:
try:
features_in, labels = x
except ValueError:
- # For Mode.INFERENCE, we do not expect to recieve labels as part of the input tuple
+ # For Mode.INFERENCE, we do not expect to receive labels as part of the input tuple
features_in, labels = x, None
sparse_features = keyed_jagged_tensor_from_tensors_dict({})
@@ -398,7 +398,7 @@ class RecapDataset(torch.utils.data.IterableDataset):
)
else:
raise ValueError(
- "Must specifiy either `inputs`, `explicit_datetime_inputs`, or `explicit_date_inputs` in data_config"
+ "Must specify either `inputs`, `explicit_datetime_inputs`, or `explicit_date_inputs` in data_config"
)
num_files = len(filenames)
diff --git a/projects/home/recap/data/tfe_parsing.py b/projects/home/recap/data/tfe_parsing.py
index f597746..07770fd 100644
--- a/projects/home/recap/data/tfe_parsing.py
+++ b/projects/home/recap/data/tfe_parsing.py
@@ -15,7 +15,7 @@ def create_tf_example_schema(
data_config: recap_data_config.SegDenseSchema,
segdense_schema,
):
- """Generate schema for deseralizing tf.Example.
+ """Generate schema for deserializing tf.Example.
Args:
segdense_schema: List of dicts of segdense features (includes feature_name, dtype, length).
@@ -58,12 +58,12 @@ def create_tf_example_schema(
@functools.lru_cache(1)
def make_mantissa_mask(mask_length: int) -> tf.Tensor:
- """For experimentating with emulating bfloat16 or less precise types."""
+ """For experimenting with emulating bfloat16 or less precise types."""
return tf.constant((1 << 32) - (1 << mask_length), dtype=tf.int32)
def mask_mantissa(tensor: tf.Tensor, mask_length: int) -> tf.Tensor:
- """For experimentating with emulating bfloat16 or less precise types."""
+ """For experimenting with emulating bfloat16 or less precise types."""
mask: tf.Tensor = make_mantissa_mask(mask_length)
return tf.bitcast(tf.bitwise.bitwise_and(tf.bitcast(tensor, tf.int32), mask), tensor.dtype)
diff --git a/projects/home/recap/model/config.py b/projects/home/recap/model/config.py
index 47d0640..020f737 100644
--- a/projects/home/recap/model/config.py
+++ b/projects/home/recap/model/config.py
@@ -18,7 +18,7 @@ class DropoutConfig(base_config.BaseConfig):
class LayerNormConfig(base_config.BaseConfig):
- """Configruation for the layer normalization."""
+ """Configuration for the layer normalization."""
epsilon: float = pydantic.Field(
1e-3, description="Small float added to variance to avoid dividing by zero."
diff --git a/projects/twhin/data/edges.py b/projects/twhin/data/edges.py
index f7864b1..42aebfb 100644
--- a/projects/twhin/data/edges.py
+++ b/projects/twhin/data/edges.py
@@ -96,7 +96,7 @@ class EdgesDataset(Dataset):
Returns a KeyedJaggedTensor used to look up all embeddings.
- Note: We treat the lhs and rhs as though they're separate lookups: `len(lenghts) == 2 * bsz * len(tables)`.
+ Note: We treat the lhs and rhs as though they're separate lookups: `len(lengths) == 2 * bsz * len(tables)`.
This differs from the DLRM pattern where we have `len(lengths) = bsz * len(tables)`.
For the example above: