diff --git a/common/checkpointing/snapshot.py b/common/checkpointing/snapshot.py index 2703efd..c2b88c6 100644 --- a/common/checkpointing/snapshot.py +++ b/common/checkpointing/snapshot.py @@ -101,7 +101,7 @@ class Snapshot: weight_tensor, ) -> None: """Loads pretrained embedding from the snapshot to the model. - Utilise partial lodaing meachanism from torchsnapshot. + Utilise partial loading mechanism from torchsnapshot. Args: embedding_snapshot: Path to the snapshot containing pretrained embeddings (EBC). snapshot_emb_name: Name of the layer in the *snapshot* model, containing the EBC. diff --git a/core/config/config_load.py b/core/config/config_load.py index 709da41..e2fac34 100644 --- a/core/config/config_load.py +++ b/core/config/config_load.py @@ -11,7 +11,7 @@ def load_config_from_yaml(config_type: Type[BaseConfig], yaml_path: str): """Recommend method to load a config file (a yaml file) and parse it. Because we have a shared filesystem the recommended route to running jobs it put modified config - files with the desired parameters somewhere on the filesytem and run jobs pointing to them. + files with the desired parameters somewhere on the filesystem and run jobs pointing to them. """ def _substitute(s): diff --git a/core/custom_training_loop.py b/core/custom_training_loop.py index 0241145..b4d240a 100644 --- a/core/custom_training_loop.py +++ b/core/custom_training_loop.py @@ -28,7 +28,7 @@ import torchmetrics as tm def get_new_iterator(iterable: Iterable): """ - This obtain a new iterator from the iterable. If the iterable uses tf.data.Dataset internally, + This obtains a new iterator from the iterable. If the iterable uses tf.data.Dataset internally, getting a new iterator each N steps will avoid memory leak. To avoid the memory leak calling iter(iterable) should return a "fresh" iterator using a fresh (new instance of) tf.data.Iterator. @@ -115,7 +115,7 @@ def train( dataset: data iterator for the training set evaluation_iterators: data iterators for the different evaluation sets scheduler: optional learning rate scheduler - output_transform_for_metrics: optional transformation functions to transorm the model + output_transform_for_metrics: optional transformation functions to transform the model output and labels into a format the metrics can understand """ diff --git a/core/debug_training_loop.py b/core/debug_training_loop.py index 610eea9..c7a1129 100644 --- a/core/debug_training_loop.py +++ b/core/debug_training_loop.py @@ -1,6 +1,6 @@ """This is a very limited feature training loop useful for interactive debugging. -It is not intended for actual model tranining (it is not fast, doesn't compile the model). +It is not intended for actual model training (it is not fast, doesn't compile the model). It does not support checkpointing. suggested use: diff --git a/projects/home/recap/FEATURES.md b/projects/home/recap/FEATURES.md index 2fa54ac..595908c 100644 --- a/projects/home/recap/FEATURES.md +++ b/projects/home/recap/FEATURES.md @@ -73,7 +73,7 @@ author (real_time) -timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -112,7 +112,7 @@ original_author (real_time) -timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -544,7 +544,7 @@ user (real_time) -timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -585,7 +585,7 @@ user (48h_real_time_v5) -timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -1422,7 +1422,7 @@ topic (real_time) -timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -1460,7 +1460,7 @@ topic (24_hour_real_time)
-timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_block_clicked
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
@@ -1552,7 +1552,7 @@ These features aggregate values corresponding to a tweet. tweet (real_time) -timelines.enagagement.is_retweeted_without_quote
+timelines.engagement.is_retweeted_without_quote
timelines.engagement.is_clicked
timelines.engagement.is_dont_like
timelines.engagement.is_dwelled
@@ -1954,7 +1954,7 @@ recap.tweetfeature.match_ui_lang
recap.tweetfeature.mention_searcher
recap.tweetfeature.num_hashtags
recap.tweetfeature.num_mentions
-recap.tweetfeature.prev_user_tweet_enagagement
+recap.tweetfeature.prev_user_tweet_engagement
recap.tweetfeature.reply_other
recap.tweetfeature.reply_searcher
recap.tweetfeature.retweet_other
@@ -2081,7 +2081,7 @@ in_reply_to_tweet.recap.tweetfeature.is_offensive
in_reply_to_tweet.recap.tweetfeature.is_reply
in_reply_to_tweet.recap.tweetfeature.is_sensitive
in_reply_to_tweet.recap.tweetfeature.num_mentions
-in_reply_to_tweet.recap.tweetfeature.prev_user_tweet_enagagement
+in_reply_to_tweet.recap.tweetfeature.prev_user_tweet_engagement
in_reply_to_tweet.recap.tweetfeature.unidirectiona_fav_count
in_reply_to_tweet.recap.tweetfeature.unidirectional_reply_count
in_reply_to_tweet.recap.tweetfeature.unidirectional_retweet_count
diff --git a/projects/home/recap/data/config.py b/projects/home/recap/data/config.py index 27ef3ed..81079a9 100644 --- a/projects/home/recap/data/config.py +++ b/projects/home/recap/data/config.py @@ -50,7 +50,7 @@ class DatasetConfig(base_config.BaseConfig): None, description="Number of shards to keep." ) repeat_files: bool = pydantic.Field( - True, description="DEPRICATED. Files are repeated no matter what this is set to." + True, description="Deprecated. Files are repeated no matter what this is set to." ) file_batch_size: pydantic.PositiveInt = pydantic.Field(16, description="File batch size") diff --git a/projects/home/recap/data/dataset.py b/projects/home/recap/data/dataset.py index 3478c68..16d9d25 100644 --- a/projects/home/recap/data/dataset.py +++ b/projects/home/recap/data/dataset.py @@ -47,7 +47,7 @@ def to_batch(x, sparse_feature_names: Optional[List[str]] = None) -> RecapBatch: try: features_in, labels = x except ValueError: - # For Mode.INFERENCE, we do not expect to recieve labels as part of the input tuple + # For Mode.INFERENCE, we do not expect to receive labels as part of the input tuple features_in, labels = x, None sparse_features = keyed_jagged_tensor_from_tensors_dict({}) @@ -398,7 +398,7 @@ class RecapDataset(torch.utils.data.IterableDataset): ) else: raise ValueError( - "Must specifiy either `inputs`, `explicit_datetime_inputs`, or `explicit_date_inputs` in data_config" + "Must specify either `inputs`, `explicit_datetime_inputs`, or `explicit_date_inputs` in data_config" ) num_files = len(filenames) diff --git a/projects/home/recap/data/tfe_parsing.py b/projects/home/recap/data/tfe_parsing.py index f597746..07770fd 100644 --- a/projects/home/recap/data/tfe_parsing.py +++ b/projects/home/recap/data/tfe_parsing.py @@ -15,7 +15,7 @@ def create_tf_example_schema( data_config: recap_data_config.SegDenseSchema, segdense_schema, ): - """Generate schema for deseralizing tf.Example. + """Generate schema for deserializing tf.Example. Args: segdense_schema: List of dicts of segdense features (includes feature_name, dtype, length). @@ -58,12 +58,12 @@ def create_tf_example_schema( @functools.lru_cache(1) def make_mantissa_mask(mask_length: int) -> tf.Tensor: - """For experimentating with emulating bfloat16 or less precise types.""" + """For experimenting with emulating bfloat16 or less precise types.""" return tf.constant((1 << 32) - (1 << mask_length), dtype=tf.int32) def mask_mantissa(tensor: tf.Tensor, mask_length: int) -> tf.Tensor: - """For experimentating with emulating bfloat16 or less precise types.""" + """For experimenting with emulating bfloat16 or less precise types.""" mask: tf.Tensor = make_mantissa_mask(mask_length) return tf.bitcast(tf.bitwise.bitwise_and(tf.bitcast(tensor, tf.int32), mask), tensor.dtype) diff --git a/projects/home/recap/model/config.py b/projects/home/recap/model/config.py index 47d0640..020f737 100644 --- a/projects/home/recap/model/config.py +++ b/projects/home/recap/model/config.py @@ -18,7 +18,7 @@ class DropoutConfig(base_config.BaseConfig): class LayerNormConfig(base_config.BaseConfig): - """Configruation for the layer normalization.""" + """Configuration for the layer normalization.""" epsilon: float = pydantic.Field( 1e-3, description="Small float added to variance to avoid dividing by zero." diff --git a/projects/twhin/data/edges.py b/projects/twhin/data/edges.py index f7864b1..42aebfb 100644 --- a/projects/twhin/data/edges.py +++ b/projects/twhin/data/edges.py @@ -96,7 +96,7 @@ class EdgesDataset(Dataset): Returns a KeyedJaggedTensor used to look up all embeddings. - Note: We treat the lhs and rhs as though they're separate lookups: `len(lenghts) == 2 * bsz * len(tables)`. + Note: We treat the lhs and rhs as though they're separate lookups: `len(lengths) == 2 * bsz * len(tables)`. This differs from the DLRM pattern where we have `len(lengths) = bsz * len(tables)`. For the example above: