mirror of
https://github.com/twitter/the-algorithm-ml.git
synced 2025-01-09 22:39:22 +01:00
Fix additional typos in various comments/docs
This commit is contained in:
parent
974d6458af
commit
cb1ff279f2
@ -101,7 +101,7 @@ class Snapshot:
|
|||||||
weight_tensor,
|
weight_tensor,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Loads pretrained embedding from the snapshot to the model.
|
"""Loads pretrained embedding from the snapshot to the model.
|
||||||
Utilise partial lodaing meachanism from torchsnapshot.
|
Utilise partial loading mechanism from torchsnapshot.
|
||||||
Args:
|
Args:
|
||||||
embedding_snapshot: Path to the snapshot containing pretrained embeddings (EBC).
|
embedding_snapshot: Path to the snapshot containing pretrained embeddings (EBC).
|
||||||
snapshot_emb_name: Name of the layer in the *snapshot* model, containing the EBC.
|
snapshot_emb_name: Name of the layer in the *snapshot* model, containing the EBC.
|
||||||
|
@ -11,7 +11,7 @@ def load_config_from_yaml(config_type: Type[BaseConfig], yaml_path: str):
|
|||||||
"""Recommend method to load a config file (a yaml file) and parse it.
|
"""Recommend method to load a config file (a yaml file) and parse it.
|
||||||
|
|
||||||
Because we have a shared filesystem the recommended route to running jobs it put modified config
|
Because we have a shared filesystem the recommended route to running jobs it put modified config
|
||||||
files with the desired parameters somewhere on the filesytem and run jobs pointing to them.
|
files with the desired parameters somewhere on the filesystem and run jobs pointing to them.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _substitute(s):
|
def _substitute(s):
|
||||||
|
@ -28,7 +28,7 @@ import torchmetrics as tm
|
|||||||
|
|
||||||
def get_new_iterator(iterable: Iterable):
|
def get_new_iterator(iterable: Iterable):
|
||||||
"""
|
"""
|
||||||
This obtain a new iterator from the iterable. If the iterable uses tf.data.Dataset internally,
|
This obtains a new iterator from the iterable. If the iterable uses tf.data.Dataset internally,
|
||||||
getting a new iterator each N steps will avoid memory leak. To avoid the memory leak
|
getting a new iterator each N steps will avoid memory leak. To avoid the memory leak
|
||||||
calling iter(iterable) should return a "fresh" iterator using a fresh
|
calling iter(iterable) should return a "fresh" iterator using a fresh
|
||||||
(new instance of) tf.data.Iterator.
|
(new instance of) tf.data.Iterator.
|
||||||
@ -115,7 +115,7 @@ def train(
|
|||||||
dataset: data iterator for the training set
|
dataset: data iterator for the training set
|
||||||
evaluation_iterators: data iterators for the different evaluation sets
|
evaluation_iterators: data iterators for the different evaluation sets
|
||||||
scheduler: optional learning rate scheduler
|
scheduler: optional learning rate scheduler
|
||||||
output_transform_for_metrics: optional transformation functions to transorm the model
|
output_transform_for_metrics: optional transformation functions to transform the model
|
||||||
output and labels into a format the metrics can understand
|
output and labels into a format the metrics can understand
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
"""This is a very limited feature training loop useful for interactive debugging.
|
"""This is a very limited feature training loop useful for interactive debugging.
|
||||||
|
|
||||||
It is not intended for actual model tranining (it is not fast, doesn't compile the model).
|
It is not intended for actual model training (it is not fast, doesn't compile the model).
|
||||||
It does not support checkpointing.
|
It does not support checkpointing.
|
||||||
|
|
||||||
suggested use:
|
suggested use:
|
||||||
|
@ -73,7 +73,7 @@ author (real_time)
|
|||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<code>
|
<code>
|
||||||
timelines.enagagement.is_retweeted_without_quote <br>
|
timelines.engagement.is_retweeted_without_quote <br>
|
||||||
timelines.engagement.is_clicked <br>
|
timelines.engagement.is_clicked <br>
|
||||||
timelines.engagement.is_dont_like <br>
|
timelines.engagement.is_dont_like <br>
|
||||||
timelines.engagement.is_dwelled <br>
|
timelines.engagement.is_dwelled <br>
|
||||||
@ -112,7 +112,7 @@ original_author (real_time)
|
|||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<code>
|
<code>
|
||||||
timelines.enagagement.is_retweeted_without_quote <br>
|
timelines.engagement.is_retweeted_without_quote <br>
|
||||||
timelines.engagement.is_clicked <br>
|
timelines.engagement.is_clicked <br>
|
||||||
timelines.engagement.is_dont_like <br>
|
timelines.engagement.is_dont_like <br>
|
||||||
timelines.engagement.is_dwelled <br>
|
timelines.engagement.is_dwelled <br>
|
||||||
@ -544,7 +544,7 @@ user (real_time)
|
|||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<code>
|
<code>
|
||||||
timelines.enagagement.is_retweeted_without_quote<br>
|
timelines.engagement.is_retweeted_without_quote<br>
|
||||||
timelines.engagement.is_clicked<br>
|
timelines.engagement.is_clicked<br>
|
||||||
timelines.engagement.is_dont_like<br>
|
timelines.engagement.is_dont_like<br>
|
||||||
timelines.engagement.is_dwelled<br>
|
timelines.engagement.is_dwelled<br>
|
||||||
@ -585,7 +585,7 @@ user (48h_real_time_v5)
|
|||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<code>
|
<code>
|
||||||
timelines.enagagement.is_retweeted_without_quote<br>
|
timelines.engagement.is_retweeted_without_quote<br>
|
||||||
timelines.engagement.is_clicked<br>
|
timelines.engagement.is_clicked<br>
|
||||||
timelines.engagement.is_dont_like<br>
|
timelines.engagement.is_dont_like<br>
|
||||||
timelines.engagement.is_dwelled<br>
|
timelines.engagement.is_dwelled<br>
|
||||||
@ -1422,7 +1422,7 @@ topic (real_time)
|
|||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<code>
|
<code>
|
||||||
timelines.enagagement.is_retweeted_without_quote <br>
|
timelines.engagement.is_retweeted_without_quote <br>
|
||||||
timelines.engagement.is_clicked <br>
|
timelines.engagement.is_clicked <br>
|
||||||
timelines.engagement.is_dont_like <br>
|
timelines.engagement.is_dont_like <br>
|
||||||
timelines.engagement.is_dwelled <br>
|
timelines.engagement.is_dwelled <br>
|
||||||
@ -1460,7 +1460,7 @@ topic (24_hour_real_time)
|
|||||||
</code>
|
</code>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<code>timelines.enagagement.is_retweeted_without_quote<br>
|
<code>timelines.engagement.is_retweeted_without_quote<br>
|
||||||
timelines.engagement.is_block_clicked<br>
|
timelines.engagement.is_block_clicked<br>
|
||||||
timelines.engagement.is_clicked<br>
|
timelines.engagement.is_clicked<br>
|
||||||
timelines.engagement.is_dont_like<br>
|
timelines.engagement.is_dont_like<br>
|
||||||
@ -1552,7 +1552,7 @@ These features aggregate values corresponding to a tweet.
|
|||||||
<tr>
|
<tr>
|
||||||
<td><code>tweet (real_time)</code></td>
|
<td><code>tweet (real_time)</code></td>
|
||||||
<td><code>
|
<td><code>
|
||||||
timelines.enagagement.is_retweeted_without_quote<br>
|
timelines.engagement.is_retweeted_without_quote<br>
|
||||||
timelines.engagement.is_clicked<br>
|
timelines.engagement.is_clicked<br>
|
||||||
timelines.engagement.is_dont_like<br>
|
timelines.engagement.is_dont_like<br>
|
||||||
timelines.engagement.is_dwelled<br>
|
timelines.engagement.is_dwelled<br>
|
||||||
@ -1954,7 +1954,7 @@ recap.tweetfeature.match_ui_lang <br>
|
|||||||
recap.tweetfeature.mention_searcher <br>
|
recap.tweetfeature.mention_searcher <br>
|
||||||
recap.tweetfeature.num_hashtags <br>
|
recap.tweetfeature.num_hashtags <br>
|
||||||
recap.tweetfeature.num_mentions <br>
|
recap.tweetfeature.num_mentions <br>
|
||||||
recap.tweetfeature.prev_user_tweet_enagagement <br>
|
recap.tweetfeature.prev_user_tweet_engagement <br>
|
||||||
recap.tweetfeature.reply_other <br>
|
recap.tweetfeature.reply_other <br>
|
||||||
recap.tweetfeature.reply_searcher <br>
|
recap.tweetfeature.reply_searcher <br>
|
||||||
recap.tweetfeature.retweet_other <br>
|
recap.tweetfeature.retweet_other <br>
|
||||||
@ -2081,7 +2081,7 @@ in_reply_to_tweet.recap.tweetfeature.is_offensive <br>
|
|||||||
in_reply_to_tweet.recap.tweetfeature.is_reply <br>
|
in_reply_to_tweet.recap.tweetfeature.is_reply <br>
|
||||||
in_reply_to_tweet.recap.tweetfeature.is_sensitive <br>
|
in_reply_to_tweet.recap.tweetfeature.is_sensitive <br>
|
||||||
in_reply_to_tweet.recap.tweetfeature.num_mentions <br>
|
in_reply_to_tweet.recap.tweetfeature.num_mentions <br>
|
||||||
in_reply_to_tweet.recap.tweetfeature.prev_user_tweet_enagagement <br>
|
in_reply_to_tweet.recap.tweetfeature.prev_user_tweet_engagement <br>
|
||||||
in_reply_to_tweet.recap.tweetfeature.unidirectiona_fav_count <br>
|
in_reply_to_tweet.recap.tweetfeature.unidirectiona_fav_count <br>
|
||||||
in_reply_to_tweet.recap.tweetfeature.unidirectional_reply_count <br>
|
in_reply_to_tweet.recap.tweetfeature.unidirectional_reply_count <br>
|
||||||
in_reply_to_tweet.recap.tweetfeature.unidirectional_retweet_count <br>
|
in_reply_to_tweet.recap.tweetfeature.unidirectional_retweet_count <br>
|
||||||
|
@ -50,7 +50,7 @@ class DatasetConfig(base_config.BaseConfig):
|
|||||||
None, description="Number of shards to keep."
|
None, description="Number of shards to keep."
|
||||||
)
|
)
|
||||||
repeat_files: bool = pydantic.Field(
|
repeat_files: bool = pydantic.Field(
|
||||||
True, description="DEPRICATED. Files are repeated no matter what this is set to."
|
True, description="Deprecated. Files are repeated no matter what this is set to."
|
||||||
)
|
)
|
||||||
file_batch_size: pydantic.PositiveInt = pydantic.Field(16, description="File batch size")
|
file_batch_size: pydantic.PositiveInt = pydantic.Field(16, description="File batch size")
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ def to_batch(x, sparse_feature_names: Optional[List[str]] = None) -> RecapBatch:
|
|||||||
try:
|
try:
|
||||||
features_in, labels = x
|
features_in, labels = x
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# For Mode.INFERENCE, we do not expect to recieve labels as part of the input tuple
|
# For Mode.INFERENCE, we do not expect to receive labels as part of the input tuple
|
||||||
features_in, labels = x, None
|
features_in, labels = x, None
|
||||||
|
|
||||||
sparse_features = keyed_jagged_tensor_from_tensors_dict({})
|
sparse_features = keyed_jagged_tensor_from_tensors_dict({})
|
||||||
@ -398,7 +398,7 @@ class RecapDataset(torch.utils.data.IterableDataset):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Must specifiy either `inputs`, `explicit_datetime_inputs`, or `explicit_date_inputs` in data_config"
|
"Must specify either `inputs`, `explicit_datetime_inputs`, or `explicit_date_inputs` in data_config"
|
||||||
)
|
)
|
||||||
|
|
||||||
num_files = len(filenames)
|
num_files = len(filenames)
|
||||||
|
@ -15,7 +15,7 @@ def create_tf_example_schema(
|
|||||||
data_config: recap_data_config.SegDenseSchema,
|
data_config: recap_data_config.SegDenseSchema,
|
||||||
segdense_schema,
|
segdense_schema,
|
||||||
):
|
):
|
||||||
"""Generate schema for deseralizing tf.Example.
|
"""Generate schema for deserializing tf.Example.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
segdense_schema: List of dicts of segdense features (includes feature_name, dtype, length).
|
segdense_schema: List of dicts of segdense features (includes feature_name, dtype, length).
|
||||||
@ -58,12 +58,12 @@ def create_tf_example_schema(
|
|||||||
|
|
||||||
@functools.lru_cache(1)
|
@functools.lru_cache(1)
|
||||||
def make_mantissa_mask(mask_length: int) -> tf.Tensor:
|
def make_mantissa_mask(mask_length: int) -> tf.Tensor:
|
||||||
"""For experimentating with emulating bfloat16 or less precise types."""
|
"""For experimenting with emulating bfloat16 or less precise types."""
|
||||||
return tf.constant((1 << 32) - (1 << mask_length), dtype=tf.int32)
|
return tf.constant((1 << 32) - (1 << mask_length), dtype=tf.int32)
|
||||||
|
|
||||||
|
|
||||||
def mask_mantissa(tensor: tf.Tensor, mask_length: int) -> tf.Tensor:
|
def mask_mantissa(tensor: tf.Tensor, mask_length: int) -> tf.Tensor:
|
||||||
"""For experimentating with emulating bfloat16 or less precise types."""
|
"""For experimenting with emulating bfloat16 or less precise types."""
|
||||||
mask: tf.Tensor = make_mantissa_mask(mask_length)
|
mask: tf.Tensor = make_mantissa_mask(mask_length)
|
||||||
return tf.bitcast(tf.bitwise.bitwise_and(tf.bitcast(tensor, tf.int32), mask), tensor.dtype)
|
return tf.bitcast(tf.bitwise.bitwise_and(tf.bitcast(tensor, tf.int32), mask), tensor.dtype)
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ class DropoutConfig(base_config.BaseConfig):
|
|||||||
|
|
||||||
|
|
||||||
class LayerNormConfig(base_config.BaseConfig):
|
class LayerNormConfig(base_config.BaseConfig):
|
||||||
"""Configruation for the layer normalization."""
|
"""Configuration for the layer normalization."""
|
||||||
|
|
||||||
epsilon: float = pydantic.Field(
|
epsilon: float = pydantic.Field(
|
||||||
1e-3, description="Small float added to variance to avoid dividing by zero."
|
1e-3, description="Small float added to variance to avoid dividing by zero."
|
||||||
|
@ -96,7 +96,7 @@ class EdgesDataset(Dataset):
|
|||||||
|
|
||||||
Returns a KeyedJaggedTensor used to look up all embeddings.
|
Returns a KeyedJaggedTensor used to look up all embeddings.
|
||||||
|
|
||||||
Note: We treat the lhs and rhs as though they're separate lookups: `len(lenghts) == 2 * bsz * len(tables)`.
|
Note: We treat the lhs and rhs as though they're separate lookups: `len(lengths) == 2 * bsz * len(tables)`.
|
||||||
This differs from the DLRM pattern where we have `len(lengths) = bsz * len(tables)`.
|
This differs from the DLRM pattern where we have `len(lengths) = bsz * len(tables)`.
|
||||||
|
|
||||||
For the example above:
|
For the example above:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user