the-algorithm-ml/projects/home/recap/embedding/config.py

from typing import List, Optional
import tml.core.config as base_config
from tml.optimizers import config as optimizer_config

import pydantic


class EmbeddingSnapshot(base_config.BaseConfig):
  """Configuration for Embedding snapshot"""

  emb_name: str = pydantic.Field(
    ..., description="Name of the embedding table from the loaded snapshot"
  )
  embedding_snapshot_uri: str = pydantic.Field(
    ..., description="Path to torchsnapshot of the embedding"
  )


# https://pytorch.org/torchrec/torchrec.modules.html#torchrec.modules.embedding_configs.EmbeddingBagConfig
class EmbeddingBagConfig(base_config.BaseConfig):
  """Configuration for EmbeddingBag."""

  name: str = pydantic.Field(..., description="name of embedding bag")
  num_embeddings: int = pydantic.Field(..., description="size of embedding dictionary")
  embedding_dim: int = pydantic.Field(..., description="size of each embedding vector")
  pretrained: EmbeddingSnapshot = pydantic.Field(None, description="Snapshot properties")
  vocab: str = pydantic.Field(
    None, description="Directory to parquet files of mapping from entity ID to table index."
  )


class EmbeddingOptimizerConfig(base_config.BaseConfig):
  """Configuration for the optimizer used for embedding tables."""
  learning_rate: optimizer_config.LearningRate = pydantic.Field(
    None, description="learning rate scheduler for the EBC"
  )
  init_learning_rate: float = pydantic.Field(description="initial learning rate for the EBC")
  # NB: Only sgd is supported right now and implicitly.
  # FBGemm only supports simple exact_sgd which only takes LR as an argument.


class LargeEmbeddingsConfig(base_config.BaseConfig):
  """Configuration for EmbeddingBagCollection.

  The tables listed in this config are gathered into a single torchrec EmbeddingBagCollection.
  """

  tables: List[EmbeddingBagConfig] = pydantic.Field(..., description="list of embedding tables")
  optimizer: EmbeddingOptimizerConfig
  tables_to_log: List[str] = pydantic.Field(
    None, description="list of embedding table names that we want to log during training"
  )


class StratifierConfig(base_config.BaseConfig):
  """Configuration for Stratifier."""
  name: str
  index: int
  value: int


class SmallEmbeddingBagConfig(base_config.BaseConfig):
  """Configuration for SmallEmbeddingBag."""

  name: str = pydantic.Field(..., description="name of embedding bag")
  num_embeddings: int = pydantic.Field(..., description="size of embedding dictionary")
  embedding_dim: int = pydantic.Field(..., description="size of each embedding vector")
  index: int = pydantic.Field(..., description="index in the discrete tensor to look for")


class SmallEmbeddingBagConfig(base_config.BaseConfig):
  """Configuration for SmallEmbeddingBag."""

  name: str = pydantic.Field(..., description="name of embedding bag")
  num_embeddings: int = pydantic.Field(..., description="size of embedding dictionary")
  embedding_dim: int = pydantic.Field(..., description="size of each embedding vector")
  index: int = pydantic.Field(..., description="index in the discrete tensor to look for")


class SmallEmbeddingsConfig(base_config.BaseConfig):
  """Configuration for SmallEmbeddingConfig.

  Here we can use discrete features that already are present in our TFRecords generated using
  segdense conversion as "home_recap_2022_discrete__segdense_vals" which are available in
  the model as "discrete_features", and embed a user-defined set of them with configurable
  dimensions and vocabulary sizes.

  Compared with LargeEmbedding, this config is for small embedding tables that can fit inside
  the model, whereas LargeEmbedding usually is meant to be hydrated outside the model at
  serving time due to size (>>1 GB).

  This small embeddings table uses the same optimizer as the rest of the model.
  """

  tables: List[SmallEmbeddingBagConfig] = pydantic.Field(
    ..., description="list of embedding tables"
  )
Twitter's Recommendation Algorithm - Heavy Ranker and TwHIN embeddings 2023-03-31 20:05:14 +02:00			`from typing import List, Optional`
			`import tml.core.config as base_config`
			`from tml.optimizers import config as optimizer_config`

			`import pydantic`


			`class EmbeddingSnapshot(base_config.BaseConfig):`
			`"""Configuration for Embedding snapshot"""`

			`emb_name: str = pydantic.Field(`
			`..., description="Name of the embedding table from the loaded snapshot"`
			`)`
			`embedding_snapshot_uri: str = pydantic.Field(`
			`..., description="Path to torchsnapshot of the embedding"`
			`)`


			`# https://pytorch.org/torchrec/torchrec.modules.html#torchrec.modules.embedding_configs.EmbeddingBagConfig`
			`class EmbeddingBagConfig(base_config.BaseConfig):`
			`"""Configuration for EmbeddingBag."""`

			`name: str = pydantic.Field(..., description="name of embedding bag")`
			`num_embeddings: int = pydantic.Field(..., description="size of embedding dictionary")`
			`embedding_dim: int = pydantic.Field(..., description="size of each embedding vector")`
			`pretrained: EmbeddingSnapshot = pydantic.Field(None, description="Snapshot properties")`
			`vocab: str = pydantic.Field(`
			`None, description="Directory to parquet files of mapping from entity ID to table index."`
			`)`


			`class EmbeddingOptimizerConfig(base_config.BaseConfig):`
update 2023-09-14 08:00:10 +02:00			`"""Configuration for the optimizer used for embedding tables."""`
Twitter's Recommendation Algorithm - Heavy Ranker and TwHIN embeddings 2023-03-31 20:05:14 +02:00			`learning_rate: optimizer_config.LearningRate = pydantic.Field(`
			`None, description="learning rate scheduler for the EBC"`
			`)`
			`init_learning_rate: float = pydantic.Field(description="initial learning rate for the EBC")`
			`# NB: Only sgd is supported right now and implicitly.`
			`# FBGemm only supports simple exact_sgd which only takes LR as an argument.`


			`class LargeEmbeddingsConfig(base_config.BaseConfig):`
			`"""Configuration for EmbeddingBagCollection.`

			`The tables listed in this config are gathered into a single torchrec EmbeddingBagCollection.`
			`"""`

			`tables: List[EmbeddingBagConfig] = pydantic.Field(..., description="list of embedding tables")`
			`optimizer: EmbeddingOptimizerConfig`
			`tables_to_log: List[str] = pydantic.Field(`
			`None, description="list of embedding table names that we want to log during training"`
			`)`


			`class StratifierConfig(base_config.BaseConfig):`
update 2023-09-14 08:00:10 +02:00			`"""Configuration for Stratifier."""`
Twitter's Recommendation Algorithm - Heavy Ranker and TwHIN embeddings 2023-03-31 20:05:14 +02:00			`name: str`
			`index: int`
			`value: int`


			`class SmallEmbeddingBagConfig(base_config.BaseConfig):`
			`"""Configuration for SmallEmbeddingBag."""`

			`name: str = pydantic.Field(..., description="name of embedding bag")`
			`num_embeddings: int = pydantic.Field(..., description="size of embedding dictionary")`
			`embedding_dim: int = pydantic.Field(..., description="size of each embedding vector")`
			`index: int = pydantic.Field(..., description="index in the discrete tensor to look for")`


			`class SmallEmbeddingBagConfig(base_config.BaseConfig):`
			`"""Configuration for SmallEmbeddingBag."""`

			`name: str = pydantic.Field(..., description="name of embedding bag")`
			`num_embeddings: int = pydantic.Field(..., description="size of embedding dictionary")`
			`embedding_dim: int = pydantic.Field(..., description="size of each embedding vector")`
			`index: int = pydantic.Field(..., description="index in the discrete tensor to look for")`


			`class SmallEmbeddingsConfig(base_config.BaseConfig):`
			`"""Configuration for SmallEmbeddingConfig.`

			`Here we can use discrete features that already are present in our TFRecords generated using`
			`segdense conversion as "home_recap_2022_discrete__segdense_vals" which are available in`
			`the model as "discrete_features", and embed a user-defined set of them with configurable`
			`dimensions and vocabulary sizes.`

			`Compared with LargeEmbedding, this config is for small embedding tables that can fit inside`
			`the model, whereas LargeEmbedding usually is meant to be hydrated outside the model at`
			`serving time due to size (>>1 GB).`

update 2023-09-14 08:00:10 +02:00			`This small embeddings table uses the same optimizer as the rest of the model.`
			`"""`
Twitter's Recommendation Algorithm - Heavy Ranker and TwHIN embeddings 2023-03-31 20:05:14 +02:00
			`tables: List[SmallEmbeddingBagConfig] = pydantic.Field(`
			`..., description="list of embedding tables"`
			`)`