the-algorithm-ml/projects/home/recap/data/tfe_parsing.py

import functools
import json

from tml.projects.home.recap.data import config as recap_data_config

from absl import logging
import tensorflow as tf


DEFAULTS_MAP = {"int64_list": 0, "float_list": 0.0, "bytes_list": ""}
DTYPE_MAP = {"int64_list": tf.int64, "float_list": tf.float32, "bytes_list": tf.string}


def create_tf_example_schema(
  data_config: recap_data_config.SegDenseSchema,
  segdense_schema,
):
  """Generate schema for deseralizing tf.Example.

  Args:
    segdense_schema: List of dicts of segdense features (includes feature_name, dtype, length).
    labels: List of strings denoting labels.

  Returns:
    A dictionary schema suitable for deserializing tf.Example.
  """
  segdense_config = data_config.seg_dense_schema
  labels = list(data_config.tasks.keys())
  used_features = (
    segdense_config.features + list(segdense_config.renamed_features.values()) + labels
  )
  logging.info(used_features)

  tfe_schema = {}
  for entry in segdense_schema:
    feature_name = entry["feature_name"]

    if feature_name in used_features:
      length = entry["length"]
      dtype = entry["dtype"]

      if feature_name in labels:
        logging.info(f"Label: feature name is {feature_name} type is {dtype}")
        tfe_schema[feature_name] = tf.io.FixedLenFeature(
          length, DTYPE_MAP[dtype], DEFAULTS_MAP[dtype]
        )
      elif length == -1:
        tfe_schema[feature_name] = tf.io.VarLenFeature(DTYPE_MAP[dtype])
      else:
        tfe_schema[feature_name] = tf.io.FixedLenFeature(
          length, DTYPE_MAP[dtype], [DEFAULTS_MAP[dtype]] * length
        )
  for feature_name in used_features:
    if feature_name not in tfe_schema:
      raise ValueError(f"{feature_name} missing from schema: {segdense_config.schema_path}.")
  return tfe_schema


@functools.lru_cache(1)
def make_mantissa_mask(mask_length: int) -> tf.Tensor:
  """For experimentating with emulating bfloat16 or less precise types."""
  return tf.constant((1 << 32) - (1 << mask_length), dtype=tf.int32)


def mask_mantissa(tensor: tf.Tensor, mask_length: int) -> tf.Tensor:
  """For experimentating with emulating bfloat16 or less precise types."""
  mask: tf.Tensor = make_mantissa_mask(mask_length)
  return tf.bitcast(tf.bitwise.bitwise_and(tf.bitcast(tensor, tf.int32), mask), tensor.dtype)


def parse_tf_example(
  serialized_example,
  tfe_schema,
  seg_dense_schema_config,
):
  """Parse serialized tf.Example into dict of tensors.

  Args:
    serialized_example: Serialized tf.Example to be parsed.
    tfe_schema: Dictionary schema suitable for deserializing tf.Example.

  Returns:
    Dictionary of tensors to be used as model input.
  """
  inputs = tf.io.parse_example(serialized=serialized_example, features=tfe_schema)

  for new_feature_name, old_feature_name in seg_dense_schema_config.renamed_features.items():
    inputs[new_feature_name] = inputs.pop(old_feature_name)

  # This should not actually be used except for experimentation with low precision floats.
  if "mask_mantissa_features" in seg_dense_schema_config:
    for feature_name, mask_length in seg_dense_schema_config.mask_mantissa_features.items():
      inputs[feature_name] = mask_mantissa(inputs[feature_name], mask_length)

  # DANGER DANGER: This default seems really scary, and it's only here because it has to be visible
  # at TF level.
  # We should not return empty tensors if we dont use embeddings.
  # Otherwise, it breaks numpy->pt conversion
  renamed_keys = list(seg_dense_schema_config.renamed_features.keys())
  for renamed_key in renamed_keys:
    if "embedding" in renamed_key and (renamed_key not in inputs):
      inputs[renamed_key] = tf.zeros([], tf.float32)

  logging.info(f"parsed example and inputs are {inputs}")
  return inputs


def get_seg_dense_parse_fn(data_config: recap_data_config.RecapDataConfig):
  """Placeholder for seg dense.

  In the future, when we use more seg dense variations, we can change this.
  """
  with tf.io.gfile.GFile(data_config.seg_dense_schema.schema_path, "r") as f:
    seg_dense_schema = json.load(f)["schema"]

  tf_example_schema = create_tf_example_schema(
    data_config,
    seg_dense_schema,
  )

  logging.info("***** TF Example Schema *****")
  logging.info(tf_example_schema)

  parse = functools.partial(
    parse_tf_example,
    tfe_schema=tf_example_schema,
    seg_dense_schema_config=data_config.seg_dense_schema,
  )
  return parse
Twitter's Recommendation Algorithm - Heavy Ranker and TwHIN embeddings 2023-03-31 20:05:14 +02:00			`import functools`
			`import json`

			`from tml.projects.home.recap.data import config as recap_data_config`

			`from absl import logging`
			`import tensorflow as tf`


			`DEFAULTS_MAP = {"int64_list": 0, "float_list": 0.0, "bytes_list": ""}`
			`DTYPE_MAP = {"int64_list": tf.int64, "float_list": tf.float32, "bytes_list": tf.string}`


			`def create_tf_example_schema(`
			`data_config: recap_data_config.SegDenseSchema,`
			`segdense_schema,`
			`):`
			`"""Generate schema for deseralizing tf.Example.`

			`Args:`
			`segdense_schema: List of dicts of segdense features (includes feature_name, dtype, length).`
			`labels: List of strings denoting labels.`

			`Returns:`
			`A dictionary schema suitable for deserializing tf.Example.`
			`"""`
			`segdense_config = data_config.seg_dense_schema`
			`labels = list(data_config.tasks.keys())`
			`used_features = (`
			`segdense_config.features + list(segdense_config.renamed_features.values()) + labels`
			`)`
			`logging.info(used_features)`

			`tfe_schema = {}`
			`for entry in segdense_schema:`
			`feature_name = entry["feature_name"]`

			`if feature_name in used_features:`
			`length = entry["length"]`
			`dtype = entry["dtype"]`

			`if feature_name in labels:`
			`logging.info(f"Label: feature name is {feature_name} type is {dtype}")`
			`tfe_schema[feature_name] = tf.io.FixedLenFeature(`
			`length, DTYPE_MAP[dtype], DEFAULTS_MAP[dtype]`
			`)`
			`elif length == -1:`
			`tfe_schema[feature_name] = tf.io.VarLenFeature(DTYPE_MAP[dtype])`
			`else:`
			`tfe_schema[feature_name] = tf.io.FixedLenFeature(`
			`length, DTYPE_MAP[dtype], [DEFAULTS_MAP[dtype]] * length`
			`)`
			`for feature_name in used_features:`
			`if feature_name not in tfe_schema:`
			`raise ValueError(f"{feature_name} missing from schema: {segdense_config.schema_path}.")`
			`return tfe_schema`


			`@functools.lru_cache(1)`
			`def make_mantissa_mask(mask_length: int) -> tf.Tensor:`
			`"""For experimentating with emulating bfloat16 or less precise types."""`
			`return tf.constant((1 << 32) - (1 << mask_length), dtype=tf.int32)`


			`def mask_mantissa(tensor: tf.Tensor, mask_length: int) -> tf.Tensor:`
			`"""For experimentating with emulating bfloat16 or less precise types."""`
			`mask: tf.Tensor = make_mantissa_mask(mask_length)`
			`return tf.bitcast(tf.bitwise.bitwise_and(tf.bitcast(tensor, tf.int32), mask), tensor.dtype)`


			`def parse_tf_example(`
			`serialized_example,`
			`tfe_schema,`
			`seg_dense_schema_config,`
			`):`
			`"""Parse serialized tf.Example into dict of tensors.`

			`Args:`
			`serialized_example: Serialized tf.Example to be parsed.`
			`tfe_schema: Dictionary schema suitable for deserializing tf.Example.`

			`Returns:`
			`Dictionary of tensors to be used as model input.`
			`"""`
			`inputs = tf.io.parse_example(serialized=serialized_example, features=tfe_schema)`

			`for new_feature_name, old_feature_name in seg_dense_schema_config.renamed_features.items():`
			`inputs[new_feature_name] = inputs.pop(old_feature_name)`

			`# This should not actually be used except for experimentation with low precision floats.`
			`if "mask_mantissa_features" in seg_dense_schema_config:`
			`for feature_name, mask_length in seg_dense_schema_config.mask_mantissa_features.items():`
			`inputs[feature_name] = mask_mantissa(inputs[feature_name], mask_length)`

			`# DANGER DANGER: This default seems really scary, and it's only here because it has to be visible`
			`# at TF level.`
			`# We should not return empty tensors if we dont use embeddings.`
			`# Otherwise, it breaks numpy->pt conversion`
			`renamed_keys = list(seg_dense_schema_config.renamed_features.keys())`
			`for renamed_key in renamed_keys:`
			`if "embedding" in renamed_key and (renamed_key not in inputs):`
			`inputs[renamed_key] = tf.zeros([], tf.float32)`

			`logging.info(f"parsed example and inputs are {inputs}")`
			`return inputs`


			`def get_seg_dense_parse_fn(data_config: recap_data_config.RecapDataConfig):`
			`"""Placeholder for seg dense.`

			`In the future, when we use more seg dense variations, we can change this.`
			`"""`
			`with tf.io.gfile.GFile(data_config.seg_dense_schema.schema_path, "r") as f:`
			`seg_dense_schema = json.load(f)["schema"]`

			`tf_example_schema = create_tf_example_schema(`
			`data_config,`
			`seg_dense_schema,`
			`)`

			`logging.info("*** TF Example Schema ***")`
			`logging.info(tf_example_schema)`

			`parse = functools.partial(`
			`parse_tf_example,`
			`tfe_schema=tf_example_schema,`
			`seg_dense_schema_config=data_config.seg_dense_schema,`
			`)`
			`return parse`