Merge d2da7e56ab into 72eda9a24f

2025-01-02 23:51:53 +01:00 · 2023-07-17 21:38:12 -05:00 · 2023-07-17 21:38:12 -05:00 · b4f0518ee5
commit b4f0518ee5
parent 72eda9a24f d2da7e56ab
1 changed files with 68 additions and 67 deletions
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
@ -3,89 +3,90 @@ from .parsers import LollyModelFeaturesParser

 class TFModelInitializerBuilder:

-  def __init__(self, model_features_parser=LollyModelFeaturesParser()):
-    self._model_features_parser = model_features_parser
+    def __init__(self, model_features_parser=LollyModelFeaturesParser()):
+        self._model_features_parser = model_features_parser

-  def build(self, lolly_model_reader):
-    '''
-    :param lolly_model_reader: LollyModelReader instance
-    :return: tf_model_initializer dictionary of the following format:
-      {
-        "features": {
-          "bias": 0.0,
-          "binary": {
-            # (feature name : feature weight) pairs
-            "feature_name_1": 0.0,
-            ...
-            "feature_nameN": 0.0
-          },
-          "discretized": {
-            # (feature name : index aligned lists of bin_boundaries and weights
-            "feature_name_1": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
-            }
-            ...
-            "feature_name_K": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
+    def build(self, lolly_model_reader):
+        '''
+        :param lolly_model_reader: LollyModelReader instance
+        :return: tf_model_initializer dictionary of the following format:
+          {
+            "features": {
+              "bias": 0.0,
+              "binary": {
+                # (feature name : feature weight) pairs
+                "feature_name_1": 0.0,
+                ...
+                "feature_nameN": 0.0
+              },
+              "discretized": {
+                # (feature name : index aligned lists of bin_boundaries and weights
+                "feature_name_1": {
+                  "bin_boundaries": [1, ..., inf],
+                  "weights": [0.0, ..., 0.0]
+                }
+                ...
+                "feature_name_K": {
+                  "bin_boundaries": [1, ..., inf],
+                  "weights": [0.0, ..., 0.0]
+                }
+              }
            }
          }
+        '''
+        tf_model_initializer = {
+            "features": {}
        }
-      }
-    '''
-    tf_model_initializer = {
-      "features": {}
-    }

-    features = self._model_features_parser.parse(lolly_model_reader)
-    tf_model_initializer["features"]["bias"] = features["bias"]
-    self._set_discretized_features(features["discretized"], tf_model_initializer)
+        features = self._model_features_parser.parse(lolly_model_reader)
+        tf_model_initializer["features"]["bias"] = features["bias"]
+        self._set_discretized_features(features["discretized"], tf_model_initializer)

-    self._dedup_binary_features(features["binary"], features["discretized"])
-    tf_model_initializer["features"]["binary"] = features["binary"]
+        self._dedup_binary_features(features["binary"], features["discretized"])
+        tf_model_initializer["features"]["binary"] = features["binary"]

-    return tf_model_initializer
+        return tf_model_initializer

-  def _set_discretized_features(self, discretized_features, tf_model_initializer):
-    if len(discretized_features) == 0:
-      return
+    def _set_discretized_features(self, discretized_features, tf_model_initializer):
+        if len(discretized_features) == 0:
+            return

-    num_bins = max([len(bins) for bins in discretized_features.values()])
+        num_bins = max(len(bins) for bins in discretized_features.values())

-    bin_boundaries_and_weights = {}
-    for feature_name in discretized_features:
-      bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights(
-        discretized_features[feature_name], num_bins)
+        bin_boundaries_and_weights = {
+            feature_name: self._extract_bin_boundaries_and_weights(
+                discretized_features[feature_name], num_bins)
+            for feature_name in discretized_features
+        }

-    tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
+        tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights

-  def _dedup_binary_features(self, binary_features, discretized_features):
-    [binary_features.pop(feature_name) for feature_name in discretized_features]
+    def _dedup_binary_features(self, binary_features, discretized_features):
+        [binary_features.pop(feature_name) for feature_name in discretized_features]

-  def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins):
-    bin_boundary_weight_pairs = []
+    def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins):
+        bin_boundary_weight_pairs = []

-    for bucket in discretized_feature_buckets:
-      bin_boundary_weight_pairs.append([bucket[0], bucket[2]])
+        for bucket in discretized_feature_buckets:
+            bin_boundary_weight_pairs.append([bucket[0], bucket[2]])

-    # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
-    #
-    # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-    #
-    # Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
-    for bin_boundary_weight_pair in bin_boundary_weight_pairs:
-      if bin_boundary_weight_pair[0] < float("inf"):
-        bin_boundary_weight_pair[0] *= -1
+        # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
+        #
+        # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+        #
+        # Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
+        for bin_boundary_weight_pair in bin_boundary_weight_pairs:
+            if bin_boundary_weight_pair[0] < float("inf"):
+                bin_boundary_weight_pair[0] *= -1

-    while len(bin_boundary_weight_pairs) < num_bins:
-      bin_boundary_weight_pairs.append([float("inf"), float(0)])
+        while len(bin_boundary_weight_pairs) < num_bins:
+            bin_boundary_weight_pairs.append([float("inf"), float(0)])

-    bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0])
+        bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0])

-    bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
+        bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))

-    return {
-      "bin_boundaries": bin_boundaries,
-      "weights": weights
-    }
+        return {
+            "bin_boundaries": bin_boundaries,
+            "weights": weights
+        }