the-algorithm/twml/twml/layers/percentile_discretizer.py
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

210 lines
8.7 KiB
Python

# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
"""
Implementing PercentileDiscretizer Layer
"""
import libtwml
import numpy as np
import tensorflow.compat.v1 as tf
import twml
from twml.layers import Layer
class PercentileDiscretizer(Layer):
"""
PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
accumulating data and performing percentile bucket calibration.
PercentileDiscretizer takes sparse continuous features and converts then to sparse
binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
Each PercentileDiscretizer input feature is converted to n_bin bins.
Each PercentileDiscretizer calibration tries to find bin delimiters such
that the number of features values per bin is roughly equal (for
each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
equiprobable, according to the given calibration data.
Note that if an input feature is rarely used, so will its associated output bin/features.
"""
def __init__(
self,
n_feature, n_bin, out_bits,
bin_values=None, hash_keys=None, hash_values=None,
bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs):
"""
Creates a non-initialized `PercentileDiscretizer` object.
Before using the table you will have to initialize it. After initialization
the table will be immutable.
If there are no calibrated features, then the discretizer will only apply
twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
the discretizer will be a "no-operation", other than obeying `out_bits`
Parent class args:
see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
for documentation of parent class arguments.
Required args:
n_feature:
number of unique features accumulated during PercentileDiscretizer calibration.
This is the number of features in the hash map.
Used to initialize bin_values, hash_keys, hash_values,
bin_ids, bin_values and feature_offsets.
n_bin:
number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
Used to initialize bin_values, hash_keys, hash_values,
bin_ids, bin_values and feature_offsets.
out_bits:
Determines the maximum value for output feature IDs.
The dense_shape of the SparseTensor returned by lookup(x)
will be [x.shape[0], 1 << output_bits].
Optional args:
hash_keys:
contains the features ID that PercentileDiscretizer discretizes and knows about.
The hash map (hash_keys->hash_values) is used for two reasons:
1. divide inputs into two feature spaces:
PercentileDiscretizer vs non-PercentileDiscretizer
2. transate the PercentileDiscretizer features into a hash_feature ID that
PercentileDiscretizer understands.
The hash_map is expected to contain n_feature items.
hash_values:
translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
bin_ids:
a 1D Tensor of size n_feature * n_bin + 1 which contains
unique IDs to which the PercentileDiscretizer features will be translated to.
For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
the most efficient output space.
bin_values:
a 1D Tensor aligned with bin_ids.
For a given hash_feature ID j, it's value bin's are indexed between
`j*n_bin` and `j*n_bin + n_bin-1`.
As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
and a inputs value between
`bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
feature_offsets:
a 1D Tensor specifying the starting location of bins for a given feature id.
For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
"""
super(PercentileDiscretizer, self).__init__(**kwargs)
if not self.built:
self.build(input_shape=None)
max_discretizer_feature = n_feature * (n_bin + 1)
self._n_feature = n_feature
self._n_bin = n_bin
# build variables
self._out_bits = out_bits
self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
self._hash_keys = (hash_keys if hash_keys is not None else
np.empty(n_feature, dtype=np.int64))
self._hash_values = (hash_values if hash_values is not None else
np.empty(n_feature, dtype=np.int64))
self._bin_ids = (bin_ids if bin_ids is not None else
np.empty(max_discretizer_feature, dtype=np.int64))
self._bin_values = (bin_values if bin_values is not None else
np.empty(max_discretizer_feature, dtype=np.float32))
self._feature_offsets = (feature_offsets if feature_offsets is not None else
np.empty(n_feature, dtype=np.int64))
self.num_parts = num_parts
self.cost_per_unit = cost_per_unit
def build(self, input_shape): # pylint: disable=unused-argument
"""
Creates the variables of the layer
"""
self.built = True
def call(self, inputs, keep_inputs=False, **kwargs):
"""Looks up `keys` in a table, outputs the corresponding values.
Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
Input features that were not calibrated have their feature IDs truncated, so as
to be less than 1<<output_bits, but their values remain untouched (not discretized)
If there are no calibrated features, then the discretizer will only apply
twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
the discretizer will be a "no-operation", other than obeying `out_bits`
Args:
inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
It has a dense_shape of [batch_size, input_size]
keep_inputs:
Include the original inputs in the output.
Note - if True, undiscretized features will be passed through, but will have
their values doubled (unless there are no calibrated features to discretize).
name: A name for the operation (optional).
Returns:
A `SparseTensor` of the same type as `inputs`.
Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
"""
if isinstance(inputs, tf.SparseTensor):
inputs = twml.SparseTensor.from_tf(inputs)
assert(isinstance(inputs, twml.SparseTensor))
# sparse column indices
ids = inputs.ids
# sparse row indices
keys = inputs.indices
# sparse values
vals = inputs.values
if self._n_feature > 0:
discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
input_ids=keys, # inc key assigned to feature_id, or -1
input_vals=vals, # the observed feature values
bin_ids=self._bin_ids, # n_feat X (n_bin+1) 2D arange
bin_vals=self._bin_values, # bin boundaries
feature_offsets=self._feature_offsets, # 0 : nbin_1 : max_feat
output_bits=self._out_bits,
feature_ids=tf.make_tensor_proto(self._hash_keys), # feature ids to build internal hash map
feature_indices=tf.make_tensor_proto(self._hash_values), # keys associated w/ feat. indices
start_compute=tf.constant(0, shape=[], dtype=tf.int64),
end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
cost_per_unit=self.cost_per_unit
)
else:
discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
discretizer_vals = vals
# don't 2x the input.
keep_inputs = False
batch_size = tf.to_int64(inputs.dense_shape[0])
output_shape = [batch_size, self._output_size]
output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
if keep_inputs:
# Note the non-discretized features will end up doubled,
# since these are already in `output`
# handle output ID conflicts
mdl_size = self._n_feature * (self._n_bin + 1)
non_mdl_size = tf.subtract(self._output_size, mdl_size)
input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
new_input = twml.SparseTensor(
ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf()
# concatenate discretizer output with original input
sparse_add = tf.sparse_add(new_input, output)
output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape)
return output
def compute_output_shape(self, input_shape):
"""Computes the output shape of the layer given the input shape.
Args:
input_shape: A (possibly nested tuple of) `TensorShape`. It need not
be fully defined (e.g. the batch size may be unknown).
Raises NotImplementedError.
"""
raise NotImplementedError