mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-01-08 10:20:42 +01:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
210 lines
8.7 KiB
Python
210 lines
8.7 KiB
Python
# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
|
|
"""
|
|
Implementing PercentileDiscretizer Layer
|
|
"""
|
|
|
|
|
|
import libtwml
|
|
import numpy as np
|
|
import tensorflow.compat.v1 as tf
|
|
import twml
|
|
from twml.layers import Layer
|
|
|
|
|
|
class PercentileDiscretizer(Layer):
|
|
"""
|
|
PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
|
|
accumulating data and performing percentile bucket calibration.
|
|
|
|
PercentileDiscretizer takes sparse continuous features and converts then to sparse
|
|
binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
|
|
Each PercentileDiscretizer input feature is converted to n_bin bins.
|
|
Each PercentileDiscretizer calibration tries to find bin delimiters such
|
|
that the number of features values per bin is roughly equal (for
|
|
each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
|
|
equiprobable, according to the given calibration data.
|
|
Note that if an input feature is rarely used, so will its associated output bin/features.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
n_feature, n_bin, out_bits,
|
|
bin_values=None, hash_keys=None, hash_values=None,
|
|
bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs):
|
|
"""
|
|
Creates a non-initialized `PercentileDiscretizer` object.
|
|
Before using the table you will have to initialize it. After initialization
|
|
the table will be immutable.
|
|
|
|
If there are no calibrated features, then the discretizer will only apply
|
|
twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
|
|
the discretizer will be a "no-operation", other than obeying `out_bits`
|
|
|
|
Parent class args:
|
|
see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
|
|
for documentation of parent class arguments.
|
|
|
|
Required args:
|
|
n_feature:
|
|
number of unique features accumulated during PercentileDiscretizer calibration.
|
|
This is the number of features in the hash map.
|
|
Used to initialize bin_values, hash_keys, hash_values,
|
|
bin_ids, bin_values and feature_offsets.
|
|
n_bin:
|
|
number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
|
|
Used to initialize bin_values, hash_keys, hash_values,
|
|
bin_ids, bin_values and feature_offsets.
|
|
out_bits:
|
|
Determines the maximum value for output feature IDs.
|
|
The dense_shape of the SparseTensor returned by lookup(x)
|
|
will be [x.shape[0], 1 << output_bits].
|
|
|
|
Optional args:
|
|
hash_keys:
|
|
contains the features ID that PercentileDiscretizer discretizes and knows about.
|
|
The hash map (hash_keys->hash_values) is used for two reasons:
|
|
1. divide inputs into two feature spaces:
|
|
PercentileDiscretizer vs non-PercentileDiscretizer
|
|
2. transate the PercentileDiscretizer features into a hash_feature ID that
|
|
PercentileDiscretizer understands.
|
|
The hash_map is expected to contain n_feature items.
|
|
hash_values:
|
|
translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
|
|
bin_ids:
|
|
a 1D Tensor of size n_feature * n_bin + 1 which contains
|
|
unique IDs to which the PercentileDiscretizer features will be translated to.
|
|
For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
|
|
the most efficient output space.
|
|
bin_values:
|
|
a 1D Tensor aligned with bin_ids.
|
|
For a given hash_feature ID j, it's value bin's are indexed between
|
|
`j*n_bin` and `j*n_bin + n_bin-1`.
|
|
As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
|
|
and a inputs value between
|
|
`bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
|
|
feature_offsets:
|
|
a 1D Tensor specifying the starting location of bins for a given feature id.
|
|
For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
|
|
"""
|
|
|
|
super(PercentileDiscretizer, self).__init__(**kwargs)
|
|
|
|
if not self.built:
|
|
self.build(input_shape=None)
|
|
|
|
max_discretizer_feature = n_feature * (n_bin + 1)
|
|
self._n_feature = n_feature
|
|
self._n_bin = n_bin
|
|
|
|
# build variables
|
|
self._out_bits = out_bits
|
|
self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
|
|
self._hash_keys = (hash_keys if hash_keys is not None else
|
|
np.empty(n_feature, dtype=np.int64))
|
|
self._hash_values = (hash_values if hash_values is not None else
|
|
np.empty(n_feature, dtype=np.int64))
|
|
self._bin_ids = (bin_ids if bin_ids is not None else
|
|
np.empty(max_discretizer_feature, dtype=np.int64))
|
|
self._bin_values = (bin_values if bin_values is not None else
|
|
np.empty(max_discretizer_feature, dtype=np.float32))
|
|
self._feature_offsets = (feature_offsets if feature_offsets is not None else
|
|
np.empty(n_feature, dtype=np.int64))
|
|
self.num_parts = num_parts
|
|
self.cost_per_unit = cost_per_unit
|
|
|
|
def build(self, input_shape): # pylint: disable=unused-argument
|
|
"""
|
|
Creates the variables of the layer
|
|
"""
|
|
self.built = True
|
|
|
|
def call(self, inputs, keep_inputs=False, **kwargs):
|
|
"""Looks up `keys` in a table, outputs the corresponding values.
|
|
|
|
Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
|
|
Input features that were not calibrated have their feature IDs truncated, so as
|
|
to be less than 1<<output_bits, but their values remain untouched (not discretized)
|
|
|
|
If there are no calibrated features, then the discretizer will only apply
|
|
twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
|
|
the discretizer will be a "no-operation", other than obeying `out_bits`
|
|
|
|
Args:
|
|
inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
|
|
It has a dense_shape of [batch_size, input_size]
|
|
keep_inputs:
|
|
Include the original inputs in the output.
|
|
Note - if True, undiscretized features will be passed through, but will have
|
|
their values doubled (unless there are no calibrated features to discretize).
|
|
name: A name for the operation (optional).
|
|
Returns:
|
|
A `SparseTensor` of the same type as `inputs`.
|
|
Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
|
|
"""
|
|
|
|
if isinstance(inputs, tf.SparseTensor):
|
|
inputs = twml.SparseTensor.from_tf(inputs)
|
|
|
|
assert(isinstance(inputs, twml.SparseTensor))
|
|
|
|
# sparse column indices
|
|
ids = inputs.ids
|
|
# sparse row indices
|
|
keys = inputs.indices
|
|
# sparse values
|
|
vals = inputs.values
|
|
|
|
if self._n_feature > 0:
|
|
discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
|
|
input_ids=keys, # inc key assigned to feature_id, or -1
|
|
input_vals=vals, # the observed feature values
|
|
bin_ids=self._bin_ids, # n_feat X (n_bin+1) 2D arange
|
|
bin_vals=self._bin_values, # bin boundaries
|
|
feature_offsets=self._feature_offsets, # 0 : nbin_1 : max_feat
|
|
output_bits=self._out_bits,
|
|
feature_ids=tf.make_tensor_proto(self._hash_keys), # feature ids to build internal hash map
|
|
feature_indices=tf.make_tensor_proto(self._hash_values), # keys associated w/ feat. indices
|
|
start_compute=tf.constant(0, shape=[], dtype=tf.int64),
|
|
end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
|
|
cost_per_unit=self.cost_per_unit
|
|
)
|
|
else:
|
|
discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
|
|
discretizer_vals = vals
|
|
# don't 2x the input.
|
|
keep_inputs = False
|
|
|
|
batch_size = tf.to_int64(inputs.dense_shape[0])
|
|
output_shape = [batch_size, self._output_size]
|
|
|
|
output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
|
|
|
|
if keep_inputs:
|
|
# Note the non-discretized features will end up doubled,
|
|
# since these are already in `output`
|
|
# handle output ID conflicts
|
|
mdl_size = self._n_feature * (self._n_bin + 1)
|
|
non_mdl_size = tf.subtract(self._output_size, mdl_size)
|
|
input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
|
|
|
|
new_input = twml.SparseTensor(
|
|
ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf()
|
|
|
|
# concatenate discretizer output with original input
|
|
sparse_add = tf.sparse_add(new_input, output)
|
|
output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape)
|
|
|
|
return output
|
|
|
|
def compute_output_shape(self, input_shape):
|
|
"""Computes the output shape of the layer given the input shape.
|
|
|
|
Args:
|
|
input_shape: A (possibly nested tuple of) `TensorShape`. It need not
|
|
be fully defined (e.g. the batch size may be unknown).
|
|
|
|
Raises NotImplementedError.
|
|
|
|
"""
|
|
raise NotImplementedError
|