# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes """ Implementing PercentileDiscretizer Layer """ import libtwml import numpy as np import tensorflow.compat.v1 as tf import twml from twml.layers import Layer class PercentileDiscretizer(Layer): """ PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after accumulating data and performing percentile bucket calibration. PercentileDiscretizer takes sparse continuous features and converts then to sparse binary features. Each binary output feature is associated to an PercentileDiscretizer bin. Each PercentileDiscretizer input feature is converted to n_bin bins. Each PercentileDiscretizer calibration tries to find bin delimiters such that the number of features values per bin is roughly equal (for each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx. equiprobable, according to the given calibration data. Note that if an input feature is rarely used, so will its associated output bin/features. """ def __init__( self, n_feature, n_bin, out_bits, bin_values=None, hash_keys=None, hash_values=None, bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs): """ Creates a non-initialized `PercentileDiscretizer` object. Before using the table you will have to initialize it. After initialization the table will be immutable. If there are no calibrated features, then the discretizer will only apply twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially, the discretizer will be a "no-operation", other than obeying `out_bits` Parent class args: see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) for documentation of parent class arguments. Required args: n_feature: number of unique features accumulated during PercentileDiscretizer calibration. This is the number of features in the hash map. Used to initialize bin_values, hash_keys, hash_values, bin_ids, bin_values and feature_offsets. n_bin: number of PercentileDiscretizer bins used for PercentileDiscretizer calibration. Used to initialize bin_values, hash_keys, hash_values, bin_ids, bin_values and feature_offsets. out_bits: Determines the maximum value for output feature IDs. The dense_shape of the SparseTensor returned by lookup(x) will be [x.shape[0], 1 << output_bits]. Optional args: hash_keys: contains the features ID that PercentileDiscretizer discretizes and knows about. The hash map (hash_keys->hash_values) is used for two reasons: 1. divide inputs into two feature spaces: PercentileDiscretizer vs non-PercentileDiscretizer 2. transate the PercentileDiscretizer features into a hash_feature ID that PercentileDiscretizer understands. The hash_map is expected to contain n_feature items. hash_values: translates the feature IDs into hash_feature IDs for PercentileDiscretizer. bin_ids: a 1D Tensor of size n_feature * n_bin + 1 which contains unique IDs to which the PercentileDiscretizer features will be translated to. For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce the most efficient output space. bin_values: a 1D Tensor aligned with bin_ids. For a given hash_feature ID j, it's value bin's are indexed between `j*n_bin` and `j*n_bin + n_bin-1`. As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j and a inputs value between `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. feature_offsets: a 1D Tensor specifying the starting location of bins for a given feature id. For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). """ super(PercentileDiscretizer, self).__init__(**kwargs) if not self.built: self.build(input_shape=None) max_discretizer_feature = n_feature * (n_bin + 1) self._n_feature = n_feature self._n_bin = n_bin # build variables self._out_bits = out_bits self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) self._hash_keys = (hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64)) self._hash_values = (hash_values if hash_values is not None else np.empty(n_feature, dtype=np.int64)) self._bin_ids = (bin_ids if bin_ids is not None else np.empty(max_discretizer_feature, dtype=np.int64)) self._bin_values = (bin_values if bin_values is not None else np.empty(max_discretizer_feature, dtype=np.float32)) self._feature_offsets = (feature_offsets if feature_offsets is not None else np.empty(n_feature, dtype=np.int64)) self.num_parts = num_parts self.cost_per_unit = cost_per_unit def build(self, input_shape): # pylint: disable=unused-argument """ Creates the variables of the layer """ self.built = True def call(self, inputs, keep_inputs=False, **kwargs): """Looks up `keys` in a table, outputs the corresponding values. Implements PercentileDiscretizer inference where inputs are intersected with a hash_map. Input features that were not calibrated have their feature IDs truncated, so as to be less than 1< 0: discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2( input_ids=keys, # inc key assigned to feature_id, or -1 input_vals=vals, # the observed feature values bin_ids=self._bin_ids, # n_feat X (n_bin+1) 2D arange bin_vals=self._bin_values, # bin boundaries feature_offsets=self._feature_offsets, # 0 : nbin_1 : max_feat output_bits=self._out_bits, feature_ids=tf.make_tensor_proto(self._hash_keys), # feature ids to build internal hash map feature_indices=tf.make_tensor_proto(self._hash_values), # keys associated w/ feat. indices start_compute=tf.constant(0, shape=[], dtype=tf.int64), end_compute=tf.constant(-1, shape=[], dtype=tf.int64), cost_per_unit=self.cost_per_unit ) else: discretizer_keys = twml.util.limit_bits(keys, self._out_bits) discretizer_vals = vals # don't 2x the input. keep_inputs = False batch_size = tf.to_int64(inputs.dense_shape[0]) output_shape = [batch_size, self._output_size] output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf() if keep_inputs: # Note the non-discretized features will end up doubled, # since these are already in `output` # handle output ID conflicts mdl_size = self._n_feature * (self._n_bin + 1) non_mdl_size = tf.subtract(self._output_size, mdl_size) input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size) new_input = twml.SparseTensor( ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf() # concatenate discretizer output with original input sparse_add = tf.sparse_add(new_input, output) output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape) return output def compute_output_shape(self, input_shape): """Computes the output shape of the layer given the input shape. Args: input_shape: A (possibly nested tuple of) `TensorShape`. It need not be fully defined (e.g. the batch size may be unknown). Raises NotImplementedError. """ raise NotImplementedError