the-algorithm/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

242 lines
9.1 KiB
C++

#include "internal/linear_search.h"
#include "internal/error.h"
#include <twml/hashing_discretizer_impl.h>
#include <twml/optim.h>
#include <algorithm>
namespace twml {
template<typename Tx>
static int64_t lower_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
auto index_temp = std::lower_bound(data, data + buf_size, val);
return static_cast<int64_t>(index_temp - data);
}
template<typename Tx>
static int64_t upper_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
auto index_temp = std::upper_bound(data, data + buf_size, val);
return static_cast<int64_t>(index_temp - data);
}
template<typename Tx>
using search_method = int64_t (*)(const Tx *, const Tx, const int64_t);
typedef uint64_t (*hash_signature)(uint64_t, int64_t, uint64_t);
// uint64_t integer_multiplicative_hashing()
//
// A function to hash discretized feature_ids into one of 2**output_bits buckets.
// This function hashes the feature_ids to achieve a uniform distribution of
// IDs, so the hashed IDs are with high probability far apart
// Then, bucket_indices can simply be added, resulting in unique new IDs with high probability
// We integer hash again to again spread out the new IDs
// Finally we take the upper
// Required args:
// feature_id:
// The feature id of the feature to be hashed.
// bucket_index:
// The bucket index of the discretized feature value
// output_bits:
// The number of bits of output space for the features to be hashed into.
//
// Note - feature_ids may have arbitrary distribution within int32s
// Note - 64 bit feature_ids can be processed with this, but the upper
// 32 bits have no effect on the output
// e.g. all feature ids 0 through 255 exist in movie-lens.
// this hashing constant is good for 32 LSBs. will use N=32. (can use N<32 also)
// this hashing constant is co-prime with 2**32, therefore we have that
// a != b, a and b in [0,2**32)
// implies
// f(a) != f(b) where f(x) = (hashing_constant * x) % (2**32)
// note that we are mostly ignoring the upper 32 bits, using modulo 2**32 arithmetic
uint64_t integer_multiplicative_hashing(uint64_t feature_id,
int64_t bucket_index,
uint64_t output_bits) {
// possibly use 14695981039346656037 for 64 bit unsigned??
// = 20921 * 465383 * 1509404459
// alternatively, 14695981039346656039 is prime
// We would also need to use N = 64
const uint64_t hashing_constant = 2654435761;
const uint64_t N = 32;
// hash once to prevent problems from anomalous input id distributions
feature_id *= hashing_constant;
feature_id += bucket_index;
// this hash enables the following right shift operation
// without losing the bucket information (lower bits)
feature_id *= hashing_constant;
// output size is a power of 2
feature_id >>= N - output_bits;
uint64_t mask = (1 << output_bits) - 1;
return mask & feature_id;
}
uint64_t integer64_multiplicative_hashing(uint64_t feature_id,
int64_t bucket_index,
uint64_t output_bits) {
const uint64_t hashing_constant = 14695981039346656039UL;
const uint64_t N = 64;
// hash once to prevent problems from anomalous input id distributions
feature_id *= hashing_constant;
feature_id += bucket_index;
// this hash enables the following right shift operation
// without losing the bucket information (lower bits)
feature_id *= hashing_constant;
// output size is a power of 2
feature_id >>= N - output_bits;
uint64_t mask = (1 << output_bits) - 1;
return mask & feature_id;
}
int64_t option_bits(int64_t options, int64_t high, int64_t low) {
options >>= low;
options &= (1 << (high - low + 1)) - 1;
return options;
}
// it is assumed that start_compute and end_compute are valid
template<typename T>
void hashDiscretizerInfer(Tensor &output_keys,
Tensor &output_vals,
const Tensor &input_ids,
const Tensor &input_vals,
const Tensor &bin_vals,
int output_bits,
const Map<int64_t, int64_t> &ID_to_index,
int64_t start_compute,
int64_t end_compute,
int64_t n_bin,
int64_t options) {
auto output_keys_data = output_keys.getData<int64_t>();
auto output_vals_data = output_vals.getData<T>();
auto input_ids_data = input_ids.getData<int64_t>();
auto input_vals_data = input_vals.getData<T>();
auto bin_vals_data = bin_vals.getData<T>();
// The function pointer implementation removes the option_bits
// function call (might be inlined) and corresponding branch from
// the hot loop, but it prevents inlining these functions, so
// there will be function call overhead. Uncertain which would
// be faster, testing needed. Also, code optimizers do weird things...
hash_signature hash_fn = integer_multiplicative_hashing;
switch (option_bits(options, 4, 2)) {
case 0:
hash_fn = integer_multiplicative_hashing;
break;
case 1:
hash_fn = integer64_multiplicative_hashing;
break;
default:
hash_fn = integer_multiplicative_hashing;
}
search_method<T> search_fn = lower_bound_search;
switch (option_bits(options, 1, 0)) {
case 0:
search_fn = lower_bound_search<T>;
break;
case 1:
search_fn = linear_search<T>;
break;
case 2:
search_fn = upper_bound_search<T>;
break;
default:
search_fn = lower_bound_search<T>;
}
for (uint64_t i = start_compute; i < end_compute; i++) {
int64_t id = input_ids_data[i];
T val = input_vals_data[i];
auto iter = ID_to_index.find(id);
if (iter != ID_to_index.end()) {
int64_t feature_idx = iter->second;
const T *bin_vals_start = bin_vals_data + feature_idx * n_bin;
int64_t out_bin_idx = search_fn(bin_vals_start, val, n_bin);
output_keys_data[i] = hash_fn(id, out_bin_idx, output_bits);
output_vals_data[i] = 1;
} else {
// feature not calibrated
output_keys_data[i] = id & ((1 << output_bits) - 1);
output_vals_data[i] = val;
}
}
}
void hashDiscretizerInfer(Tensor &output_keys,
Tensor &output_vals,
const Tensor &input_ids,
const Tensor &input_vals,
int n_bin,
const Tensor &bin_vals,
int output_bits,
const Map<int64_t, int64_t> &ID_to_index,
int start_compute,
int end_compute,
int64_t options) {
if (input_ids.getType() != TWML_TYPE_INT64) {
throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
}
if (output_keys.getType() != TWML_TYPE_INT64) {
throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
}
if (input_vals.getType() != bin_vals.getType()) {
throw twml::Error(TWML_ERR_TYPE,
"Data type of input_vals does not match type of bin_vals");
}
if (bin_vals.getNumDims() != 1) {
throw twml::Error(TWML_ERR_SIZE,
"bin_vals must be 1 Dimensional");
}
uint64_t size = input_ids.getDim(0);
if (end_compute == -1) {
end_compute = size;
}
if (start_compute < 0 || start_compute >= size) {
throw twml::Error(TWML_ERR_SIZE,
"start_compute out of range");
}
if (end_compute < -1 || end_compute > size) {
throw twml::Error(TWML_ERR_SIZE,
"end_compute out of range");
}
if (start_compute > end_compute && end_compute != -1) {
throw twml::Error(TWML_ERR_SIZE,
"must have start_compute <= end_compute, or end_compute==-1");
}
if (output_keys.getStride(0) != 1 || output_vals.getStride(0) != 1 ||
input_ids.getStride(0) != 1 || input_vals.getStride(0) != 1 ||
bin_vals.getStride(0) != 1) {
throw twml::Error(TWML_ERR_SIZE,
"All Strides must be 1.");
}
switch (input_vals.getType()) {
case TWML_TYPE_FLOAT:
twml::hashDiscretizerInfer<float>(output_keys, output_vals,
input_ids, input_vals,
bin_vals, output_bits, ID_to_index,
start_compute, end_compute, n_bin, options);
break;
case TWML_TYPE_DOUBLE:
twml::hashDiscretizerInfer<double>(output_keys, output_vals,
input_ids, input_vals,
bin_vals, output_bits, ID_to_index,
start_compute, end_compute, n_bin, options);
break;
default:
throw twml::Error(TWML_ERR_TYPE,
"Unsupported datatype for hashDiscretizerInfer");
}
}
} // namespace twml