mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 08:48:46 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
242 lines
9.1 KiB
C++
242 lines
9.1 KiB
C++
#include "internal/linear_search.h"
|
|
#include "internal/error.h"
|
|
#include <twml/hashing_discretizer_impl.h>
|
|
#include <twml/optim.h>
|
|
#include <algorithm>
|
|
|
|
namespace twml {
|
|
template<typename Tx>
|
|
static int64_t lower_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
|
|
auto index_temp = std::lower_bound(data, data + buf_size, val);
|
|
return static_cast<int64_t>(index_temp - data);
|
|
}
|
|
|
|
template<typename Tx>
|
|
static int64_t upper_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
|
|
auto index_temp = std::upper_bound(data, data + buf_size, val);
|
|
return static_cast<int64_t>(index_temp - data);
|
|
}
|
|
|
|
template<typename Tx>
|
|
using search_method = int64_t (*)(const Tx *, const Tx, const int64_t);
|
|
|
|
typedef uint64_t (*hash_signature)(uint64_t, int64_t, uint64_t);
|
|
|
|
// uint64_t integer_multiplicative_hashing()
|
|
//
|
|
// A function to hash discretized feature_ids into one of 2**output_bits buckets.
|
|
// This function hashes the feature_ids to achieve a uniform distribution of
|
|
// IDs, so the hashed IDs are with high probability far apart
|
|
// Then, bucket_indices can simply be added, resulting in unique new IDs with high probability
|
|
// We integer hash again to again spread out the new IDs
|
|
// Finally we take the upper
|
|
// Required args:
|
|
// feature_id:
|
|
// The feature id of the feature to be hashed.
|
|
// bucket_index:
|
|
// The bucket index of the discretized feature value
|
|
// output_bits:
|
|
// The number of bits of output space for the features to be hashed into.
|
|
//
|
|
// Note - feature_ids may have arbitrary distribution within int32s
|
|
// Note - 64 bit feature_ids can be processed with this, but the upper
|
|
// 32 bits have no effect on the output
|
|
// e.g. all feature ids 0 through 255 exist in movie-lens.
|
|
// this hashing constant is good for 32 LSBs. will use N=32. (can use N<32 also)
|
|
// this hashing constant is co-prime with 2**32, therefore we have that
|
|
// a != b, a and b in [0,2**32)
|
|
// implies
|
|
// f(a) != f(b) where f(x) = (hashing_constant * x) % (2**32)
|
|
// note that we are mostly ignoring the upper 32 bits, using modulo 2**32 arithmetic
|
|
uint64_t integer_multiplicative_hashing(uint64_t feature_id,
|
|
int64_t bucket_index,
|
|
uint64_t output_bits) {
|
|
// possibly use 14695981039346656037 for 64 bit unsigned??
|
|
// = 20921 * 465383 * 1509404459
|
|
// alternatively, 14695981039346656039 is prime
|
|
// We would also need to use N = 64
|
|
const uint64_t hashing_constant = 2654435761;
|
|
const uint64_t N = 32;
|
|
// hash once to prevent problems from anomalous input id distributions
|
|
feature_id *= hashing_constant;
|
|
feature_id += bucket_index;
|
|
// this hash enables the following right shift operation
|
|
// without losing the bucket information (lower bits)
|
|
feature_id *= hashing_constant;
|
|
// output size is a power of 2
|
|
feature_id >>= N - output_bits;
|
|
uint64_t mask = (1 << output_bits) - 1;
|
|
return mask & feature_id;
|
|
}
|
|
|
|
uint64_t integer64_multiplicative_hashing(uint64_t feature_id,
|
|
int64_t bucket_index,
|
|
uint64_t output_bits) {
|
|
const uint64_t hashing_constant = 14695981039346656039UL;
|
|
const uint64_t N = 64;
|
|
// hash once to prevent problems from anomalous input id distributions
|
|
feature_id *= hashing_constant;
|
|
feature_id += bucket_index;
|
|
// this hash enables the following right shift operation
|
|
// without losing the bucket information (lower bits)
|
|
feature_id *= hashing_constant;
|
|
// output size is a power of 2
|
|
feature_id >>= N - output_bits;
|
|
uint64_t mask = (1 << output_bits) - 1;
|
|
return mask & feature_id;
|
|
}
|
|
|
|
int64_t option_bits(int64_t options, int64_t high, int64_t low) {
|
|
options >>= low;
|
|
options &= (1 << (high - low + 1)) - 1;
|
|
return options;
|
|
}
|
|
|
|
// it is assumed that start_compute and end_compute are valid
|
|
template<typename T>
|
|
void hashDiscretizerInfer(Tensor &output_keys,
|
|
Tensor &output_vals,
|
|
const Tensor &input_ids,
|
|
const Tensor &input_vals,
|
|
const Tensor &bin_vals,
|
|
int output_bits,
|
|
const Map<int64_t, int64_t> &ID_to_index,
|
|
int64_t start_compute,
|
|
int64_t end_compute,
|
|
int64_t n_bin,
|
|
int64_t options) {
|
|
auto output_keys_data = output_keys.getData<int64_t>();
|
|
auto output_vals_data = output_vals.getData<T>();
|
|
|
|
auto input_ids_data = input_ids.getData<int64_t>();
|
|
auto input_vals_data = input_vals.getData<T>();
|
|
|
|
auto bin_vals_data = bin_vals.getData<T>();
|
|
|
|
// The function pointer implementation removes the option_bits
|
|
// function call (might be inlined) and corresponding branch from
|
|
// the hot loop, but it prevents inlining these functions, so
|
|
// there will be function call overhead. Uncertain which would
|
|
// be faster, testing needed. Also, code optimizers do weird things...
|
|
hash_signature hash_fn = integer_multiplicative_hashing;
|
|
switch (option_bits(options, 4, 2)) {
|
|
case 0:
|
|
hash_fn = integer_multiplicative_hashing;
|
|
break;
|
|
case 1:
|
|
hash_fn = integer64_multiplicative_hashing;
|
|
break;
|
|
default:
|
|
hash_fn = integer_multiplicative_hashing;
|
|
}
|
|
|
|
search_method<T> search_fn = lower_bound_search;
|
|
switch (option_bits(options, 1, 0)) {
|
|
case 0:
|
|
search_fn = lower_bound_search<T>;
|
|
break;
|
|
case 1:
|
|
search_fn = linear_search<T>;
|
|
break;
|
|
case 2:
|
|
search_fn = upper_bound_search<T>;
|
|
break;
|
|
default:
|
|
search_fn = lower_bound_search<T>;
|
|
}
|
|
|
|
for (uint64_t i = start_compute; i < end_compute; i++) {
|
|
int64_t id = input_ids_data[i];
|
|
T val = input_vals_data[i];
|
|
|
|
auto iter = ID_to_index.find(id);
|
|
if (iter != ID_to_index.end()) {
|
|
int64_t feature_idx = iter->second;
|
|
const T *bin_vals_start = bin_vals_data + feature_idx * n_bin;
|
|
int64_t out_bin_idx = search_fn(bin_vals_start, val, n_bin);
|
|
output_keys_data[i] = hash_fn(id, out_bin_idx, output_bits);
|
|
output_vals_data[i] = 1;
|
|
} else {
|
|
// feature not calibrated
|
|
output_keys_data[i] = id & ((1 << output_bits) - 1);
|
|
output_vals_data[i] = val;
|
|
}
|
|
}
|
|
}
|
|
|
|
void hashDiscretizerInfer(Tensor &output_keys,
|
|
Tensor &output_vals,
|
|
const Tensor &input_ids,
|
|
const Tensor &input_vals,
|
|
int n_bin,
|
|
const Tensor &bin_vals,
|
|
int output_bits,
|
|
const Map<int64_t, int64_t> &ID_to_index,
|
|
int start_compute,
|
|
int end_compute,
|
|
int64_t options) {
|
|
if (input_ids.getType() != TWML_TYPE_INT64) {
|
|
throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
|
|
}
|
|
|
|
if (output_keys.getType() != TWML_TYPE_INT64) {
|
|
throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
|
|
}
|
|
|
|
if (input_vals.getType() != bin_vals.getType()) {
|
|
throw twml::Error(TWML_ERR_TYPE,
|
|
"Data type of input_vals does not match type of bin_vals");
|
|
}
|
|
|
|
if (bin_vals.getNumDims() != 1) {
|
|
throw twml::Error(TWML_ERR_SIZE,
|
|
"bin_vals must be 1 Dimensional");
|
|
}
|
|
|
|
uint64_t size = input_ids.getDim(0);
|
|
if (end_compute == -1) {
|
|
end_compute = size;
|
|
}
|
|
|
|
if (start_compute < 0 || start_compute >= size) {
|
|
throw twml::Error(TWML_ERR_SIZE,
|
|
"start_compute out of range");
|
|
}
|
|
|
|
if (end_compute < -1 || end_compute > size) {
|
|
throw twml::Error(TWML_ERR_SIZE,
|
|
"end_compute out of range");
|
|
}
|
|
|
|
if (start_compute > end_compute && end_compute != -1) {
|
|
throw twml::Error(TWML_ERR_SIZE,
|
|
"must have start_compute <= end_compute, or end_compute==-1");
|
|
}
|
|
|
|
if (output_keys.getStride(0) != 1 || output_vals.getStride(0) != 1 ||
|
|
input_ids.getStride(0) != 1 || input_vals.getStride(0) != 1 ||
|
|
bin_vals.getStride(0) != 1) {
|
|
throw twml::Error(TWML_ERR_SIZE,
|
|
"All Strides must be 1.");
|
|
}
|
|
|
|
switch (input_vals.getType()) {
|
|
case TWML_TYPE_FLOAT:
|
|
twml::hashDiscretizerInfer<float>(output_keys, output_vals,
|
|
input_ids, input_vals,
|
|
bin_vals, output_bits, ID_to_index,
|
|
start_compute, end_compute, n_bin, options);
|
|
break;
|
|
case TWML_TYPE_DOUBLE:
|
|
twml::hashDiscretizerInfer<double>(output_keys, output_vals,
|
|
input_ids, input_vals,
|
|
bin_vals, output_bits, ID_to_index,
|
|
start_compute, end_compute, n_bin, options);
|
|
break;
|
|
default:
|
|
throw twml::Error(TWML_ERR_TYPE,
|
|
"Unsupported datatype for hashDiscretizerInfer");
|
|
}
|
|
}
|
|
} // namespace twml
|