the-algorithm/twml/libtwml/src/lib/functions.cpp
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

159 lines
5.0 KiB
C++

#include "internal/error.h"
#include "internal/murmur_hash3.h"
#include "internal/utf_converter.h"
#include <twml/functions.h>
#include <cstring>
#include <algorithm>
namespace twml {
template<typename T>
void add1(Tensor &output, const Tensor input) {
T *odata = output.getData<T>();
const T *idata = input.getData<T>();
const uint64_t num_elements = input.getNumElements();
for (uint64_t i = 0; i < num_elements; i++) {
odata[i] = idata[i] + 1;
}
}
template<typename T>
void copy(Tensor &output, const Tensor input) {
T *odata = output.getData<T>();
const T *idata = input.getData<T>();
const uint64_t num_elements = input.getNumElements();
for (uint64_t i = 0; i < num_elements; i++) {
odata[i] = idata[i];
}
}
void add1(Tensor &output, const Tensor input) {
auto type = input.getType();
if (output.getType() != type) {
throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
}
if (output.getNumElements() != input.getNumElements()) {
throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
}
// TODO: Implement an easier dispatch function
switch (type) {
case TWML_TYPE_FLOAT:
twml::add1<float>(output, input);
break;
case TWML_TYPE_DOUBLE:
twml::add1<double>(output, input);
break;
default:
throw twml::Error(TWML_ERR_TYPE, "add1 only supports float and double tensors");
}
}
void copy(Tensor &output, const Tensor input) {
auto type = input.getType();
if (output.getType() != type) {
throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
}
if (output.getNumElements() != input.getNumElements()) {
throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
}
// TODO: Implement an easier dispatch function
switch (type) {
case TWML_TYPE_FLOAT:
twml::copy<float>(output, input);
break;
case TWML_TYPE_DOUBLE:
twml::copy<double>(output, input);
break;
default:
throw twml::Error(TWML_ERR_TYPE, "copy only supports float and double tensors");
}
}
int64_t featureId(const std::string &feature) {
const char *str = feature.c_str();
uint64_t len = feature.size();
int64_t id = 0;
TWML_CHECK(twml_get_feature_id(&id, len, str), "Error getting featureId");
return id;
}
} // namespace twml
twml_err twml_add1(twml_tensor output, const twml_tensor input) {
HANDLE_EXCEPTIONS(
auto out = twml::getTensor(output);
auto in = twml::getConstTensor(input);
twml::add1(*out, *in););
return TWML_ERR_NONE;
}
twml_err twml_copy(twml_tensor output, const twml_tensor input) {
HANDLE_EXCEPTIONS(
auto out = twml::getTensor(output);
auto in = twml::getConstTensor(input);
twml::copy(*out, *in););
return TWML_ERR_NONE;
}
inline twml_err twml_get_feature_id_internal(int64_t *result,
uint64_t out_size, uint16_t *out,
uint64_t out2_size, uint16_t *out2,
const uint64_t len, const char *str) {
uint64_t k = 0;
for (uint64_t i = 0; i < len; i++) {
if (str[i] == '#') {
k = i;
break;
}
}
uint8_t hash[16];
if (k != 0) {
ssize_t n = utf8_to_utf16((const uint8_t *) str, k, out, out_size);
if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, out2);
n = utf8_to_utf16((const uint8_t *) (str + k + 1), len - k - 1, &out2[4], out2_size - 8);
if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
MurmurHash3_x64_128(out2, (n * sizeof(uint16_t)) + 8, 0, hash);
} else {
ssize_t n = utf8_to_utf16((const uint8_t *)str, len, out, out_size);
if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, hash);
}
int64_t id;
memcpy(&id, hash, sizeof(int64_t));
*result = id;
return TWML_ERR_NONE;
}
static const int UTF16_STR_MAX_SIZE = 1024;
twml_err twml_get_feature_id(int64_t *result, const uint64_t len, const char *str) {
try {
uint16_t out[UTF16_STR_MAX_SIZE];
uint16_t out2[UTF16_STR_MAX_SIZE];
return twml_get_feature_id_internal(result,
UTF16_STR_MAX_SIZE, out,
UTF16_STR_MAX_SIZE, out2,
len, str);
} catch(const std::invalid_argument &ex) {
// If the space on the stack is not enough, try using the heap.
// len + 1 is needed because a null terminating character is added at the end.
std::vector<uint16_t> out(len + 1);
std::vector<uint16_t> out2(len + 1);
return twml_get_feature_id_internal(result,
len + 1, out.data(),
len + 1, out2.data(),
len, str);
}
}