the-algorithm/twml/libtwml/src/ops/data_record.cpp
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

1892 lines
71 KiB
C++

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/op_kernel.h"
#include <twml.h>
#include <twml/functions.h>
#include <twml/utilities.h>
#include "tensorflow_utils.h"
#include "resource_utils.h"
#include <algorithm>
using std::string;
REGISTER_OP("DecodeDataRecord")
.Attr("InputType: {uint8, string}")
.Attr("keep_features: list(int)")
.Attr("keep_codes: list(int)")
.Attr("label_features: list(int)")
.Attr("weight_features: list(int) = []")
.Input("input_bytes: InputType")
.Output("data_record_handle: resource")
.SetShapeFn(shape_inference::ScalarShape)
.Doc(R"doc(
A tensorflow OP that creates a handle for the datarecord.
Attr
keep_features: a list of int ids to keep.
keep_codes: their corresponding code.
label_features: list of feature ids representing the labels.
weight_features: list of feature ids representing the weights. Defaults to empty list.
shared_name: name used by the resource handle inside the resource manager.
container: name used by the container of the resources.
shared_name and container are required when inheriting from ResourceOpKernel.
Input
input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
Outputs
data_record_handle: A resource handle to the DataRecord struct.
)doc");
template<typename InputType>
class DecodeDataRecord : public OpKernel {
public:
explicit DecodeDataRecord(OpKernelConstruction* context)
: OpKernel(context) {
std::vector<int64> keep_features;
std::vector<int64> keep_codes;
std::vector<int64> label_features;
std::vector<int64> weight_features;
OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
errors::InvalidArgument("keep keys and values must have same size."));
#ifdef USE_DENSE_HASH
m_keep_map.set_empty_key(0);
m_labels_map.set_empty_key(0);
m_weights_map.set_empty_key(0);
#endif // USE_DENSE_HASH
for (uint64_t i = 0; i < keep_features.size(); i++) {
m_keep_map[keep_features[i]] = keep_codes[i];
}
for (uint64_t i = 0; i < label_features.size(); i++) {
m_labels_map[label_features[i]] = i;
}
for (uint64_t i = 0; i < weight_features.size(); i++) {
m_weights_map[weight_features[i]] = i;
}
}
private:
twml::Map<int64_t, int64_t> m_keep_map;
twml::Map<int64_t, int64_t> m_labels_map;
twml::Map<int64_t, int64_t> m_weights_map;
void Compute(OpKernelContext* context) override {
try {
DataRecordResource *resource = nullptr;
OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
// Store the input bytes in the resource so it isnt freed before the resource.
// This is necessary because we are not copying the contents for tensors.
resource->input = context->input(0);
int batch_size = getBatchSize<InputType>(resource->input);
int num_labels = static_cast<int>(m_labels_map.size());
int num_weights = static_cast<int>(m_weights_map.size());
twml::DataRecordReader reader;
reader.setKeepMap(&m_keep_map);
reader.setLabelsMap(&m_labels_map);
// Do not set weight map if it is empty. This will take a faster path.
if (num_weights != 0) {
reader.setWeightsMap(&m_weights_map);
}
resource->records.clear();
resource->records.reserve(batch_size);
for (int i = 0; i < batch_size; i++) {
resource->records.emplace_back(num_labels, num_weights);
}
for (int64 id = 0; id < batch_size; id++) {
const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
reader.setBuffer(input_bytes);
// decode the reader
resource->records[id].decode(reader);
}
// This should be fine because m_keep_map should never go out of scope.
resource->keep_map = &m_keep_map;
resource->num_weights = num_weights;
resource->num_labels = num_labels;
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
int64_t count_if_exists(const twml::DataRecord::BinaryFeatures &set,
const twml::Map<int64_t, int64_t> *const keep_map) {
int64_t count = 0;
for (const auto &key : set) {
if (keep_map->find(key) == keep_map->end()) continue;
count++;
}
return count;
}
// This works for continuous, discrete, and string features
template<typename V>
int64_t count_if_exists(const twml::Map<int64_t, V> &map,
const twml::Map<int64_t, int64_t> *const keep_map) {
int64_t count = 0;
for (const auto &elem : map) {
if (keep_map->find(elem.first) == keep_map->end()) continue;
count++;
}
return count;
}
int64_t count_if_exists(const twml::DataRecord::SparseBinaryFeatures &map,
const twml::Map<int64_t, int64_t> *const keep_map) {
int64_t count = 0;
for (const auto &elem : map) {
if (keep_map->find(elem.first) == keep_map->end()) continue;
count += elem.second.size();
}
return count;
}
int64_t count_if_exists(const twml::DataRecord::SparseContinuousFeatures &map,
const twml::Map<int64_t, int64_t> *const keep_map) {
int64_t count = 0;
for (const auto &elem : map) {
if (keep_map->find(elem.first) == keep_map->end()) continue;
count += elem.second.size();
}
return count;
}
REGISTER_OP("GetBinaryFeatures")
.Input("data_record_handle: resource")
.Output("ids: int64")
.Output("keys: int64")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that reads binary features
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
values: always set to 1 (float)
)doc");
class GetBinaryFeatures : public OpKernel {
public:
explicit GetBinaryFeatures(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const auto &common = handle->common;
int64 common_binary_size = count_if_exists(common.getBinary(), handle->keep_map);
int64 total_binary_size = records.size() * common_binary_size;
for (int id = 0; id < records.size(); id++) {
total_binary_size += count_if_exists(handle->records[id].getBinary(), handle->keep_map);
}
const int total_size = static_cast<int>(total_binary_size);
TensorShape shape = {total_size};
Tensor* keys = nullptr;
Tensor* ids = nullptr;
Tensor* values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
uint64_t offset = 0;
auto keys_flat = keys->flat<int64>();
auto ids_flat = ids->flat<int64>();
auto values_flat = values->flat<float>();
for (int64 id = 0; id < records.size(); id++) {
for (const auto &it : common.getBinary()) {
if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it;
offset++;
}
for (const auto &it : records[id].getBinary()) {
if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it;
offset++;
}
}
// All the values for binary features are 1.
std::fill(values_flat.data(), values_flat.data() + total_size, 1);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetContinuousFeatures")
.Input("data_record_handle: resource")
.Output("ids: int64")
.Output("keys: int64")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that reads continuous features
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: Datarecord keys (int64)
values: Datarecord values(float)
)doc");
class GetContinuousFeatures : public OpKernel {
public:
explicit GetContinuousFeatures(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const auto &common = handle->common;
int64 common_continuous_size = count_if_exists(common.getContinuous(), handle->keep_map);
int64 total_continuous_size = records.size() * common_continuous_size;
for (int id = 0; id < records.size(); id++) {
total_continuous_size += count_if_exists(handle->records[id].getContinuous(),
handle->keep_map);
}
const int total_size = static_cast<int>(total_continuous_size);
TensorShape shape = {total_size};
Tensor* keys = nullptr;
Tensor* values = nullptr;
Tensor* ids = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
uint64_t offset = 0;
auto keys_flat = keys->flat<int64>();
auto values_flat = values->flat<float>();
auto ids_flat = ids->flat<int64>();
for (int64 id = 0; id < records.size(); id++) {
for (const auto &it : common.getContinuous()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it.first;
values_flat(offset) = it.second;
offset++;
}
for (const auto &it : records[id].getContinuous()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it.first;
values_flat(offset) = it.second;
offset++;
}
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetDiscreteFeatures")
.Input("data_record_handle: resource")
.Output("ids: int64")
.Output("keys: int64")
.Output("values: int64")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that reads discrete features
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
values: DataRecord values(int64)
)doc");
class GetDiscreteFeatures : public OpKernel {
public:
explicit GetDiscreteFeatures(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const auto &common = handle->common;
int64 common_discrete_size = count_if_exists(common.getDiscrete(), handle->keep_map);
int64 total_discrete_size = records.size() * common_discrete_size;
for (int id = 0; id < records.size(); id++) {
total_discrete_size += count_if_exists(handle->records[id].getDiscrete(),
handle->keep_map);
}
const int total_size = static_cast<int>(total_discrete_size);
TensorShape shape = {total_size};
Tensor* keys = nullptr;
Tensor* values = nullptr;
Tensor* ids = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
uint64_t offset = 0;
auto keys_flat = keys->flat<int64>();
auto values_flat = values->flat<int64>();
auto ids_flat = ids->flat<int64>();
for (int64 id = 0; id < records.size(); id++) {
for (const auto &it : common.getDiscrete()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it.first;
values_flat(offset) = it.second;
offset++;
}
for (const auto &it : records[id].getDiscrete()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it.first;
values_flat(offset) = it.second;
offset++;
}
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetStringFeatures")
.Input("data_record_handle: resource")
.Output("ids: int64")
.Output("keys: int64")
.Output("names: string")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that reads string features
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
names: DataRecord values(string)
values: always set to 1 (float)
)doc");
class GetStringFeatures : public OpKernel {
public:
explicit GetStringFeatures(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const auto &common = handle->common;
int64 common_string_size = count_if_exists(common.getString(), handle->keep_map);
int64 total_string_size = records.size() * common_string_size;
for (int id = 0; id < records.size(); id++) {
total_string_size += count_if_exists(handle->records[id].getString(),
handle->keep_map);
}
const int total_size = static_cast<int>(total_string_size);
TensorShape shape = {total_size};
Tensor* keys = nullptr;
Tensor* names = nullptr;
Tensor* ids = nullptr;
Tensor*values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
uint64_t offset = 0;
auto keys_flat = keys->flat<int64>();
auto names_flat = names->flat<string>();
auto ids_flat = ids->flat<int64>();
auto values_flat = values->flat<float>();
std::fill(values_flat.data(), values_flat.data() + total_size, 1);
for (int64 id = 0; id < records.size(); id++) {
for (const auto &it : common.getString()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it.first;
names_flat(offset) = it.second;
offset++;
}
for (const auto &it : records[id].getString()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
ids_flat(offset) = id;
keys_flat(offset) = it.first;
names_flat(offset) = it.second;
offset++;
}
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetSparseBinaryFeatures")
.Input("data_record_handle: resource")
.Output("ids: int64")
.Output("keys: int64")
.Output("names: string")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that reads sparse binary features
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
names: DataRecord values(string)
values: always set to 1 (float)
)doc");
class GetSparseBinaryFeatures : public OpKernel {
public:
explicit GetSparseBinaryFeatures(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const auto &common = handle->common;
int64 common_sparse_binary_size = count_if_exists(common.getSparseBinary(), handle->keep_map);
int64 total_sparse_binary_size = records.size() * common_sparse_binary_size;
for (int id = 0; id < records.size(); id++) {
total_sparse_binary_size += count_if_exists(handle->records[id].getSparseBinary(),
handle->keep_map);
}
const int total_size = static_cast<int>(total_sparse_binary_size);
TensorShape shape = {total_size};
Tensor* keys = nullptr;
Tensor* names = nullptr;
Tensor* ids = nullptr;
Tensor* values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
uint64_t offset = 0;
auto keys_flat = keys->flat<int64>();
auto names_flat = names->flat<string>();
auto ids_flat = ids->flat<int64>();
auto values_flat = values->flat<float>();
// All the values for sparse binary features are 1.
std::fill(values_flat.data(), values_flat.data() + total_size, 1);
for (int64 id = 0; id < records.size(); id++) {
for (const auto &it : common.getSparseBinary()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
for (const auto &it_inner : it.second) {
ids_flat(offset) = id;
keys_flat(offset) = it.first;
names_flat(offset) = it_inner;
offset++;
}
}
for (const auto &it : records[id].getSparseBinary()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
for (const auto &it_inner : it.second) {
ids_flat(offset) = id;
keys_flat(offset) = it.first;
names_flat(offset) = it_inner;
offset++;
}
}
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetSparseContinuousFeatures")
.Input("data_record_handle: resource")
.Output("ids: int64")
.Output("keys: int64")
.Output("values: float")
.Output("names: string")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that reads sparse continuous features
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
values: DataRecord values(float)
names: DataRecord values(string)
)doc");
class GetSparseContinuousFeatures : public OpKernel {
public:
explicit GetSparseContinuousFeatures(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const auto &common = handle->common;
int64 common_sparse_continuous_size = count_if_exists(common.getSparseContinuous(),
handle->keep_map);
int64 total_sparse_continuous_size = records.size() * common_sparse_continuous_size;
for (int id = 0; id < records.size(); id++) {
total_sparse_continuous_size += count_if_exists(handle->records[id].getSparseContinuous(),
handle->keep_map);
}
const int total_size = static_cast<int>(total_sparse_continuous_size);
TensorShape shape = {total_size};
Tensor* keys = nullptr;
Tensor* values = nullptr;
Tensor* names = nullptr;
Tensor* ids = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &names));
uint64_t offset = 0;
auto keys_flat = keys->flat<int64>();
auto values_flat = values->flat<float>();
auto names_flat = names->flat<string>();
auto ids_flat = ids->flat<int64>();
for (int64 id = 0; id < records.size(); id++) {
// copying the contents of the maps of maps
for (const auto &it : common.getSparseContinuous()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
// for each id; iterate through the number of maps corresponding to that id
for (const auto &it_inner : it.second) {
ids_flat(offset) = id;
keys_flat(offset) = it.first;
names_flat(offset) = it_inner.first;
values_flat(offset) = it_inner.second;
offset++;
}
}
// copying the contents of the maps of maps
for (const auto &it : records[id].getSparseContinuous()) {
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
// for each id; iterate through the number of maps corresponding to that id
for (const auto &it_inner : it.second) {
ids_flat(offset) = id;
keys_flat(offset) = it.first;
names_flat(offset) = it_inner.first;
values_flat(offset) = it_inner.second;
offset++;
}
}
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetBatchSizeFromDataRecord")
.Input("data_record_handle: resource")
.Output("batch_size: int64")
.SetShapeFn(shape_inference::ScalarShape)
.Doc(R"doc(
A tensorflow OP that returns batch size from the data record.
Input
data_record_handle: Resource handle to DataRecord
Outputs
batch_size: Number of records held in the handle.
)doc");
class GetBatchSizeFromDataRecord : public OpKernel {
public:
explicit GetBatchSizeFromDataRecord(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
Tensor *output;
OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
output->scalar<int64>()() = handle->records.size();
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetLabelsFromDataRecord")
.Input("data_record_handle: resource")
.Output("labels: float")
.Attr("default_label: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns labels from the data record.
Attr
default_label: The value used when a label is absent in a data record.
Input
data_record_handle: Resource handle to DataRecord
Outputs
labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
)doc");
class GetLabelsFromDataRecord : public OpKernel {
private:
float default_label;
public:
explicit GetLabelsFromDataRecord(OpKernelConstruction* context)
: OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const int num_labels = static_cast<int>(handle->num_labels);
TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
Tensor *labels;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
// The default value of label is not present in data record is std::nanf
// For continuous labels, change that to a default_label or label.
auto func = [this](float label) -> float {
return std::isnan(label) ? default_label : label;
};
auto labels_data = labels->flat<float>().data();
for (const auto &record : records) {
const auto& rec_labels = record.labels();
labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetWeightsFromDataRecord")
.Input("data_record_handle: resource")
.Output("weights: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns weights from the data record.
Input
data_record_handle: Resource handle to DataRecord
Outputs
weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
)doc");
class GetWeightsFromDataRecord : public OpKernel {
public:
explicit GetWeightsFromDataRecord(OpKernelConstruction* context)
: OpKernel(context) {}
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
const int num_weights = static_cast<int>(handle->num_weights);
TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
Tensor *weights;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
auto weights_data = weights->flat<float>().data();
for (const auto &record : records) {
const auto& rec_weights = record.weights();
weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
}
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
template<typename ValueType, typename FeatureType, typename TensorType>
void SetValueGroup(
const FeatureType& type,
const int64& feature_id,
const int64& id,
const ValueType& default_value,
TensorType values_flat) {
auto it = type.find(feature_id);
values_flat(id) = (it == type.end()) ? default_value : it->second;
}
template<typename ValueType, typename TensorType>
// overloading for BinaryFeatures; as it needs to set a value of 1
void SetValueGroup(
const twml::DataRecord::BinaryFeatures& type,
const int64& feature_id,
const int64& id,
const ValueType& default_value,
TensorType values_flat) {
auto it = type.find(feature_id);
values_flat(id) = (it == type.end()) ? default_value : 1;
}
// Helper for Group Extraction of Dense Features
template<typename ValueType, typename FeatureType>
void ComputeHelperGroupFeaturesAsTensors(
OpKernelContext* context,
const std::vector<int64>& feature_ids,
ValueType& default_value,
std::function<const FeatureType&(const twml::DataRecord&)> f) {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
// Output shape is 2D; where the first dimension corresponds to the batch_size
// and the second corresponds to the number of features passed to the TF Op.
const int batch_size = static_cast<int64>(handle->records.size());
const int num_feature_ids = static_cast<int>(feature_ids.size());
TensorShape shape = {batch_size, num_feature_ids};
// Define the output
Tensor* values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
auto values_flat = values->flat<ValueType>();
for (int64 id = 0; id < records.size(); id++) {
const auto &type = f(records[id]);
const auto id_offset = id * feature_ids.size();
for (int64 fid = 0; fid < feature_ids.size(); fid++) {
auto feature_id = feature_ids[fid];
// The value is set to default if it does not exist in the current DataRecord
SetValueGroup(type, feature_id, id_offset + fid, default_value, values_flat);
}
}
}
// Helper for Single Extraction of Dense Features
template<typename ValueType, typename FeatureType>
void ComputeHelperFeaturesAsTensors(
OpKernelContext* context,
ValueType& default_value,
int64 feature_id,
std::function<const FeatureType&(const twml::DataRecord&)> f) {
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
// Output shape is 2D; where the first dimension corresponds to the batch_size
// and the second corresponds to the number of features passed to the TF Op.
const int total_size = static_cast<int64>(handle->records.size());
TensorShape shape = {total_size};
// Define the output
Tensor* values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
auto values_flat = values->flat<ValueType>();
for (int64 id = 0; id < records.size(); id++) {
const auto &type = f(records[id]);
SetValueGroup(type, feature_id, id, default_value, values_flat);
}
}
REGISTER_OP("GetBinaryAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_id: int")
.Attr("default_value: float")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_id: Id representing the feature whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
)doc");
class GetBinaryAsTensor : public OpKernel {
private:
int64 feature_id;
float default_value;
public:
explicit GetBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetContinuousAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_id: int")
.Attr("default_value: float")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_id: Id representing the feature whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
)doc");
class GetContinuousAsTensor : public OpKernel {
private:
int64 feature_id;
float default_value;
public:
explicit GetContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetDiscreteAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_id: int")
.Attr("default_value: int")
.Output("values: int64")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_id: Id representing the feature whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
)doc");
class GetDiscreteAsTensor : public OpKernel {
private:
int64 feature_id;
int64 default_value;
public:
explicit GetDiscreteAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetStringAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_id: int")
.Attr("default_value: string")
.Output("names: string")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_id: Id representing the feature whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
names: A Tensor corresponding to the value of the feature_id across multiple DataRecords
)doc");
class GetStringAsTensor : public OpKernel {
private:
int64 feature_id;
string default_value;
public:
explicit GetStringAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetBinaryGroupAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_ids: list(int)")
.Attr("default_value: float")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_ids: List of ids representing the features whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
)doc");
class GetBinaryGroupAsTensor : public OpKernel {
private:
float default_value;
std::vector<int64> feature_ids;
public:
explicit GetBinaryGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetContinuousGroupAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_ids: list(int)")
.Attr("default_value: float")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_ids: List of ids representing the features whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
)doc");
class GetContinuousGroupAsTensor : public OpKernel {
private:
float default_value;
std::vector<int64> feature_ids;
public:
explicit GetContinuousGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetDiscreteGroupAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_ids: list(int)")
.Attr("default_value: int")
.Output("values: int64")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_ids: List of ids representing the features whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
)doc");
class GetDiscreteGroupAsTensor : public OpKernel {
private:
std::vector<int64> feature_ids;
int64 default_value;
public:
explicit GetDiscreteGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetStringGroupAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_ids: list(int)")
.Attr("default_value: string")
.Output("names: string")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_ids: List of ids representing the features whose values will be extracted.
default_value: default_value to be inputted if the values are missing from the current DataRecord.
Outputs
names: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
)doc");
class GetStringGroupAsTensor : public OpKernel {
private:
std::vector<int64> feature_ids;
string default_value;
public:
explicit GetStringGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
}
void Compute(OpKernelContext* context) override {
try {
std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
[](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetSparseBinaryAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_id: int")
.Output("ids: int64")
.Output("keys: int64")
.Output("names: string")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns tensors corresponding to the ids, keys and names of a particular
feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_id: Id representing the feature whose values will be extracted.
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
names: DataRecord values(string)
)doc");
class GetSparseBinaryAsTensor : public OpKernel {
private:
int64 feature_id;
public:
explicit GetSparseBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
}
void Compute(OpKernelContext* context) override {
try {
// We need two passes to the data:
// 1 to compute the output size of the tensor
// 2 to copy the values to the tensor
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
// Creating a vector we increment every time a key is found
std::vector<std::string> temp_names;
std::vector<int64> temp_ids;
for (int64 id = 0; id < records.size(); id++) {
const auto &sparse_binary = records[id].getSparseBinary();
auto it = sparse_binary.find(feature_id);
// Find all instances of key in DataRecord
if (it != sparse_binary.end()) {
// insert to temp_names all the values in the dictionary value
temp_names.insert(temp_names.end(), it->second.begin(), it->second.end());
temp_ids.insert(temp_ids.end(), it->second.size(), id);
}
}
// The total_size will be the that of the saved vector
const int total_size = static_cast<int64>(temp_names.size());
TensorShape shape = {total_size};
Tensor* ids = nullptr;
Tensor* keys = nullptr;
Tensor* names = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
auto keys_flat = keys->flat<int64>();
auto names_flat = names->flat<string>();
auto ids_flat = ids->flat<int64>();
// The feature id value will always be the same
std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetSparseContinuousAsTensor")
.Input("data_record_handle: resource")
.Attr("feature_id: int")
.Output("ids: int64")
.Output("keys: int64")
.Output("names: string")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP that returns tensors corresponding to the ids, keys, names and values of a particular
feature_id.
Input
data_record_handle: Resource handle to DataRecord
Attr
feature_id: Id representing the feature whose values will be extracted.
Outputs
ids: ids specifies the index of the records[id] in the batch (int64)
keys: DataRecord keys (int64)
names: DataRecord values(string)
values: DataRecord values(float)
)doc");
class GetSparseContinuousAsTensor : public OpKernel {
private:
int64 feature_id;
public:
explicit GetSparseContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
}
void Compute(OpKernelContext* context) override {
try {
// We need two passes to the data:
// 1 to compute the output size of the tensor
// 2 to copy the values to the tensor
auto handle = getHandle<DataRecordResource>(context, 0);
const auto &records = handle->records;
// Creating a vector we increment every time a key is found
std::vector<std::string> temp_names;
std::vector<float> temp_values;
std::vector<int64> temp_ids;
for (int64 id = 0; id < records.size(); id++) {
const auto &sparse_continuous = records[id].getSparseContinuous();
auto it = sparse_continuous.find(feature_id);
// Find all instances of key in DataRecord
if (it != sparse_continuous.end()) {
// insert to temp_names all the values in the dictionary value
auto value_map = it->second;
for (auto& elem : value_map) {
temp_names.push_back(elem.first);
temp_values.push_back(elem.second);
temp_ids.push_back(id);
}
}
}
// The total_size will be the that of the saved vector
const int total_size = static_cast<int64>(temp_names.size());
TensorShape shape = {total_size};
Tensor* ids = nullptr;
Tensor* keys = nullptr;
Tensor* names = nullptr;
Tensor* values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
auto keys_flat = keys->flat<int64>();
auto names_flat = names->flat<string>();
auto ids_flat = ids->flat<int64>();
auto values_flat = values->flat<float>();
// The feature id value will always be the same
std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
// Helper function to add ids, keys and values to common vector
inline void addIdsKeysValuesToVectors(
const int64 id,
const int64 key,
const double value,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
ids.push_back(id);
keys.push_back(key);
values.push_back(value);
}
struct KeepFeatures {
KeepFeatures() : vec(), set() {}
template<typename ContainerType>
KeepFeatures(const std::vector<int64> &keep_features,
const ContainerType *const container) {
vec.reserve(keep_features.size());
#ifdef USE_DENSE_HASH
set.resize(keep_features.size());
set.set_empty_key(0);
#else
set.reserve(keep_features.size());
#endif // USE_DENSE_HASH
set.max_load_factor(0.5);
for (const auto &elem : keep_features) {
if (container->find(elem) == container->end()) continue;
vec.push_back(elem);
set.insert(elem);
}
}
size_t size() const {
return vec.size();
}
std::vector<int64> vec;
twml::Set<int64> set;
};
// Helper Function to Filter and Hash Feature for Binary Features
void filterAndHashFeature(
const twml::DataRecord::BinaryFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
addIdsKeysValuesToVectors(current_id, *iter, 1, ids, keys, values);
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem) == keep_features.set.end()) continue;
addIdsKeysValuesToVectors(current_id, elem, 1, ids, keys, values);
}
}
}
// Helper Function to Filter and Hash Feature for Continuous Features
void filterAndHashFeature(
const twml::DataRecord::ContinuousFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
addIdsKeysValuesToVectors(current_id, iter->first, iter->second, ids, keys, values);
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
addIdsKeysValuesToVectors(current_id, elem.first, elem.second, ids, keys, values);
}
}
}
// Helper Function to Filter and Hash Feature for Discrete Features
void filterAndHashFeature(
const twml::DataRecord::DiscreteFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
int64_t key = twml::mixDiscreteIdAndValue(iter->first, iter->second);
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
int64_t key = twml::mixDiscreteIdAndValue(elem.first, elem.second);
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
}
}
}
// Helper Function to Filter and Hash Feature for String Features
void filterAndHashFeature(
const twml::DataRecord::StringFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
int64_t key = twml::mixStringIdAndValue(
iter->first,
iter->second.size(),
reinterpret_cast<const uint8_t*>(iter->second.c_str()));
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
int64_t key = twml::mixStringIdAndValue(
elem.first,
elem.second.size(),
reinterpret_cast<const uint8_t*>(elem.second.c_str()));
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
}
}
}
// Helper Function to Filter and Hash Feature for Sparse Binary Features
void filterAndHashFeature(
const twml::DataRecord::SparseBinaryFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
for (const auto &name : iter->second) {
int64_t key = twml::mixStringIdAndValue(iter->first, name.size(),
reinterpret_cast<const uint8_t*>(name.c_str()));
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
}
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
for (const auto &name : elem.second) {
int64_t key = twml::mixStringIdAndValue(elem.first, name.size(),
reinterpret_cast<const uint8_t*>(name.c_str()));
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
}
}
}
}
// Helper Function to Filter and Hash Feature for Sparse Continuous Features
void filterAndHashFeature(
const twml::DataRecord::SparseContinuousFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
for (const auto &map : iter->second) {
int64_t key = twml::mixStringIdAndValue(
iter->first,
map.first.size(),
reinterpret_cast<const uint8_t*>(map.first.c_str()));
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
}
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
for (const auto &map : elem.second) {
int64_t key = twml::mixStringIdAndValue(
elem.first,
map.first.size(),
reinterpret_cast<const uint8_t*>(map.first.c_str()));
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
}
}
}
}
// Helper Function to Filter and Hash Feature for Sparse Continuous Features
void filterAndHashFeatureCompat(
const twml::DataRecord::SparseContinuousFeatures& features,
const int64 current_id,
const KeepFeatures &keep_features,
std::vector<int64>& ids,
std::vector<int64>& keys,
std::vector<float>& values) {
if (keep_features.size() < 2 * features.size()) {
for (const auto &f : keep_features.vec) {
const auto &iter = features.find(f);
if (iter == features.end()) continue;
for (const auto &map : iter->second) {
int64_t key = twml::featureId(map.first);
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
}
}
} else {
for (const auto &elem : features) {
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
for (const auto &map : elem.second) {
int64_t key = twml::featureId(map.first);
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
}
}
}
}
void copy_if_exists(std::vector<int64>& out,
const std::vector<int64>& in,
const twml::Map<int64_t, int64_t> *const map) {
out.reserve(in.size());
for (const auto &elem : in) {
if (map->find(elem) == map->end()) continue;
out.push_back(elem);
}
}
void ComputeHashedFeaturesAsTensor(OpKernelContext* context,
const DataRecordResource *const handle,
const KeepFeatures &binary_keep_features,
const KeepFeatures &continuous_keep_features,
const KeepFeatures &discrete_keep_features,
const KeepFeatures &string_keep_features,
const KeepFeatures &sparse_binary_keep_features,
const KeepFeatures &sparse_continuous_keep_features,
bool sparse_continuous_compatibility) {
const auto &records = handle->records;
uint64_t estimated_size = (binary_keep_features.size() + continuous_keep_features.size() +
discrete_keep_features.size() + string_keep_features.size() +
sparse_binary_keep_features.size() +
sparse_continuous_keep_features.size());
// Construct temporary vectors for common features
std::vector<int64> common_ids, common_keys, temp_ids, temp_keys;
std::vector<float> common_values, temp_values;
common_ids.reserve(estimated_size);
common_keys.reserve(estimated_size);
common_values.reserve(estimated_size);
const auto &common_binary = handle->common.getBinary();
const auto &common_continuous = handle->common.getContinuous();
const auto &common_discrete = handle->common.getDiscrete();
const auto &common_string = handle->common.getString();
const auto &common_sparse_binary = handle->common.getSparseBinary();
const auto &common_sparse_continuous = handle->common.getSparseContinuous();
filterAndHashFeature(common_binary, 0, binary_keep_features,
common_ids, common_keys, common_values);
filterAndHashFeature(common_continuous, 0, continuous_keep_features,
common_ids, common_keys, common_values);
filterAndHashFeature(common_discrete, 0, discrete_keep_features,
common_ids, common_keys, common_values);
filterAndHashFeature(common_string, 0, string_keep_features,
common_ids, common_keys, common_values);
filterAndHashFeature(common_sparse_binary, 0, sparse_binary_keep_features,
common_ids, common_keys, common_values);
if (sparse_continuous_compatibility) {
filterAndHashFeatureCompat(common_sparse_continuous, 0, sparse_continuous_keep_features,
common_ids, common_keys, common_values);
} else {
filterAndHashFeature(common_sparse_continuous, 0, sparse_continuous_keep_features,
common_ids, common_keys, common_values);
}
common_ids.clear();
// Construct temporary vectors for all features
estimated_size = (estimated_size + common_keys.size()) * records.size();
temp_ids.reserve(estimated_size);
temp_keys.reserve(estimated_size);
temp_values.reserve(estimated_size);
for (int64 id = 0; id < records.size(); id++) {
temp_ids.insert(temp_ids.end(), common_keys.size(), id);
temp_keys.insert(temp_keys.end(), common_keys.begin(), common_keys.end());
temp_values.insert(temp_values.end(), common_values.begin(), common_values.end());
const auto &binary = records[id].getBinary();
const auto &continuous = records[id].getContinuous();
const auto &discrete = records[id].getDiscrete();
const auto &str = records[id].getString();
const auto &sparse_binary = records[id].getSparseBinary();
const auto &sparse_continuous = records[id].getSparseContinuous();
filterAndHashFeature(binary, id, binary_keep_features,
temp_ids, temp_keys, temp_values);
filterAndHashFeature(continuous, id, continuous_keep_features,
temp_ids, temp_keys, temp_values);
filterAndHashFeature(discrete, id, discrete_keep_features,
temp_ids, temp_keys, temp_values);
filterAndHashFeature(str, id, string_keep_features,
temp_ids, temp_keys, temp_values);
filterAndHashFeature(sparse_binary, id, sparse_binary_keep_features,
temp_ids, temp_keys, temp_values);
if (sparse_continuous_compatibility) {
filterAndHashFeatureCompat(sparse_continuous, id, sparse_continuous_keep_features,
temp_ids, temp_keys, temp_values);
} else {
filterAndHashFeature(sparse_continuous, id, sparse_continuous_keep_features,
temp_ids, temp_keys, temp_values);
}
}
// Copy the temporary vectors into the output Tensors
TensorShape shape = {static_cast<int64>(temp_ids.size())};
Tensor* ids = nullptr;
Tensor* keys = nullptr;
Tensor* values = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
auto ids_flat = ids->flat<int64>();
auto keys_flat = keys->flat<int64>();
auto values_flat = values->flat<float>();
std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
std::copy(temp_keys.begin(), temp_keys.end(), keys_flat.data());
std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
}
REGISTER_OP("GetHashedFeaturesAsSparseTensor")
.Input("data_record_handle: resource")
.Attr("binary_keep_features: list(int)")
.Attr("continuous_keep_features: list(int)")
.Attr("discrete_keep_features: list(int)")
.Attr("string_keep_features: list(int)")
.Attr("sparse_binary_keep_features: list(int)")
.Attr("sparse_continuous_keep_features: list(int)")
.Output("ids: int64")
.Output("keys: int64")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP for returning required features of different type as
a single sparse tensor. Hashing trick is applied.
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records in the batch (int64)
keys: DataRecord keys (int64)
values: DataRecord values (float)
)doc");
class GetHashedFeaturesAsSparseTensor: public OpKernel {
public:
explicit GetHashedFeaturesAsSparseTensor(OpKernelConstruction* context): OpKernel(context) {
// Get the list of features to keep for each feature type
OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
}
private:
std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
// Create a new list of keep features based on the original keep_set.
// This is to ensure compatibility with existing behavior such as:
// - Ensure no new features are decoded in this op.
// - Ensure labels or weights dont get included here.
// TODO: Should we return features requested by user here even if they are labels / weights?
KeepFeatures binary_keep_features(binary_keep_features_, handle->keep_map);
KeepFeatures continuous_keep_features(continuous_keep_features_, handle->keep_map);
KeepFeatures discrete_keep_features(discrete_keep_features_, handle->keep_map);
KeepFeatures string_keep_features(string_keep_features_, handle->keep_map);
KeepFeatures sparse_binary_keep_features(sparse_binary_keep_features_, handle->keep_map);
KeepFeatures sparse_continuous_keep_features(sparse_continuous_keep_features_, handle->keep_map);
ComputeHashedFeaturesAsTensor(context, handle.get(),
binary_keep_features,
continuous_keep_features,
discrete_keep_features,
string_keep_features,
sparse_binary_keep_features,
sparse_continuous_keep_features,
false);
} catch(const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
REGISTER_OP("GetHashedFeaturesAsSparseTensorV2")
.Input("data_record_handle: resource")
.Attr("binary_keep_features: list(int)")
.Attr("continuous_keep_features: list(int)")
.Attr("discrete_keep_features: list(int)")
.Attr("string_keep_features: list(int)")
.Attr("sparse_binary_keep_features: list(int)")
.Attr("sparse_continuous_keep_features: list(int)")
.Attr("keep_features: list(int)")
.Attr("keep_codes: list(int)")
.Attr("decode_mode: int = 0")
.Output("ids: int64")
.Output("keys: int64")
.Output("values: float")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
return Status::OK();
}).Doc(R"doc(
A tensorflow OP for returning required features of different type as
a single sparse tensor. Hashing trick is applied.
Input
data_record_handle: Resource handle to DataRecord
Outputs
ids: ids specifies the index of the records in the batch (int64)
keys: DataRecord keys (int64)
values: DataRecord values (float)
)doc");
class GetHashedFeaturesAsSparseTensorV2: public OpKernel {
public:
explicit GetHashedFeaturesAsSparseTensorV2(OpKernelConstruction* context): OpKernel(context) {
std::vector<int64> keep_features;
std::vector<int64> keep_codes;
std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
// Get the list of features to keep for each feature type
OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
twml::Map<int64_t, int64_t> keep_map;
#ifdef USE_DENSE_HASH
keep_map.set_empty_key(0);
#endif // USE_DENSE_HASH
for (uint64_t i = 0; i < keep_features.size(); i++) {
keep_map[keep_features[i]] = keep_codes[i];
}
binary_keep_features = KeepFeatures(binary_keep_features_, &keep_map);
continuous_keep_features = KeepFeatures(continuous_keep_features_, &keep_map);
discrete_keep_features = KeepFeatures(discrete_keep_features_, &keep_map);
string_keep_features = KeepFeatures(string_keep_features_, &keep_map);
sparse_binary_keep_features = KeepFeatures(sparse_binary_keep_features_, &keep_map);
sparse_continuous_keep_features = KeepFeatures(sparse_continuous_keep_features_, &keep_map);
}
private:
KeepFeatures binary_keep_features, continuous_keep_features, discrete_keep_features;
KeepFeatures string_keep_features, sparse_binary_keep_features, sparse_continuous_keep_features;
int64 m_decode_mode;
void Compute(OpKernelContext* context) override {
try {
auto handle = getHandle<DataRecordResource>(context, 0);
// Create a new list of keep features based on the original keep_set.
// This is to ensure compatibility with existing behavior such as:
// - Ensure no new features are decoded in this op.
// - Ensure labels or weights dont get included here.
// TODO: Should we return features requested by user here even if they are labels / weights?
ComputeHashedFeaturesAsTensor(context, handle.get(),
binary_keep_features,
continuous_keep_features,
discrete_keep_features,
string_keep_features,
sparse_binary_keep_features,
sparse_continuous_keep_features,
m_decode_mode == 0);
} catch(const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}
};
#define REGISTER_DECODE_DATA_RECORD(InputType) \
REGISTER_KERNEL_BUILDER( \
Name("DecodeDataRecord") \
.Device(DEVICE_CPU) \
.TypeConstraint<InputType>("InputType"), \
DecodeDataRecord<InputType>); \
REGISTER_DECODE_DATA_RECORD(uint8)
REGISTER_DECODE_DATA_RECORD(string)
#define REGISTER_GETTER(FIELD) \
REGISTER_KERNEL_BUILDER( \
Name("Get" #FIELD "Features") \
.Device(DEVICE_CPU), \
Get##FIELD##Features); \
#define REGISTER_GETTER_FROM_DR(FIELD) \
REGISTER_KERNEL_BUILDER( \
Name("Get" #FIELD "FromDataRecord") \
.Device(DEVICE_CPU), \
Get##FIELD##FromDataRecord); \
#define REGISTER_GETTER_AS_TENSOR(FIELD) \
REGISTER_KERNEL_BUILDER( \
Name("Get" #FIELD "AsTensor") \
.Device(DEVICE_CPU), \
Get##FIELD##AsTensor); \
#define REGISTER_GETTER_GROUP_AS_TENSOR(FIELD) \
REGISTER_KERNEL_BUILDER( \
Name("Get" #FIELD "GroupAsTensor") \
.Device(DEVICE_CPU), \
Get##FIELD##GroupAsTensor); \
REGISTER_GETTER(Binary)
REGISTER_GETTER(Continuous)
REGISTER_GETTER(Discrete)
REGISTER_GETTER(String)
REGISTER_GETTER(SparseBinary)
REGISTER_GETTER(SparseContinuous)
REGISTER_GETTER_FROM_DR(BatchSize)
REGISTER_GETTER_FROM_DR(Labels)
REGISTER_GETTER_FROM_DR(Weights)
REGISTER_GETTER_AS_TENSOR(Binary)
REGISTER_GETTER_AS_TENSOR(Continuous)
REGISTER_GETTER_AS_TENSOR(Discrete)
REGISTER_GETTER_AS_TENSOR(String)
REGISTER_GETTER_AS_TENSOR(SparseBinary)
REGISTER_GETTER_AS_TENSOR(SparseContinuous)
REGISTER_GETTER_GROUP_AS_TENSOR(Binary)
REGISTER_GETTER_GROUP_AS_TENSOR(Continuous)
REGISTER_GETTER_GROUP_AS_TENSOR(Discrete)
REGISTER_GETTER_GROUP_AS_TENSOR(String)
REGISTER_KERNEL_BUILDER(
Name("GetHashedFeaturesAsSparseTensor")
.Device(DEVICE_CPU),
GetHashedFeaturesAsSparseTensor);
REGISTER_KERNEL_BUILDER(
Name("GetHashedFeaturesAsSparseTensorV2")
.Device(DEVICE_CPU),
GetHashedFeaturesAsSparseTensorV2);