mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 00:38:46 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
1892 lines
71 KiB
C++
1892 lines
71 KiB
C++
#include "tensorflow/core/framework/op.h"
|
|
#include "tensorflow/core/framework/shape_inference.h"
|
|
#include "tensorflow/core/framework/op_kernel.h"
|
|
|
|
#include <twml.h>
|
|
#include <twml/functions.h>
|
|
#include <twml/utilities.h>
|
|
#include "tensorflow_utils.h"
|
|
#include "resource_utils.h"
|
|
|
|
#include <algorithm>
|
|
|
|
using std::string;
|
|
|
|
REGISTER_OP("DecodeDataRecord")
|
|
.Attr("InputType: {uint8, string}")
|
|
.Attr("keep_features: list(int)")
|
|
.Attr("keep_codes: list(int)")
|
|
.Attr("label_features: list(int)")
|
|
.Attr("weight_features: list(int) = []")
|
|
.Input("input_bytes: InputType")
|
|
.Output("data_record_handle: resource")
|
|
.SetShapeFn(shape_inference::ScalarShape)
|
|
.Doc(R"doc(
|
|
A tensorflow OP that creates a handle for the datarecord.
|
|
|
|
Attr
|
|
keep_features: a list of int ids to keep.
|
|
keep_codes: their corresponding code.
|
|
label_features: list of feature ids representing the labels.
|
|
weight_features: list of feature ids representing the weights. Defaults to empty list.
|
|
shared_name: name used by the resource handle inside the resource manager.
|
|
container: name used by the container of the resources.
|
|
|
|
shared_name and container are required when inheriting from ResourceOpKernel.
|
|
|
|
Input
|
|
input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
|
|
|
|
Outputs
|
|
data_record_handle: A resource handle to the DataRecord struct.
|
|
)doc");
|
|
|
|
template<typename InputType>
|
|
class DecodeDataRecord : public OpKernel {
|
|
public:
|
|
explicit DecodeDataRecord(OpKernelConstruction* context)
|
|
: OpKernel(context) {
|
|
std::vector<int64> keep_features;
|
|
std::vector<int64> keep_codes;
|
|
|
|
std::vector<int64> label_features;
|
|
std::vector<int64> weight_features;
|
|
|
|
OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
|
|
OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
|
|
OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
|
|
OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
|
|
|
|
OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
|
|
errors::InvalidArgument("keep keys and values must have same size."));
|
|
|
|
#ifdef USE_DENSE_HASH
|
|
m_keep_map.set_empty_key(0);
|
|
m_labels_map.set_empty_key(0);
|
|
m_weights_map.set_empty_key(0);
|
|
#endif // USE_DENSE_HASH
|
|
|
|
for (uint64_t i = 0; i < keep_features.size(); i++) {
|
|
m_keep_map[keep_features[i]] = keep_codes[i];
|
|
}
|
|
|
|
for (uint64_t i = 0; i < label_features.size(); i++) {
|
|
m_labels_map[label_features[i]] = i;
|
|
}
|
|
|
|
for (uint64_t i = 0; i < weight_features.size(); i++) {
|
|
m_weights_map[weight_features[i]] = i;
|
|
}
|
|
}
|
|
|
|
private:
|
|
twml::Map<int64_t, int64_t> m_keep_map;
|
|
twml::Map<int64_t, int64_t> m_labels_map;
|
|
twml::Map<int64_t, int64_t> m_weights_map;
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
DataRecordResource *resource = nullptr;
|
|
OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
|
|
|
|
// Store the input bytes in the resource so it isnt freed before the resource.
|
|
// This is necessary because we are not copying the contents for tensors.
|
|
resource->input = context->input(0);
|
|
int batch_size = getBatchSize<InputType>(resource->input);
|
|
int num_labels = static_cast<int>(m_labels_map.size());
|
|
int num_weights = static_cast<int>(m_weights_map.size());
|
|
|
|
twml::DataRecordReader reader;
|
|
reader.setKeepMap(&m_keep_map);
|
|
reader.setLabelsMap(&m_labels_map);
|
|
|
|
// Do not set weight map if it is empty. This will take a faster path.
|
|
if (num_weights != 0) {
|
|
reader.setWeightsMap(&m_weights_map);
|
|
}
|
|
|
|
resource->records.clear();
|
|
resource->records.reserve(batch_size);
|
|
for (int i = 0; i < batch_size; i++) {
|
|
resource->records.emplace_back(num_labels, num_weights);
|
|
}
|
|
|
|
for (int64 id = 0; id < batch_size; id++) {
|
|
const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
|
|
reader.setBuffer(input_bytes);
|
|
// decode the reader
|
|
resource->records[id].decode(reader);
|
|
}
|
|
// This should be fine because m_keep_map should never go out of scope.
|
|
resource->keep_map = &m_keep_map;
|
|
resource->num_weights = num_weights;
|
|
resource->num_labels = num_labels;
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
int64_t count_if_exists(const twml::DataRecord::BinaryFeatures &set,
|
|
const twml::Map<int64_t, int64_t> *const keep_map) {
|
|
int64_t count = 0;
|
|
for (const auto &key : set) {
|
|
if (keep_map->find(key) == keep_map->end()) continue;
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
// This works for continuous, discrete, and string features
|
|
template<typename V>
|
|
int64_t count_if_exists(const twml::Map<int64_t, V> &map,
|
|
const twml::Map<int64_t, int64_t> *const keep_map) {
|
|
int64_t count = 0;
|
|
for (const auto &elem : map) {
|
|
if (keep_map->find(elem.first) == keep_map->end()) continue;
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
int64_t count_if_exists(const twml::DataRecord::SparseBinaryFeatures &map,
|
|
const twml::Map<int64_t, int64_t> *const keep_map) {
|
|
int64_t count = 0;
|
|
for (const auto &elem : map) {
|
|
if (keep_map->find(elem.first) == keep_map->end()) continue;
|
|
count += elem.second.size();
|
|
}
|
|
return count;
|
|
}
|
|
|
|
int64_t count_if_exists(const twml::DataRecord::SparseContinuousFeatures &map,
|
|
const twml::Map<int64_t, int64_t> *const keep_map) {
|
|
int64_t count = 0;
|
|
for (const auto &elem : map) {
|
|
if (keep_map->find(elem.first) == keep_map->end()) continue;
|
|
count += elem.second.size();
|
|
}
|
|
return count;
|
|
}
|
|
|
|
REGISTER_OP("GetBinaryFeatures")
|
|
.Input("data_record_handle: resource")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that reads binary features
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
values: always set to 1 (float)
|
|
)doc");
|
|
|
|
class GetBinaryFeatures : public OpKernel {
|
|
public:
|
|
explicit GetBinaryFeatures(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const auto &common = handle->common;
|
|
|
|
int64 common_binary_size = count_if_exists(common.getBinary(), handle->keep_map);
|
|
int64 total_binary_size = records.size() * common_binary_size;
|
|
for (int id = 0; id < records.size(); id++) {
|
|
total_binary_size += count_if_exists(handle->records[id].getBinary(), handle->keep_map);
|
|
}
|
|
const int total_size = static_cast<int>(total_binary_size);
|
|
|
|
TensorShape shape = {total_size};
|
|
Tensor* keys = nullptr;
|
|
Tensor* ids = nullptr;
|
|
Tensor* values = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
|
|
|
|
uint64_t offset = 0;
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
for (const auto &it : common.getBinary()) {
|
|
if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it;
|
|
offset++;
|
|
}
|
|
for (const auto &it : records[id].getBinary()) {
|
|
if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it;
|
|
offset++;
|
|
}
|
|
}
|
|
// All the values for binary features are 1.
|
|
std::fill(values_flat.data(), values_flat.data() + total_size, 1);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetContinuousFeatures")
|
|
.Input("data_record_handle: resource")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that reads continuous features
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: Datarecord keys (int64)
|
|
values: Datarecord values(float)
|
|
)doc");
|
|
|
|
class GetContinuousFeatures : public OpKernel {
|
|
public:
|
|
explicit GetContinuousFeatures(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const auto &common = handle->common;
|
|
|
|
int64 common_continuous_size = count_if_exists(common.getContinuous(), handle->keep_map);
|
|
int64 total_continuous_size = records.size() * common_continuous_size;
|
|
for (int id = 0; id < records.size(); id++) {
|
|
total_continuous_size += count_if_exists(handle->records[id].getContinuous(),
|
|
handle->keep_map);
|
|
}
|
|
const int total_size = static_cast<int>(total_continuous_size);
|
|
|
|
TensorShape shape = {total_size};
|
|
Tensor* keys = nullptr;
|
|
Tensor* values = nullptr;
|
|
Tensor* ids = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
|
|
|
|
uint64_t offset = 0;
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
for (const auto &it : common.getContinuous()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
values_flat(offset) = it.second;
|
|
offset++;
|
|
}
|
|
for (const auto &it : records[id].getContinuous()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
values_flat(offset) = it.second;
|
|
offset++;
|
|
}
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetDiscreteFeatures")
|
|
.Input("data_record_handle: resource")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("values: int64")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that reads discrete features
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
values: DataRecord values(int64)
|
|
)doc");
|
|
|
|
class GetDiscreteFeatures : public OpKernel {
|
|
public:
|
|
explicit GetDiscreteFeatures(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const auto &common = handle->common;
|
|
|
|
int64 common_discrete_size = count_if_exists(common.getDiscrete(), handle->keep_map);
|
|
int64 total_discrete_size = records.size() * common_discrete_size;
|
|
for (int id = 0; id < records.size(); id++) {
|
|
total_discrete_size += count_if_exists(handle->records[id].getDiscrete(),
|
|
handle->keep_map);
|
|
}
|
|
const int total_size = static_cast<int>(total_discrete_size);
|
|
|
|
TensorShape shape = {total_size};
|
|
Tensor* keys = nullptr;
|
|
Tensor* values = nullptr;
|
|
Tensor* ids = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
|
|
|
|
uint64_t offset = 0;
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto values_flat = values->flat<int64>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
for (const auto &it : common.getDiscrete()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
values_flat(offset) = it.second;
|
|
offset++;
|
|
}
|
|
for (const auto &it : records[id].getDiscrete()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
values_flat(offset) = it.second;
|
|
offset++;
|
|
}
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetStringFeatures")
|
|
.Input("data_record_handle: resource")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("names: string")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that reads string features
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
names: DataRecord values(string)
|
|
values: always set to 1 (float)
|
|
)doc");
|
|
|
|
class GetStringFeatures : public OpKernel {
|
|
public:
|
|
explicit GetStringFeatures(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const auto &common = handle->common;
|
|
|
|
int64 common_string_size = count_if_exists(common.getString(), handle->keep_map);
|
|
int64 total_string_size = records.size() * common_string_size;
|
|
for (int id = 0; id < records.size(); id++) {
|
|
total_string_size += count_if_exists(handle->records[id].getString(),
|
|
handle->keep_map);
|
|
}
|
|
const int total_size = static_cast<int>(total_string_size);
|
|
|
|
TensorShape shape = {total_size};
|
|
Tensor* keys = nullptr;
|
|
Tensor* names = nullptr;
|
|
Tensor* ids = nullptr;
|
|
Tensor*values = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
|
|
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
|
|
|
|
uint64_t offset = 0;
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto names_flat = names->flat<string>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
|
|
std::fill(values_flat.data(), values_flat.data() + total_size, 1);
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
for (const auto &it : common.getString()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
names_flat(offset) = it.second;
|
|
offset++;
|
|
}
|
|
for (const auto &it : records[id].getString()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
names_flat(offset) = it.second;
|
|
offset++;
|
|
}
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetSparseBinaryFeatures")
|
|
.Input("data_record_handle: resource")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("names: string")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that reads sparse binary features
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
names: DataRecord values(string)
|
|
values: always set to 1 (float)
|
|
)doc");
|
|
|
|
class GetSparseBinaryFeatures : public OpKernel {
|
|
public:
|
|
explicit GetSparseBinaryFeatures(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const auto &common = handle->common;
|
|
|
|
int64 common_sparse_binary_size = count_if_exists(common.getSparseBinary(), handle->keep_map);
|
|
int64 total_sparse_binary_size = records.size() * common_sparse_binary_size;
|
|
for (int id = 0; id < records.size(); id++) {
|
|
total_sparse_binary_size += count_if_exists(handle->records[id].getSparseBinary(),
|
|
handle->keep_map);
|
|
}
|
|
const int total_size = static_cast<int>(total_sparse_binary_size);
|
|
|
|
TensorShape shape = {total_size};
|
|
Tensor* keys = nullptr;
|
|
Tensor* names = nullptr;
|
|
Tensor* ids = nullptr;
|
|
Tensor* values = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
|
|
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
|
|
|
|
uint64_t offset = 0;
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto names_flat = names->flat<string>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
|
|
// All the values for sparse binary features are 1.
|
|
std::fill(values_flat.data(), values_flat.data() + total_size, 1);
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
for (const auto &it : common.getSparseBinary()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
for (const auto &it_inner : it.second) {
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
names_flat(offset) = it_inner;
|
|
offset++;
|
|
}
|
|
}
|
|
for (const auto &it : records[id].getSparseBinary()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
for (const auto &it_inner : it.second) {
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
names_flat(offset) = it_inner;
|
|
offset++;
|
|
}
|
|
}
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetSparseContinuousFeatures")
|
|
.Input("data_record_handle: resource")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("values: float")
|
|
.Output("names: string")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that reads sparse continuous features
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
values: DataRecord values(float)
|
|
names: DataRecord values(string)
|
|
)doc");
|
|
|
|
class GetSparseContinuousFeatures : public OpKernel {
|
|
public:
|
|
explicit GetSparseContinuousFeatures(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const auto &common = handle->common;
|
|
|
|
int64 common_sparse_continuous_size = count_if_exists(common.getSparseContinuous(),
|
|
handle->keep_map);
|
|
int64 total_sparse_continuous_size = records.size() * common_sparse_continuous_size;
|
|
for (int id = 0; id < records.size(); id++) {
|
|
total_sparse_continuous_size += count_if_exists(handle->records[id].getSparseContinuous(),
|
|
handle->keep_map);
|
|
}
|
|
const int total_size = static_cast<int>(total_sparse_continuous_size);
|
|
|
|
TensorShape shape = {total_size};
|
|
Tensor* keys = nullptr;
|
|
Tensor* values = nullptr;
|
|
Tensor* names = nullptr;
|
|
Tensor* ids = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
|
|
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &names));
|
|
|
|
uint64_t offset = 0;
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
auto names_flat = names->flat<string>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
// copying the contents of the maps of maps
|
|
for (const auto &it : common.getSparseContinuous()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
// for each id; iterate through the number of maps corresponding to that id
|
|
for (const auto &it_inner : it.second) {
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
names_flat(offset) = it_inner.first;
|
|
values_flat(offset) = it_inner.second;
|
|
offset++;
|
|
}
|
|
}
|
|
// copying the contents of the maps of maps
|
|
for (const auto &it : records[id].getSparseContinuous()) {
|
|
if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
|
|
// for each id; iterate through the number of maps corresponding to that id
|
|
for (const auto &it_inner : it.second) {
|
|
ids_flat(offset) = id;
|
|
keys_flat(offset) = it.first;
|
|
names_flat(offset) = it_inner.first;
|
|
values_flat(offset) = it_inner.second;
|
|
offset++;
|
|
}
|
|
}
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetBatchSizeFromDataRecord")
|
|
.Input("data_record_handle: resource")
|
|
.Output("batch_size: int64")
|
|
.SetShapeFn(shape_inference::ScalarShape)
|
|
.Doc(R"doc(
|
|
A tensorflow OP that returns batch size from the data record.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
batch_size: Number of records held in the handle.
|
|
)doc");
|
|
|
|
class GetBatchSizeFromDataRecord : public OpKernel {
|
|
public:
|
|
explicit GetBatchSizeFromDataRecord(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
Tensor *output;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
|
|
output->scalar<int64>()() = handle->records.size();
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetLabelsFromDataRecord")
|
|
.Input("data_record_handle: resource")
|
|
.Output("labels: float")
|
|
.Attr("default_label: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns labels from the data record.
|
|
|
|
Attr
|
|
default_label: The value used when a label is absent in a data record.
|
|
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
|
|
)doc");
|
|
|
|
class GetLabelsFromDataRecord : public OpKernel {
|
|
private:
|
|
float default_label;
|
|
|
|
public:
|
|
explicit GetLabelsFromDataRecord(OpKernelConstruction* context)
|
|
: OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const int num_labels = static_cast<int>(handle->num_labels);
|
|
TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
|
|
|
|
Tensor *labels;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
|
|
|
|
// The default value of label is not present in data record is std::nanf
|
|
// For continuous labels, change that to a default_label or label.
|
|
auto func = [this](float label) -> float {
|
|
return std::isnan(label) ? default_label : label;
|
|
};
|
|
|
|
auto labels_data = labels->flat<float>().data();
|
|
for (const auto &record : records) {
|
|
const auto& rec_labels = record.labels();
|
|
labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetWeightsFromDataRecord")
|
|
.Input("data_record_handle: resource")
|
|
.Output("weights: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns weights from the data record.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
|
|
)doc");
|
|
|
|
class GetWeightsFromDataRecord : public OpKernel {
|
|
public:
|
|
explicit GetWeightsFromDataRecord(OpKernelConstruction* context)
|
|
: OpKernel(context) {}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
const int num_weights = static_cast<int>(handle->num_weights);
|
|
TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
|
|
|
|
Tensor *weights;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
|
|
|
|
auto weights_data = weights->flat<float>().data();
|
|
for (const auto &record : records) {
|
|
const auto& rec_weights = record.weights();
|
|
weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
|
|
}
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename ValueType, typename FeatureType, typename TensorType>
|
|
void SetValueGroup(
|
|
const FeatureType& type,
|
|
const int64& feature_id,
|
|
const int64& id,
|
|
const ValueType& default_value,
|
|
TensorType values_flat) {
|
|
auto it = type.find(feature_id);
|
|
values_flat(id) = (it == type.end()) ? default_value : it->second;
|
|
}
|
|
|
|
template<typename ValueType, typename TensorType>
|
|
// overloading for BinaryFeatures; as it needs to set a value of 1
|
|
void SetValueGroup(
|
|
const twml::DataRecord::BinaryFeatures& type,
|
|
const int64& feature_id,
|
|
const int64& id,
|
|
const ValueType& default_value,
|
|
TensorType values_flat) {
|
|
auto it = type.find(feature_id);
|
|
values_flat(id) = (it == type.end()) ? default_value : 1;
|
|
}
|
|
|
|
// Helper for Group Extraction of Dense Features
|
|
template<typename ValueType, typename FeatureType>
|
|
void ComputeHelperGroupFeaturesAsTensors(
|
|
OpKernelContext* context,
|
|
const std::vector<int64>& feature_ids,
|
|
ValueType& default_value,
|
|
std::function<const FeatureType&(const twml::DataRecord&)> f) {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
// Output shape is 2D; where the first dimension corresponds to the batch_size
|
|
// and the second corresponds to the number of features passed to the TF Op.
|
|
const int batch_size = static_cast<int64>(handle->records.size());
|
|
const int num_feature_ids = static_cast<int>(feature_ids.size());
|
|
TensorShape shape = {batch_size, num_feature_ids};
|
|
|
|
// Define the output
|
|
Tensor* values = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
|
|
auto values_flat = values->flat<ValueType>();
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
const auto &type = f(records[id]);
|
|
const auto id_offset = id * feature_ids.size();
|
|
for (int64 fid = 0; fid < feature_ids.size(); fid++) {
|
|
auto feature_id = feature_ids[fid];
|
|
// The value is set to default if it does not exist in the current DataRecord
|
|
SetValueGroup(type, feature_id, id_offset + fid, default_value, values_flat);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper for Single Extraction of Dense Features
|
|
template<typename ValueType, typename FeatureType>
|
|
void ComputeHelperFeaturesAsTensors(
|
|
OpKernelContext* context,
|
|
ValueType& default_value,
|
|
int64 feature_id,
|
|
std::function<const FeatureType&(const twml::DataRecord&)> f) {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
// Output shape is 2D; where the first dimension corresponds to the batch_size
|
|
// and the second corresponds to the number of features passed to the TF Op.
|
|
const int total_size = static_cast<int64>(handle->records.size());
|
|
TensorShape shape = {total_size};
|
|
|
|
// Define the output
|
|
Tensor* values = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
|
|
auto values_flat = values->flat<ValueType>();
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
const auto &type = f(records[id]);
|
|
SetValueGroup(type, feature_id, id, default_value, values_flat);
|
|
}
|
|
}
|
|
|
|
REGISTER_OP("GetBinaryAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_id: int")
|
|
.Attr("default_value: float")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_id: Id representing the feature whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetBinaryAsTensor : public OpKernel {
|
|
private:
|
|
int64 feature_id;
|
|
float default_value;
|
|
|
|
public:
|
|
explicit GetBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
|
|
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetContinuousAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_id: int")
|
|
.Attr("default_value: float")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_id: Id representing the feature whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetContinuousAsTensor : public OpKernel {
|
|
private:
|
|
int64 feature_id;
|
|
float default_value;
|
|
|
|
public:
|
|
explicit GetContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
|
|
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetDiscreteAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_id: int")
|
|
.Attr("default_value: int")
|
|
.Output("values: int64")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_id: Id representing the feature whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetDiscreteAsTensor : public OpKernel {
|
|
private:
|
|
int64 feature_id;
|
|
int64 default_value;
|
|
|
|
public:
|
|
explicit GetDiscreteAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
|
|
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetStringAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_id: int")
|
|
.Attr("default_value: string")
|
|
.Output("names: string")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_id: Id representing the feature whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
names: A Tensor corresponding to the value of the feature_id across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetStringAsTensor : public OpKernel {
|
|
private:
|
|
int64 feature_id;
|
|
string default_value;
|
|
|
|
public:
|
|
explicit GetStringAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
|
|
ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
REGISTER_OP("GetBinaryGroupAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_ids: list(int)")
|
|
.Attr("default_value: float")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_ids: List of ids representing the features whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
|
|
)doc");
|
|
|
|
|
|
class GetBinaryGroupAsTensor : public OpKernel {
|
|
private:
|
|
float default_value;
|
|
std::vector<int64> feature_ids;
|
|
|
|
public:
|
|
explicit GetBinaryGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
|
|
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
REGISTER_OP("GetContinuousGroupAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_ids: list(int)")
|
|
.Attr("default_value: float")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_ids: List of ids representing the features whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetContinuousGroupAsTensor : public OpKernel {
|
|
private:
|
|
float default_value;
|
|
std::vector<int64> feature_ids;
|
|
|
|
public:
|
|
explicit GetContinuousGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
|
|
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetDiscreteGroupAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_ids: list(int)")
|
|
.Attr("default_value: int")
|
|
.Output("values: int64")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_ids: List of ids representing the features whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetDiscreteGroupAsTensor : public OpKernel {
|
|
private:
|
|
std::vector<int64> feature_ids;
|
|
int64 default_value;
|
|
|
|
public:
|
|
explicit GetDiscreteGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
|
|
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetStringGroupAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_ids: list(int)")
|
|
.Attr("default_value: string")
|
|
.Output("names: string")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_ids: List of ids representing the features whose values will be extracted.
|
|
default_value: default_value to be inputted if the values are missing from the current DataRecord.
|
|
Outputs
|
|
names: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
|
|
)doc");
|
|
|
|
class GetStringGroupAsTensor : public OpKernel {
|
|
private:
|
|
std::vector<int64> feature_ids;
|
|
string default_value;
|
|
|
|
public:
|
|
explicit GetStringGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
|
|
OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
|
|
[](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
|
|
ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetSparseBinaryAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_id: int")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("names: string")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns tensors corresponding to the ids, keys and names of a particular
|
|
feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_id: Id representing the feature whose values will be extracted.
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
names: DataRecord values(string)
|
|
)doc");
|
|
class GetSparseBinaryAsTensor : public OpKernel {
|
|
private:
|
|
int64 feature_id;
|
|
|
|
public:
|
|
explicit GetSparseBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
// We need two passes to the data:
|
|
// 1 to compute the output size of the tensor
|
|
// 2 to copy the values to the tensor
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
|
|
// Creating a vector we increment every time a key is found
|
|
std::vector<std::string> temp_names;
|
|
std::vector<int64> temp_ids;
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
const auto &sparse_binary = records[id].getSparseBinary();
|
|
auto it = sparse_binary.find(feature_id);
|
|
// Find all instances of key in DataRecord
|
|
if (it != sparse_binary.end()) {
|
|
// insert to temp_names all the values in the dictionary value
|
|
temp_names.insert(temp_names.end(), it->second.begin(), it->second.end());
|
|
temp_ids.insert(temp_ids.end(), it->second.size(), id);
|
|
}
|
|
}
|
|
|
|
// The total_size will be the that of the saved vector
|
|
const int total_size = static_cast<int64>(temp_names.size());
|
|
TensorShape shape = {total_size};
|
|
Tensor* ids = nullptr;
|
|
Tensor* keys = nullptr;
|
|
Tensor* names = nullptr;
|
|
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
|
|
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto names_flat = names->flat<string>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
|
|
// The feature id value will always be the same
|
|
std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
|
|
std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
|
|
std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetSparseContinuousAsTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("feature_id: int")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("names: string")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP that returns tensors corresponding to the ids, keys, names and values of a particular
|
|
feature_id.
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
Attr
|
|
feature_id: Id representing the feature whose values will be extracted.
|
|
Outputs
|
|
ids: ids specifies the index of the records[id] in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
names: DataRecord values(string)
|
|
values: DataRecord values(float)
|
|
)doc");
|
|
class GetSparseContinuousAsTensor : public OpKernel {
|
|
private:
|
|
int64 feature_id;
|
|
|
|
public:
|
|
explicit GetSparseContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
|
|
OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
|
|
}
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
// We need two passes to the data:
|
|
// 1 to compute the output size of the tensor
|
|
// 2 to copy the values to the tensor
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
const auto &records = handle->records;
|
|
|
|
// Creating a vector we increment every time a key is found
|
|
std::vector<std::string> temp_names;
|
|
std::vector<float> temp_values;
|
|
std::vector<int64> temp_ids;
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
const auto &sparse_continuous = records[id].getSparseContinuous();
|
|
auto it = sparse_continuous.find(feature_id);
|
|
// Find all instances of key in DataRecord
|
|
if (it != sparse_continuous.end()) {
|
|
// insert to temp_names all the values in the dictionary value
|
|
auto value_map = it->second;
|
|
for (auto& elem : value_map) {
|
|
temp_names.push_back(elem.first);
|
|
temp_values.push_back(elem.second);
|
|
temp_ids.push_back(id);
|
|
}
|
|
}
|
|
}
|
|
|
|
// The total_size will be the that of the saved vector
|
|
const int total_size = static_cast<int64>(temp_names.size());
|
|
TensorShape shape = {total_size};
|
|
Tensor* ids = nullptr;
|
|
Tensor* keys = nullptr;
|
|
Tensor* names = nullptr;
|
|
Tensor* values = nullptr;
|
|
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
|
|
OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
|
|
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto names_flat = names->flat<string>();
|
|
auto ids_flat = ids->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
|
|
// The feature id value will always be the same
|
|
std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
|
|
std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
|
|
std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
|
|
std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
|
|
} catch (const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
// Helper function to add ids, keys and values to common vector
|
|
inline void addIdsKeysValuesToVectors(
|
|
const int64 id,
|
|
const int64 key,
|
|
const double value,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
ids.push_back(id);
|
|
keys.push_back(key);
|
|
values.push_back(value);
|
|
}
|
|
|
|
struct KeepFeatures {
|
|
KeepFeatures() : vec(), set() {}
|
|
template<typename ContainerType>
|
|
KeepFeatures(const std::vector<int64> &keep_features,
|
|
const ContainerType *const container) {
|
|
vec.reserve(keep_features.size());
|
|
#ifdef USE_DENSE_HASH
|
|
set.resize(keep_features.size());
|
|
set.set_empty_key(0);
|
|
#else
|
|
set.reserve(keep_features.size());
|
|
#endif // USE_DENSE_HASH
|
|
set.max_load_factor(0.5);
|
|
for (const auto &elem : keep_features) {
|
|
if (container->find(elem) == container->end()) continue;
|
|
vec.push_back(elem);
|
|
set.insert(elem);
|
|
}
|
|
}
|
|
size_t size() const {
|
|
return vec.size();
|
|
}
|
|
std::vector<int64> vec;
|
|
twml::Set<int64> set;
|
|
};
|
|
|
|
// Helper Function to Filter and Hash Feature for Binary Features
|
|
void filterAndHashFeature(
|
|
const twml::DataRecord::BinaryFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
addIdsKeysValuesToVectors(current_id, *iter, 1, ids, keys, values);
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem) == keep_features.set.end()) continue;
|
|
addIdsKeysValuesToVectors(current_id, elem, 1, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper Function to Filter and Hash Feature for Continuous Features
|
|
void filterAndHashFeature(
|
|
const twml::DataRecord::ContinuousFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
addIdsKeysValuesToVectors(current_id, iter->first, iter->second, ids, keys, values);
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
|
|
addIdsKeysValuesToVectors(current_id, elem.first, elem.second, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper Function to Filter and Hash Feature for Discrete Features
|
|
void filterAndHashFeature(
|
|
const twml::DataRecord::DiscreteFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
int64_t key = twml::mixDiscreteIdAndValue(iter->first, iter->second);
|
|
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
|
|
int64_t key = twml::mixDiscreteIdAndValue(elem.first, elem.second);
|
|
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper Function to Filter and Hash Feature for String Features
|
|
void filterAndHashFeature(
|
|
const twml::DataRecord::StringFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
int64_t key = twml::mixStringIdAndValue(
|
|
iter->first,
|
|
iter->second.size(),
|
|
reinterpret_cast<const uint8_t*>(iter->second.c_str()));
|
|
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
|
|
int64_t key = twml::mixStringIdAndValue(
|
|
elem.first,
|
|
elem.second.size(),
|
|
reinterpret_cast<const uint8_t*>(elem.second.c_str()));
|
|
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper Function to Filter and Hash Feature for Sparse Binary Features
|
|
void filterAndHashFeature(
|
|
const twml::DataRecord::SparseBinaryFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
for (const auto &name : iter->second) {
|
|
int64_t key = twml::mixStringIdAndValue(iter->first, name.size(),
|
|
reinterpret_cast<const uint8_t*>(name.c_str()));
|
|
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
|
|
}
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
|
|
for (const auto &name : elem.second) {
|
|
int64_t key = twml::mixStringIdAndValue(elem.first, name.size(),
|
|
reinterpret_cast<const uint8_t*>(name.c_str()));
|
|
addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper Function to Filter and Hash Feature for Sparse Continuous Features
|
|
void filterAndHashFeature(
|
|
const twml::DataRecord::SparseContinuousFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
for (const auto &map : iter->second) {
|
|
int64_t key = twml::mixStringIdAndValue(
|
|
iter->first,
|
|
map.first.size(),
|
|
reinterpret_cast<const uint8_t*>(map.first.c_str()));
|
|
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
|
|
}
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
|
|
for (const auto &map : elem.second) {
|
|
int64_t key = twml::mixStringIdAndValue(
|
|
elem.first,
|
|
map.first.size(),
|
|
reinterpret_cast<const uint8_t*>(map.first.c_str()));
|
|
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper Function to Filter and Hash Feature for Sparse Continuous Features
|
|
void filterAndHashFeatureCompat(
|
|
const twml::DataRecord::SparseContinuousFeatures& features,
|
|
const int64 current_id,
|
|
const KeepFeatures &keep_features,
|
|
std::vector<int64>& ids,
|
|
std::vector<int64>& keys,
|
|
std::vector<float>& values) {
|
|
if (keep_features.size() < 2 * features.size()) {
|
|
for (const auto &f : keep_features.vec) {
|
|
const auto &iter = features.find(f);
|
|
if (iter == features.end()) continue;
|
|
for (const auto &map : iter->second) {
|
|
int64_t key = twml::featureId(map.first);
|
|
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
|
|
}
|
|
}
|
|
} else {
|
|
for (const auto &elem : features) {
|
|
if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
|
|
for (const auto &map : elem.second) {
|
|
int64_t key = twml::featureId(map.first);
|
|
addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void copy_if_exists(std::vector<int64>& out,
|
|
const std::vector<int64>& in,
|
|
const twml::Map<int64_t, int64_t> *const map) {
|
|
out.reserve(in.size());
|
|
for (const auto &elem : in) {
|
|
if (map->find(elem) == map->end()) continue;
|
|
out.push_back(elem);
|
|
}
|
|
}
|
|
|
|
void ComputeHashedFeaturesAsTensor(OpKernelContext* context,
|
|
const DataRecordResource *const handle,
|
|
const KeepFeatures &binary_keep_features,
|
|
const KeepFeatures &continuous_keep_features,
|
|
const KeepFeatures &discrete_keep_features,
|
|
const KeepFeatures &string_keep_features,
|
|
const KeepFeatures &sparse_binary_keep_features,
|
|
const KeepFeatures &sparse_continuous_keep_features,
|
|
bool sparse_continuous_compatibility) {
|
|
|
|
const auto &records = handle->records;
|
|
uint64_t estimated_size = (binary_keep_features.size() + continuous_keep_features.size() +
|
|
discrete_keep_features.size() + string_keep_features.size() +
|
|
sparse_binary_keep_features.size() +
|
|
sparse_continuous_keep_features.size());
|
|
// Construct temporary vectors for common features
|
|
std::vector<int64> common_ids, common_keys, temp_ids, temp_keys;
|
|
std::vector<float> common_values, temp_values;
|
|
common_ids.reserve(estimated_size);
|
|
common_keys.reserve(estimated_size);
|
|
common_values.reserve(estimated_size);
|
|
|
|
const auto &common_binary = handle->common.getBinary();
|
|
const auto &common_continuous = handle->common.getContinuous();
|
|
const auto &common_discrete = handle->common.getDiscrete();
|
|
const auto &common_string = handle->common.getString();
|
|
const auto &common_sparse_binary = handle->common.getSparseBinary();
|
|
const auto &common_sparse_continuous = handle->common.getSparseContinuous();
|
|
|
|
filterAndHashFeature(common_binary, 0, binary_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
filterAndHashFeature(common_continuous, 0, continuous_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
filterAndHashFeature(common_discrete, 0, discrete_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
filterAndHashFeature(common_string, 0, string_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
filterAndHashFeature(common_sparse_binary, 0, sparse_binary_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
if (sparse_continuous_compatibility) {
|
|
filterAndHashFeatureCompat(common_sparse_continuous, 0, sparse_continuous_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
} else {
|
|
filterAndHashFeature(common_sparse_continuous, 0, sparse_continuous_keep_features,
|
|
common_ids, common_keys, common_values);
|
|
}
|
|
common_ids.clear();
|
|
// Construct temporary vectors for all features
|
|
estimated_size = (estimated_size + common_keys.size()) * records.size();
|
|
temp_ids.reserve(estimated_size);
|
|
temp_keys.reserve(estimated_size);
|
|
temp_values.reserve(estimated_size);
|
|
|
|
for (int64 id = 0; id < records.size(); id++) {
|
|
temp_ids.insert(temp_ids.end(), common_keys.size(), id);
|
|
temp_keys.insert(temp_keys.end(), common_keys.begin(), common_keys.end());
|
|
temp_values.insert(temp_values.end(), common_values.begin(), common_values.end());
|
|
const auto &binary = records[id].getBinary();
|
|
const auto &continuous = records[id].getContinuous();
|
|
const auto &discrete = records[id].getDiscrete();
|
|
const auto &str = records[id].getString();
|
|
const auto &sparse_binary = records[id].getSparseBinary();
|
|
const auto &sparse_continuous = records[id].getSparseContinuous();
|
|
|
|
filterAndHashFeature(binary, id, binary_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
filterAndHashFeature(continuous, id, continuous_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
filterAndHashFeature(discrete, id, discrete_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
filterAndHashFeature(str, id, string_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
filterAndHashFeature(sparse_binary, id, sparse_binary_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
if (sparse_continuous_compatibility) {
|
|
filterAndHashFeatureCompat(sparse_continuous, id, sparse_continuous_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
} else {
|
|
filterAndHashFeature(sparse_continuous, id, sparse_continuous_keep_features,
|
|
temp_ids, temp_keys, temp_values);
|
|
}
|
|
}
|
|
|
|
// Copy the temporary vectors into the output Tensors
|
|
TensorShape shape = {static_cast<int64>(temp_ids.size())};
|
|
Tensor* ids = nullptr;
|
|
Tensor* keys = nullptr;
|
|
Tensor* values = nullptr;
|
|
OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
|
|
OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
|
|
OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
|
|
auto ids_flat = ids->flat<int64>();
|
|
auto keys_flat = keys->flat<int64>();
|
|
auto values_flat = values->flat<float>();
|
|
std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
|
|
std::copy(temp_keys.begin(), temp_keys.end(), keys_flat.data());
|
|
std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
|
|
}
|
|
|
|
REGISTER_OP("GetHashedFeaturesAsSparseTensor")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("binary_keep_features: list(int)")
|
|
.Attr("continuous_keep_features: list(int)")
|
|
.Attr("discrete_keep_features: list(int)")
|
|
.Attr("string_keep_features: list(int)")
|
|
.Attr("sparse_binary_keep_features: list(int)")
|
|
.Attr("sparse_continuous_keep_features: list(int)")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP for returning required features of different type as
|
|
a single sparse tensor. Hashing trick is applied.
|
|
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
values: DataRecord values (float)
|
|
)doc");
|
|
|
|
class GetHashedFeaturesAsSparseTensor: public OpKernel {
|
|
public:
|
|
explicit GetHashedFeaturesAsSparseTensor(OpKernelConstruction* context): OpKernel(context) {
|
|
// Get the list of features to keep for each feature type
|
|
OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
|
|
}
|
|
|
|
private:
|
|
std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
|
|
std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
// Create a new list of keep features based on the original keep_set.
|
|
// This is to ensure compatibility with existing behavior such as:
|
|
// - Ensure no new features are decoded in this op.
|
|
// - Ensure labels or weights dont get included here.
|
|
// TODO: Should we return features requested by user here even if they are labels / weights?
|
|
KeepFeatures binary_keep_features(binary_keep_features_, handle->keep_map);
|
|
KeepFeatures continuous_keep_features(continuous_keep_features_, handle->keep_map);
|
|
KeepFeatures discrete_keep_features(discrete_keep_features_, handle->keep_map);
|
|
KeepFeatures string_keep_features(string_keep_features_, handle->keep_map);
|
|
KeepFeatures sparse_binary_keep_features(sparse_binary_keep_features_, handle->keep_map);
|
|
KeepFeatures sparse_continuous_keep_features(sparse_continuous_keep_features_, handle->keep_map);
|
|
ComputeHashedFeaturesAsTensor(context, handle.get(),
|
|
binary_keep_features,
|
|
continuous_keep_features,
|
|
discrete_keep_features,
|
|
string_keep_features,
|
|
sparse_binary_keep_features,
|
|
sparse_continuous_keep_features,
|
|
false);
|
|
} catch(const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_OP("GetHashedFeaturesAsSparseTensorV2")
|
|
.Input("data_record_handle: resource")
|
|
.Attr("binary_keep_features: list(int)")
|
|
.Attr("continuous_keep_features: list(int)")
|
|
.Attr("discrete_keep_features: list(int)")
|
|
.Attr("string_keep_features: list(int)")
|
|
.Attr("sparse_binary_keep_features: list(int)")
|
|
.Attr("sparse_continuous_keep_features: list(int)")
|
|
.Attr("keep_features: list(int)")
|
|
.Attr("keep_codes: list(int)")
|
|
.Attr("decode_mode: int = 0")
|
|
.Output("ids: int64")
|
|
.Output("keys: int64")
|
|
.Output("values: float")
|
|
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
|
|
return Status::OK();
|
|
}).Doc(R"doc(
|
|
A tensorflow OP for returning required features of different type as
|
|
a single sparse tensor. Hashing trick is applied.
|
|
|
|
Input
|
|
data_record_handle: Resource handle to DataRecord
|
|
|
|
Outputs
|
|
ids: ids specifies the index of the records in the batch (int64)
|
|
keys: DataRecord keys (int64)
|
|
values: DataRecord values (float)
|
|
)doc");
|
|
|
|
class GetHashedFeaturesAsSparseTensorV2: public OpKernel {
|
|
public:
|
|
explicit GetHashedFeaturesAsSparseTensorV2(OpKernelConstruction* context): OpKernel(context) {
|
|
std::vector<int64> keep_features;
|
|
std::vector<int64> keep_codes;
|
|
std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
|
|
std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
|
|
|
|
// Get the list of features to keep for each feature type
|
|
OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
|
|
OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
|
|
OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
|
|
OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
|
|
|
|
twml::Map<int64_t, int64_t> keep_map;
|
|
#ifdef USE_DENSE_HASH
|
|
keep_map.set_empty_key(0);
|
|
#endif // USE_DENSE_HASH
|
|
for (uint64_t i = 0; i < keep_features.size(); i++) {
|
|
keep_map[keep_features[i]] = keep_codes[i];
|
|
}
|
|
|
|
|
|
binary_keep_features = KeepFeatures(binary_keep_features_, &keep_map);
|
|
continuous_keep_features = KeepFeatures(continuous_keep_features_, &keep_map);
|
|
discrete_keep_features = KeepFeatures(discrete_keep_features_, &keep_map);
|
|
string_keep_features = KeepFeatures(string_keep_features_, &keep_map);
|
|
sparse_binary_keep_features = KeepFeatures(sparse_binary_keep_features_, &keep_map);
|
|
sparse_continuous_keep_features = KeepFeatures(sparse_continuous_keep_features_, &keep_map);
|
|
|
|
}
|
|
|
|
private:
|
|
KeepFeatures binary_keep_features, continuous_keep_features, discrete_keep_features;
|
|
KeepFeatures string_keep_features, sparse_binary_keep_features, sparse_continuous_keep_features;
|
|
int64 m_decode_mode;
|
|
|
|
void Compute(OpKernelContext* context) override {
|
|
try {
|
|
auto handle = getHandle<DataRecordResource>(context, 0);
|
|
// Create a new list of keep features based on the original keep_set.
|
|
// This is to ensure compatibility with existing behavior such as:
|
|
// - Ensure no new features are decoded in this op.
|
|
// - Ensure labels or weights dont get included here.
|
|
// TODO: Should we return features requested by user here even if they are labels / weights?
|
|
ComputeHashedFeaturesAsTensor(context, handle.get(),
|
|
binary_keep_features,
|
|
continuous_keep_features,
|
|
discrete_keep_features,
|
|
string_keep_features,
|
|
sparse_binary_keep_features,
|
|
sparse_continuous_keep_features,
|
|
m_decode_mode == 0);
|
|
} catch(const std::exception &e) {
|
|
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
#define REGISTER_DECODE_DATA_RECORD(InputType) \
|
|
REGISTER_KERNEL_BUILDER( \
|
|
Name("DecodeDataRecord") \
|
|
.Device(DEVICE_CPU) \
|
|
.TypeConstraint<InputType>("InputType"), \
|
|
DecodeDataRecord<InputType>); \
|
|
|
|
REGISTER_DECODE_DATA_RECORD(uint8)
|
|
REGISTER_DECODE_DATA_RECORD(string)
|
|
|
|
#define REGISTER_GETTER(FIELD) \
|
|
REGISTER_KERNEL_BUILDER( \
|
|
Name("Get" #FIELD "Features") \
|
|
.Device(DEVICE_CPU), \
|
|
Get##FIELD##Features); \
|
|
|
|
#define REGISTER_GETTER_FROM_DR(FIELD) \
|
|
REGISTER_KERNEL_BUILDER( \
|
|
Name("Get" #FIELD "FromDataRecord") \
|
|
.Device(DEVICE_CPU), \
|
|
Get##FIELD##FromDataRecord); \
|
|
|
|
#define REGISTER_GETTER_AS_TENSOR(FIELD) \
|
|
REGISTER_KERNEL_BUILDER( \
|
|
Name("Get" #FIELD "AsTensor") \
|
|
.Device(DEVICE_CPU), \
|
|
Get##FIELD##AsTensor); \
|
|
|
|
|
|
#define REGISTER_GETTER_GROUP_AS_TENSOR(FIELD) \
|
|
REGISTER_KERNEL_BUILDER( \
|
|
Name("Get" #FIELD "GroupAsTensor") \
|
|
.Device(DEVICE_CPU), \
|
|
Get##FIELD##GroupAsTensor); \
|
|
|
|
REGISTER_GETTER(Binary)
|
|
REGISTER_GETTER(Continuous)
|
|
REGISTER_GETTER(Discrete)
|
|
REGISTER_GETTER(String)
|
|
REGISTER_GETTER(SparseBinary)
|
|
REGISTER_GETTER(SparseContinuous)
|
|
REGISTER_GETTER_FROM_DR(BatchSize)
|
|
REGISTER_GETTER_FROM_DR(Labels)
|
|
REGISTER_GETTER_FROM_DR(Weights)
|
|
REGISTER_GETTER_AS_TENSOR(Binary)
|
|
REGISTER_GETTER_AS_TENSOR(Continuous)
|
|
REGISTER_GETTER_AS_TENSOR(Discrete)
|
|
REGISTER_GETTER_AS_TENSOR(String)
|
|
REGISTER_GETTER_AS_TENSOR(SparseBinary)
|
|
REGISTER_GETTER_AS_TENSOR(SparseContinuous)
|
|
REGISTER_GETTER_GROUP_AS_TENSOR(Binary)
|
|
REGISTER_GETTER_GROUP_AS_TENSOR(Continuous)
|
|
REGISTER_GETTER_GROUP_AS_TENSOR(Discrete)
|
|
REGISTER_GETTER_GROUP_AS_TENSOR(String)
|
|
REGISTER_KERNEL_BUILDER(
|
|
Name("GetHashedFeaturesAsSparseTensor")
|
|
.Device(DEVICE_CPU),
|
|
GetHashedFeaturesAsSparseTensor);
|
|
REGISTER_KERNEL_BUILDER(
|
|
Name("GetHashedFeaturesAsSparseTensorV2")
|
|
.Device(DEVICE_CPU),
|
|
GetHashedFeaturesAsSparseTensorV2);
|