[docx] split commit for file 6400

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
2024-11-16 08:29:21 +01:00 · 2024-01-23 19:21:37 +02:00 · 2024-01-23 19:21:37 +02:00 · ac0fb2a2f2
commit ac0fb2a2f2
parent 4e32fcb29f
395 changed files with 0 additions and 30106 deletions
--- a/twml/libtwml/src/lib/TensorRecordWriter.cpp
+++ b/twml/libtwml/src/lib/TensorRecordWriter.cpp
@ -1,162 +0,0 @@
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <map>
-#include <twml/ThriftWriter.h>
-#include <twml/TensorRecordWriter.h>
-#include <twml/io/IOError.h>
-
-using namespace twml::io;
-
-namespace twml {
-
-static int32_t getRawThriftType(twml_type dtype) {
-  // convert twml enum to tensor.thrift enum
-  switch (dtype) {
-    case TWML_TYPE_FLOAT:
-      return DATA_TYPE_FLOAT;
-    case TWML_TYPE_DOUBLE:
-      return DATA_TYPE_DOUBLE;
-    case TWML_TYPE_INT64:
-      return DATA_TYPE_INT64;
-    case TWML_TYPE_INT32:
-      return DATA_TYPE_INT32;
-    case TWML_TYPE_UINT8:
-      return DATA_TYPE_UINT8;
-    case TWML_TYPE_STRING:
-      return DATA_TYPE_STRING;
-    case TWML_TYPE_BOOL:
-      return DATA_TYPE_BOOL;
-    default:
-      throw IOError(IOError::UNSUPPORTED_OUTPUT_TYPE);
-  }
-}
-
-void TensorRecordWriter::writeTensor(const RawTensor &tensor) {
-  if (tensor.getType() == TWML_TYPE_INT32) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_INT32);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_I32, tensor.getNumElements());
-
-    const int32_t *data = tensor.getData<int32_t>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeInt32(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_INT64) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_INT64);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumElements());
-
-    const int64_t *data = tensor.getData<int64_t>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeInt64(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_FLOAT) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_FLOAT);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_DOUBLE, tensor.getNumElements());
-
-    const float *data = tensor.getData<float>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeDouble(static_cast<double>(data[i]));
-
-  } else if (tensor.getType() == TWML_TYPE_DOUBLE) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_DOUBLE);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_DOUBLE, tensor.getNumElements());
-
-    const double *data = tensor.getData<double>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeDouble(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_STRING) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_STRING);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_STRING, tensor.getNumElements());
-
-    const std::string *data = tensor.getData<std::string>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeString(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_BOOL) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_BOOL);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_BOOL, tensor.getNumElements());
-
-    const bool *data = tensor.getData<bool>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeBool(data[i]);
-
-  } else {
-    throw IOError(IOError::UNSUPPORTED_OUTPUT_TYPE);
-  }
-
-  // write tensor shape field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 2);
-  m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumDims());
-
-  for (uint64_t i = 0; i < tensor.getNumDims(); i++)
-    m_thrift_writer.writeInt64(tensor.getDim(i));
-
-  m_thrift_writer.writeStructStop();
-  m_thrift_writer.writeStructStop();
-}
-
-void TensorRecordWriter::writeRawTensor(const RawTensor &tensor) {
-  m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_RAW);
-
-  // dataType field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_I32, 1);
-  m_thrift_writer.writeInt32(getRawThriftType(tensor.getType()));
-
-  // content field
-  uint64_t type_size = getSizeOf(tensor.getType());
-  m_thrift_writer.writeStructFieldHeader(TTYPE_STRING, 2);
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor.getData<void>());
-  m_thrift_writer.writeBinary(data, tensor.getNumElements() * type_size);
-
-  // shape field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 3);
-  m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumDims());
-
-  for (uint64_t i = 0; i < tensor.getNumDims(); i++)
-    m_thrift_writer.writeInt64(tensor.getDim(i));
-
-  m_thrift_writer.writeStructStop();
-  m_thrift_writer.writeStructStop();
-}
-
-TWMLAPI uint32_t TensorRecordWriter::getRecordsWritten() {
-  return m_records_written;
-}
-
-// Caller (usually DataRecordWriter) must precede with struct header field
-// like thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_GENERAL_TENSOR)
-TWMLAPI uint64_t TensorRecordWriter::write(twml::TensorRecord &record) {
-  uint64_t bytes_written_before = m_thrift_writer.getBytesWritten();
-
-  m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_STRUCT, record.getRawTensors().size());
-
-  for (auto id_tensor_pairs : record.getRawTensors()) {
-    m_thrift_writer.writeInt64(id_tensor_pairs.first);
-
-    // all tensors written as RawTensor Thrift except for StringTensors
-    // this avoids the overhead of converting little endian to big endian
-    if (id_tensor_pairs.second.getType() == TWML_TYPE_STRING)
-      writeTensor(id_tensor_pairs.second);
-    else
-      writeRawTensor(id_tensor_pairs.second);
-  }
-
-  m_records_written++;
-
-  return m_thrift_writer.getBytesWritten() - bytes_written_before;
-}
-
-}  // namespace twml
--- a/twml/libtwml/src/lib/TensorRecordWriter.docx
+++ b/twml/libtwml/src/lib/TensorRecordWriter.docx
--- a/twml/libtwml/src/lib/ThriftReader.cpp
+++ b/twml/libtwml/src/lib/ThriftReader.cpp
@ -1,33 +0,0 @@
-#include "internal/endianutils.h"
-
-#include <twml/ThriftReader.h>
-#include <twml/Error.h>
-
-#include <cstring>
-
-namespace twml {
-
-uint8_t ThriftReader::readByte() {
-  return readDirect<uint8_t>();
-}
-
-int16_t ThriftReader::readInt16() {
-  return betoh16(readDirect<int16_t>());
-}
-
-int32_t ThriftReader::readInt32() {
-  return betoh32(readDirect<int32_t>());
-}
-
-int64_t ThriftReader::readInt64() {
-  return betoh64(readDirect<int64_t>());
-}
-
-double ThriftReader::readDouble() {
-  double val;
-  int64_t *val_proxy = reinterpret_cast<int64_t*>(&val);
-  *val_proxy = readInt64();
-  return val;
-}
-
-}  // namespace twml
--- a/twml/libtwml/src/lib/ThriftReader.docx
+++ b/twml/libtwml/src/lib/ThriftReader.docx
--- a/twml/libtwml/src/lib/ThriftWriter.cpp
+++ b/twml/libtwml/src/lib/ThriftWriter.cpp
@ -1,91 +0,0 @@
-#include "internal/endianutils.h"
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <twml/ThriftWriter.h>
-#include <twml/Error.h>
-#include <twml/io/IOError.h>
-
-#include <cstring>
-
-using namespace twml::io;
-
-namespace twml {
-
-template <typename T> inline
-uint64_t ThriftWriter::write(T val) {
-  if (!m_dry_run) {
-    if (m_bytes_written + sizeof(T) > m_buffer_size)
-      throw IOError(IOError::DESTINATION_LARGER_THAN_CAPACITY);
-    memcpy(m_buffer, &val, sizeof(T));
-    m_buffer += sizeof(T);
-  }
-  m_bytes_written += sizeof(T);
-  return sizeof(T);
-}
-
-TWMLAPI uint64_t ThriftWriter::getBytesWritten() {
-  return m_bytes_written;
-}
-
-TWMLAPI uint64_t ThriftWriter::writeStructFieldHeader(int8_t field_type, int16_t field_id) {
-  return writeInt8(field_type) + writeInt16(field_id);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeStructStop() {
-  return writeInt8(static_cast<int8_t>(TTYPE_STOP));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeListHeader(int8_t element_type, int32_t num_elems) {
-  return writeInt8(element_type) + writeInt32(num_elems);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeMapHeader(int8_t key_type, int8_t val_type, int32_t num_elems) {
-  return writeInt8(key_type) + writeInt8(val_type) + writeInt32(num_elems);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeDouble(double val) {
-  int64_t bin_value;
-  memcpy(&bin_value, &val, sizeof(int64_t));
-  return writeInt64(bin_value);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt8(int8_t val) {
-  return write(val);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt16(int16_t val) {
-  return write(betoh16(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt32(int32_t val) {
-  return write(betoh32(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt64(int64_t val) {
-  return write(betoh64(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeBinary(const uint8_t *bytes, int32_t num_bytes) {
-  writeInt32(num_bytes);
-
-  if (!m_dry_run) {
-    if (m_bytes_written + num_bytes > m_buffer_size)
-      throw IOError(IOError::DESTINATION_LARGER_THAN_CAPACITY);
-    memcpy(m_buffer, bytes, num_bytes);
-    m_buffer += num_bytes;
-  }
-  m_bytes_written += num_bytes;
-
-  return 4 + num_bytes;
-}
-
-TWMLAPI uint64_t ThriftWriter::writeString(std::string str) {
-  return writeBinary(reinterpret_cast<const uint8_t *>(str.data()), str.length());
-}
-
-TWMLAPI uint64_t ThriftWriter::writeBool(bool val) {
-  return write(val);
-}
-
-}  // namespace twml
--- a/twml/libtwml/src/lib/ThriftWriter.docx
+++ b/twml/libtwml/src/lib/ThriftWriter.docx
--- a/twml/libtwml/src/lib/discretizer_impl.cpp
+++ b/twml/libtwml/src/lib/discretizer_impl.cpp
@ -1,167 +0,0 @@
-#include "internal/interpolate.h"
-#include "internal/error.h"
-#include <twml/discretizer_impl.h>
-#include <twml/optim.h>
-
-namespace twml {
-  // it is assumed that start_compute and end_compute are valid
-  template<typename T>
-  void discretizerInfer(Tensor &output_keys,
-          Tensor &output_vals,
-          const Tensor &input_ids,
-          const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          int output_bits,
-          const Map<int64_t, int64_t> &ID_to_index,
-          int64_t start_compute,
-          int64_t end_compute,
-          int64_t output_start) {
-    auto out_keysData = output_keys.getData<int64_t>();
-    auto out_valsData = output_vals.getData<T>();
-    uint64_t out_keysStride = output_keys.getStride(0);
-    uint64_t out_valsStride = output_vals.getStride(0);
-
-    auto in_idsData = input_ids.getData<int64_t>();
-    auto in_valsData = input_vals.getData<T>();
-    uint64_t in_idsStride = input_ids.getStride(0);
-    uint64_t in_valsStride = input_vals.getStride(0);
-
-    auto xsData = bin_vals.getData<T>();
-    auto ysData = bin_ids.getData<int64_t>();
-    uint64_t xsStride = bin_vals.getStride(0);
-    uint64_t ysStride = bin_ids.getStride(0);
-
-    auto offsetData = feature_offsets.getData<int64_t>();
-
-    uint64_t total_bins = bin_ids.getNumElements();
-    uint64_t fsize = feature_offsets.getNumElements();
-
-    uint64_t output_size = (1 << output_bits);
-
-    for (uint64_t i = start_compute; i < end_compute; i++) {
-      int64_t feature_ID = in_idsData[i * in_idsStride];
-      T val = in_valsData[i * in_valsStride];
-
-      auto iter = ID_to_index.find(feature_ID);
-      if (iter == ID_to_index.end()) {
-        // feature not calibrated
-        // modulo add operation for new key from feature ID
-        int64_t ikey = feature_ID % (output_size - total_bins) + total_bins;
-        out_keysData[(i + output_start - start_compute) * out_keysStride] = ikey;
-        out_valsData[(i + output_start - start_compute) * out_valsStride] = val;
-        continue;
-      }
-
-      int64_t ikey = iter->second;
-
-      // Perform interpolation
-      uint64_t offset = offsetData[ikey];
-      uint64_t next_offset = (ikey == (int64_t)(fsize - 1)) ? total_bins : offsetData[ikey + 1];
-      uint64_t mainSize = next_offset - offset;
-
-      const T *lxsData = xsData + offset;
-      const int64_t *lysData = ysData + offset;
-      int64_t okey;
-      okey = interpolation<T, int64_t>(lxsData, xsStride,
-                                       lysData, ysStride,
-                                       val, mainSize,
-                                       NEAREST, 0);
-      out_keysData[(i + output_start - start_compute) * out_keysStride] = okey;
-      out_valsData[(i + output_start - start_compute) * out_valsStride] = 1;
-    }
-  }
-
-  void discretizerInfer(Tensor &output_keys,
-          Tensor &output_vals,
-          const Tensor &input_ids,
-          const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          int output_bits,
-          const Map<int64_t, int64_t> &ID_to_index,
-          int start_compute,
-          int end_compute,
-          int output_start) {
-    if (input_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (bin_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (feature_offsets.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    if (bin_ids.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_ids must be 1 Dimensional");
-    }
-
-    if (bin_vals.getNumElements() != bin_ids.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "Dimensions of bin_vals and bin_ids do not match");
-    }
-
-    if (feature_offsets.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "feature_offsets must be contiguous");
-    }
-
-    uint64_t size = input_ids.getDim(0);
-    if (end_compute == -1) {
-      end_compute = size;
-    }
-
-    if (start_compute < 0 || start_compute >= size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "start_compute out of range");
-    }
-
-    if (end_compute < -1 || end_compute > size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "end_compute out of range");
-    }
-
-    if (start_compute > end_compute && end_compute != -1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "must have start_compute <= end_compute, or end_compute==-1");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::discretizerInfer<float>(output_keys, output_vals,
-                  input_ids, input_vals,
-                  bin_ids, bin_vals, feature_offsets, output_bits, ID_to_index,
-                  start_compute, end_compute, output_start);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::discretizerInfer<double>(output_keys, output_vals,
-                   input_ids, input_vals,
-                   bin_ids, bin_vals, feature_offsets, output_bits, ID_to_index,
-                   start_compute, end_compute, output_start);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for discretizerInfer");
-    }
-  }
-}  // namespace twml
--- a/twml/libtwml/src/lib/discretizer_impl.docx
+++ b/twml/libtwml/src/lib/discretizer_impl.docx
--- a/twml/libtwml/src/lib/functions.cpp
+++ b/twml/libtwml/src/lib/functions.cpp
@ -1,158 +0,0 @@
-#include "internal/error.h"
-#include "internal/murmur_hash3.h"
-#include "internal/utf_converter.h"
-#include <twml/functions.h>
-#include <cstring>
-#include <algorithm>
-
-namespace twml {
-
-  template<typename T>
-  void add1(Tensor &output, const Tensor input) {
-    T *odata = output.getData<T>();
-    const T *idata = input.getData<T>();
-    const uint64_t num_elements = input.getNumElements();
-
-    for (uint64_t i = 0; i < num_elements; i++) {
-      odata[i] = idata[i] + 1;
-    }
-  }
-
-  template<typename T>
-  void copy(Tensor &output, const Tensor input) {
-    T *odata = output.getData<T>();
-    const T *idata = input.getData<T>();
-    const uint64_t num_elements = input.getNumElements();
-
-    for (uint64_t i = 0; i < num_elements; i++) {
-      odata[i] = idata[i];
-    }
-  }
-
-  void add1(Tensor &output, const Tensor input) {
-    auto type =  input.getType();
-    if (output.getType() != type) {
-      throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
-    }
-
-    if (output.getNumElements() != input.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
-    }
-
-    // TODO: Implement an easier dispatch function
-    switch (type) {
-    case TWML_TYPE_FLOAT:
-      twml::add1<float>(output, input);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::add1<double>(output, input);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "add1 only supports float and double tensors");
-    }
-  }
-
-  void copy(Tensor &output, const Tensor input) {
-    auto type =  input.getType();
-    if (output.getType() != type) {
-      throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
-    }
-
-    if (output.getNumElements() != input.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
-    }
-
-    // TODO: Implement an easier dispatch function
-    switch (type) {
-    case TWML_TYPE_FLOAT:
-      twml::copy<float>(output, input);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::copy<double>(output, input);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "copy only supports float and double tensors");
-    }
-  }
-
-  int64_t featureId(const std::string &feature) {
-    const char *str = feature.c_str();
-    uint64_t len = feature.size();
-    int64_t id = 0;
-    TWML_CHECK(twml_get_feature_id(&id, len, str), "Error getting featureId");
-    return id;
-  }
-}  // namespace twml
-
-twml_err twml_add1(twml_tensor output, const twml_tensor input) {
-  HANDLE_EXCEPTIONS(
-    auto out = twml::getTensor(output);
-    auto in = twml::getConstTensor(input);
-    twml::add1(*out, *in););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_copy(twml_tensor output, const twml_tensor input) {
-  HANDLE_EXCEPTIONS(
-    auto out = twml::getTensor(output);
-    auto in = twml::getConstTensor(input);
-    twml::copy(*out, *in););
-  return TWML_ERR_NONE;
-}
-
-inline twml_err twml_get_feature_id_internal(int64_t *result,
-                                             uint64_t out_size, uint16_t *out,
-                                             uint64_t out2_size, uint16_t *out2,
-                                             const uint64_t len, const char *str) {
-  uint64_t k = 0;
-  for (uint64_t i = 0; i < len; i++) {
-    if (str[i] == '#') {
-      k = i;
-      break;
-    }
-  }
-
-  uint8_t hash[16];
-  if (k != 0) {
-    ssize_t n = utf8_to_utf16((const uint8_t *) str, k, out, out_size);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-
-    MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, out2);
-    n = utf8_to_utf16((const uint8_t *) (str + k + 1), len - k - 1, &out2[4], out2_size - 8);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-
-    MurmurHash3_x64_128(out2, (n * sizeof(uint16_t)) + 8, 0, hash);
-  } else {
-    ssize_t n = utf8_to_utf16((const uint8_t *)str, len, out, out_size);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-    MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, hash);
-  }
-  int64_t id;
-  memcpy(&id, hash, sizeof(int64_t));
-  *result = id;
-
-  return TWML_ERR_NONE;
-}
-
-static const int UTF16_STR_MAX_SIZE = 1024;
-
-twml_err twml_get_feature_id(int64_t *result, const uint64_t len, const char *str) {
-  try {
-    uint16_t out[UTF16_STR_MAX_SIZE];
-    uint16_t out2[UTF16_STR_MAX_SIZE];
-    return twml_get_feature_id_internal(result,
-                                        UTF16_STR_MAX_SIZE, out,
-                                        UTF16_STR_MAX_SIZE, out2,
-                                        len, str);
-  } catch(const std::invalid_argument &ex) {
-    // If the space on the stack is not enough, try using the heap.
-    // len + 1 is needed because a null terminating character is added at the end.
-    std::vector<uint16_t> out(len + 1);
-    std::vector<uint16_t> out2(len + 1);
-    return twml_get_feature_id_internal(result,
-                                        len + 1, out.data(),
-                                        len + 1, out2.data(),
-                                        len, str);
-
-  }
-}
--- a/twml/libtwml/src/lib/functions.docx
+++ b/twml/libtwml/src/lib/functions.docx
--- a/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
+++ b/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
@ -1,241 +0,0 @@
-#include "internal/linear_search.h"
-#include "internal/error.h"
-#include <twml/hashing_discretizer_impl.h>
-#include <twml/optim.h>
-#include <algorithm>
-
-namespace twml {
-  template<typename Tx>
-  static int64_t lower_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
-    auto index_temp = std::lower_bound(data, data + buf_size, val);
-    return static_cast<int64_t>(index_temp - data);
-  }
-
-  template<typename Tx>
-  static int64_t upper_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
-    auto index_temp = std::upper_bound(data, data + buf_size, val);
-    return static_cast<int64_t>(index_temp - data);
-  }
-
-  template<typename Tx>
-  using search_method = int64_t (*)(const Tx *, const Tx, const int64_t);
-
-  typedef uint64_t (*hash_signature)(uint64_t, int64_t, uint64_t);
-
-  // uint64_t integer_multiplicative_hashing()
-  //
-  // A function to hash discretized feature_ids into one of 2**output_bits buckets.
-  // This function hashes the feature_ids to achieve a uniform distribution of
-  //   IDs, so the hashed IDs are with high probability far apart
-  // Then, bucket_indices can simply be added, resulting in unique new IDs with high probability
-  // We integer hash again to again spread out the new IDs
-  // Finally we take the upper
-  // Required args:
-  //   feature_id:
-  //     The feature id of the feature to be hashed.
-  //   bucket_index:
-  //     The bucket index of the discretized feature value
-  //   output_bits:
-  //     The number of bits of output space for the features to be hashed into.
-  //
-  // Note - feature_ids may have arbitrary distribution within int32s
-  // Note - 64 bit feature_ids can be processed with this, but the upper
-  //          32 bits have no effect on the output
-  // e.g. all feature ids 0 through 255 exist in movie-lens.
-  // this hashing constant is good for 32 LSBs. will use N=32. (can use N<32 also)
-  // this hashing constant is co-prime with 2**32, therefore we have that
-  //   a != b, a and b in [0,2**32)
-  //    implies
-  //   f(a) != f(b) where f(x) = (hashing_constant * x) % (2**32)
-  // note that we are mostly ignoring the upper 32 bits, using modulo 2**32 arithmetic
-  uint64_t integer_multiplicative_hashing(uint64_t feature_id,
-                                          int64_t bucket_index,
-                                          uint64_t output_bits) {
-    // possibly use 14695981039346656037 for 64 bit unsigned??
-    //  = 20921 * 465383 * 1509404459
-    // alternatively, 14695981039346656039 is prime
-    // We would also need to use N = 64
-    const uint64_t hashing_constant = 2654435761;
-    const uint64_t N = 32;
-    // hash once to prevent problems from anomalous input id distributions
-    feature_id *= hashing_constant;
-    feature_id += bucket_index;
-    // this hash enables the following right shift operation
-    //  without losing the bucket information (lower bits)
-    feature_id *= hashing_constant;
-    // output size is a power of 2
-    feature_id >>= N - output_bits;
-    uint64_t mask = (1 << output_bits) - 1;
-    return mask & feature_id;
-  }
-
-  uint64_t integer64_multiplicative_hashing(uint64_t feature_id,
-                                            int64_t bucket_index,
-                                            uint64_t output_bits) {
-    const uint64_t hashing_constant = 14695981039346656039UL;
-    const uint64_t N = 64;
-    // hash once to prevent problems from anomalous input id distributions
-    feature_id *= hashing_constant;
-    feature_id += bucket_index;
-    // this hash enables the following right shift operation
-    //  without losing the bucket information (lower bits)
-    feature_id *= hashing_constant;
-    // output size is a power of 2
-    feature_id >>= N - output_bits;
-    uint64_t mask = (1 << output_bits) - 1;
-    return mask & feature_id;
-  }
-
-  int64_t option_bits(int64_t options, int64_t high, int64_t low) {
-    options >>= low;
-    options &= (1 << (high - low + 1)) - 1;
-    return options;
-  }
-
-  // it is assumed that start_compute and end_compute are valid
-  template<typename T>
-  void hashDiscretizerInfer(Tensor &output_keys,
-                            Tensor &output_vals,
-                            const Tensor &input_ids,
-                            const Tensor &input_vals,
-                            const Tensor &bin_vals,
-                            int output_bits,
-                            const Map<int64_t, int64_t> &ID_to_index,
-                            int64_t start_compute,
-                            int64_t end_compute,
-                            int64_t n_bin,
-                            int64_t options) {
-    auto output_keys_data = output_keys.getData<int64_t>();
-    auto output_vals_data = output_vals.getData<T>();
-
-    auto input_ids_data = input_ids.getData<int64_t>();
-    auto input_vals_data = input_vals.getData<T>();
-
-    auto bin_vals_data = bin_vals.getData<T>();
-
-    // The function pointer implementation removes the option_bits
-    // function call (might be inlined) and corresponding branch from
-    // the hot loop, but it prevents inlining these functions, so
-    // there will be function call overhead. Uncertain which would
-    // be faster, testing needed. Also, code optimizers do weird things...
-    hash_signature hash_fn = integer_multiplicative_hashing;
-    switch (option_bits(options, 4, 2)) {
-      case 0:
-      hash_fn = integer_multiplicative_hashing;
-      break;
-      case 1:
-      hash_fn = integer64_multiplicative_hashing;
-      break;
-      default:
-      hash_fn = integer_multiplicative_hashing;
-    }
-
-    search_method<T> search_fn = lower_bound_search;
-    switch (option_bits(options, 1, 0)) {
-      case 0:
-      search_fn = lower_bound_search<T>;
-      break;
-      case 1:
-      search_fn = linear_search<T>;
-      break;
-      case 2:
-      search_fn = upper_bound_search<T>;
-      break;
-      default:
-      search_fn = lower_bound_search<T>;
-    }
-
-    for (uint64_t i = start_compute; i < end_compute; i++) {
-      int64_t id = input_ids_data[i];
-      T val = input_vals_data[i];
-
-      auto iter = ID_to_index.find(id);
-      if (iter != ID_to_index.end()) {
-        int64_t feature_idx = iter->second;
-        const T *bin_vals_start = bin_vals_data + feature_idx * n_bin;
-        int64_t out_bin_idx = search_fn(bin_vals_start, val, n_bin);
-        output_keys_data[i] = hash_fn(id, out_bin_idx, output_bits);
-        output_vals_data[i] = 1;
-      } else {
-        // feature not calibrated
-        output_keys_data[i] = id & ((1 << output_bits) - 1);
-        output_vals_data[i] = val;
-      }
-    }
-  }
-
-  void hashDiscretizerInfer(Tensor &output_keys,
-                            Tensor &output_vals,
-                            const Tensor &input_ids,
-                            const Tensor &input_vals,
-                            int n_bin,
-                            const Tensor &bin_vals,
-                            int output_bits,
-                            const Map<int64_t, int64_t> &ID_to_index,
-                            int start_compute,
-                            int end_compute,
-                            int64_t options) {
-    if (input_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    uint64_t size = input_ids.getDim(0);
-    if (end_compute == -1) {
-      end_compute = size;
-    }
-
-    if (start_compute < 0 || start_compute >= size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "start_compute out of range");
-    }
-
-    if (end_compute < -1 || end_compute > size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "end_compute out of range");
-    }
-
-    if (start_compute > end_compute && end_compute != -1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "must have start_compute <= end_compute, or end_compute==-1");
-    }
-
-    if (output_keys.getStride(0) != 1 || output_vals.getStride(0) != 1 ||
-        input_ids.getStride(0) != 1 || input_vals.getStride(0) != 1 ||
-        bin_vals.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "All Strides must be 1.");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::hashDiscretizerInfer<float>(output_keys, output_vals,
-                  input_ids, input_vals,
-                  bin_vals, output_bits, ID_to_index,
-                  start_compute, end_compute, n_bin, options);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::hashDiscretizerInfer<double>(output_keys, output_vals,
-                   input_ids, input_vals,
-                   bin_vals, output_bits, ID_to_index,
-                   start_compute, end_compute, n_bin, options);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for hashDiscretizerInfer");
-    }
-  }
-}  // namespace twml
--- a/twml/libtwml/src/lib/hashing_discretizer_impl.docx
+++ b/twml/libtwml/src/lib/hashing_discretizer_impl.docx
--- a/twml/libtwml/src/lib/internal/endianutils.docx
+++ b/twml/libtwml/src/lib/internal/endianutils.docx
--- a/twml/libtwml/src/lib/internal/endianutils.h
+++ b/twml/libtwml/src/lib/internal/endianutils.h
@ -1,137 +0,0 @@
-//
-//  endian_fix.h
-//  ImageCore
-//
-//  For OSes that use glibc < 2.9 (like RHEL5)
-//
-#pragma once
-
-#ifdef __APPLE__
-#include <libkern/OSByteOrder.h>
-#define htobe16(x) OSSwapHostToBigInt16(x)
-#define htole16(x) OSSwapHostToLittleInt16(x)
-#define betoh16(x) OSSwapBigToHostInt16(x)
-#define letoh16(x) OSSwapLittleToHostInt16(x)
-#define htobe32(x) OSSwapHostToBigInt32(x)
-#define htole32(x) OSSwapHostToLittleInt32(x)
-#define betoh32(x) OSSwapBigToHostInt32(x)
-#define letoh32(x) OSSwapLittleToHostInt32(x)
-#define htobe64(x) OSSwapHostToBigInt64(x)
-#define htole64(x) OSSwapHostToLittleInt64(x)
-#define betoh64(x) OSSwapBigToHostInt64(x)
-#define letoh64(x) OSSwapLittleToHostInt64(x)
-#else
-#include <endian.h>
-#ifdef __USE_BSD
-/* Conversion interfaces.  */
-#include <byteswap.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#ifndef htobe16
-#define htobe16(x) __bswap_16(x)
-#endif
-#ifndef htole16
-#define htole16(x) (x)
-#endif
-#ifndef betoh16
-#define betoh16(x) __bswap_16(x)
-#endif
-#ifndef letoh16
-#define letoh16(x) (x)
-#endif
-
-#ifndef htobe32
-#define htobe32(x) __bswap_32(x)
-#endif
-#ifndef htole32
-#define htole32(x) (x)
-#endif
-#ifndef betoh32
-#define betoh32(x) __bswap_32(x)
-#endif
-#ifndef letoh32
-#define letoh32(x) (x)
-#endif
-
-#ifndef htobe64
-#define htobe64(x) __bswap_64(x)
-#endif
-#ifndef htole64
-#define htole64(x) (x)
-#endif
-#ifndef betoh64
-#define betoh64(x) __bswap_64(x)
-#endif
-#ifndef letoh64
-#define letoh64(x) (x)
-#endif
-
-#else /* __BYTE_ORDER == __LITTLE_ENDIAN */
-#ifndef htobe16
-#define htobe16(x) (x)
-#endif
-#ifndef htole16
-#define htole16(x) __bswap_16(x)
-#endif
-#ifndef be16toh
-#define be16toh(x) (x)
-#endif
-#ifndef le16toh
-#define le16toh(x) __bswap_16(x)
-#endif
-
-#ifndef htobe32
-#define htobe32(x) (x)
-#endif
-#ifndef htole32
-#define htole32(x) __bswap_32(x)
-#endif
-#ifndef betoh32
-#define betoh32(x) (x)
-#endif
-#ifndef letoh32
-#define letoh32(x) __bswap_32(x)
-#endif
-
-#ifndef htobe64
-#define htobe64(x) (x)
-#endif
-#ifndef htole64
-#define htole64(x) __bswap_64(x)
-#endif
-#ifndef betoh64
-#define betoh64(x) (x)
-#endif
-#ifndef letoh64
-#define letoh64(x) __bswap_64(x)
-#endif
-
-#endif /* __BYTE_ORDER == __LITTLE_ENDIAN */
-
-#else  /* __USE_BSD */
-#ifndef betoh16
-#define betoh16 be16toh
-#endif
-
-#ifndef betoh32
-#define betoh32 be32toh
-#endif
-
-#ifndef betoh64
-#define betoh64 be64toh
-#endif
-
-#ifndef letoh16
-#define letoh16 le16toh
-#endif
-
-#ifndef letoh32
-#define letoh32 le32toh
-#endif
-
-#ifndef letoh64
-#define letoh64 le64toh
-#endif
-
-#endif /* __USE_BSD */
-#endif /* __APPLE__ */
--- a/twml/libtwml/src/lib/internal/error.docx
+++ b/twml/libtwml/src/lib/internal/error.docx
--- a/twml/libtwml/src/lib/internal/error.h
+++ b/twml/libtwml/src/lib/internal/error.h
@ -1,29 +0,0 @@
-#pragma once
-#include <twml/Error.h>
-#include <iostream>
-
-#define HANDLE_EXCEPTIONS(fn) do {              \
-        try {                                   \
-            fn                                  \
-        } catch(const twml::Error &e) {         \
-            std::cerr << e.what() << std::endl; \
-            return e.err();                     \
-        } catch(...) {                          \
-            std::cerr << "Unknown error\n";     \
-            return TWML_ERR_UNKNOWN;            \
-        }                                       \
-    } while(0)
-
-#define TWML_CHECK(fn, msg) do {                \
-        twml_err err = fn;                      \
-        if (err == TWML_ERR_NONE) break;        \
-        throw twml::Error(err, msg);            \
-    } while(0)
-
-
-#define CHECK_THRIFT_TYPE(real_type, expected_type, type) do {      \
-    int real_type_val = real_type;                                  \
-    if (real_type_val != expected_type) {                           \
-      throw twml::ThriftInvalidType(real_type_val, __func__, type); \
-    }                                                               \
-  } while(0)
--- a/twml/libtwml/src/lib/internal/interpolate.docx
+++ b/twml/libtwml/src/lib/internal/interpolate.docx
--- a/twml/libtwml/src/lib/internal/interpolate.h
+++ b/twml/libtwml/src/lib/internal/interpolate.h
@ -1,74 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-#include <twml/optim.h>
-namespace twml {
-
-  enum InterpolationMode {LINEAR, NEAREST};
-
-  template<typename Tx, typename Ty>
-  static Tx interpolation(const Tx *xsData, const int64_t xsStride,
-                 const Ty *ysData, const int64_t ysStride,
-                 const Tx val, const int64_t mainSize,
-                 const InterpolationMode mode,
-                 const int64_t lowest,
-                 const bool return_local_index = false) {
-    int64_t left = 0;
-    int64_t right = mainSize-1;
-
-    if (val <= xsData[0]) {
-      right = 0;
-    } else if (val >= xsData[right*xsStride]) {
-      left = right;
-    } else {
-      while (left < right) {
-        int64_t middle = (left+right)/2;
-
-        if (middle < mainSize - 1 &&
-          val >= xsData[middle*xsStride] &&
-          val <= xsData[(middle+1)*xsStride]) {
-          left = middle;
-          right = middle + 1;
-          break;
-        } else if (val > xsData[middle*xsStride]) {
-          left = middle;
-        } else {
-          right = middle;
-        }
-      }
-      if (lowest) {
-        while (left > 0 &&
-             val >= xsData[(left - 1) * xsStride] &&
-             val == xsData[left * xsStride]) {
-          left--;
-          right--;
-        }
-      }
-    }
-
-    Ty out = 0;
-    if (return_local_index) {
-        out = left;
-    } else if (mode == NEAREST) {
-      out = ysData[left*ysStride];
-    } else {
-      int64_t leftys = left*ysStride;
-      int64_t rightys = right*ysStride;
-      int64_t leftxs = left*xsStride;
-      int64_t rightxs = right*xsStride;
-      if (right != left+1 ||
-        xsData[leftxs] == xsData[rightxs]) {
-        out = ysData[leftys];
-      } else {
-        Tx xLeft = xsData[leftxs];
-        Tx xRight = xsData[rightxs];
-        Tx yLeft = ysData[leftys];
-        Tx ratio = (val - xLeft) / (xRight - xLeft);
-        out = ratio*(ysData[rightys] - yLeft) + yLeft;
-      }
-    }
-    return out;
-  }
-
-}  // namespace twml
-#endif
--- a/twml/libtwml/src/lib/internal/khash.docx
+++ b/twml/libtwml/src/lib/internal/khash.docx
--- a/twml/libtwml/src/lib/internal/khash.h
+++ b/twml/libtwml/src/lib/internal/khash.h
@ -1,627 +0,0 @@
-/* The MIT License
-
-   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/*
-  An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
-   int ret, is_missing;
-   khiter_t k;
-   khash_t(32) *h = kh_init(32);
-   k = kh_put(32, h, 5, &ret);
-   kh_value(h, k) = 10;
-   k = kh_get(32, h, 10);
-   is_missing = (k == kh_end(h));
-   k = kh_get(32, h, 5);
-   kh_del(32, h, k);
-   for (k = kh_begin(h); k != kh_end(h); ++k)
-      if (kh_exist(h, k)) kh_value(h, k) = 1;
-   kh_destroy(32, h);
-   return 0;
-}
-*/
-
-/*
-  2013-05-02 (0.2.8):
-
-   * Use quadratic probing. When the capacity is power of 2, stepping function
-     i*(i+1)/2 guarantees to traverse each bucket. It is better than double
-     hashing on cache performance and is more robust than linear probing.
-
-     In theory, double hashing should be more robust than quadratic probing.
-     However, my implementation is probably not for large hash tables, because
-     the second hash function is closely tied to the first hash function,
-     which reduce the effectiveness of double hashing.
-
-   Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
-
-  2011-12-29 (0.2.7):
-
-    * Minor code clean up; no actual effect.
-
-  2011-09-16 (0.2.6):
-
-   * The capacity is a power of 2. This seems to dramatically improve the
-     speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
-
-      - http://code.google.com/p/ulib/
-      - http://nothings.org/computer/judy/
-
-   * Allow to optionally use linear probing which usually has better
-     performance for random input. Double hashing is still the default as it
-     is more robust to certain non-random input.
-
-   * Added Wang's integer hash function (not used by default). This hash
-     function is more robust to certain non-random input.
-
-  2011-02-14 (0.2.5):
-
-    * Allow to declare global functions.
-
-  2009-09-26 (0.2.4):
-
-    * Improve portability
-
-  2008-09-19 (0.2.3):
-
-   * Corrected the example
-   * Improved interfaces
-
-  2008-09-11 (0.2.2):
-
-   * Improved speed a little in kh_put()
-
-  2008-09-10 (0.2.1):
-
-   * Added kh_clear()
-   * Fixed a compiling error
-
-  2008-09-02 (0.2.0):
-
-   * Changed to token concatenation which increases flexibility.
-
-  2008-08-31 (0.1.2):
-
-   * Fixed a bug in kh_get(), which has not been tested previously.
-
-  2008-08-31 (0.1.1):
-
-   * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
-  @header
-
-  Generic hash table library.
- */
-
-#define AC_VERSION_KHASH_H "0.2.8"
-
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-/* compiler specific configuration */
-
-#if UINT_MAX == 0xffffffffu
-typedef unsigned int khint32_t;
-#elif ULONG_MAX == 0xffffffffu
-typedef unsigned long khint32_t;
-#endif
-
-#if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khint64_t;
-#else
-typedef uint64_t khint64_t;
-#endif
-
-#ifndef kh_inline
-#ifdef _MSC_VER
-#define kh_inline __inline
-#else
-#define kh_inline inline
-#endif
-#endif /* kh_inline */
-
-#ifndef klib_unused
-#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
-#define klib_unused __attribute__ ((__unused__))
-#else
-#define klib_unused
-#endif
-#endif /* klib_unused */
-
-typedef khint32_t khint_t;
-typedef khint_t khiter_t;
-
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
-
-#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#ifndef kcalloc
-#define kcalloc(N,Z) calloc(N,Z)
-#endif
-#ifndef kmalloc
-#define kmalloc(Z) malloc(Z)
-#endif
-#ifndef krealloc
-#define krealloc(P,Z) realloc(P,Z)
-#endif
-#ifndef kfree
-#define kfree(P) free(P)
-#endif
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define __KHASH_TYPE(name, khkey_t, khval_t) \
-   typedef struct kh_##name##_s { \
-      khint_t n_buckets, size, n_occupied, upper_bound; \
-      khint32_t *flags; \
-      khkey_t *keys; \
-      khval_t *vals; \
-   } kh_##name##_t;
-
-#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \
-   extern kh_##name##_t *kh_init_##name(void);                    \
-   extern void kh_destroy_##name(kh_##name##_t *h);               \
-   extern void kh_clear_##name(kh_##name##_t *h);                 \
-   extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \
-   extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
-   extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
-   extern void kh_del_##name(kh_##name##_t *h, khint_t x);
-
-#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   SCOPE kh_##name##_t *kh_init_##name(void) {                    \
-      return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \
-   }                                                  \
-   SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \
-   {                                                  \
-      if (h) {                                        \
-         kfree((void *)h->keys); kfree(h->flags);              \
-         kfree((void *)h->vals);                            \
-         kfree(h);                                       \
-      }                                               \
-   }                                                  \
-   SCOPE void kh_clear_##name(kh_##name##_t *h)                \
-   {                                                  \
-      if (h && h->flags) {                               \
-         memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
-         h->size = h->n_occupied = 0;                       \
-      }                                               \
-   }                                                  \
-   SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \
-   {                                                  \
-      if (h->n_buckets) {                                   \
-         khint_t k, i, last, mask, step = 0; \
-         mask = h->n_buckets - 1;                           \
-         k = __hash_func(key); i = k & mask;                   \
-         last = i; \
-         while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-            i = (i + (++step)) & mask; \
-            if (i == last) return h->n_buckets;                \
-         }                                            \
-         return __ac_iseither(h->flags, i)? h->n_buckets : i;     \
-      } else return 0;                                   \
-   }                                                  \
-   SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
-   { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
-      khint32_t *new_flags = 0;                             \
-      khint_t j = 1;                                     \
-      {                                               \
-         kroundup32(new_n_buckets);                            \
-         if (new_n_buckets < 4) new_n_buckets = 4;             \
-         if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
-         else { /* hash table size to be changed (shrink or expand); rehash */ \
-            new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
-            if (!new_flags) return -1;                      \
-            memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
-            if (h->n_buckets < new_n_buckets) { /* expand */      \
-               khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-               if (!new_keys) { kfree(new_flags); return -1; }    \
-               h->keys = new_keys;                          \
-               if (kh_is_map) {                          \
-                  khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-                  if (!new_vals) { kfree(new_flags); return -1; } \
-                  h->vals = new_vals;                       \
-               }                                      \
-            } /* otherwise shrink */                        \
-         }                                            \
-      }                                               \
-      if (j) { /* rehashing is needed */                       \
-         for (j = 0; j != h->n_buckets; ++j) {                 \
-            if (__ac_iseither(h->flags, j) == 0) {             \
-               khkey_t key = h->keys[j];                    \
-               khval_t val;                              \
-               khint_t new_mask;                         \
-               new_mask = new_n_buckets - 1;                   \
-               if (kh_is_map) val = h->vals[j];             \
-               __ac_set_isdel_true(h->flags, j);               \
-               while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
-                  khint_t k, i, step = 0; \
-                  k = __hash_func(key);                     \
-                  i = k & new_mask;                      \
-                  while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
-                  __ac_set_isempty_false(new_flags, i);        \
-                  if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
-                     { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
-                     if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-                     __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
-                  } else { /* write the element and jump out of the loop */ \
-                     h->keys[i] = key;                   \
-                     if (kh_is_map) h->vals[i] = val;       \
-                     break;                              \
-                  }                                   \
-               }                                      \
-            }                                         \
-         }                                            \
-         if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
-            h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-            if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-         }                                            \
-         kfree(h->flags); /* free the working space */            \
-         h->flags = new_flags;                              \
-         h->n_buckets = new_n_buckets;                      \
-         h->n_occupied = h->size;                           \
-         h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
-      }                                               \
-      return 0;                                          \
-   }                                                  \
-   SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
-   {                                                  \
-      khint_t x;                                         \
-      if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
-         if (h->n_buckets > (h->size<<1)) {                    \
-            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
-               *ret = -1; return h->n_buckets;                 \
-            }                                         \
-         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
-            *ret = -1; return h->n_buckets;                    \
-         }                                            \
-      } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
-      {                                               \
-         khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
-         x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
-         if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
-         else {                                          \
-            last = i; \
-            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-               if (__ac_isdel(h->flags, i)) site = i;          \
-               i = (i + (++step)) & mask; \
-               if (i == last) { x = site; break; }             \
-            }                                         \
-            if (x == h->n_buckets) {                        \
-               if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
-               else x = i;                               \
-            }                                         \
-         }                                            \
-      }                                               \
-      if (__ac_isempty(h->flags, x)) { /* not present at all */      \
-         h->keys[x] = key;                               \
-         __ac_set_isboth_false(h->flags, x);                   \
-         ++h->size; ++h->n_occupied;                           \
-         *ret = 1;                                       \
-      } else if (__ac_isdel(h->flags, x)) { /* deleted */            \
-         h->keys[x] = key;                               \
-         __ac_set_isboth_false(h->flags, x);                   \
-         ++h->size;                                      \
-         *ret = 2;                                       \
-      } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
-      return x;                                          \
-   }                                                  \
-   SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \
-   {                                                  \
-      if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \
-         __ac_set_isdel_true(h->flags, x);                     \
-         --h->size;                                      \
-      }                                               \
-   }
-
-#define KHASH_DECLARE(name, khkey_t, khval_t)                     \
-   __KHASH_TYPE(name, khkey_t, khval_t)                        \
-   __KHASH_PROTOTYPES(name, khkey_t, khval_t)
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   __KHASH_TYPE(name, khkey_t, khval_t)                        \
-   __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
-  @abstract     Integer hash function
-  @param  key   The integer [khint32_t]
-  @return       The hash value [khint_t]
- */
-#define kh_int_hash_func(key) (khint32_t)(key)
-/*! @function
-  @abstract     Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
-  @abstract     64-bit integer hash function
-  @param  key   The integer [khint64_t]
-  @return       The hash value [khint_t]
- */
-#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
-/*! @function
-  @abstract     64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-/*! @function
-  @abstract     const char* hash function
-  @param  s     Pointer to a null terminated string
-  @return       The hash value
- */
-static kh_inline khint_t __ac_X31_hash_string(const char *s)
-{
-   khint_t h = (khint_t)*s;
-   if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
-   return h;
-}
-/*! @function
-  @abstract     Another interface to const char* hash function
-  @param  key   Pointer to a null terminated string [const char*]
-  @return       The hash value [khint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
-  @abstract     Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-static kh_inline khint_t __ac_Wang_hash(khint_t key)
-{
-    key += ~(key << 15);
-    key ^=  (key >> 10);
-    key +=  (key << 3);
-    key ^=  (key >> 6);
-    key += ~(key << 11);
-    key ^=  (key >> 16);
-    return key;
-}
-#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other convenient macros... */
-
-/*!
-  @abstract Type of the hash table.
-  @param  name  Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
-  @abstract     Initiate a hash table.
-  @param  name  Name of the hash table [symbol]
-  @return       Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name()
-
-/*! @function
-  @abstract     Destroy a hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
-  @abstract     Reset a hash table without deallocating memory.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
-  @abstract     Resize a hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  s     New size [khint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
-  @abstract     Insert a key to the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Key [type of keys]
-  @param  r     Extra return code: -1 if the operation failed;
-                0 if the key is present in the hash table;
-                1 if the bucket is empty (never used); 2 if the element in
-            the bucket has been deleted [int*]
-  @return       Iterator to the inserted element [khint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
-  @abstract     Retrieve a key from the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Key [type of keys]
-  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
-  @abstract     Remove a key from the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Iterator to the element to be deleted [khint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-/*! @function
-  @abstract     Test whether a bucket contains data.
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
-  @abstract     Get key given an iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
-  @abstract     Get value given an iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       Value [type of values]
-  @discussion   For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
-  @abstract     Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
-  @abstract     Get the start iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       The start iterator [khint_t]
- */
-#define kh_begin(h) (khint_t)(0)
-
-/*! @function
-  @abstract     Get the end iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       The end iterator [khint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
-  @abstract     Get the number of elements in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       Number of elements in the hash table [khint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
-  @abstract     Get the number of buckets in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       Number of buckets in the hash table [khint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/*! @function
-  @abstract     Iterate over the entries in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  kvar  Variable to which key will be assigned
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \
-   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
-      if (!kh_exist(h,__i)) continue;                 \
-      (kvar) = kh_key(h,__i);                      \
-      (vvar) = kh_val(h,__i);                      \
-      code;                                  \
-   } }
-
-/*! @function
-  @abstract     Iterate over the values in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach_value(h, vvar, code) { khint_t __i;      \
-   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
-      if (!kh_exist(h,__i)) continue;                 \
-      (vvar) = kh_val(h,__i);                      \
-      code;                                  \
-   } }
-
-/* More conenient interfaces */
-
-/*! @function
-  @abstract     Instantiate a hash set containing integer keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name)                            \
-   KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing integer keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t)                      \
-   KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing 64-bit integer keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT64(name)                             \
-   KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing 64-bit integer keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_INT64(name, khval_t)                       \
-   KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-typedef const char *kh_cstr_t;
-/*! @function
-  @abstract     Instantiate a hash map containing const char* keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name)                            \
-   KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing const char* keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t)                      \
-   KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-#endif /* __AC_KHASH_H */
--- a/twml/libtwml/src/lib/internal/linear_search.docx
+++ b/twml/libtwml/src/lib/internal/linear_search.docx
--- a/twml/libtwml/src/lib/internal/linear_search.h
+++ b/twml/libtwml/src/lib/internal/linear_search.h
@ -1,17 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-#include <twml/optim.h>
-namespace twml {
-
-  template<typename Tx>
-  static int64_t linear_search(const Tx *xsData, const Tx val, const int64_t mainSize) {
-    int64_t left = 0;
-    int64_t right = mainSize-1;
-    while(left <= right && val > xsData[left])
-      left++;
-    return left;
-  }
-
-}  // namespace twml
-#endif
--- a/twml/libtwml/src/lib/internal/murmur_hash3.docx
+++ b/twml/libtwml/src/lib/internal/murmur_hash3.docx
--- a/twml/libtwml/src/lib/internal/murmur_hash3.h
+++ b/twml/libtwml/src/lib/internal/murmur_hash3.h
@ -1,37 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#ifndef _MURMURHASH3_H_
-#define _MURMURHASH3_H_
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else  // defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
-
-//-----------------------------------------------------------------------------
-
-#endif // _MURMURHASH3_H_
--- a/twml/libtwml/src/lib/internal/thrift.docx
+++ b/twml/libtwml/src/lib/internal/thrift.docx
--- a/twml/libtwml/src/lib/internal/thrift.h
+++ b/twml/libtwml/src/lib/internal/thrift.h
@ -1,69 +0,0 @@
-// For details of how to encode and decode thrift, check
-// https://github.com/apache/thrift/blob/master/doc/specs/thrift-binary-protocol.md
-
-// Definitions of the thrift binary format
-typedef enum {
-  TTYPE_STOP   = 0,
-  TTYPE_VOID   = 1,
-  TTYPE_BOOL   = 2,
-  TTYPE_BYTE   = 3,
-  TTYPE_DOUBLE = 4,
-  TTYPE_I16    = 6,
-  TTYPE_I32    = 8,
-  TTYPE_I64    = 10,
-  TTYPE_STRING = 11,
-  TTYPE_STRUCT = 12,
-  TTYPE_MAP    = 13,
-  TTYPE_SET    = 14,
-  TTYPE_LIST   = 15,
-  TTYPE_ENUM   = 16,
-} TTYPES;
-
-// Fields of a batch prediction response
-typedef enum {
-  BPR_DUMMY ,
-  BPR_PREDICTIONS,
-} BPR_FIELDS;
-
-// Fields of a datarecord
-typedef enum {
-  DR_CROSS             , // fake field for crosses
-  DR_BINARY            ,
-  DR_CONTINUOUS        ,
-  DR_DISCRETE          ,
-  DR_STRING            ,
-  DR_SPARSE_BINARY     ,
-  DR_SPARSE_CONTINUOUS ,
-  DR_BLOB              ,
-  DR_GENERAL_TENSOR    ,
-  DR_SPARSE_TENSOR     ,
-} DR_FIELDS;
-
-// Fields for General tensor
-typedef enum {
-  GT_DUMMY  , // dummy field
-  GT_RAW    ,
-  GT_STRING ,
-  GT_INT32  ,
-  GT_INT64  ,
-  GT_FLOAT  ,
-  GT_DOUBLE ,
-  GT_BOOL   ,
-} GT_FIELDS;
-
-typedef enum {
-  SP_DUMMY  , // dummy field
-  SP_COO    ,
-} SP_FIELDS;
-
-// Enum values from tensor.thrift
-typedef enum {
-  DATA_TYPE_FLOAT  ,
-  DATA_TYPE_DOUBLE ,
-  DATA_TYPE_INT32  ,
-  DATA_TYPE_INT64  ,
-  DATA_TYPE_UINT8  ,
-  DATA_TYPE_STRING ,
-  DATA_TYPE_BYTE   ,
-  DATA_TYPE_BOOL   ,
-} DATA_TYPES;
--- a/twml/libtwml/src/lib/internal/utf_converter.docx
+++ b/twml/libtwml/src/lib/internal/utf_converter.docx
--- a/twml/libtwml/src/lib/internal/utf_converter.h
+++ b/twml/libtwml/src/lib/internal/utf_converter.h
@ -1,10 +0,0 @@
-#ifndef _UTF_CONVERTER_H_
-#define _UTF_CONVERTER_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/types.h>
-
-ssize_t utf8_to_utf16(const uint8_t *in, uint64_t in_len, uint16_t *out, uint64_t max_out);
-
-#endif
--- a/twml/libtwml/src/lib/io/IOError.cpp
+++ b/twml/libtwml/src/lib/io/IOError.cpp
@ -1,61 +0,0 @@
-#include <twml/io/IOError.h>
-
-
-namespace twml {
-namespace io {
-
-namespace {
-  std::string messageFromStatus(IOError::Status status) {
-    switch (status) {
-      case IOError::OUT_OF_RANGE:
-        return "failed to read enough input";
-      case IOError::WRONG_MAGIC:
-        return "wrong magic in stream";
-      case IOError::WRONG_HEADER:
-        return "wrong header in stream";
-      case IOError::ERROR_HEADER_CHECKSUM:
-        return "header checksum doesn't match";
-      case IOError::INVALID_METHOD:
-        return "using invalid method";
-      case IOError::USING_RESERVED:
-        return "using reserved flag";
-      case IOError::ERROR_HEADER_EXTRA_FIELD_CHECKSUM:
-        return "extra header field checksum doesn't match";
-      case IOError::CANT_FIT_OUTPUT:
-        return "can't fit output in the given space";
-      case IOError::SPLIT_FILE:
-        return "split files aren't supported";
-      case IOError::BLOCK_SIZE_TOO_LARGE:
-        return "block size is too large";
-      case IOError::SOURCE_LARGER_THAN_DESTINATION:
-        return "source is larger than destination";
-      case IOError::DESTINATION_LARGER_THAN_CAPACITY:
-        return "destination buffer is too small to fit uncompressed result";
-      case IOError::HEADER_FLAG_MISMATCH:
-        return "failed to match flags for compressed and decompressed data";
-      case IOError::NOT_ENOUGH_INPUT:
-        return "not enough input to proceed with decompression";
-      case IOError::ERROR_SOURCE_BLOCK_CHECKSUM:
-        return "source block checksum doesn't match";
-      case IOError::COMPRESSED_DATA_VIOLATION:
-        return "error occurred while decompressing the data";
-      case IOError::ERROR_DESTINATION_BLOCK_CHECKSUM:
-        return "destination block checksum doesn't match";
-      case IOError::EMPTY_RECORD:
-        return "can't write an empty record";
-      case IOError::MALFORMED_MEMORY_RECORD:
-        return "can't write malformed record";
-      case IOError::UNSUPPORTED_OUTPUT_TYPE:
-        return "output data type is not supported";
-      case IOError::OTHER_ERROR:
-      default:
-        return "unknown error occurred";
-    }
-  }
-}  // namespace
-
-IOError::IOError(Status status): twml::Error(TWML_ERR_IO, "Found error while processing stream: " +
-    messageFromStatus(status)), m_status(status) {}
-
-}  // namespace io
-}  // namespace twml
--- a/twml/libtwml/src/lib/io/IOError.docx
+++ b/twml/libtwml/src/lib/io/IOError.docx
--- a/twml/libtwml/src/lib/murmur_hash3.cpp
+++ b/twml/libtwml/src/lib/murmur_hash3.cpp
@ -1,335 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "internal/murmur_hash3.h"
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE  __forceinline
-
-#include <stdlib.h>
-
-#define ROTL32(x,y)  _rotl(x,y)
-#define ROTL64(x,y)  _rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else  // defined(_MSC_VER)
-
-#define  FORCE_INLINE inline __attribute__((always_inline))
-
-FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r )
-{
-  return (x << r) | (x >> (32 - r));
-}
-
-FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r )
-{
-  return (x << r) | (x >> (64 - r));
-}
-
-#define  ROTL32(x,y)  rotl32(x,y)
-#define ROTL64(x,y)  rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
-{
-  return p[i];
-}
-
-FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
-{
-  return p[i];
-}
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32 ( uint32_t h )
-{
-  h ^= h >> 16;
-  h *= 0x85ebca6b;
-  h ^= h >> 13;
-  h *= 0xc2b2ae35;
-  h ^= h >> 16;
-
-  return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64 ( uint64_t k )
-{
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-
-  return k;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32 ( const void * key, int len,
-                          uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 4;
-
-  uint32_t h1 = seed;
-
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  //----------
-  // body
-
-  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i);
-
-    k1 *= c1;
-    k1 = ROTL32(k1,15);
-    k1 *= c2;
-
-    h1 ^= k1;
-    h1 = ROTL32(h1,13);
-    h1 = h1*5+0xe6546b64;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
-
-  uint32_t k1 = 0;
-
-  switch(len & 3)
-  {
-  case 3: k1 ^= tail[2] << 16;
-  case 2: k1 ^= tail[1] << 8;
-  case 1: k1 ^= tail[0];
-          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-
-  h1 = fmix32(h1);
-
-  *(uint32_t*)out = h1;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_128 ( const void * key, const int len,
-                           uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint32_t h1 = seed;
-  uint32_t h2 = seed;
-  uint32_t h3 = seed;
-  uint32_t h4 = seed;
-
-  const uint32_t c1 = 0x239b961b;
-  const uint32_t c2 = 0xab0e9789;
-  const uint32_t c3 = 0x38b34ae5;
-  const uint32_t c4 = 0xa1e38b93;
-
-  //----------
-  // body
-
-  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i*4+0);
-    uint32_t k2 = getblock32(blocks,i*4+1);
-    uint32_t k3 = getblock32(blocks,i*4+2);
-    uint32_t k4 = getblock32(blocks,i*4+3);
-
-    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
-
-    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
-    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
-
-    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
-    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
-
-    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
-    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
-  uint32_t k1 = 0;
-  uint32_t k2 = 0;
-  uint32_t k3 = 0;
-  uint32_t k4 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k4 ^= tail[14] << 16;
-  case 14: k4 ^= tail[13] << 8;
-  case 13: k4 ^= tail[12] << 0;
-           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
-  case 12: k3 ^= tail[11] << 24;
-  case 11: k3 ^= tail[10] << 16;
-  case 10: k3 ^= tail[ 9] << 8;
-  case  9: k3 ^= tail[ 8] << 0;
-           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
-  case  8: k2 ^= tail[ 7] << 24;
-  case  7: k2 ^= tail[ 6] << 16;
-  case  6: k2 ^= tail[ 5] << 8;
-  case  5: k2 ^= tail[ 4] << 0;
-           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
-  case  4: k1 ^= tail[ 3] << 24;
-  case  3: k1 ^= tail[ 2] << 16;
-  case  2: k1 ^= tail[ 1] << 8;
-  case  1: k1 ^= tail[ 0] << 0;
-           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  h1 = fmix32(h1);
-  h2 = fmix32(h2);
-  h3 = fmix32(h3);
-  h4 = fmix32(h4);
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  ((uint32_t*)out)[0] = h1;
-  ((uint32_t*)out)[1] = h2;
-  ((uint32_t*)out)[2] = h3;
-  ((uint32_t*)out)[3] = h4;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x64_128 ( const void * key, const int len,
-                           const uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint64_t h1 = seed;
-  uint64_t h2 = seed;
-
-  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
-  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
-  //----------
-  // body
-
-  const uint64_t * blocks = (const uint64_t *)(data);
-
-  for(int i = 0; i < nblocks; i++)
-  {
-    uint64_t k1 = getblock64(blocks,i*2+0);
-    uint64_t k2 = getblock64(blocks,i*2+1);
-
-    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
-
-    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
-  uint64_t k1 = 0;
-  uint64_t k2 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
-  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
-  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
-  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
-  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
-  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
-  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
-           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
-  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
-  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
-  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
-  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
-  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
-  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
-  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
-           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len;
-
-  h1 += h2;
-  h2 += h1;
-
-  h1 = fmix64(h1);
-  h2 = fmix64(h2);
-
-  h1 += h2;
-  h2 += h1;
-
-  ((uint64_t*)out)[0] = h1;
-  ((uint64_t*)out)[1] = h2;
-}
-
-//-----------------------------------------------------------------------------
-
--- a/twml/libtwml/src/lib/murmur_hash3.docx
+++ b/twml/libtwml/src/lib/murmur_hash3.docx
--- a/twml/libtwml/src/lib/optim.cpp
+++ b/twml/libtwml/src/lib/optim.cpp
@ -1,274 +0,0 @@
-#include "internal/interpolate.h"
-#include "internal/error.h"
-#include <twml/optim.h>
-
-namespace twml {
-  template<typename T>
-  void mdlInfer(Tensor &output_keys, Tensor &output_vals,
-          const Tensor &input_keys, const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          bool return_bin_indices) {
-    auto okeysData = output_keys.getData<int64_t>();
-    auto ovalsData = output_vals.getData<T>();
-    uint64_t okeysStride   = output_keys.getStride(0);
-    uint64_t ovaluesStride = output_vals.getStride(0);
-
-    auto ikeysData = input_keys.getData<int64_t>();
-    auto ivalsData = input_vals.getData<T>();
-    uint64_t ikeysStride   = input_keys.getStride(0);
-    uint64_t ivaluesStride = input_vals.getStride(0);
-
-    auto xsData = bin_vals.getData<T>();
-    auto ysData = bin_ids.getData<int64_t>();
-    uint64_t xsStride = bin_vals.getStride(0);
-    uint64_t ysStride = bin_ids.getStride(0);
-
-    auto offsetData = feature_offsets.getData<int64_t>();
-
-    uint64_t size = input_keys.getDim(0);
-    uint64_t total_bins = bin_ids.getNumElements();
-    uint64_t fsize = feature_offsets.getNumElements();
-
-    for (uint64_t i = 0; i < size; i++) {
-      int64_t ikey = ikeysData[i * ikeysStride] - TWML_INDEX_BASE;
-      T val = ivalsData[i * ivaluesStride];
-      if (ikey == -1) {
-        ovalsData[i * ovaluesStride] = val;
-        continue;
-      }
-
-      // Perform interpolation
-      uint64_t offset = offsetData[ikey];
-      uint64_t next_offset = (ikey == (int64_t)(fsize - 1)) ? total_bins : offsetData[ikey + 1];
-      uint64_t mainSize = next_offset - offset;
-
-      const T *lxsData = xsData + offset;
-      const int64_t *lysData = ysData + offset;
-      int64_t okey = interpolation<T, int64_t>(lxsData, xsStride,
-                                 lysData, ysStride,
-                                 val, mainSize, NEAREST, 0,
-                                 return_bin_indices);
-      okeysData[i * okeysStride] = okey + TWML_INDEX_BASE;
-      ovalsData[i * ovaluesStride] = 1;
-    }
-  }
-
-  void mdlInfer(Tensor &output_keys, Tensor &output_vals,
-          const Tensor &input_keys, const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          bool return_bin_indices) {
-    if (input_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_keys must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (bin_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (feature_offsets.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    if (bin_ids.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_ids must be 1 Dimensional");
-    }
-
-    if (bin_vals.getNumElements() != bin_ids.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "Dimensions of bin_vals and bin_ids do not match");
-    }
-
-    if (feature_offsets.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "feature_offsets must be contiguous");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::mdlInfer<float>(output_keys, output_vals,
-                  input_keys, input_vals,
-                  bin_ids, bin_vals, feature_offsets,
-                  return_bin_indices);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::mdlInfer<double>(output_keys, output_vals,
-                   input_keys, input_vals,
-                   bin_ids, bin_vals, feature_offsets,
-                   return_bin_indices);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for mdlInfer");
-    }
-  }
-
-  const int DEFAULT_INTERPOLATION_LOWEST = 0;
-  /**
-   * @param output tensor to hold linear or nearest interpolation output.
-   *    This function does not allocate space.
-   *    The output tensor must have space allcoated.
-   * @param input input tensor; size must match output.
-   *    input is assumed to have size [batch_size, number_of_labels].
-   * @param xs the bins.
-   * @param ys the values for the bins.
-   * @param mode: linear or nearest InterpolationMode.
-   *    linear is used for isotonic calibration.
-   *    nearest is used for MDL calibration and MDL inference.
-   *
-   * @return Returns nothing. Output is stored into the output tensor.
-   *
-   * This is used by IsotonicCalibration inference.
-   */
-  template <typename T>
-  void interpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys,
-    const InterpolationMode mode) {
-    // Sanity check: input and output should have two dims.
-    if (input.getNumDims() != 2 || output.getNumDims() != 2) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "input and output should have 2 dimensions.");
-    }
-
-    // Sanity check: input and output size should match.
-    for (int i = 0; i < input.getNumDims(); i++) {
-      if (input.getDim(i) != output.getDim(i))  {
-        throw twml::Error(TWML_ERR_TYPE,
-                  "input and output mismatch in size.");
-      }
-    }
-
-    // Sanity check: number of labels in input should match
-    // number of labels in xs / ys.
-    if (input.getDim(1) != xs.getDim(0)
-      || input.getDim(1) != ys.getDim(0)) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "input, xs, ys should have the same number of labels.");
-    }
-
-    const uint64_t inputStride0 = input.getStride(0);
-    const uint64_t inputStride1 = input.getStride(1);
-    const uint64_t outputStride0 = output.getStride(0);
-    const uint64_t outputStride1 = output.getStride(1);
-    const uint64_t xsStride0 = xs.getStride(0);
-    const uint64_t xsStride1 = xs.getStride(1);
-    const uint64_t ysStride0 = ys.getStride(0);
-    const uint64_t ysStride1 = ys.getStride(1);
-    const uint64_t mainSize = xs.getDim(1);
-
-    // for each value in the input matrix, compute output value by
-    // calling interpolation.
-    auto inputData = input.getData<T>();
-    auto outputData = output.getData<T>();
-    auto xsData = xs.getData<T>();
-    auto ysData = ys.getData<T>();
-
-    for (uint64_t i = 0; i < input.getDim(0); i++) {
-      for (uint64_t j = 0; j < input.getDim(1); j++) {
-        const T val = inputData[i * inputStride0 + j * inputStride1];
-        const T *lxsData = xsData + j * xsStride0;
-        const T *lysData = ysData + j * ysStride0;
-        const T res = interpolation(
-          lxsData, xsStride1,
-          lysData, ysStride1,
-          val,
-          mainSize,
-          mode,
-          DEFAULT_INTERPOLATION_LOWEST);
-        outputData[i * outputStride0 + j * outputStride1] = res;
-      }
-    }
-  }
-
-  void linearInterpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys) {
-    switch (input.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::interpolation<float>(output, input, xs, ys, LINEAR);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::interpolation<double>(output, input, xs, ys, LINEAR);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for linearInterpolation.");
-    }
-  }
-
-  void nearestInterpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys) {
-    switch (input.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::interpolation<float>(output, input, xs, ys, NEAREST);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::interpolation<double>(output, input, xs, ys, NEAREST);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for nearestInterpolation.");
-    }
-  }
-}  // namespace twml
-
-twml_err twml_optim_mdl_infer(twml_tensor output_keys,
-                twml_tensor output_vals,
-                const twml_tensor input_keys,
-                const twml_tensor input_vals,
-                const twml_tensor bin_ids,
-                const twml_tensor bin_vals,
-                const twml_tensor feature_offsets,
-                bool return_bin_indices) {
-  HANDLE_EXCEPTIONS(
-    using namespace twml;
-    mdlInfer(*getTensor(output_keys),
-         *getTensor(output_vals),
-         *getConstTensor(input_keys),
-         *getConstTensor(input_vals),
-         *getConstTensor(bin_ids),
-         *getConstTensor(bin_vals),
-         *getConstTensor(feature_offsets),
-          return_bin_indices););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_optim_nearest_interpolation(
-                twml_tensor output,
-                const twml_tensor input,
-                const twml_tensor xs,
-                const twml_tensor ys) {
-  HANDLE_EXCEPTIONS(
-    using namespace twml;
-    nearestInterpolation(*getTensor(output),
-      *getConstTensor(input),
-      *getConstTensor(xs),
-      *getConstTensor(ys)););
-  return TWML_ERR_NONE;
-}
--- a/twml/libtwml/src/lib/optim.docx
+++ b/twml/libtwml/src/lib/optim.docx
--- a/twml/libtwml/src/lib/utf_converter.cpp
+++ b/twml/libtwml/src/lib/utf_converter.cpp
@ -1,53 +0,0 @@
-#include "internal/utf_converter.h"
-
-ssize_t utf8_to_utf16(const uint8_t *in, uint64_t in_len, uint16_t *out, uint64_t max_out) {
-  uint64_t num_out = 0;
-  uint64_t num_in = 0;
-  while (num_in < in_len) {
-    uint32_t uni;
-    uint64_t todo;
-    uint8_t ch = in[num_in];
-    num_in++;
-    if (ch <= 0x7F) {
-      uni = ch;
-      todo = 0;
-    } else if (ch <= 0xBF) {
-      return -1;
-    } else if (ch <= 0xDF) {
-      uni = ch & 0x1F;
-      todo = 1;
-    } else if (ch <= 0xEF) {
-      uni = ch & 0x0F;
-      todo = 2;
-    } else if (ch <= 0xF7) {
-      uni = ch & 0x07;
-      todo = 3;
-    } else {
-      return -1;
-    }
-    for (uint64_t j = 0; j < todo; ++j) {
-      if (num_in == in_len) return -1;
-      uint8_t ch = in[num_in];
-      num_in++;
-      if (ch < 0x80 || ch > 0xBF) return -1;
-      uni <<= 6;
-      uni += ch & 0x3F;
-    }
-    if (uni >= 0xD800 && uni <= 0xDFFF) return -1;
-    if (uni > 0x10FFFF) return -1;
-    if (uni <= 0xFFFF) {
-      if (num_out == max_out) return -1;
-      out[num_out] = uni;
-      num_out++;
-    } else {
-      uni -= 0x10000;
-      if (num_out + 1 >= max_out) return -1;
-      out[num_out] = (uni >> 10) + 0xD800;
-      out[num_out + 1] = (uni & 0x3FF) + 0xDC00;
-      num_out += 2;
-    }
-  }
-  if (num_out == max_out) return -1;
-  out[num_out] = 0;
-  return num_out;
-}
--- a/twml/libtwml/src/lib/utf_converter.docx
+++ b/twml/libtwml/src/lib/utf_converter.docx
--- a/twml/libtwml/src/ops/CMakeLists.docx
+++ b/twml/libtwml/src/ops/CMakeLists.docx
--- a/twml/libtwml/src/ops/CMakeLists.txt
+++ b/twml/libtwml/src/ops/CMakeLists.txt
@ -1,79 +0,0 @@
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
-cmake_policy(VERSION 2.8)
-set(CMAKE_MACOSX_RPATH 1)
-
-file(GLOB_RECURSE sources *.cpp)
-
-set (CMAKE_CXX_FLAGS "-Wall -std=c++11 -fno-stack-protector ${CMAKE_CXX_FLAGS}")
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_inc.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_INC)
-
-if (NOT (${TF_RES} EQUAL "0"))
-  message(${TF_RES})
-  message(FATAL_ERROR "Failed to get include path for tensorflow")
-endif()
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_lib.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_LIB)
-
-if (NOT (${TF_RES} EQUAL "0"))
-  message(${TF_RES})
-  message(FATAL_ERROR "Failed to get lib path for tensorflow")
-endif()
-
-find_path(
-  TWML_INC
-  NAMES "twml.h"
-  PATHS $ENV{LIBTWML_HOME}/include)
-
-add_library(twml_tf MODULE ${sources})
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{LIBTWML_HOME}/cmake")
-
-if (UNIX)
-  if (APPLE)
-    set (CMAKE_CXX_FLAGS "-undefined dynamic_lookup -stdlib=libc++  ${CMAKE_CXX_FLAGS}")
-    # -Wl,-all_load ensures symbols not used by twml_tf are also included.
-    # -Wl,-noall_load limits the scope of the previous flag.
-    set (LINK_ALL_OPTION  "-Wl,-all_load")
-    set (NO_LINK_ALL_OPTION  "-Wl,-noall_load")
-    set(TF_FRAMEWORK_LIB ${TF_LIB}/libtensorflow_framework.1.dylib)
-  else()
-    # -Wl,--whole-archive ensures symbols not used by twml_tf are also included.
-    # -Wl,--no-whole-archive limits the scope of the previous flag.
-    set (LINK_ALL_OPTION  "-Wl,--whole-archive")
-    set (NO_LINK_ALL_OPTION  "-Wl,--no-whole-archive")
-    set(TF_FRAMEWORK_LIB ${TF_LIB}/libtensorflow_framework.so.1)
-  endif()
-endif()
-
-
-target_include_directories(
-  twml_tf
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${TWML_INC}
-  # TF_INC needs to be the last to avoid some weird white-spacing issues with generated Makefile.
-  ${TF_INC} # Needed because of some header files auto-generated during build time.
-  ${TF_INC}/external/nsync/public/
-  )
-
-target_link_libraries(twml_tf
-  PUBLIC
-  # Since we are using twml_tf as the "one" dynamic library,
-  # we want it to have the C function symbols needed for other functions as well.
-  ${LINK_ALL_OPTION} twml ${NO_LINK_ALL_OPTION}
-  ${TF_FRAMEWORK_LIB}
-  )
--- a/twml/libtwml/src/ops/add1.cpp
+++ b/twml/libtwml/src/ops/add1.cpp
@ -1,92 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("Add1")
-.Attr("T: {float, double, int32}")
-.Input("input1: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class Add1 : public OpKernel {
- public:
-  explicit Add1(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
-                             &output_tensor));
-    auto output_flat = output_tensor->flat<T>();
-
-    // Add 1 to input and assign to output
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output_flat(i) = input(i) + 1;
-    }
-  }
-};
-
-
-REGISTER_OP("Add1Grad")
-.Attr("T: {float, double, int32}")
-.Input("grad_output: T")
-.Output("grad_input: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  });
-
-template<typename T>
-class Add1Grad : public OpKernel {
- public:
-  explicit Add1Grad(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& grad_output_tensor = context->input(0);
-    auto grad_output = grad_output_tensor.flat<T>();
-
-    // Create an grad_input tensor
-    Tensor* grad_input_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, grad_output_tensor.shape(),
-                             &grad_input_tensor));
-
-    auto grad_input_flat = grad_input_tensor->flat<T>();
-
-    // Copy from grad_output to grad_input
-    const int N = grad_output.size();
-    for (int i = 0; i < N; i++) {
-      grad_input_flat(i) = grad_output(i);
-    }
-  }
-};
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("Add1")                    \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    Add1<Type>);                    \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("Add1Grad")                \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    Add1Grad<Type>);                \
-
-REGISTER(float);
-REGISTER(double);
-REGISTER(int32);
--- a/twml/libtwml/src/ops/add1.docx
+++ b/twml/libtwml/src/ops/add1.docx
--- a/twml/libtwml/src/ops/batch_prediction_request.cpp
+++ b/twml/libtwml/src/ops/batch_prediction_request.cpp
@ -1,183 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-REGISTER_OP("DecodeAndHashBatchPredictionRequest")
-.Input("input_bytes: uint8")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of hashed data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  hashed_data_record_handle: A resource handle to the HashedDataRecordResource containing batch of HashedDataRecords.
-)doc");
-
-class DecodeAndHashBatchPredictionRequest : public OpKernel {
- public:
-  explicit DecodeAndHashBatchPredictionRequest(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      const uint8_t *input_bytes = resource->input.flat<uint8>().data();
-      twml::HashedDataRecordReader reader;
-      twml::HashedBatchPredictionRequest bpr;
-      reader.setKeepMap(&m_keep_map);
-      reader.setBuffer(input_bytes);
-      reader.setDecodeMode(m_decode_mode);
-      bpr.decode(reader);
-
-      resource->common = std::move(bpr.common());
-      resource->records = std::move(bpr.requests());
-
-      // Each datarecord has a copy of common features.
-      // Initialize total_size by common_size * num_records
-      int64 common_size = static_cast<int64>(resource->common.totalSize());
-      int64 num_records = static_cast<int64>(resource->records.size());
-      int64 total_size = common_size * num_records;
-      for (const auto &record : resource->records) {
-        total_size += static_cast<int64>(record.totalSize());
-      }
-
-      resource->total_size = total_size;
-      resource->num_labels = 0;
-      resource->num_weights = 0;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("DecodeAndHashBatchPredictionRequest").Device(DEVICE_CPU),
-  DecodeAndHashBatchPredictionRequest);
-
-REGISTER_OP("DecodeBatchPredictionRequest")
-.Input("input_bytes: uint8")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecordResource containing batch of DataRecords.
-)doc");
-
-class DecodeBatchPredictionRequest : public OpKernel {
- public:
-  explicit DecodeBatchPredictionRequest(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      const uint8_t *input_bytes = resource->input.flat<uint8>().data();
-      twml::DataRecordReader reader;
-      twml::BatchPredictionRequest bpr;
-      reader.setKeepMap(&m_keep_map);
-      reader.setBuffer(input_bytes);
-      bpr.decode(reader);
-
-      resource->common = std::move(bpr.common());
-      resource->records = std::move(bpr.requests());
-
-      resource->num_weights = 0;
-      resource->num_labels = 0;
-      resource->keep_map = &m_keep_map;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("DecodeBatchPredictionRequest").Device(DEVICE_CPU),
-  DecodeBatchPredictionRequest);
--- a/twml/libtwml/src/ops/batch_prediction_request.docx
+++ b/twml/libtwml/src/ops/batch_prediction_request.docx
--- a/twml/libtwml/src/ops/batch_prediction_request_v2.cpp
+++ b/twml/libtwml/src/ops/batch_prediction_request_v2.cpp
@ -1,224 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <cstdint>
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <iterator>
-
-template<typename InputType, typename RecordType>
-class DecodeBatchPredictionRequestKernel : public OpKernel {
- public:
-  explicit DecodeBatchPredictionRequestKernel(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- protected:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-  int64 m_decode_mode;
-
-  template<typename ResourceType>
-  void Decode(OpKernelContext* context, ResourceType *resource) {
-    resource->input = context->input(0);
-    const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, 0);
-    int num_labels = static_cast<int>(m_labels_map.size());
-    int num_weights = static_cast<int>(m_weights_map.size());
-
-    typename RecordType::Reader reader;
-    twml::GenericBatchPredictionRequest<RecordType> bpr(num_labels, num_weights);
-
-    reader.setKeepMap(&m_keep_map);
-    reader.setLabelsMap(&m_labels_map);
-    reader.setBuffer(input_bytes);
-    reader.setDecodeMode(m_decode_mode);
-    // Do not set weight map if it is empty. This will take a faster path.
-    if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-    }
-    bpr.decode(reader);
-
-    resource->common = std::move(bpr.common());
-    resource->records = std::move(bpr.requests());
-
-    resource->num_labels = num_labels;
-    resource->num_weights = num_weights;
-  }
-};
-
-
-REGISTER_OP("DecodeAndHashBatchPredictionRequestV2")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes a list/batch of data records and creates a handle to the batch of hashed data records.
-
-Compared to DecodeAndHashBatchPredictionRequest, DecodeAndHashBatchPredictionRequestV2 is used for training instead
-of serving. Thus label_features and weight_features[optional] must be passed, and labels and weights are extracted in
-the output.
-DecodeAndHashBatchPredictionRequestV2 controls what DataRecords we want to process together in a batch in training.
-For instance, we can put all instances for a query in the same batch when training a ranking model.
-Notice that this OP was added separately to make sure we would not break the API for DecodeAndHashBatchPredictionRequest.
-It requires some discussions if we merge the two ops into a single .cpp file in a future API revision.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  hashed_data_record_handle: A resource handle to the HashedDataRecordResource containing batch of HashedDataRecords.
-)doc");
-
-template<typename InputType>
-class DecodeAndHashBatchPredictionRequestV2 :
-    public DecodeBatchPredictionRequestKernel<InputType, twml::HashedDataRecord> {
-
-public:
-  DecodeAndHashBatchPredictionRequestV2(OpKernelConstruction *context)
-    : DecodeBatchPredictionRequestKernel<InputType, twml::HashedDataRecord>(context) {
-  }
-
- private:
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(
-        context,
-        makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      this->Decode(context, resource);
-
-      // Each datarecord has a copy of common features.
-      // Initialize total_size by common_size * num_records
-      int64 common_size = static_cast<int64>(resource->common.totalSize());
-      int64 num_records = static_cast<int64>(resource->records.size());
-      int64 total_size = common_size * num_records;
-      for (const auto &record : resource->records) {
-        total_size += static_cast<int64>(record.totalSize());
-      }
-
-      resource->total_size = total_size;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("DecodeBatchPredictionRequestV2")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  shared_name: name used by the resource handle inside the resource manager.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: reserved, do not use.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecordResource containing batch of DataRecords.
-)doc");
-
-
-template<typename InputType>
-class DecodeBatchPredictionRequestV2 :
-    public DecodeBatchPredictionRequestKernel<InputType, twml::DataRecord> {
-public:
-  DecodeBatchPredictionRequestV2(OpKernelConstruction *context)
-    : DecodeBatchPredictionRequestKernel<InputType, twml::DataRecord>(context) {
-  }
-
-private:
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(
-        context,
-        makeResourceHandle<DataRecordResource>(context, 0, &resource));
-      this->Decode(context, resource);
-      resource->keep_map = &(this->m_keep_map);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER_DECODE_OPS(InputType)                      \
-    REGISTER_KERNEL_BUILDER(                                \
-        Name("DecodeAndHashBatchPredictionRequestV2")       \
-        .Device(DEVICE_CPU)                                 \
-        .TypeConstraint<InputType>("InputType"),            \
-        DecodeAndHashBatchPredictionRequestV2<InputType>);  \
-    REGISTER_KERNEL_BUILDER(                                \
-        Name("DecodeBatchPredictionRequestV2")              \
-        .Device(DEVICE_CPU)                                 \
-        .TypeConstraint<InputType>("InputType"),            \
-        DecodeBatchPredictionRequestV2<InputType>);         \
-
-REGISTER_DECODE_OPS(uint8)
-REGISTER_DECODE_OPS(string)
--- a/twml/libtwml/src/ops/batch_prediction_request_v2.docx
+++ b/twml/libtwml/src/ops/batch_prediction_request_v2.docx
--- a/twml/libtwml/src/ops/batch_prediction_response_writer.cpp
+++ b/twml/libtwml/src/ops/batch_prediction_response_writer.cpp
@ -1,82 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("BatchPredictionResponseWriter")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and values into a BatchPredictionResponse.
-
-values: input feature value. (float/double)
-keys: feature ids from the original BatchPredictionRequest. (int64)
-
-Outputs
-  bytes: output BatchPredictionRequest serialized using Thrift into a uint8 tensor.
-)doc");
-
-template<typename T>
-class BatchPredictionResponseWriter : public OpKernel {
- public:
-  explicit BatchPredictionResponseWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-    const Tensor& values = context->input(1);
-
-    try {
-      // Ensure the inner dimension matches.
-      if (values.dim_size(values.dims() - 1) != keys.dim_size(keys.dims() - 1)) {
-        throw std::runtime_error("The sizes of keys and values need to match");
-      }
-
-      // set inputs as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-      const twml::Tensor in_values_ = TFTensor_to_twml_tensor(values);
-      // no tensors in this op
-      const twml::Tensor dummy_dense_keys_;
-      const std::vector<twml::RawTensor> dummy_dense_values_;
-
-      // call constructor BatchPredictionResponse
-      twml::BatchPredictionResponse tempResult(
-        in_keys_, in_values_, dummy_dense_keys_, dummy_dense_values_);
-
-      // determine the length of the result
-      int len = tempResult.encodedSize();
-      TensorShape result_shape = {1, len};
-
-      // Create an output tensor, the size is determined by the content of input.
-      Tensor* result = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape,
-                                                       &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // Call writer of BatchPredictionResponse
-      tempResult.write(out_result);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER(Type)                     \
-                                           \
-  REGISTER_KERNEL_BUILDER(                 \
-    Name("BatchPredictionResponseWriter")  \
-    .Device(DEVICE_CPU)                    \
-    .TypeConstraint<Type>("T"),            \
-    BatchPredictionResponseWriter<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
--- a/twml/libtwml/src/ops/batch_prediction_response_writer.docx
+++ b/twml/libtwml/src/ops/batch_prediction_response_writer.docx
--- a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp
+++ b/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp
@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("BatchPredictionTensorResponseWriter")
-.Attr("T: list({string, int32, int64, float, double})")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and dense tensors into a BatchPredictionResponse.
-
-values: list of tensors
-keys: feature ids from the original BatchPredictionRequest. (int64)
-
-Outputs
-  bytes: output BatchPredictionRequest serialized using Thrift into a uint8 tensor.
-)doc");
-
-class BatchPredictionTensorResponseWriter : public OpKernel {
- public:
-  explicit BatchPredictionTensorResponseWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-
-    try {
-      // set keys as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-
-      // check sizes
-      uint64_t num_keys = in_keys_.getNumElements();
-      uint64_t num_values = context->num_inputs() - 1;
-
-      OP_REQUIRES(context, num_values % num_keys == 0,
-        errors::InvalidArgument("Number of dense tensors not multiple of dense keys"));
-
-      // set dense tensor values
-      std::vector<twml::RawTensor> in_values_;
-      for (int i = 1; i < context->num_inputs(); i++) {
-        in_values_.push_back(TFTensor_to_twml_raw_tensor(context->input(i)));
-      }
-
-      // no continuous predictions in this op, only tensors
-      const twml::Tensor dummy_cont_keys_;
-      const twml::Tensor dummy_cont_values_;
-
-      // call constructor BatchPredictionResponse
-      twml::BatchPredictionResponse tempResult(
-        dummy_cont_keys_, dummy_cont_values_, in_keys_, in_values_);
-
-      // determine the length of the result
-      int len = tempResult.encodedSize();
-      TensorShape result_shape = {1, len};
-
-      // Create an output tensor, the size is determined by the content of input.
-      Tensor* result = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape,
-                                                       &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // Call writer of BatchPredictionResponse
-      tempResult.write(out_result);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchPredictionTensorResponseWriter").Device(DEVICE_CPU),
-    BatchPredictionTensorResponseWriter);
--- a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.docx
+++ b/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.docx
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp
+++ b/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp
@ -1,330 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TWML modified to optimize binary features:
-// - Sparse tensor values are assumed to be binary, so only add operation is done
-//   rather than mul-add;
-// - In house version of vectorization is used instead of Eigen;
-// - Enable sharding and multithreading.
-
-#define EIGEN_USE_THREADS
-
-#include "binary_sparse_dense_matmul.h"
-#include "binary_sparse_dense_matmul_impl.h"
-
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-namespace shape_inference {
-// TODO: The `a_value` is supposed to be all ones.
-// Users should not call this op directly but to use it from `sparse_op` python library. 
-// To make it consistent with original op, the signature remains the same currently,
-//  we will think a better way to contrain correct use of this op.
-// CX-18174
-REGISTER_OP("BinarySparseTensorDenseMatMul")
-    .Input("a_indices: Tindices")
-    .Input("a_values: T")
-    .Input("a_shape: int64")
-    .Input("b: T")
-    .Output("product: T")
-    .Attr("T: type")
-    .Attr("Tindices: {int32,int64} = DT_INT64")
-    .Attr("adjoint_a: bool = false")
-    .Attr("adjoint_b: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      DimensionHandle unused_dim;
-      ShapeHandle unused;
-      ShapeHandle b;
-      ShapeHandle a_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));  // a_indices
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));  // a_values
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &a_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(a_shape, 2, &a_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &b));
-
-      bool adjoint_a;
-      bool adjoint_b;
-      TF_RETURN_IF_ERROR(c->GetAttr("adjoint_a", &adjoint_a));
-      TF_RETURN_IF_ERROR(c->GetAttr("adjoint_b", &adjoint_b));
-
-      DimensionHandle output_right = c->Dim(b, adjoint_b ? 0 : 1);
-      DimensionHandle output_left = c->Dim(a_shape, adjoint_a ? 1 : 0);
-      DimensionHandle inner_left = c->Dim(a_shape, adjoint_a ? 0 : 1);
-      DimensionHandle inner_right = c->Dim(b, adjoint_b ? 1 : 0);
-      TF_RETURN_IF_ERROR(c->Merge(inner_left, inner_right, &unused_dim));
-      c->set_output(0, c->Matrix(output_left, output_right));
-      return Status::OK();
-    });
-}  // namespace shape_inference
-
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-template <typename Device, typename T, typename Tindices>
-class BinarySparseTensorDenseMatMulOp : public OpKernel {
- public:
-  explicit BinarySparseTensorDenseMatMulOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_a", &adjoint_a_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_b", &adjoint_b_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* a_indices;
-    const Tensor* a_values;
-    const Tensor* a_shape;
-    const Tensor* b;
-    OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
-    OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values));
-    OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
-    OP_REQUIRES_OK(ctx, ctx->input("b", &b));
-
-    // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b->shape()),
-                errors::InvalidArgument("Tensor 'b' is not a matrix"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()),
-                errors::InvalidArgument("Tensor 'a_shape' is not a vector"));
-
-    OP_REQUIRES(
-        ctx, a_shape->NumElements() == 2,
-        errors::InvalidArgument("Tensor 'a_shape' must have 2 elements"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values->shape()),
-                errors::InvalidArgument("Tensor 'a_values' is not a vector"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
-                errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
-
-    const int64 nnz = a_indices->shape().dim_size(0);
-    OP_REQUIRES(ctx, nnz == a_values->NumElements(),
-                errors::InvalidArgument("Number of rows of a_indices does not "
-                                        "match number of entries in a_values"));
-
-    OP_REQUIRES(
-        ctx, a_indices->shape().dim_size(1) == a_shape->NumElements(),
-        errors::InvalidArgument("Number of columns of a_indices does not match "
-                                "number of entries in a_shape"));
-
-    auto a_shape_t = a_shape->vec<int64>();
-    const int64 outer_left = (adjoint_a_) ? a_shape_t(1) : a_shape_t(0);
-    const int64 outer_right =
-        (adjoint_b_) ? b->shape().dim_size(0) : b->shape().dim_size(1);
-    const int64 inner_left = (adjoint_a_) ? a_shape_t(0) : a_shape_t(1);
-    const int64 inner_right =
-        (adjoint_b_) ? b->shape().dim_size(1) : b->shape().dim_size(0);
-
-    OP_REQUIRES(
-        ctx, inner_right == inner_left,
-        errors::InvalidArgument(
-            "Cannot multiply A and B because inner dimension does not match: ",
-            inner_left, " vs. ", inner_right,
-            ".  Did you forget a transpose?  "
-            "Dimensions of A: [",
-            a_shape_t(0), ", ", a_shape_t(1),
-            ").  Dimensions of B: ", b->shape().DebugString()));
-
-    TensorShape out_shape({outer_left, outer_right});
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
-
-    if (out->NumElements() == 0) {
-      // If a has shape [0, x] or b has shape [x, 0], the output shape
-      // is a 0-element matrix, so there is nothing to do.
-      return;
-    }
-
-    if (a_values->NumElements() == 0 || b->NumElements() == 0) {
-      // If a has shape [x, 0] and b has shape [0, y], the
-      // output shape is [x, y] where x and y are non-zero, so we fill
-      // the output with zeros.
-      out->flat<T>().device(ctx->eigen_device<Device>()) = 
-          out->flat<T>().constant(T(0));
-      return;
-    }
-
-#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                        \
-  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
-    Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
-        Device, T, Tindices, ADJ_A,                                        \
-        ADJ_B>::Compute(ctx, a_indices, a_values, a_shape, b, out);        \
-    OP_REQUIRES_OK(ctx, functor_status);                                   \
-  }
-
-    MAYBE_ADJOINT(false, false);
-    MAYBE_ADJOINT(false, true);
-    MAYBE_ADJOINT(true, false);
-    MAYBE_ADJOINT(true, true);
-
-#undef MAYBE_ADJOINT
-  }
-
- private:
-  bool adjoint_a_;
-  bool adjoint_b_;
-};
-
-#define REGISTER_CPU(TypeT, TypeIndex)           \
-  REGISTER_KERNEL_BUILDER(                       \
-      Name("BinarySparseTensorDenseMatMul")      \
-          .Device(DEVICE_CPU)                    \
-          .TypeConstraint<TypeT>("T")            \
-          .TypeConstraint<TypeIndex>("Tindices") \
-          .HostMemory("a_shape"),                \
-      BinarySparseTensorDenseMatMulOp<CPUDevice, TypeT, TypeIndex>);
-
-#define REGISTER_KERNELS_CPU(T) \
-  REGISTER_CPU(T, int64);       \
-  REGISTER_CPU(T, int32)
-
-REGISTER_KERNELS_CPU(float);
-REGISTER_KERNELS_CPU(double);
-REGISTER_KERNELS_CPU(int32);
-REGISTER_KERNELS_CPU(complex64);
-REGISTER_KERNELS_CPU(complex128);
-
-namespace functor {
-
-namespace {
-Status KOutOfBoundsError(int64 k, std::size_t i, int rhs_index_a,
-                         std::size_t lhs_right) {
-  return errors::InvalidArgument("k (", k, ") from index[", i, ",", rhs_index_a,
-                                 "] out of bounds (>=", lhs_right, ")");
-}
-
-Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
-                         int64 out_dim0) {
-  return errors::InvalidArgument("m (", m, ") from index[", i, ",", lhs_index_a,
-                                 "] out of bounds (>=", out_dim0, ")");
-}
-
-}  // namespace
-
-
-// The general functor just borrows the code from tf except that add is computed 
-// instead of mul-add.
-template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
-  // Vectorize certain operations above this size.
-  static const std::size_t kNumVectorize = 32;
-
-  static Status Compute(OpKernelContext* ctx,
-                        const Tensor *a_indices,
-                        const Tensor *a_values,
-                        const Tensor *a_shape,
-                        const Tensor *b,
-                        Tensor *out) {
-    return EigenCompute(ctx->eigen_device<CPUDevice>(), out->matrix<T>(),
-                        a_indices->matrix<Tindices>(), a_values->vec<T>(),
-                        b->matrix<T>());
-  }
-
-  static Status EigenCompute(const CPUDevice& d, typename TTypes<T>::Matrix out,
-                             typename TTypes<Tindices>::ConstMatrix a_indices,
-                             typename TTypes<T>::ConstVec a_values,
-                             typename TTypes<T>::ConstMatrix b) {
-    const std::size_t nnz = a_values.size();
-    const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
-    const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
-    const int lhs_index_a = ADJ_A ? 1 : 0;
-    const int rhs_index_a = ADJ_A ? 0 : 1;
-
-    out.setZero();
-
-    if (rhs_right < kNumVectorize) {
-      // Disable vectorization if the RHS of output is too small
-      auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
-
-      for (std::size_t i = 0; i < nnz; ++i) {
-        const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
-        const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
-        if (!FastBoundsCheck(k, lhs_right)) {
-          return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);
-        }
-        if (!FastBoundsCheck(m, out.dimension(0))) {
-          return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));
-        }
-        for (std::size_t n = 0; n < rhs_right; ++n) {
-          const T b_value = maybe_adjoint_b(k, n);
-          out(m, n) += b_value;
-        }
-      }
-    } else {
-      // Vectorization via Eigen.
-      const int b_chip_index = ADJ_B ? 1 : 0;
-
-#define LOOP_NNZ(b_passed)                                                  \
-  for (std::size_t i = 0; i < nnz; ++i) {                                   \
-    const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
-    const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
-    if (!FastBoundsCheck(k, lhs_right)) {                                   \
-      return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);               \
-    }                                                                       \
-    if (!FastBoundsCheck(m, out.dimension(0))) {                            \
-      return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));        \
-    }                                                                       \
-    out.template chip<0>(m) += b_passed.template chip<b_chip_index>(k);     \
-  }
-
-
-      if (ADJ_B) {
-        // Perform transpose and conjugation on B once, since we chip out B's
-        // columns in the nnz loop.
-        Eigen::array<int, 2> shuffle;  // preserve dimension order
-        shuffle[0] = 1; shuffle[1] = 0;
-        Eigen::Tensor<T, 2, Eigen::ColMajor> col_major_conj_b =
-            b.swap_layout().shuffle(shuffle).conjugate();
-        LOOP_NNZ(col_major_conj_b);
-      } else {
-        LOOP_NNZ(b);
-      }
-#undef LOOP_NNZ
-    }
-    return Status::OK();
-  }
-};
-
-
-// We have only specified and optimised the case with no matrix transpose, 
-// since it is the most typical usage in productions.
-template <typename Tindices>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, 
-                                      float, Tindices, false, false> {
-  static Status Compute(OpKernelContext* ctx,
-                        const Tensor *a_indices,
-                        const Tensor *a_values,
-                        const Tensor *a_shape,
-                        const Tensor *b,
-                        Tensor *out) {
-    auto a_indices_ptr = a_indices->flat<Tindices>().data();     
-    auto b_ptr = b->flat<float>().data();
-    auto out_ptr = out->flat<float>().data();
-    const int64 nnz = a_indices->shape().dim_size(0);
-    const int64 outer_left = a_shape->vec<int64>()(0);
-    const int64 outer_right = b->shape().dim_size(1);
-    ParallelLookupAndSegmentSum<Tindices>(ctx, a_indices_ptr, b_ptr, nnz,
-                                outer_left, outer_right, out_ptr);
-    return Status::OK();
-  }
-};
-
-}  // namespace functor
-
-}  // namespace tensorflow
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.docx
+++ b/twml/libtwml/src/ops/binary_sparse_dense_matmul.docx
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.h
+++ b/twml/libtwml/src/ops/binary_sparse_dense_matmul.h
@ -1,75 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TWML modified to optimize binary features 
-#ifndef TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-#define TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-namespace functor {
-
-template <typename Device, typename T, typename Tindices, bool ADJ_A,
-          bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor {
-  static EIGEN_ALWAYS_INLINE Status Compute(
-      const Device& d, typename TTypes<T>::Matrix out,
-      typename TTypes<Tindices>::ConstMatrix a_indices,
-      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
-};
-
-template <typename MATRIX, bool ADJ>
-class MaybeAdjoint;
-
-template <typename MATRIX>
-class MaybeAdjoint<MATRIX, false> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
-      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
-    return m_(i, j);
-  }
-
- private:
-  const MATRIX m_;
-};
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T MaybeConj(T v) {
-  return v;
-}
-
-template <typename MATRIX>
-class MaybeAdjoint<MATRIX, true> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
-      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
-    return Eigen::numext::conj(m_(j, i));
-  }
-
- private:
-  const MATRIX m_;
-};
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.docx
+++ b/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.docx
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h
+++ b/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h
@ -1,145 +0,0 @@
-#ifndef TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
-
-#include <atomic>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-
-namespace tensorflow {
-namespace functor {
-
-// `ConservativeShard` is adopted rather than `Shard` in tensorflow because the
-// original `Shard` may generate number of shards more than the number of
-// threads, which is not ideal for this case, as it may cause too much overhead.
-static void ConservativeShard(int max_parallelism, thread::ThreadPool *workers,
-                              int64 total, int64 cost_per_unit,
-                              std::function<void(int64, int64)> work) {
-  if (total == 0) {
-    return;
-  }
-  max_parallelism = std::min(max_parallelism, workers->NumThreads());
-  if (max_parallelism <= 1) {
-    // Just inline the whole work since we only have 1 thread (core).
-    work(0, total);
-    return;
-  }
-  cost_per_unit = std::max(1LL, cost_per_unit);
-  // We shard [0, total) into "num_shards" shards.
-  //   1 <= num_shards <= num worker threads
-  //
-  // If total * cost_per_unit is small, it is not worth shard too
-  // much. Let us assume each cost unit is 1ns, kMinCostPerShard=10000
-  // is 10us.
-  static const int64 kMinCostPerShard = 10000;
-  const int num_shards =
-      std::max<int>(1, std::min(static_cast<int64>(max_parallelism),
-                                total * cost_per_unit / kMinCostPerShard));
-
-  // Each shard contains up to "block_size" units. [0, total) is sharded
-  // into:
-  //   [0, block_size), [block_size, 2*block_size), ...
-  // The 1st shard is done by the caller thread and the other shards
-  // are dispatched to the worker threads. The last shard may be smaller than
-  // block_size.
-  const int64 block_size = (total + num_shards - 1) / num_shards;
-  if (block_size >= total) {
-    work(0, total);
-    return;
-  }
-  const int num_shards_used = (total + block_size - 1) / block_size;
-  BlockingCounter counter(num_shards_used - 1);
-  for (int64 start = block_size; start < total; start += block_size) {
-    auto limit = std::min(start + block_size, total);
-    workers->Schedule([&work, &counter, start, limit]() {
-      work(start, limit);        // Compute the shard.
-      counter.DecrementCount();  // The shard is done.
-    });
-  }
-
-  // Inline execute the 1st shard.
-  work(0, std::min(block_size, total));
-  counter.Wait();
-}
-
-static inline void VectorSum(float *a, const float *b, int n) {
-  for (int i = 0; i < n; ++i) {
-    a[i] += b[i];
-  }
-}
-
-// This func is to vectorize the computation of segment sum.
-template<typename Tindices>
-static void LookupAndSegmentSum(const Tindices *a_indices, const float *b,
-                                int nnz, int outer_right, float *output) {
-  for (std::size_t i = 0; i < nnz; ++i) {
-    const Tindices m = a_indices[i * 2];
-    const Tindices k = a_indices[i * 2 + 1];
-    auto output_row_m = output + m * outer_right;
-    auto b_row_k = b + k * outer_right;
-    VectorSum(output_row_m, b_row_k, outer_right);
-  }
-}
-
-// This func enables sharding and multithreading, it comes with an overhead of
-// duplicating output buffer to achieve lock free output. So there should not
-// be too many threads.
-template<typename Tindices>
-static void ParallelLookupAndSegmentSum(OpKernelContext *ctx,
-                                        const Tindices *a_indices,
-                                        const float *b, int nnz, int outer_left,
-                                        int outer_right, float *output) {
-  auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-  int out_size = outer_left * outer_right;
-  if (worker_threads.num_threads <= 1) {
-    memset(output, 0, out_size * sizeof(float));
-    LookupAndSegmentSum<Tindices>(a_indices, b, 
-                                  nnz, outer_right,
-                                  output);
-    return;
-  }
-
-  // this is to make buffer align with kAllocatorAlignment
-  int padded_out_size = (out_size + (Allocator::kAllocatorAlignment - 1)) &
-                        ~(Allocator::kAllocatorAlignment - 1);
-  std::size_t num_bytes =
-      (worker_threads.num_threads - 1) * padded_out_size * sizeof(float);
-  auto buffer = std::unique_ptr<float>(reinterpret_cast<float *>(
-      port::AlignedMalloc(num_bytes, Allocator::kAllocatorAlignment)));
-  float *temp_out = buffer.get();
-
-  std::atomic<int> thread_index(0);
-
-  auto task = [&](int64 start, int64 limit) {
-    int local_thread_index = thread_index++;
-    float *buf_ptr = nullptr;
-    if (local_thread_index == 0) {
-      buf_ptr = output;
-    } else {
-      buf_ptr = temp_out + (local_thread_index - 1) * padded_out_size;
-    }
-    memset(buf_ptr, 0, out_size * sizeof(float));
-
-    LookupAndSegmentSum<Tindices>(a_indices + start * 2, b, 
-                                  limit - start, outer_right,
-                                  buf_ptr);
-  };
-
-  int cost_per_unit = outer_right;
-
-  // We don't use tensorflow shard func as tf may create more shards than
-  // number of threads.
-  ConservativeShard(worker_threads.num_threads, worker_threads.workers, nnz,
-                    static_cast<int64>(cost_per_unit), task);
-
-  for (int i = 1; i < thread_index; ++i) {
-    VectorSum(output, temp_out + (i - 1) * padded_out_size, out_size);
-  }
-}
-
-}  // namespace functor
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
--- a/twml/libtwml/src/ops/block_format_dataset.cpp
+++ b/twml/libtwml/src/ops/block_format_dataset.cpp
@ -1,243 +0,0 @@
-#include "block_format_reader.h"
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/io/random_inputstream.h"
-
-#if !defined(DISABLE_ZLIB)
-#include "tensorflow/core/lib/io/zlib_inputstream.h"
-#endif
-
-#include <twml.h>
-
-#include <cstdio>
-#include <algorithm>
-#include <iterator>
-
-using namespace tensorflow;
-
-
-inline std::string stripPath(std::string const &file_name) {
-  const auto pos = file_name.find_last_of("/");
-  if (pos == std::string::npos) return file_name;
-  return file_name.substr(pos + 1);
-}
-
-inline std::string getExtension(std::string const &file_name) {
-  const auto stripped_file_name = stripPath(file_name);
-  const auto pos = stripPath(stripped_file_name).find_last_of(".");
-  if (pos == std::string::npos) return "";
-  return stripped_file_name.substr(pos + 1);
-}
-
-REGISTER_OP("BlockFormatDatasetV2")
-.Input("filenames: string")
-.Input("compression_type: string")
-.Input("buffer_size: int64")
-.Output("handle: variant")
-.SetIsStateful()
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-
-Creates a dataset for streaming BlockFormat data in compressed (e.g. gzip), uncompressed formats.
-This op also has the ability stream a dataset containing files from multiple formats mentioned above.
-
-filenames: A scalar or vector containing the name(s) of the file(s) to be read.
-compression_type: A scalar string denoting the compression type. Can be 'none', 'zlib', 'auto'.
-buffer_size: A scalar denoting the buffer size to use during decompression.
-
-Outputs
-  handle: A handle to the dataset. This handle is later used to create an iterator to stream the data from the dataset.
-
-)doc");
-
-
-class BlockFormatDatasetV2 : public DatasetOpKernel {
- public:
-  using DatasetOpKernel::DatasetOpKernel;
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase **output) override {
-    const Tensor* filenames_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
-    OP_REQUIRES(
-        ctx, filenames_tensor->dims() <= 1,
-        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
-
-    const auto filenames_flat = filenames_tensor->flat<string>();
-    const int64 num_files = filenames_tensor->NumElements();
-    std::vector<string> filenames;
-    filenames.reserve(num_files);
-    std::copy(filenames_flat.data(),
-              filenames_flat.data() + num_files,
-              std::back_inserter(filenames));
-
-    string compression_type;
-    OP_REQUIRES_OK(
-        ctx, tensorflow::data::ParseScalarArgument<string>(
-            ctx, "compression_type", &compression_type));
-
-    int64 buffer_size = -1;
-    OP_REQUIRES_OK(
-        ctx, tensorflow::data::ParseScalarArgument<int64>(
-            ctx, "buffer_size", &buffer_size));
-
-    OP_REQUIRES(ctx, buffer_size >= 0,
-                errors::InvalidArgument(
-                    "`buffer_size` must be >= 0 (0 == no buffering)"));
-
-    OP_REQUIRES(ctx,
-                compression_type == "auto" ||
-                compression_type == "gz" ||
-                compression_type == "",
-                errors::InvalidArgument("Unknown extension: ", compression_type));
-
-    *output = new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx,
-            std::vector<string> filenames,
-            std::string compression_type,
-            int64 buffer_size)
-        : DatasetBase(DatasetContext(ctx)),
-          compression_type_(compression_type),
-          buffer_size_(buffer_size),
-          filenames_(std::move(filenames))
-    {}
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    string DebugString() const override { return "BlockFormatDatasetV2::Dataset"; }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* filenames = nullptr;
-      Node* compression_type = nullptr;
-      Node* buffer_size = nullptr;
-      TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
-      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_size_, &buffer_size));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {filenames, compression_type, buffer_size}, output));
-      return Status::OK();
-    }
-
-   private:
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::BlockFormat")}));
-    }
-
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params &params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        do {
-          // We are currently processing a file, so try to read the next record.
-          if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
-            Status s = reader_->ReadNext(&result_tensor.scalar<string>()());
-            if (s.ok()) {
-              out_tensors->emplace_back(std::move(result_tensor));
-              *end_of_sequence = false;
-              return Status::OK();
-            } else if (!errors::IsOutOfRange(s)) {
-              return s;
-            }
-
-            // We have reached the end of the current file, so maybe
-            // move on to next file.
-            reader_.reset();
-            ++current_file_index_;
-          }
-
-          // Iteration ends when there are no more files to process.
-          if (current_file_index_ == dataset()->filenames_.size()) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          // Actually move on to next file.
-          const string& next_filename =
-              dataset()->filenames_[current_file_index_];
-
-          auto compression_type = dataset()->compression_type_;
-          int64 buffer_size = dataset()->buffer_size_;
-
-          if (compression_type == "auto") {
-            compression_type = getExtension(next_filename);
-          }
-
-          if (compression_type != "gz" && compression_type != "") {
-            return errors::InvalidArgument("Unknown extension: ", compression_type);
-          }
-
-          tensorflow::Env* env = tensorflow::Env::Default();
-          TF_CHECK_OK(env->NewRandomAccessFile(next_filename, &file_));
-
-          // RandomAccessInputstream defaults the second param to "false".
-          // The second parameter "false" is the key issue.
-          // "false" assumes the ownership of the file is elsewhere.
-          // But making that "true" causes segfaults down the line.
-          // So keep the ownership of "file_" in this class and clean up properly.
-          file_stream_.reset(new tensorflow::io::RandomAccessInputStream(file_.get(), false));
-
-          if (compression_type == "gz") {
-            // unpack_stream does not take ownership of file_stream_
-#if !defined(DISABLE_ZLIB)
-            unpack_stream_.reset(new tensorflow::io::ZlibInputStream(
-                                   file_stream_.get(),
-                                   buffer_size,
-                                   buffer_size,
-                                   tensorflow::io::ZlibCompressionOptions::GZIP()));
-            reader_.reset(new BlockFormatReader(unpack_stream_.get()));
-#else
-            return errors::InvalidArgument("libtwml compiled without zlib support");
-#endif
-          } else {
-            unpack_stream_.reset(nullptr);
-            reader_.reset(new BlockFormatReader(file_stream_.get()));
-          }
-        } while (true);
-      }
-
-     private:
-      mutex mu_;
-      uint64_t current_file_index_ GUARDED_BY(mu_) = 0;
-      std::unique_ptr<tensorflow::RandomAccessFile> file_;
-      std::unique_ptr<tensorflow::io::InputStreamInterface> file_stream_;
-      std::unique_ptr<tensorflow::io::InputStreamInterface> unpack_stream_;
-      std::unique_ptr<BlockFormatReader> reader_ GUARDED_BY(mu_);
-    };
-
-    const std::string compression_type_;
-    const int64 buffer_size_;
-    const std::vector<string> filenames_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("BlockFormatDatasetV2")
-  .Device(DEVICE_CPU),
-  BlockFormatDatasetV2);
--- a/twml/libtwml/src/ops/block_format_dataset.docx
+++ b/twml/libtwml/src/ops/block_format_dataset.docx
--- a/twml/libtwml/src/ops/block_format_reader.docx
+++ b/twml/libtwml/src/ops/block_format_reader.docx
--- a/twml/libtwml/src/ops/block_format_reader.h
+++ b/twml/libtwml/src/ops/block_format_reader.h
@ -1,50 +0,0 @@
-#pragma once
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/lib/io/random_inputstream.h"
-
-#include <twml.h>
-
-#include <string>
-
-using tensorflow::int64;
-using tensorflow::Status;
-using std::string;
-
-class BlockFormatReader : twml::BlockFormatReader {
- public:
-  explicit BlockFormatReader(tensorflow::io::InputStreamInterface *stream)
-      : twml::BlockFormatReader() , stream_(stream) {
-  }
-
-  // Read the next record.
-  // Returns OK on success,
-  // Returns OUT_OF_RANGE for end of file, or something else for an error.
-  Status ReadNext(string* record) {
-    if (this->next()) {
-      return stream_->ReadNBytes(this->current_size(), record);
-    }
-    return tensorflow::errors::OutOfRange("eof");
-  }
-
-  uint64_t read_bytes(void *dest, int size, int count) {
-    uint64_t bytesToRead = size * count;
-    std::string current;
-    // TODO: Try to merge ReadNBytes and the memcpy below
-    // ReadNBytes performs a memory copy already.
-    Status status = stream_->ReadNBytes(bytesToRead, &current);
-    if (!status.ok()) {
-      return 0;
-    }
-    memcpy(dest, current.c_str(), bytesToRead);
-    return count;
-  }
-
- private:
-  tensorflow::io::InputStreamInterface *stream_;
-  TF_DISALLOW_COPY_AND_ASSIGN(BlockFormatReader);
-};
--- a/twml/libtwml/src/ops/compress_sample_ids.cpp
+++ b/twml/libtwml/src/ops/compress_sample_ids.cpp
@ -1,138 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <algorithm>    // std::fill_n
-
-using namespace tensorflow;
-
-REGISTER_OP("CompressSampleIds")
-.Attr("T: {int32}")
-.Input("input: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->Vector(c->kUnknownDim));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class CompressSampleIds : public OpKernel {
- public:
-  explicit CompressSampleIds(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    const int N = input.size();
-
-    // Check for improper input
-    bool error = (N > 0 && input(0) < 0);
-    for (int i = 1; !error && i < N; i++) {
-      error = input(i - 1) > input(i);
-    }
-
-    OP_REQUIRES(
-      context, !error,
-      errors::InvalidArgument(
-        "Error in CompressSampleIds. SampleIds must be non-negative and non-decreasing"
-      )
-    );
-
-    // choose output size, either last input element + 1, or 0
-    int output_size = 0;
-    if (N > 0) {
-      output_size = input(N - 1) + 1;
-    }
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, TensorShape({output_size}), &output_tensor)
-    );
-    auto output_flat = output_tensor->flat<T>();
-
-    // Zero-initialize output
-    for (int i = 0; i < output_size; i++) {
-      output_flat(i) = 0;
-    }
-
-    // count how many of each input element
-    for (int i = 0; i < N; i++) {
-      output_flat(input(i)) ++;
-    }
-  }
-};
-
-REGISTER_OP("DecompressSampleIds")
-.Attr("T: {int32}")
-.Input("input: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->Vector(c->kUnknownDim));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class DecompressSampleIds : public OpKernel {
- public:
-  explicit DecompressSampleIds(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    const int N = input.size();
-
-    // Check for improper input
-    bool error = false;
-    int output_size = 0;
-    for (int i = 0; !error && i < N; i++) {
-      error = input(i) < 0;
-      output_size += input(i);
-    }
-
-    OP_REQUIRES(
-      context, !error,
-      errors::InvalidArgument(
-        "Error in DecompressSampleIds. Inputs must be non-negative."
-      )
-    );
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, TensorShape({output_size}),&output_tensor)
-    );
-    auto output_flat = output_tensor->flat<T>();
-
-    T *output_data = output_flat.data();
-    for (int current_sample = 0; current_sample < N; current_sample++) {
-      std::fill_n(output_data, input(current_sample), current_sample);
-      output_data += input(current_sample);
-    }
-  }
-};
-
-
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("CompressSampleIds")       \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    CompressSampleIds<Type>);       \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("DecompressSampleIds")     \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    DecompressSampleIds<Type>);     \
-                                    \
-
-REGISTER(int32);
--- a/twml/libtwml/src/ops/compress_sample_ids.docx
+++ b/twml/libtwml/src/ops/compress_sample_ids.docx
--- a/twml/libtwml/src/ops/contrib/get_substrings.cpp
+++ b/twml/libtwml/src/ops/contrib/get_substrings.cpp
@ -1,116 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "../tensorflow_utils.h"
-#include "../resource_utils.h"
-
-#include <string>
-#include <set>
-
-using std::string;
-
-void join(const std::set<string>& v, char c, string& s) {
-         s.clear();
-         std::set<std::string>::iterator it = v.begin();
-         while (it != v.end()) {
-            s += *it;
-            it++;
-            if (it != v.end()) s+= c;
-         }
-}
-
-// cpp function that computes substrings of a given word
-std::string computeSubwords(std::string word, int32_t minn, int32_t maxn) {
-         std::string word2 = "<" + word + ">";
-         std::set<string> ngrams;
-         std::string s;
-         ngrams.insert(word);
-         ngrams.insert(word2);
-         for (size_t i = 0; i < word2.size(); i++) {
-            if ((word2[i] & 0xC0) == 0x80) continue;
-            for (size_t j = minn; i+j <= word2.size() && j <= maxn; j++) {
-              ngrams.insert(word2.substr(i, j));
-            }
-         }
-         join(ngrams, ';',  s);
-         ngrams.clear();
-         return s;
-}
-
-// tf-op function that computes substrings for a given tensor of words
-template< typename ValueType>
-
-void ComputeSubStringsTensor(OpKernelContext *context, int32 min_n, int32 max_n) {
-  try {
-      const Tensor& values = context->input(0);
-
-      auto values_flat = values.flat<ValueType>();
-
-      // batch_size from input_size  :
-      const int batch_size = values_flat.size();
-
-      // define the output tensor
-      Tensor* substrings = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, values.shape(), &substrings));
-
-      auto substrings_flat = substrings->flat<ValueType>();
-       // compute substrings for the given tensor values
-      for (int64 i = 0; i < batch_size; i++) {
-            substrings_flat(i) = computeSubwords(values_flat(i), min_n, max_n);
-      }
-  }
-  catch (const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-  }
-}
-
-REGISTER_OP("GetSubstrings")
-.Attr("ValueType: {string}")
-.Attr("min_n: int")
-.Attr("max_n: int")
-.Input("values: ValueType")
-.Output("substrings: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert word to substrings of length between min_n and max_n.
-
-Attr
-  min_n,max_n: The size of the substrings.
-
-Input
-  values: 1D input tensor containing the values.
-
-Outputs
-  substrings: A string tensor where substrings are joined by ";".
-)doc");
-
-template<typename ValueType>
-class GetSubstrings : public OpKernel {
- public:
-  explicit GetSubstrings(OpKernelConstruction *context) : OpKernel(context) {
-      OP_REQUIRES_OK(context, context->GetAttr("min_n", &min_n));
-      OP_REQUIRES_OK(context, context->GetAttr("max_n", &max_n));
-  }
-
- private:
-  int32 min_n;
-  int32 max_n;
-  void Compute(OpKernelContext *context) override {
-    ComputeSubStringsTensor<ValueType>(context, min_n, max_n);
-  }
-};
-
-
-#define REGISTER_SUBSTRINGS(ValueType)          \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("GetSubstrings")                       \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<ValueType>("ValueType"),    \
-    GetSubstrings<ValueType>);                  \
-
-REGISTER_SUBSTRINGS(string)
--- a/twml/libtwml/src/ops/contrib/get_substrings.docx
+++ b/twml/libtwml/src/ops/contrib/get_substrings.docx
--- a/twml/libtwml/src/ops/data_record.cpp
+++ b/twml/libtwml/src/ops/data_record.cpp
--- a/twml/libtwml/src/ops/data_record.docx
+++ b/twml/libtwml/src/ops/data_record.docx
--- a/twml/libtwml/src/ops/data_record_tensor_writer.cpp
+++ b/twml/libtwml/src/ops/data_record_tensor_writer.cpp
@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("DataRecordTensorWriter")
-.Attr("T: list({string, int32, int64, float, double, bool})")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and dense tensors into a DataRecord.
-
-values: list of tensors
-keys: feature ids from the original DataRecord (int64)
-
-Outputs
-  bytes: output DataRecord serialized using Thrift into a uint8 tensor.
-)doc");
-
-class DataRecordTensorWriter : public OpKernel {
- public:
-  explicit DataRecordTensorWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-
-    try {
-      // set keys as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-
-      // check sizes
-      uint64_t num_keys = in_keys_.getNumElements();
-      uint64_t num_values = context->num_inputs() - 1;
-
-      OP_REQUIRES(context, num_keys == num_values,
-        errors::InvalidArgument("Number of dense keys and dense tensors do not match"));
-
-      // populate DataRecord object
-      const int64_t *keys = in_keys_.getData<int64_t>();
-      twml::DataRecord record = twml::DataRecord();
-
-      for (int i = 1; i < context->num_inputs(); i++) {
-        const twml::RawTensor& value = TFTensor_to_twml_raw_tensor(context->input(i));
-        record.addRawTensor(keys[i-1], value);
-      }
-
-      // determine the length of the encoded result (no memory is copied)
-      twml::ThriftWriter thrift_dry_writer = twml::ThriftWriter(nullptr, 0, true);
-      twml::DataRecordWriter record_dry_writer = twml::DataRecordWriter(thrift_dry_writer);
-      record_dry_writer.write(record);
-      int len = thrift_dry_writer.getBytesWritten();
-      TensorShape result_shape = {1, len};
-
-      // allocate output tensor
-      Tensor* result = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape, &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // write to output tensor
-      uint8_t *buffer = out_result.getData<uint8_t>();
-      twml::ThriftWriter thrift_writer = twml::ThriftWriter(buffer, len, false);
-      twml::DataRecordWriter record_writer = twml::DataRecordWriter(thrift_writer);
-      record_writer.write(record);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("DataRecordTensorWriter").Device(DEVICE_CPU),
-    DataRecordTensorWriter);
--- a/twml/libtwml/src/ops/data_record_tensor_writer.docx
+++ b/twml/libtwml/src/ops/data_record_tensor_writer.docx
--- a/twml/libtwml/src/ops/discretizer.cpp
+++ b/twml/libtwml/src/ops/discretizer.cpp
@ -1,293 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-
-void ComputeDiscretizers(OpKernelContext* context, const bool return_bin_indices = false) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_ids = context->input(2);
-  const Tensor& bin_vals = context->input(3);
-  const Tensor& feature_offsets = context->input(4);
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(),
-                                                   &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, keys.shape(),
-                                                   &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_ids_ = TFTensor_to_twml_tensor(bin_ids);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-    const twml::Tensor feature_offsets_ = TFTensor_to_twml_tensor(feature_offsets);
-    twml::mdlInfer(out_keys_, out_vals_,
-                   in_keys_, in_vals_,
-                   bin_ids_, bin_vals_,
-                   feature_offsets_,
-                   return_bin_indices);
-  }  catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
-
-REGISTER_OP("MDL")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using MDL, you should use a hashmap to get the intersection of
-input `keys` with the features that MDL knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  mdl_keys = hashmap.find(keys) # mdl_keys are now in range [0, num_classes_from_calibration)
-  mdl_keys = where (mdl_keys != -1) # Ignore keys not found
-
-
-Inside MDL, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-
-template<typename T>
-class MDL : public OpKernel {
- public:
-  explicit MDL(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context);
-  }
-};
-
-REGISTER_OP("PercentileDiscretizer")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using PercentileDiscretizer, you should use a hashmap to get the intersection of
-input `keys` with the features that PercentileDiscretizer knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  percentile_discretizer_keys = hashmap.find(keys) # percentile_discretizer_keys are now in range [0, num_classes_from_calibration)
-  percentile_discretizer_keys = where (percentile_discretizer_keys != -1) # Ignore keys not found
-
-
-Inside PercentileDiscretizer, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizer : public OpKernel {
- public:
-  explicit PercentileDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context);
-  }
-};
-
-
-REGISTER_OP("PercentileDiscretizerBinIndices")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-If the feature id and bin id of the discretized value is the same on multiple runs, they
-will always be assigned to the same output key and value, regardless of the bin_id assigned during
-calibration.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using PercentileDiscretizerBinIndices, you should use a hashmap to get the intersection of
-input `keys` with the features that PercentileDiscretizerBinIndices knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  percentile_discretizer_keys = hashmap.find(keys) # percentile_discretizer_keys are now in range [0, num_classes_from_calibration)
-  percentile_discretizer_keys = where (percentile_discretizer_keys != -1) # Ignore keys not found
-
-
-Inside PercentileDiscretizerBinIndices, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizerBinIndices : public OpKernel {
- public:
-  explicit PercentileDiscretizerBinIndices(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context, true);
-  }
-};
-
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizerBinIndices")   \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizerBinIndices<Type>);   \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizer")   \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizer<Type>);   \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("MDL")                     \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    MDL<Type>);                     \
-
-REGISTER(float);
-REGISTER(double);
--- a/twml/libtwml/src/ops/discretizer.docx
+++ b/twml/libtwml/src/ops/discretizer.docx
--- a/twml/libtwml/src/ops/feature_extractor.cpp
+++ b/twml/libtwml/src/ops/feature_extractor.cpp
@ -1,134 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include <map>
-#include <vector>
-
-REGISTER_OP("FeatureExtractor")
-.Attr("T: {float, double} = DT_FLOAT")
-.Input("mask_in: bool")
-.Input("ids_in: int64")
-.Input("keys_in: int64")
-.Input("values_in: T")
-.Input("codes_in: int64")
-.Input("types_in: int8")
-.Output("ids_out: int64")
-.Output("keys_out: int64")
-.Output("values_out: T")
-.Output("codes_out: int64")
-.Output("types_out: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that extracts the desired indices of a Tensor based on a mask
-
-Input
-  mask_in: boolean Tensor that determines which are the indices to be kept (bool)
-  ids_in: input indices Tensor (int64)
-  keys_in: input keys Tensor (int64)
-  values_in: input values Tensor (float/double)
-  codes_in: input codes Tensor (int64)
-  types_in: input types Tensor(int8)
-
-Outputs
-  ids_out: output indices Tensor (int64)
-  keys_out: output keys Tensor (int64)
-  values_out: output values Tensor (float/double)
-  codes_out: output codes Tensor (int64)
-  types_out: output types Tensor(int8)
-
-)doc");
-template <typename T>
-class FeatureExtractor : public OpKernel {
- public:
-  explicit FeatureExtractor(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  template <typename A, typename U>
-  bool allequal(const A &t, const U &u) {
-      return t == u;
-  }
-
-  template <typename A, typename U, typename... Others>
-  bool allequal(const A &t, const U &u, Others const &... args) {
-      return (t == u) && allequal(u, args...);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Get input tensors
-    const Tensor& input_mask = context->input(0);
-    const Tensor& input_ids = context->input(1);
-    const Tensor& input_keys = context->input(2);
-    const Tensor& input_values = context->input(3);
-    const Tensor& input_codes = context->input(4);
-    const Tensor& input_types = context->input(5);
-
-    auto mask = input_mask.flat<bool>();
-    auto ids = input_ids.flat<int64>();
-    auto keys = input_keys.flat<int64>();
-    auto codes = input_codes.flat<int64>();
-    auto values = input_values.flat<T>();
-    auto types = input_types.flat<int8>();
-
-    // Verify that all Tensors have the same size.
-    OP_REQUIRES(context, allequal(mask.size(), ids.size(), keys.size(), codes.size(), values.size(), types.size()),
-                errors::InvalidArgument("all input vectors must be the same size."));
-
-    // Get the size of the output vectors by counting the numbers of trues.
-    int total_size = 0;
-    for (int i = 0; i < mask.size(); i++) {
-      if (mask(i))
-        total_size += 1;
-    }
-
-    // Shape is the number of Trues in the mask Eigen::Tensor
-    TensorShape shape_out = {total_size};
-
-    // Create the output tensors
-    Tensor* output_codes = nullptr;
-    Tensor* output_ids = nullptr;
-    Tensor* output_values = nullptr;
-    Tensor* output_types = nullptr;
-    Tensor* output_keys = nullptr;
-
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape_out, &output_ids));
-    OP_REQUIRES_OK(context, context->allocate_output(1, shape_out, &output_keys));
-    OP_REQUIRES_OK(context, context->allocate_output(2, shape_out, &output_values));
-    OP_REQUIRES_OK(context, context->allocate_output(3, shape_out, &output_codes));
-    OP_REQUIRES_OK(context, context->allocate_output(4, shape_out, &output_types));
-
-    auto output_ids_ = output_ids->flat<int64>();
-    auto output_keys_ = output_keys->flat<int64>();
-    auto output_codes_ = output_codes->flat<int64>();
-    auto output_values_ = output_values->flat<T>();
-    auto output_types_ = output_types->flat<int8>();
-
-    // Iterate through the mask and set values to output Eigen::Tensors
-    int j = 0;
-    for (int i = 0; i < mask.size(); i++) {
-      if (mask(i)) {
-        output_ids_(j) = ids(i);
-        output_keys_(j) = keys(i);
-        output_values_(j) = values(i);
-        output_codes_(j) = codes(i);
-        output_types_(j) = types(i);
-        ++j;
-      }
-    }
-  }
-};
-
-#define REGISTER(Type)                        \
-                                              \
-  REGISTER_KERNEL_BUILDER(                    \
-  Name("FeatureExtractor")  \
-  .Device(DEVICE_CPU)                         \
-  .TypeConstraint<Type>("T"),                 \
-  FeatureExtractor<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
--- a/twml/libtwml/src/ops/feature_extractor.docx
+++ b/twml/libtwml/src/ops/feature_extractor.docx
--- a/twml/libtwml/src/ops/feature_id.cpp
+++ b/twml/libtwml/src/ops/feature_id.cpp
@ -1,58 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("FeatureId")
-.Attr("feature_names: list(string)")
-.Output("output: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that hashes a list of strings into int64. This is used for feature name hashing.
-
-Attr
-  feature_names: a list of string feature names (list(string)).
-
-Outputs
-  ouput: hashes corresponding to the string feature names (int64).
-)doc");
-
-
-class FeatureId : public OpKernel {
- private:
-    std::vector<string> input_vector;
-
- public:
-  explicit FeatureId(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_names", &input_vector));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Get size of the input_vector and create TensorShape shape
-    const int total_size = static_cast<int>(input_vector.size());
-    TensorShape shape = {total_size};
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape,
-                             &output_tensor));
-    auto output_flat = output_tensor->flat<int64>();
-
-    // Transform the input tensor into a int64
-    for (int i = 0; i < total_size; i++) {
-      output_flat(i) = twml::featureId(input_vector[i]);
-    }
-  }
-};
-
-
-REGISTER_KERNEL_BUILDER(
-  Name("FeatureId")
-  .Device(DEVICE_CPU),
-  FeatureId);
--- a/twml/libtwml/src/ops/feature_id.docx
+++ b/twml/libtwml/src/ops/feature_id.docx
--- a/twml/libtwml/src/ops/feature_mask.cpp
+++ b/twml/libtwml/src/ops/feature_mask.cpp
@ -1,83 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include <map>
-#include <vector>
-#include <set>
-
-REGISTER_OP("FeatureMask")
-.Attr("T: {int64, int8}")
-.Input("keep: T")
-.Attr("list_keep: list(int)")
-.Output("mask: bool")
-
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that creates a mask of the indices that should be kept.
-
-Attribute
-list_keep: list of values which should be kept(list(int))
-
-Input
-  keep: Tensor for which we will apply the mask (int64, int8)
-
-Outputs
-  mask: boolean Tensor. (bool)
-
-)doc");
-template <typename T>
-class FeatureMask : public OpKernel {
- private:
-  std::set<int64> feature_set_keep;
-
- public:
-  explicit FeatureMask(OpKernelConstruction* context)
-      : OpKernel(context) {
-        std::vector<int64> feature_list_keep;
-        OP_REQUIRES_OK(context, context->GetAttr("list_keep", &feature_list_keep));
-        // create set that contains the content of the feature_list_keep, since tensorflow does not allow
-        // me to directly ouput the contents of list_keep to a set
-        feature_set_keep = std::set<int64>(feature_list_keep.begin(), feature_list_keep.end());
-      }
-
-  void Compute(OpKernelContext* context) override {
-    // Get size of the input_vector and create TensorShape shape
-    const Tensor& input = context->input(0);
-
-    auto keep = input.flat<T>();
-
-    // Create an output tensor
-    Tensor* output_mask = nullptr;
-
-    // Output shape is determined and now we can copy the contents of the vector to the output Tensor.
-    const int total_size_out = static_cast<int>(keep.size());
-
-    TensorShape shape_out = {total_size_out};
-
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape_out, &output_mask));
-
-    auto output_mask_ = output_mask->flat<bool>();
-
-    // Check if value is in set, output is boolean
-    for (int j = 0; j < keep.size(); j++){
-      output_mask_(j) = (feature_set_keep.count(keep(j)));
-    }
-  }
-};
-
-
-#define REGISTER(Type)                        \
-                                              \
-  REGISTER_KERNEL_BUILDER(                    \
-  Name("FeatureMask")  \
-  .Device(DEVICE_CPU)                         \
-  .TypeConstraint<Type>("T"),                 \
-  FeatureMask<Type>);  \
-
-REGISTER(int64);
-REGISTER(int8);
--- a/twml/libtwml/src/ops/feature_mask.docx
+++ b/twml/libtwml/src/ops/feature_mask.docx
--- a/twml/libtwml/src/ops/fixed_length_tensor.cpp
+++ b/twml/libtwml/src/ops/fixed_length_tensor.cpp
@ -1,190 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-using std::string;
-
-template<typename IndexType, typename ValueType, bool calc_batch_size>
-void ComputeFixedLengthTensor(OpKernelContext *context, int64 max_length_) {
-  try {
-    const Tensor& segment_ids = context->input(0);
-    const Tensor& values = context->input(1);
-    const Tensor& pad_value = context->input(2);
-
-    auto indices_flat = segment_ids.flat<IndexType>();
-    auto values_flat = values.flat<ValueType>();
-
-    auto pad_value_scalar = pad_value.scalar<ValueType>()();
-
-    // Get maximum length from batch if user hasn't specified it.
-    int64 max_length = max_length_;
-    if (max_length < 0 && indices_flat.size() > 0) {
-      int64 current_id = indices_flat(0);
-      int64 current_length = 1;
-
-      for (int64 i = 1; i < indices_flat.size(); i++) {
-        if (current_id == indices_flat(i)) {
-          current_length++;
-        } else {
-          current_id = indices_flat(i);
-          max_length = std::max(max_length, current_length);
-          current_length = 1;
-        }
-      }
-      // This is needed if the last batch is the longest sequence.
-      max_length = std::max(max_length, current_length);
-    }
-
-    int64 batch_size = 0;
-    if (calc_batch_size) {
-      if (indices_flat.size() > 0) {
-        // The last value of segment_ids will have value batch_size  1;
-        batch_size = 1 + indices_flat(indices_flat.size() - 1);
-      } else {
-        batch_size = 0;
-      }
-    } else {
-      const Tensor& batch_size_tensor = context->input(3);
-      batch_size = batch_size_tensor.flat<int64>()(0);
-    }
-
-    TensorShape output_shape = {batch_size, max_length};
-    Tensor* fixed_length = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &fixed_length));
-
-    auto fixed_length_flat = fixed_length->flat<ValueType>();
-
-    int64 n = 0;
-    int64 offset = 0;
-    for (int64 i = 0; i < batch_size; i++) {
-      for (int64 j = 0; j < max_length; j++) {
-        if (n < indices_flat.size() && indices_flat(n) == i) {
-          // Copy from variable length tensor.
-          fixed_length_flat(offset + j) = values_flat(n);
-          n++;
-        } else {
-          // Pad to fixed length.
-          fixed_length_flat(offset + j) = pad_value_scalar;
-        }
-      }
-      // Corner case: truncate to max_length if user specified max_length < current length.
-      while (n < indices_flat.size() && i == indices_flat(n)) n++;
-
-      // Update output pointer
-      offset += max_length;
-    }
-  } catch (const std::exception &err) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-  }
-}
-
-REGISTER_OP("FixedLengthTensor")
-.Attr("IndexType: {int64, int32}")
-.Attr("ValueType: {int64, int32, string}")
-.Attr("max_length: int")
-.Input("segment_ids: IndexType")
-.Input("values: ValueType")
-.Input("pad_value: ValueType")
-.Output("fixed_length: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert variable length segments into fixed length tensor.
-
-Attr
-  max_length: The size of the inner most (i.e. last) dimension.
-
-Input
-  segment_ids: 1D input tensor containing the sorted segment_ids.
-  values: 1D input tensor containing the values.
-  pad_value: The value used for padding the fixed length tensor.
-
-Outputs
-  fixed_length: A fixed length tensor of size [batch_size, max_length].
-)doc");
-
-template<typename IndexType, typename ValueType>
-class FixedLengthTensor: public OpKernel {
- public:
-  explicit FixedLengthTensor(OpKernelConstruction *context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_length", &max_length_));
-  }
-
- private:
-  int64 max_length_;
-
-  void Compute(OpKernelContext *context) override {
-    ComputeFixedLengthTensor<IndexType, ValueType, true>(context, max_length_);
-  }
-};
-
-REGISTER_OP("FixedLengthTensorV2")
-.Attr("IndexType: {int64, int32}")
-.Attr("ValueType: {int64, int32, string}")
-.Attr("max_length: int")
-.Input("segment_ids: IndexType")
-.Input("values: ValueType")
-.Input("pad_value: ValueType")
-.Input("batch_size: int64")
-.Output("fixed_length: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert variable length segments into fixed length tensor.
-
-Attr
-  max_length: The size of the inner most (i.e. last) dimension.
-
-Input
-  segment_ids: 1D input tensor containing the sorted segment_ids.
-  values: 1D input tensor containing the values.
-  pad_value: The value used for padding the fixed length tensor.
-  batch_size: The batch size to use.
-
-Outputs
-  fixed_length: A fixed length tensor of size [batch_size, max_length].
-)doc");
-
-template<typename IndexType, typename ValueType>
-class FixedLengthTensorV2: public OpKernel {
- public:
-  explicit FixedLengthTensorV2(OpKernelConstruction *context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_length", &max_length_));
-  }
-
- private:
-  int64 max_length_;
-
-  void Compute(OpKernelContext *context) override {
-    ComputeFixedLengthTensor<IndexType, ValueType, false>(context, max_length_);
-  }
-};
-
-#define REGISTER_SPARSE_TO_FIXED_LENGTH(IndexType, ValueType)   \
-  REGISTER_KERNEL_BUILDER(                                      \
-    Name("FixedLengthTensor")                                   \
-    .Device(DEVICE_CPU)                                         \
-    .TypeConstraint<IndexType>("IndexType")                     \
-    .TypeConstraint<ValueType>("ValueType"),                    \
-    FixedLengthTensor<IndexType, ValueType>);                   \
-                                                                \
-  REGISTER_KERNEL_BUILDER(                                      \
-    Name("FixedLengthTensorV2")                                 \
-    .Device(DEVICE_CPU)                                         \
-    .TypeConstraint<IndexType>("IndexType")                     \
-    .TypeConstraint<ValueType>("ValueType"),                    \
-    FixedLengthTensorV2<IndexType, ValueType>);                 \
-
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, int64)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, int32)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, string)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, int64)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, int32)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, string)
--- a/twml/libtwml/src/ops/fixed_length_tensor.docx
+++ b/twml/libtwml/src/ops/fixed_length_tensor.docx
--- a/twml/libtwml/src/ops/hashed_data_record.cpp
+++ b/twml/libtwml/src/ops/hashed_data_record.cpp
@ -1,520 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <functional>
-
-REGISTER_OP("DecodeAndHashDataRecord")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that creates a handle for the hashed data record.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
-
-Outputs
-  hashed_data_record_handle: A resource handle to batch of HashedDataRecords.
-)doc");
-
-template<typename InputType>
-class DecodeAndHashDataRecord : public OpKernel {
- public:
-  explicit DecodeAndHashDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      int batch_size = getBatchSize<InputType>(resource->input);
-      int num_labels = static_cast<int>(m_labels_map.size());
-      int num_weights = static_cast<int>(m_weights_map.size());
-
-      twml::HashedDataRecordReader reader;
-      reader.setKeepMap(&m_keep_map);
-      reader.setLabelsMap(&m_labels_map);
-      reader.setDecodeMode(m_decode_mode);
-
-      // Do not set weight map if it is empty. This will take a faster path.
-      if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-      }
-
-      resource->records.clear();
-      resource->records.reserve(batch_size);
-
-      int64 total_size = 0;
-
-      for (int id = 0; id < batch_size; id++) {
-        const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
-        reader.setBuffer(input_bytes);
-        resource->records.emplace_back(num_labels, num_weights);
-        resource->records[id].decode(reader);
-        total_size += static_cast<int64>(resource->records[id].totalSize());
-      }
-
-      resource->total_size = total_size;
-      resource->num_labels = num_labels;
-      resource->num_weights = num_weights;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetIdsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns unhashed ids from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-)doc");
-
-// This Kernel is used for both training and serving once the resource is created.
-class GetIdsFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetIdsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-      const int64 common_size = static_cast<int64>(common.totalSize());
-      const int64 total_size = handle->total_size;
-      TensorShape shape = {total_size};
-
-      Tensor *ids;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-
-      int id = 0;
-      int64 offset = 0;
-      auto ids_flat = ids->flat<int64>();
-      for (const auto &record : records) {
-        // Since common features are added to each input, add the common_size to the current size.
-        // For training common_size == 0, for serving it can be a non-zero value.
-        int64 curr_size = static_cast<int64>(record.totalSize()) + common_size;
-        std::fill(ids_flat.data() + offset, ids_flat.data() + offset + curr_size, id);
-        offset += curr_size;
-        id++;
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-// OutType: Output Tensor Type. FieldType: The storage type used inside HashedDatarecord.
-template<typename OutType, typename FieldType>
-class GetOutputFromHashedDataRecord : public OpKernel {
- protected:
-  using Getter = std::function<const std::vector<FieldType>&(const twml::HashedDataRecord &)>;
-  Getter getter;
-
- public:
-  explicit GetOutputFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-      const int64 total_size = handle->total_size;
-      TensorShape shape = {total_size};
-
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
-
-      const auto &common_output = getter(common);
-
-      auto output_data = output->flat<OutType>().data();
-      for (const auto &record : records) {
-        // This is does not copy anything during training as common_size == 0
-        // It will copy the relevant common features coming from a batch prediction request.
-        output_data = std::copy(common_output.begin(), common_output.end(), output_data);
-
-        // Copy the current record to output.
-        const auto& rec_output = getter(record);
-        output_data = std::copy(rec_output.begin(), rec_output.end(), output_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetUKeysFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("ukeys: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns unhashed keys from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ukeys: unhased keys / raw feature ids from the original request.
-)doc");
-
-class GetUKeysFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetUKeysFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.keys();
-    };
-  }
-};
-
-REGISTER_OP("GetKeysFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("keys: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns keys from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  keys: keys after raw feature ids are hashed with values (int64)
-)doc");
-
-class GetKeysFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetKeysFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.transformed_keys();
-    };
-  }
-};
-
-REGISTER_OP("GetValuesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns values from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  values: feature values.
-)doc");
-
-class GetValuesFromHashedDataRecord : public GetOutputFromHashedDataRecord<float, double> {
- public:
-  explicit GetValuesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<float, double>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<double> & {
-      return record.values();
-    };
-  }
-};
-
-REGISTER_OP("GetCodesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("codes: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns codes from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  codes: deepbird feature code, usually from A,B,C,D ... in the config.
-)doc");
-
-class GetCodesFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetCodesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.codes();
-    };
-  }
-};
-
-REGISTER_OP("GetTypesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("types: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns types from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  types: feature types corresponding to BINARY, DISCRETE, etc.
-)doc");
-
-class GetTypesFromHashedDataRecord : public GetOutputFromHashedDataRecord<int8, uint8_t> {
- public:
-  explicit GetTypesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int8, uint8_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<uint8_t> & {
-      return record.types();
-    };
-  }
-};
-
-REGISTER_OP("GetBatchSizeFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("batch_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns batch size from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  batch_size: Number of records held in the handle.
-)doc");
-
-class GetBatchSizeFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetBatchSizeFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->records.size();
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetTotalSizeFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("total_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns total size from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  total_size: Total number of keys / values in the batch.
-)doc");
-
-class GetTotalSizeFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetTotalSizeFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->total_size;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetLabelsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("labels: float")
-.Attr("default_label: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns labels from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
-)doc");
-
-class GetLabelsFromHashedDataRecord : public OpKernel {
- private:
-  float default_label;
-
- public:
-  explicit GetLabelsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_labels = static_cast<int>(handle->num_labels);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
-
-      Tensor *labels;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
-
-      // The default value of label is not present in data record is std::nanf
-      // For continuous labels, change that to a default_label or label.
-      auto func = [this](float label) -> float {
-        return std::isnan(label) ? default_label : label;
-      };
-
-      auto labels_data = labels->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_labels = record.labels();
-        labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetWeightsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("weights: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns weights from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
-)doc");
-
-class GetWeightsFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetWeightsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_weights = static_cast<int>(handle->num_weights);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
-
-      Tensor *weights;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
-
-      auto weights_data = weights->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_weights = record.weights();
-        weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-#define REGISTER_DECODE_AND_HASH(InputType)     \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("DecodeAndHashDataRecord")             \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<InputType>("InputType"),    \
-    DecodeAndHashDataRecord<InputType>);        \
-
-REGISTER_DECODE_AND_HASH(uint8)
-REGISTER_DECODE_AND_HASH(string)
-
-#define REGISTER_GETTER(FIELD)                  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "FromHashedDataRecord")   \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##FromHashedDataRecord);          \
-
-REGISTER_GETTER(Ids)
-REGISTER_GETTER(UKeys)
-REGISTER_GETTER(Keys)
-REGISTER_GETTER(Values)
-REGISTER_GETTER(Codes)
-REGISTER_GETTER(Types)
-REGISTER_GETTER(BatchSize)
-REGISTER_GETTER(TotalSize)
-REGISTER_GETTER(Labels)
-REGISTER_GETTER(Weights)
--- a/twml/libtwml/src/ops/hashed_data_record.docx
+++ b/twml/libtwml/src/ops/hashed_data_record.docx
--- a/twml/libtwml/src/ops/hashing_discretizer.cpp
+++ b/twml/libtwml/src/ops/hashing_discretizer.cpp
@ -1,260 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-void ComputeHashingDiscretizer(
-  OpKernelContext*,
-  int64_t,
-  const twml::Map<int64_t, int64_t> &,
-  int64_t,
-  int64_t,
-  int64_t);
-
-REGISTER_OP("HashingDiscretizer")
-.Attr("T: {float, double}")
-.Input("input_ids: int64")
-.Input("input_vals: T")
-.Input("bin_vals: T")
-.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
-.Attr("n_bin: int")
-.Attr("output_bits: int")
-.Attr("cost_per_unit: int")
-.Attr("options: int")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn(
-  [](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(1));
-    return Status::OK();
-  }
-)
-.Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features (if calibrated).
-  - note - choice of float or double should be consistent among inputs/output
-
-Input
-  input_ids(int64): A tensor containing input feature ids (direct from data record).
-  input_vals(float/double): A tensor containing input values at corresponding feature ids.
-    - i.e. input_ids[i] <-> input_vals[i] for each i
-  bin_vals(float/double): A tensor containing the bin boundaries for values of a given feature.
-    - float or double, matching input_vals
-  feature_ids(int64 attr): 1D TensorProto of feature IDs seen during calibration
-    -> hint: look up make_tensor_proto:
-       proto_init = np.array(values, dtype=np.int64)
-       tensor_attr = tf.make_tensor_proto(proto_init)
-  n_bin(int): The number of bin boundary values per feature
-    -> hence, n_bin + 1 buckets for each feature
-  output_bits(int): The maximum number of bits to use for the output IDs.
-  cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
-    if not CPU-bound) to complete a unit of work. Overestimating creates too
-    many shards and CPU time will be dominated by per-shard overhead, such as
-    Context creation. Underestimating may not fully make use of the specified
-    parallelism.
-  options(int): selects behavior of the op.
-    0x00 in bits{1:0} for std::lower_bound bucket search.
-    0x01 in bits{1:0} for linear bucket search
-    0x02 in bits{1:0} for std::upper_bound bucket search
-    0x00 in bits{4:2} for integer_multiplicative_hashing
-    0x01 in bits{4:2} for integer64_multiplicative_hashing
-    higher bits/other values are reserved for future extensions
-
-Outputs
-  new_keys(int64): The discretized feature ids with same shape and size as keys.
-  new_vals(float or double): The discretized values with the same shape and size as vals.
-
-Operation
-  Note that the discretization operation maps observation vectors to higher dimensional
-    observation vectors. Here, we describe this mapping.
-
-  Let a calibrated feature observation be given by (F,x), where F is the ID of the
-    feature, and x is some real value (i.e., continuous feature). This kind of
-    representation is useful for the representation of sparse vectors, where there
-    are many zeros.
-
-  For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
-    (0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
-    elements of the vector.
-
-  The disretizer performs the following operation:
-    (F,x) -> (map(x|F),1).
-  Hence, we have that map(x|F) is a new feature ID, and the value observed for that
-    feature is 1. We might read map(x|F) as 'the map of x for feature F'.
-
-  For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
-    We will then have that map(x|F) is in the set newIDs(F) for any value of x. Each
-    set member of newIDs(F) is associated with a 'bin', as defined by the bin
-    boundaries given in the bin_vals input array. For any two different feature IDs F
-    and G, we would ideally have that INTERSECT(newIDs(F),newIDs(G)) is the empty set.
-    However, this is not guaranteed for this discretizer.
-
-  In the case of this hashing discretizer, map(x|F) can actually be written as follows:
-    let bucket = bucket(x|F) be the the bucket index for x, according to the
-    calibration on F. (This is an integer value in [0,n_bin], inclusive)
-    F is an integer ID. Here, we have that map(x|F) = hash_fn(F,bucket). This has
-    the desirable property that the new ID depends only on the calibration data
-    supplied for feature F, and not on any other features in the dataset (e.g.,
-    number of other features present in the calibration data, or order of features
-    in the dataset). Note that PercentileDiscretizer does NOT have this property.
-    This comes at the expense of the possibility of output ID collisions, which
-    we try to minimize through the design of hash_fn.
-
-  Example - consider input vector with a single element, i.e. [x].
-    Let's Discretize to one of 2 values, as follows:
-    Let F=0 for the ID of the single feature in the vector.
-    Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
-    bucket = bucket(x|F=0) = 0 if x<=BNDRY(0) else 1
-    Let map(x|F) = hash_fn(F=0,bucket=0) if x<=BNDRY(0) else hash_fn(F=0,bucket=1)
-  If we had another element y in the vector, i.e. [x, y], then we might additionally
-    Let F=1 for element y.
-    Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
-    bucket = bucket(x|F=1) = 0 if x<=BNDRY(1) else 1
-    Let map(x|F) = hash_fn(F=1,bucket=0) if x<=BNDRY(1) else hash_fn(F=1,bucket=1)
-  Note how the construction of map(x|F=1) does not depend on whether map(x|F=0)
-    was constructed.
-)doc");
-
-template<typename T>
-class HashingDiscretizer : public OpKernel {
- public:
-  explicit HashingDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("n_bin", &n_bin_));
-    OP_REQUIRES(context,
-                n_bin_ > 0,
-                errors::InvalidArgument("Must have n_bin_ > 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("output_bits", &output_bits_));
-    OP_REQUIRES(context,
-                output_bits_ > 0,
-                errors::InvalidArgument("Must have output_bits_ > 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("cost_per_unit", &cost_per_unit_));
-    OP_REQUIRES(context,
-                cost_per_unit_ >= 0,
-                errors::InvalidArgument("Must have cost_per_unit >= 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("options", &options_));
-
-    // construct the ID_to_index hash map
-    Tensor feature_IDs;
-
-    // extract the tensors
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_ids", &feature_IDs));
-
-    // for access to the data
-    // int64_t data type is set in to_layer function of the calibrator objects in Python
-    auto feature_IDs_flat = feature_IDs.flat<int64>();
-
-    // verify proper dimension constraints
-    OP_REQUIRES(context,
-                feature_IDs.shape().dims() == 1,
-                errors::InvalidArgument("feature_ids must be 1D."));
-
-    // reserve space in the hash map and fill in the values
-    int64_t num_features = feature_IDs.shape().dim_size(0);
-#ifdef USE_DENSE_HASH
-    ID_to_index_.set_empty_key(0);
-    ID_to_index_.resize(num_features);
-#else
-    ID_to_index_.reserve(num_features);
-#endif  // USE_DENSE_HASH
-    for (int64_t i = 0 ; i < num_features ; i++) {
-      ID_to_index_[feature_IDs_flat(i)] = i;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeHashingDiscretizer(
-      context,
-      output_bits_,
-      ID_to_index_,
-      n_bin_,
-      cost_per_unit_,
-      options_);
-  }
-
- private:
-  twml::Map<int64_t, int64_t> ID_to_index_;
-  int n_bin_;
-  int output_bits_;
-  int cost_per_unit_;
-  int options_;
-};
-
-#define REGISTER(Type)              \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("HashingDiscretizer")      \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    HashingDiscretizer<Type>);      \
-
-REGISTER(float);
-REGISTER(double);
-
-void ComputeHashingDiscretizer(
-    OpKernelContext* context,
-    int64_t output_bits,
-    const twml::Map<int64_t, int64_t> &ID_to_index,
-    int64_t n_bin,
-    int64_t cost_per_unit,
-    int64_t options) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_vals = context->input(2);
-
-  const int64 output_size = keys.dim_size(0);
-
-  TensorShape output_shape;
-  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(&output_size, 1, &output_shape));
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [&](int64 start, int64 limit) {
-      twml::hashDiscretizerInfer(out_keys_, out_vals_,
-                             in_keys_, in_vals_,
-                             n_bin,
-                             bin_vals_,
-                             output_bits,
-                             ID_to_index,
-                             start, limit,
-                             options);
-    };
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          output_size,
-          static_cast<int64>(cost_per_unit),
-          task);
-  } catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
-
--- a/twml/libtwml/src/ops/hashing_discretizer.docx
+++ b/twml/libtwml/src/ops/hashing_discretizer.docx
--- a/twml/libtwml/src/ops/hashmap.cpp
+++ b/twml/libtwml/src/ops/hashmap.cpp
@ -1,84 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-
-#include <mutex>
-
-using namespace tensorflow;
-
-REGISTER_OP("Hashmap")
-.Input("keys: int64")
-.Input("hash_keys: int64")
-.Input("hash_values: int64")
-.Output("values: int64")
-.Output("mask: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check if the sizes are different in the input
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-  });
-
-
-class Hashmap : public OpKernel {
- private:
-  twml::HashMap hmap;
-  std::once_flag flag;
-
- public:
-  explicit Hashmap(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // Quick hack
-      const Tensor& keys = context->input(0);
-
-      std::call_once(this->flag, [this, context](){
-          const Tensor& hash_keys = context->input(1);
-          const Tensor& hash_values = context->input(2);
-          const auto hash_keys_flat = hash_keys.flat<int64>();
-          const auto hash_values_flat = hash_values.flat<int64>();
-          const int64 N = hash_keys_flat.size();
-
-          for (int64 i = 0; i < N; i++) {
-            hmap.insert(hash_keys_flat(i), hash_values_flat(i));
-          }
-        });
-
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(),
-                                                       &values));
-
-      Tensor* mask = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, keys.shape(),
-                                                       &mask));
-
-      // copy the values without sharing a storage
-      values->flat<int64>() = keys.flat<int64>();
-
-      auto keys_flat = keys.flat<int64>();
-      auto values_flat = values->flat<int64>();
-      auto mask_flat = mask->flat<int8>();
-
-      // TODO: use twml tensor
-      const int64 N = keys_flat.size();
-      for (int64 i = 0; i < N; i++) {
-        // values_flat(i), keys_flat(i) return references to tensorflow::int64.
-        // Using them in hmap.get() was causing issues because of automatic casting.
-        int64_t val = values_flat(i);
-        int64_t key = keys_flat(i);
-        mask_flat(i) = hmap.get(val, key);
-        values_flat(i) = val;
-      }
-    }  catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("Hashmap")
-  .Device(DEVICE_CPU),
-  Hashmap);
--- a/twml/libtwml/src/ops/hashmap.docx
+++ b/twml/libtwml/src/ops/hashmap.docx
--- a/twml/libtwml/src/ops/isotonic_calibration.cpp
+++ b/twml/libtwml/src/ops/isotonic_calibration.cpp
@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("IsotonicCalibration")
-.Attr("T: {float, double}")
-.Input("input: T")
-.Input("xs: T")
-.Input("ys: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  // output shape should be the same as input shape.
-  c->set_output(0, c->input(0));
-  return Status::OK();
-}).Doc(R"doc(
-
-This operation calibrates probabilities by fitting to a piece-wise non-decreasing function.
-
-Input
-  input: A tensor containing uncalibrated probabilities.
-  xs: A tensor containing the boundaries of the bins.
-  ys: A tensor contianing calibrated values for the corresponding bins.
-
-Expected Sizes:
-  input: [batch_size, num_labels].
-  xs, ys: [num_labels, num_bins].
-
-Expected Types:
-  input: float or double.
-  xs, ys: same as input.
-
-Outputs
-  output: A tensor containing calibrated probabilities with same shape and size as input.
-
-)doc");
-
-template<typename T>
-class IsotonicCalibration : public OpKernel {
- public:
-  explicit IsotonicCalibration(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const Tensor& xs = context->input(1);
-    const Tensor& ys = context->input(2);
-
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, input.shape(), &output));
-
-    try {
-      const twml::Tensor twml_input = TFTensor_to_twml_tensor(input);
-      const twml::Tensor twml_xs = TFTensor_to_twml_tensor(xs);
-      const twml::Tensor twml_ys = TFTensor_to_twml_tensor(ys);
-      twml::Tensor twml_output = TFTensor_to_twml_tensor(*output);
-
-      twml::linearInterpolation(twml_output, twml_input, twml_xs, twml_ys);
-    }  catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER(Type)                \
-                                      \
-  REGISTER_KERNEL_BUILDER(            \
-    Name("IsotonicCalibration")       \
-    .Device(DEVICE_CPU)               \
-    .TypeConstraint<Type>("T"),       \
-    IsotonicCalibration<Type>);       \
-
-REGISTER(float);
-REGISTER(double);
--- a/twml/libtwml/src/ops/isotonic_calibration.docx
+++ b/twml/libtwml/src/ops/isotonic_calibration.docx
--- a/twml/libtwml/src/ops/num_intra_op_threads.cpp
+++ b/twml/libtwml/src/ops/num_intra_op_threads.cpp
@ -1,39 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("NumIntraOpThreads")
-.Input("x: float32")
-.Output("num_intra_op_threads: int32")
-.SetShapeFn(tensorflow::shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns the number of threads in the intra_op_parallelism pool
-This is not part of the Tensorflow API as of the date of writing this doc. Hence,
-a tensorflow operation is the best resort.
-Input
-  x: Dummy placeholder so that constant folding is not done by TF GraphOptimizer.
-  Please refer https://github.com/tensorflow/tensorflow/issues/22546 for more
-  details.
-Output
-  num_intra_op_threads: A scalar tensor corresponding to the number of threads in
-  the intra_op_parallelism pool
-)doc");
-
-class NumIntraOpThreads : public OpKernel {
- public:
-  explicit NumIntraOpThreads(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    int num_intra_op_threads = context->device()->tensorflow_cpu_worker_threads()->num_threads;
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    auto output_flat = output_tensor->flat<int32>();
-    output_flat(0) = num_intra_op_threads;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("NumIntraOpThreads").Device(DEVICE_CPU), NumIntraOpThreads);
--- a/twml/libtwml/src/ops/num_intra_op_threads.docx
+++ b/twml/libtwml/src/ops/num_intra_op_threads.docx
--- a/twml/libtwml/src/ops/par_add.cpp
+++ b/twml/libtwml/src/ops/par_add.cpp
@ -1,75 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/logging.h"
-#include <iostream>
-
-#include <vector>
-
-using namespace tensorflow;
-
-REGISTER_OP("ParAdd")
-  .Input("input_a: float")
-  .Input("input_b: float")
-  .Output("a_plus_b: float")
-  .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-  });
-
-
-class ParAddOp : public OpKernel {
- public:
-  explicit ParAddOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor0 = context->input(0);
-    auto input_flat0 = input_tensor0.flat<float>();
-    const Tensor& input_tensor1 = context->input(1);
-    auto input_flat1 = input_tensor1.flat<float>();
-
-    OP_REQUIRES(context, input_tensor0.shape() == input_tensor1.shape(),
-                errors::InvalidArgument("Input tensors must be identical shape."));
-
-    // Create an output tensor
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0,
-                                            input_tensor0.shape(),
-                                            &output_tensor));
-    auto output_flat = output_tensor->flat<float>();
-
-    // PARALLEL ADD
-    const int N = input_flat0.size();
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [=, &input_flat0, &input_flat1, &output_flat](int64 start, int64 limit) {
-      for (; start < limit; ++start) {
-        output_flat(start) = input_flat0(start) + input_flat1(start);
-      }
-    };
-
-    // this is a heuristic. high number is likely to be sharded into smaller pieces
-    int64 cost_per_unit = 1;
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          N,
-          cost_per_unit,
-          task);
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParAdd").Device(DEVICE_CPU), ParAddOp);
-
-
--- a/twml/libtwml/src/ops/par_add.docx
+++ b/twml/libtwml/src/ops/par_add.docx
--- a/twml/libtwml/src/ops/partition_sparse_tensor.cpp
+++ b/twml/libtwml/src/ops/partition_sparse_tensor.cpp
@ -1,125 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("PartitionSparseTensorMod")
-.Attr("T: {float, double}")
-.Input("indices: int64")
-.Input("values: T")
-.Output("result: output_types")
-.Attr("num_partitions: int")
-.Attr("output_types: list({int64, float, double})")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-
-A tensorflow OP that partitions an input batch represented as a sparse tensor
-(indices are [ids, keys]) into separate sparse tensors to more optimally place
-sparse computations in distributed training.
-
-Inputs
-  indices: Indices from sparse tensor ([ids, keys] from the batch).
-  values: Batch values from the original features dict.
-
-Attr
-  num_partitions: Number of partitions to generate.
-  output_types: A list of types for the output tensors like
-                [tf.int64, tf.float32, tf.int64, tf.float32, ...]
-                The length must be 2 * num_partitions (see Outputs below)
-
-Outputs
-  List of dense tensors containing for each partition:
-    - partitioned indices tensor ([ids, keys] from partitioned batch)
-    - partitioned values tensor
-  The list lenth is 2 * num_partitions. Example:
-  [ [ids_1, keys_1], values_1, [ids_2, keys_2], values_2, ... ]
-)doc");
-
-template<typename T>
-class PartitionSparseTensorMod : public OpKernel {
- private:
-  int64 num_partitions;
-
- public:
-  explicit PartitionSparseTensorMod(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("num_partitions", &num_partitions));
-    OP_REQUIRES(context, num_partitions > 0,
-                errors::InvalidArgument("Number of partitions must be positive"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // grab input tensors
-    const Tensor& indices_tensor = context->input(0);  // (ids, keys)
-    const Tensor& values_tensor = context->input(1);
-
-    // check sizes
-    int64 num_keys = indices_tensor.shape().dim_size(0);
-    OP_REQUIRES(context, indices_tensor.dims() == 2,
-                errors::InvalidArgument("Indices tensor must be 2D [ids, keys]"));
-    OP_REQUIRES(context, indices_tensor.shape().dim_size(1) == 2,
-                errors::InvalidArgument("Indices tensor must have 2 cols [ids, keys]"));
-    OP_REQUIRES(context, values_tensor.shape().dim_size(0) == num_keys,
-                errors::InvalidArgument("Number of values must match number of keys"));
-
-    // grab input vectors
-    auto indices = indices_tensor.flat<int64>();
-    auto values = values_tensor.flat<T>();
-
-    // count the number of features that fall in each partition
-    std::vector<int64> partition_counts(num_partitions);
-
-    for (int i = 0; i < num_keys; i++) {
-      int64 key = indices(2 * i + 1);
-      int64 partition_id = key % num_partitions;
-      partition_counts[partition_id]++;
-    }
-
-    // allocate outputs for each partition and keep references
-    std::vector<int64*> output_indices_partitions;
-    std::vector<T*> output_values_partitions;
-    output_indices_partitions.reserve(num_partitions);
-    output_values_partitions.reserve(num_partitions);
-
-    for (int i = 0; i < num_partitions; i++) {
-      Tensor *output_indices = nullptr, *output_values = nullptr;
-      TensorShape shape_indices = TensorShape({partition_counts[i], 2});
-      TensorShape shape_values = TensorShape({partition_counts[i]});
-
-      OP_REQUIRES_OK(context, context->allocate_output(2 * i, shape_indices, &output_indices));
-      OP_REQUIRES_OK(context, context->allocate_output(2 * i + 1, shape_values, &output_values));
-
-      output_indices_partitions.push_back(output_indices->flat<int64>().data());
-      output_values_partitions.push_back(output_values->flat<T>().data());
-    }
-
-    // assign a partition id to each feature
-    // populate tensors for each partition
-    std::vector<int64> partition_indices(num_partitions);
-
-    for (int i = 0; i < num_keys; i++) {
-      int64 key = indices(2 * i + 1);
-      int64 pid = key % num_partitions;  // partition id
-      int64 idx = partition_indices[pid]++;
-
-      output_indices_partitions[pid][2 * idx] = indices(2 * i);
-      output_indices_partitions[pid][2 * idx + 1] = key / num_partitions;
-      output_values_partitions[pid][idx] = values(i);
-    }
-  }
-};
-
-#define REGISTER(Type)                \
-                                      \
-  REGISTER_KERNEL_BUILDER(            \
-    Name("PartitionSparseTensorMod")  \
-    .Device(DEVICE_CPU)               \
-    .TypeConstraint<Type>("T"),       \
-    PartitionSparseTensorMod<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
--- a/twml/libtwml/src/ops/partition_sparse_tensor.docx
+++ b/twml/libtwml/src/ops/partition_sparse_tensor.docx
--- a/twml/libtwml/src/ops/percentile_discretizer_v2.cpp
+++ b/twml/libtwml/src/ops/percentile_discretizer_v2.cpp
@ -1,241 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-
-using namespace tensorflow;
-
-void CombinedComputeDiscretizers(
-  OpKernelContext*,
-  int64_t,
-  const twml::Map<int64_t, int64_t>&,
-  int64_t);
-
-REGISTER_OP("PercentileDiscretizerV2")
-.Attr("T: {float, double}")
-.Input("input_ids: int64")
-.Input("input_vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Input("start_compute: int64")
-.Input("end_compute: int64")
-.Attr("output_bits: int")
-.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
-.Attr("feature_indices: tensor = { dtype: DT_INT64 }")
-.Attr("cost_per_unit: int")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features (if calibrated).
-  - note - choice of float or double should be consistent among inputs/output
-
-Input
-  input_ids(int64): A tensor containing input feature ids (direct from data record).
-  input_vals: A tensor containing input values at corresponding feature ids.
-    - i.e. input_ids[i] <-> input_vals[i] for each i
-    - float or double
-  bin_ids(int64): A tensor containing the discretized feature id for each bin.
-  bin_vals: A tensor containing the bin boundaries for values of a given feature.
-    - float or double
-  feature_offsets(int64): Specifies the starting location of bins for a given feature id.
-  start_compute(int64 scalar tensor): which index to start the computation at
-  end_compute(int64 scalar tensor): which index to end the computation right before
-    -> for example, (start_compute,end_compute)=(0,10) would compute on 0 thru 9
-  output_bits(int): The maximum number of bits to use for the output IDs.
-    -> 2**out_bits must be greater than bin_ids.size
-  feature_ids(int64): 1D TensorProto of feature IDs seen during calibration
-  feature_indices(int64): 1D TensorProto of feature indices corresponding with feature_IDs
-    -> hint: look up make_tensor_proto:
-       proto_init = np.array(values, dtype=np.int64)
-       tensor_attr = tf.make_tensor_proto(my_proto_init)
-  cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
-    if not CPU-bound) to complete a unit of work. Overestimating creates too
-    many shards and CPU time will be dominated by per-shard overhead, such as
-    Context creation. Underestimating may not fully make use of the specified
-    parallelism.
-
-Outputs
-  new_keys(int64): The discretized feature ids with same shape and size as keys.
-  new_vals(float or double): The discretized values with the same shape and size as vals.
-
-Operation
-  Note that the discretization operation maps observation vectors to higher dimensional
-    observation vectors. Here, we describe this mapping.
-
-  Let a calibrated feature observation be given by (F,x), where F is the ID of the
-    feature, and x is some real value (i.e., continuous feature). This kind of
-    representation is useful for the representation of sparse vectors, where there
-    are many zeros.
-
-  For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
-    (0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
-    elements of the vector
-
-  The disretizer performs the following operation:
-    (F,x) -> (map(x|F),1).
-  Hence, we have that map(x|F) is a new feature ID, and the value observed for that
-    feature is 1. We might read map(x|F) as 'the map of x for feature F'.
-
-  For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
-    We will then have that F~(x) is in the set newIDs(F) for any value of x. Each set member
-    of newIDs(F) is associated with a 'bin', as defined by the bin boundaries given in
-    the bin_vals input array. For any two different feature IDs F and G, we have that
-    INTERSECT(newIDs(F),newIDs(G)) is the empty set
-
-  Example - consider input vector with a single element, i.e. [x].
-    Let's Discretize to one of 2 values, as follows:
-    Let F=0 for the ID of the single feature in the vector.
-    Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
-    Let newIDs(F) = newIDs(0) = {0,1}
-    Let map(x|F) = map(x|0) = 0 if x<=BNDRY else 1
-  If we had another element y in the vector, i.e. [x, y], then we might additionally
-    Let F=1 for element y.
-    Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
-    Let newIDs(F) = newIDs(1) = {2,3} (so as to have empty intersect with newIDs(0))
-    Let map(x|F) = map(x|1) = 2 if x<=BNDRY else 3
-  Consider vector observation [-0.1, 0.2]. We then represent this as [(0, -0.1), (1, 0.2)]
-    Let BNDRY(0) = BNDRY(1) = 0. When we discretize the vector, we get:
-    (0, -0.1) -> (map(-0.1|0), 1) = (0, 1)
-    (1,  0.2) -> (map( 0.2|1), 1) = (3, 1)
-    Our output vector is then represented sparsely as [(0, 1), (3, 1)], and the dense
-    representation of this could be [1, 0, 0, 1]
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizerV2 : public OpKernel {
- public:
-  explicit PercentileDiscretizerV2(OpKernelConstruction* context) : OpKernel(context) {
-    // get the number of output bits
-    // for use with features that have not been calibrated
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("output_bits", &output_bits_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("cost_per_unit", &cost_per_unit_));
-    OP_REQUIRES(context, cost_per_unit_ >= 0,
-                errors::InvalidArgument("Must have cost_per_unit >= 0."));
-
-    // construct the ID_to_index hash map
-    Tensor feature_IDs;
-    Tensor feature_indices;
-
-    // extract the tensors
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_ids", &feature_IDs));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_indices", &feature_indices));
-
-    // for access to the data
-    // int64_t data type is set in to_layer function of the calibrator objects in Python
-    auto feature_IDs_flat = feature_IDs.flat<int64>();
-    auto feature_indices_flat = feature_indices.flat<int64>();
-
-    // verify proper dimension constraints
-    OP_REQUIRES(context, feature_IDs.shape() == feature_indices.shape(),
-                errors::InvalidArgument("feature_ids and feature_indices must be identical shape."));
-    OP_REQUIRES(context, feature_IDs.shape().dims() == 1,
-                errors::InvalidArgument("feature_ids and feature_indices must be 1D."));
-
-    // reserve space in the hash map and fill in the values
-    int num_features = feature_IDs.shape().dim_size(0);
-
-#ifdef USE_DENSE_HASH
-    ID_to_index_.set_empty_key(0);
-    ID_to_index_.resize(num_features);
-#else
-    ID_to_index_.reserve(num_features);
-#endif  // USE_DENSE_HASH
-    for (int i = 0 ; i < num_features ; i++) {
-      ID_to_index_[feature_IDs_flat(i)] = feature_indices_flat(i);
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    CombinedComputeDiscretizers(
-      context,
-      output_bits_,
-      ID_to_index_,
-      cost_per_unit_);
-  }
-
- private:
-  twml::Map<int64_t, int64_t> ID_to_index_;
-  int output_bits_;
-  int cost_per_unit_;
-};
-
-#define REGISTER(Type)              \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizerV2")         \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizerV2<Type>);         \
-
-REGISTER(float);
-REGISTER(double);
-
-void CombinedComputeDiscretizers(
-    OpKernelContext* context,
-    int64_t output_bits,
-    const twml::Map<int64_t, int64_t> &ID_to_index,
-    int64_t cost_per_unit) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_ids = context->input(2);
-  const Tensor& bin_vals = context->input(3);
-  const Tensor& feature_offsets = context->input(4);
-
-  uint64 full_size = keys.dim_size(0);
-  const int total_size = static_cast<int64>(full_size);
-  TensorShape output_shape = {total_size};
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_ids_ = TFTensor_to_twml_tensor(bin_ids);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-    const twml::Tensor feature_offsets_ = TFTensor_to_twml_tensor(feature_offsets);
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [&](int64 start, int64 limit) {
-      twml::discretizerInfer(out_keys_, out_vals_,
-                             in_keys_, in_vals_,
-                             bin_ids_, bin_vals_,
-                             feature_offsets_, output_bits,
-                             ID_to_index,
-                             start, limit,
-                             start);
-    };
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          full_size,
-          static_cast<int64>(cost_per_unit),
-          task);
-  }  catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
--- a/twml/libtwml/src/ops/percentile_discretizer_v2.docx
+++ b/twml/libtwml/src/ops/percentile_discretizer_v2.docx
--- a/twml/libtwml/src/ops/resource_utils.docx
+++ b/twml/libtwml/src/ops/resource_utils.docx
--- a/twml/libtwml/src/ops/resource_utils.h
+++ b/twml/libtwml/src/ops/resource_utils.h
@ -1,126 +0,0 @@
-#pragma once
-
-#include <twml.h>
-
-#include <atomic>
-#include <string>
-#include <vector>
-
-// Add these to make gcc ignore the warnings from tensorflow.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-compare"
-
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/resource_op_kernel.h"
-
-#pragma GCC diagnostic pop
-
-#include <memory>
-#include <functional>
-
-template<typename T>
-void unrefHandle(T *handle) {
-  handle->Unref();
-}
-
-template <typename T>
-using unique_handle = std::unique_ptr<T, std::function<void(T *)> >;
-
-// as std::type_index is not abi compatible, we bypass the hash_code checks.
-// https://github.com/tensorflow/tensorflow/commit/15275d3a14c77e2244ae1155f93243256f08e3ed
-#ifdef __APPLE__
-template <typename T>
-Status CreateTwmlResource(OpKernelContext* ctx, const ResourceHandle& p, T* value) {
-  return ctx->resource_manager()->Create(p.container(), p.name(), value);
-}
-
-template <typename T>
-Status LookupTwmlResource(OpKernelContext* ctx, const ResourceHandle& p,
-                      T** value) {
-  return ctx->resource_manager()->Lookup(p.container(), p.name(), value);
-}
-#endif  // __APPLE__
-
-template<typename T>
-unique_handle<T> getHandle(tensorflow::OpKernelContext* context, int input_idx) {
-  using namespace tensorflow;
-  T *ptr = nullptr;
-#ifdef __APPLE__
-  auto s = LookupTwmlResource(context, HandleFromInput(context, input_idx), &ptr);
-#else
-  auto s = LookupResource(context, HandleFromInput(context, input_idx), &ptr);
-#endif  // __APPLE__
-
-  if (!s.ok()) {
-    throw std::runtime_error("Failed to get resource handle");
-  }
-  return unique_handle<T>(ptr, unrefHandle<T>);
-}
-
-template<typename InputType>
-const uint8_t *getInputBytes(const Tensor &input, int id) {
-  return reinterpret_cast<const uint8_t *>(input.flat<InputType>().data());
-}
-
-template<>
-inline const uint8_t *getInputBytes<string>(const Tensor &input, int id) {
-  return reinterpret_cast<const uint8_t *>(input.flat<string>()(id).c_str());
-}
-
-template<typename InputType>
-const int getBatchSize(const Tensor &input) {
-  return 1;
-}
-
-template<>
-inline const int getBatchSize<string>(const Tensor &input) {
-  return static_cast<int>(input.NumElements());
-}
-
-class DataRecordResource : public ResourceBase {
- public:
-  Tensor input;
-  int64 num_labels;
-  int64 num_weights;
-  twml::DataRecord common;
-  std::vector<twml::DataRecord> records;
-  twml::Map<int64_t, int64_t> *keep_map;
-  string DebugString() const override { return "DataRecords resource"; }
-};
-
-// A thin layer around batch of HashedDataRecords
-class HashedDataRecordResource : public ResourceBase {
- public:
-  Tensor input;
-  int64 total_size;
-  int64 num_labels;
-  int64 num_weights;
-  twml::HashedDataRecord common;
-  std::vector<twml::HashedDataRecord> records;
-  string DebugString() const override { return "HashedDataRecord Resource"; }
-};
-
-#define TF_CHECK_STATUS(fn) do {                \
-    Status s = fn;                              \
-    if (!s.ok()) return s;                      \
-  } while (0)
-
-template<typename ResourceType>
-Status makeResourceHandle(OpKernelContext* context, int out_idx, ResourceType **resource_) {
-  static std::atomic<int64> id;
-  Tensor* handle_tensor;
-  TF_CHECK_STATUS(context->allocate_output(out_idx, TensorShape({}), &handle_tensor));
-
-  ResourceType *resource = new ResourceType();
-  const auto resource_name = typeid(ResourceType).name() + std::to_string(id++);
-  ResourceHandle handle = MakePerStepResourceHandle<ResourceType>(context, resource_name);
-#ifdef __APPLE__
-  TF_CHECK_STATUS(CreateTwmlResource(context, handle, resource));
-#else
-  TF_CHECK_STATUS(CreateResource(context, handle, resource));
-#endif  // __APPLE__
-  handle_tensor->scalar<ResourceHandle>()() = handle;
-
-  *resource_ = resource;
-  return Status::OK();
-}
--- a/twml/libtwml/src/ops/scripts/get_inc.docx
+++ b/twml/libtwml/src/ops/scripts/get_inc.docx
--- a/twml/libtwml/src/ops/scripts/get_inc.py
+++ b/twml/libtwml/src/ops/scripts/get_inc.py
@ -1,5 +0,0 @@
-"""Gets the path of headers for the current Tensorflow library"""
-
-import tensorflow.compat.v1 as tf
-
-print(tf.sysconfig.get_include(), end='')
--- a/twml/libtwml/src/ops/scripts/get_inc.sh
+++ b/twml/libtwml/src/ops/scripts/get_inc.sh
@ -1,2 +0,0 @@
-#!/bin/sh
-PEX_INTERPRETER=1 "$PYTHON_ENV" "$LIBTWML_HOME"/src/ops/scripts/get_inc.py
--- a/twml/libtwml/src/ops/scripts/get_lib.docx
+++ b/twml/libtwml/src/ops/scripts/get_lib.docx
--- a/twml/libtwml/src/ops/scripts/get_lib.py
+++ b/twml/libtwml/src/ops/scripts/get_lib.py
@ -1,5 +0,0 @@
-"""Gets the path of headers for the current Tensorflow library"""
-
-import tensorflow.compat.v1 as tf
-
-print(tf.sysconfig.get_lib(), end='')
--- a/twml/libtwml/src/ops/scripts/get_lib.sh
+++ b/twml/libtwml/src/ops/scripts/get_lib.sh
@ -1,2 +0,0 @@
-#!/bin/sh
-PEX_INTERPRETER=1 "$PYTHON_ENV" "$LIBTWML_HOME"/src/ops/scripts/get_lib.py
--- a/twml/libtwml/src/ops/scripts/symlink.docx
+++ b/twml/libtwml/src/ops/scripts/symlink.docx
--- a/Show More
+++ b/Show More