diff --git a/twml/libtwml/src/lib/TensorRecordWriter.cpp b/twml/libtwml/src/lib/TensorRecordWriter.cpp
deleted file mode 100644
index b1fe98e64..000000000
--- a/twml/libtwml/src/lib/TensorRecordWriter.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <map>
-#include <twml/ThriftWriter.h>
-#include <twml/TensorRecordWriter.h>
-#include <twml/io/IOError.h>
-
-using namespace twml::io;
-
-namespace twml {
-
-static int32_t getRawThriftType(twml_type dtype) {
-  // convert twml enum to tensor.thrift enum
-  switch (dtype) {
-    case TWML_TYPE_FLOAT:
-      return DATA_TYPE_FLOAT;
-    case TWML_TYPE_DOUBLE:
-      return DATA_TYPE_DOUBLE;
-    case TWML_TYPE_INT64:
-      return DATA_TYPE_INT64;
-    case TWML_TYPE_INT32:
-      return DATA_TYPE_INT32;
-    case TWML_TYPE_UINT8:
-      return DATA_TYPE_UINT8;
-    case TWML_TYPE_STRING:
-      return DATA_TYPE_STRING;
-    case TWML_TYPE_BOOL:
-      return DATA_TYPE_BOOL;
-    default:
-      throw IOError(IOError::UNSUPPORTED_OUTPUT_TYPE);
-  }
-}
-
-void TensorRecordWriter::writeTensor(const RawTensor &tensor) {
-  if (tensor.getType() == TWML_TYPE_INT32) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_INT32);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_I32, tensor.getNumElements());
-
-    const int32_t *data = tensor.getData<int32_t>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeInt32(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_INT64) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_INT64);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumElements());
-
-    const int64_t *data = tensor.getData<int64_t>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeInt64(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_FLOAT) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_FLOAT);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_DOUBLE, tensor.getNumElements());
-
-    const float *data = tensor.getData<float>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeDouble(static_cast<double>(data[i]));
-
-  } else if (tensor.getType() == TWML_TYPE_DOUBLE) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_DOUBLE);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_DOUBLE, tensor.getNumElements());
-
-    const double *data = tensor.getData<double>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeDouble(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_STRING) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_STRING);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_STRING, tensor.getNumElements());
-
-    const std::string *data = tensor.getData<std::string>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeString(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_BOOL) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_BOOL);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_BOOL, tensor.getNumElements());
-
-    const bool *data = tensor.getData<bool>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeBool(data[i]);
-
-  } else {
-    throw IOError(IOError::UNSUPPORTED_OUTPUT_TYPE);
-  }
-
-  // write tensor shape field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 2);
-  m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumDims());
-
-  for (uint64_t i = 0; i < tensor.getNumDims(); i++)
-    m_thrift_writer.writeInt64(tensor.getDim(i));
-
-  m_thrift_writer.writeStructStop();
-  m_thrift_writer.writeStructStop();
-}
-
-void TensorRecordWriter::writeRawTensor(const RawTensor &tensor) {
-  m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_RAW);
-
-  // dataType field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_I32, 1);
-  m_thrift_writer.writeInt32(getRawThriftType(tensor.getType()));
-
-  // content field
-  uint64_t type_size = getSizeOf(tensor.getType());
-  m_thrift_writer.writeStructFieldHeader(TTYPE_STRING, 2);
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor.getData<void>());
-  m_thrift_writer.writeBinary(data, tensor.getNumElements() * type_size);
-
-  // shape field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 3);
-  m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumDims());
-
-  for (uint64_t i = 0; i < tensor.getNumDims(); i++)
-    m_thrift_writer.writeInt64(tensor.getDim(i));
-
-  m_thrift_writer.writeStructStop();
-  m_thrift_writer.writeStructStop();
-}
-
-TWMLAPI uint32_t TensorRecordWriter::getRecordsWritten() {
-  return m_records_written;
-}
-
-// Caller (usually DataRecordWriter) must precede with struct header field
-// like thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_GENERAL_TENSOR)
-TWMLAPI uint64_t TensorRecordWriter::write(twml::TensorRecord &record) {
-  uint64_t bytes_written_before = m_thrift_writer.getBytesWritten();
-
-  m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_STRUCT, record.getRawTensors().size());
-
-  for (auto id_tensor_pairs : record.getRawTensors()) {
-    m_thrift_writer.writeInt64(id_tensor_pairs.first);
-
-    // all tensors written as RawTensor Thrift except for StringTensors
-    // this avoids the overhead of converting little endian to big endian
-    if (id_tensor_pairs.second.getType() == TWML_TYPE_STRING)
-      writeTensor(id_tensor_pairs.second);
-    else
-      writeRawTensor(id_tensor_pairs.second);
-  }
-
-  m_records_written++;
-
-  return m_thrift_writer.getBytesWritten() - bytes_written_before;
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/TensorRecordWriter.docx b/twml/libtwml/src/lib/TensorRecordWriter.docx
new file mode 100644
index 000000000..4b888c5f3
Binary files /dev/null and b/twml/libtwml/src/lib/TensorRecordWriter.docx differ
diff --git a/twml/libtwml/src/lib/ThriftReader.cpp b/twml/libtwml/src/lib/ThriftReader.cpp
deleted file mode 100644
index bceb74c13..000000000
--- a/twml/libtwml/src/lib/ThriftReader.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "internal/endianutils.h"
-
-#include <twml/ThriftReader.h>
-#include <twml/Error.h>
-
-#include <cstring>
-
-namespace twml {
-
-uint8_t ThriftReader::readByte() {
-  return readDirect<uint8_t>();
-}
-
-int16_t ThriftReader::readInt16() {
-  return betoh16(readDirect<int16_t>());
-}
-
-int32_t ThriftReader::readInt32() {
-  return betoh32(readDirect<int32_t>());
-}
-
-int64_t ThriftReader::readInt64() {
-  return betoh64(readDirect<int64_t>());
-}
-
-double ThriftReader::readDouble() {
-  double val;
-  int64_t *val_proxy = reinterpret_cast<int64_t*>(&val);
-  *val_proxy = readInt64();
-  return val;
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/ThriftReader.docx b/twml/libtwml/src/lib/ThriftReader.docx
new file mode 100644
index 000000000..3cc7e8e20
Binary files /dev/null and b/twml/libtwml/src/lib/ThriftReader.docx differ
diff --git a/twml/libtwml/src/lib/ThriftWriter.cpp b/twml/libtwml/src/lib/ThriftWriter.cpp
deleted file mode 100644
index 4f298a154..000000000
--- a/twml/libtwml/src/lib/ThriftWriter.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "internal/endianutils.h"
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <twml/ThriftWriter.h>
-#include <twml/Error.h>
-#include <twml/io/IOError.h>
-
-#include <cstring>
-
-using namespace twml::io;
-
-namespace twml {
-
-template <typename T> inline
-uint64_t ThriftWriter::write(T val) {
-  if (!m_dry_run) {
-    if (m_bytes_written + sizeof(T) > m_buffer_size)
-      throw IOError(IOError::DESTINATION_LARGER_THAN_CAPACITY);
-    memcpy(m_buffer, &val, sizeof(T));
-    m_buffer += sizeof(T);
-  }
-  m_bytes_written += sizeof(T);
-  return sizeof(T);
-}
-
-TWMLAPI uint64_t ThriftWriter::getBytesWritten() {
-  return m_bytes_written;
-}
-
-TWMLAPI uint64_t ThriftWriter::writeStructFieldHeader(int8_t field_type, int16_t field_id) {
-  return writeInt8(field_type) + writeInt16(field_id);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeStructStop() {
-  return writeInt8(static_cast<int8_t>(TTYPE_STOP));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeListHeader(int8_t element_type, int32_t num_elems) {
-  return writeInt8(element_type) + writeInt32(num_elems);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeMapHeader(int8_t key_type, int8_t val_type, int32_t num_elems) {
-  return writeInt8(key_type) + writeInt8(val_type) + writeInt32(num_elems);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeDouble(double val) {
-  int64_t bin_value;
-  memcpy(&bin_value, &val, sizeof(int64_t));
-  return writeInt64(bin_value);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt8(int8_t val) {
-  return write(val);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt16(int16_t val) {
-  return write(betoh16(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt32(int32_t val) {
-  return write(betoh32(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt64(int64_t val) {
-  return write(betoh64(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeBinary(const uint8_t *bytes, int32_t num_bytes) {
-  writeInt32(num_bytes);
-
-  if (!m_dry_run) {
-    if (m_bytes_written + num_bytes > m_buffer_size)
-      throw IOError(IOError::DESTINATION_LARGER_THAN_CAPACITY);
-    memcpy(m_buffer, bytes, num_bytes);
-    m_buffer += num_bytes;
-  }
-  m_bytes_written += num_bytes;
-
-  return 4 + num_bytes;
-}
-
-TWMLAPI uint64_t ThriftWriter::writeString(std::string str) {
-  return writeBinary(reinterpret_cast<const uint8_t *>(str.data()), str.length());
-}
-
-TWMLAPI uint64_t ThriftWriter::writeBool(bool val) {
-  return write(val);
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/ThriftWriter.docx b/twml/libtwml/src/lib/ThriftWriter.docx
new file mode 100644
index 000000000..c696cf11c
Binary files /dev/null and b/twml/libtwml/src/lib/ThriftWriter.docx differ
diff --git a/twml/libtwml/src/lib/discretizer_impl.cpp b/twml/libtwml/src/lib/discretizer_impl.cpp
deleted file mode 100644
index 3f161341e..000000000
--- a/twml/libtwml/src/lib/discretizer_impl.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-#include "internal/interpolate.h"
-#include "internal/error.h"
-#include <twml/discretizer_impl.h>
-#include <twml/optim.h>
-
-namespace twml {
-  // it is assumed that start_compute and end_compute are valid
-  template<typename T>
-  void discretizerInfer(Tensor &output_keys,
-          Tensor &output_vals,
-          const Tensor &input_ids,
-          const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          int output_bits,
-          const Map<int64_t, int64_t> &ID_to_index,
-          int64_t start_compute,
-          int64_t end_compute,
-          int64_t output_start) {
-    auto out_keysData = output_keys.getData<int64_t>();
-    auto out_valsData = output_vals.getData<T>();
-    uint64_t out_keysStride = output_keys.getStride(0);
-    uint64_t out_valsStride = output_vals.getStride(0);
-
-    auto in_idsData = input_ids.getData<int64_t>();
-    auto in_valsData = input_vals.getData<T>();
-    uint64_t in_idsStride = input_ids.getStride(0);
-    uint64_t in_valsStride = input_vals.getStride(0);
-
-    auto xsData = bin_vals.getData<T>();
-    auto ysData = bin_ids.getData<int64_t>();
-    uint64_t xsStride = bin_vals.getStride(0);
-    uint64_t ysStride = bin_ids.getStride(0);
-
-    auto offsetData = feature_offsets.getData<int64_t>();
-
-    uint64_t total_bins = bin_ids.getNumElements();
-    uint64_t fsize = feature_offsets.getNumElements();
-
-    uint64_t output_size = (1 << output_bits);
-
-    for (uint64_t i = start_compute; i < end_compute; i++) {
-      int64_t feature_ID = in_idsData[i * in_idsStride];
-      T val = in_valsData[i * in_valsStride];
-
-      auto iter = ID_to_index.find(feature_ID);
-      if (iter == ID_to_index.end()) {
-        // feature not calibrated
-        // modulo add operation for new key from feature ID
-        int64_t ikey = feature_ID % (output_size - total_bins) + total_bins;
-        out_keysData[(i + output_start - start_compute) * out_keysStride] = ikey;
-        out_valsData[(i + output_start - start_compute) * out_valsStride] = val;
-        continue;
-      }
-
-      int64_t ikey = iter->second;
-
-      // Perform interpolation
-      uint64_t offset = offsetData[ikey];
-      uint64_t next_offset = (ikey == (int64_t)(fsize - 1)) ? total_bins : offsetData[ikey + 1];
-      uint64_t mainSize = next_offset - offset;
-
-      const T *lxsData = xsData + offset;
-      const int64_t *lysData = ysData + offset;
-      int64_t okey;
-      okey = interpolation<T, int64_t>(lxsData, xsStride,
-                                       lysData, ysStride,
-                                       val, mainSize,
-                                       NEAREST, 0);
-      out_keysData[(i + output_start - start_compute) * out_keysStride] = okey;
-      out_valsData[(i + output_start - start_compute) * out_valsStride] = 1;
-    }
-  }
-
-  void discretizerInfer(Tensor &output_keys,
-          Tensor &output_vals,
-          const Tensor &input_ids,
-          const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          int output_bits,
-          const Map<int64_t, int64_t> &ID_to_index,
-          int start_compute,
-          int end_compute,
-          int output_start) {
-    if (input_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (bin_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (feature_offsets.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    if (bin_ids.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_ids must be 1 Dimensional");
-    }
-
-    if (bin_vals.getNumElements() != bin_ids.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "Dimensions of bin_vals and bin_ids do not match");
-    }
-
-    if (feature_offsets.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "feature_offsets must be contiguous");
-    }
-
-    uint64_t size = input_ids.getDim(0);
-    if (end_compute == -1) {
-      end_compute = size;
-    }
-
-    if (start_compute < 0 || start_compute >= size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "start_compute out of range");
-    }
-
-    if (end_compute < -1 || end_compute > size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "end_compute out of range");
-    }
-
-    if (start_compute > end_compute && end_compute != -1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "must have start_compute <= end_compute, or end_compute==-1");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::discretizerInfer<float>(output_keys, output_vals,
-                  input_ids, input_vals,
-                  bin_ids, bin_vals, feature_offsets, output_bits, ID_to_index,
-                  start_compute, end_compute, output_start);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::discretizerInfer<double>(output_keys, output_vals,
-                   input_ids, input_vals,
-                   bin_ids, bin_vals, feature_offsets, output_bits, ID_to_index,
-                   start_compute, end_compute, output_start);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for discretizerInfer");
-    }
-  }
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/discretizer_impl.docx b/twml/libtwml/src/lib/discretizer_impl.docx
new file mode 100644
index 000000000..bf06ea763
Binary files /dev/null and b/twml/libtwml/src/lib/discretizer_impl.docx differ
diff --git a/twml/libtwml/src/lib/functions.cpp b/twml/libtwml/src/lib/functions.cpp
deleted file mode 100644
index b7af3c0ac..000000000
--- a/twml/libtwml/src/lib/functions.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "internal/error.h"
-#include "internal/murmur_hash3.h"
-#include "internal/utf_converter.h"
-#include <twml/functions.h>
-#include <cstring>
-#include <algorithm>
-
-namespace twml {
-
-  template<typename T>
-  void add1(Tensor &output, const Tensor input) {
-    T *odata = output.getData<T>();
-    const T *idata = input.getData<T>();
-    const uint64_t num_elements = input.getNumElements();
-
-    for (uint64_t i = 0; i < num_elements; i++) {
-      odata[i] = idata[i] + 1;
-    }
-  }
-
-  template<typename T>
-  void copy(Tensor &output, const Tensor input) {
-    T *odata = output.getData<T>();
-    const T *idata = input.getData<T>();
-    const uint64_t num_elements = input.getNumElements();
-
-    for (uint64_t i = 0; i < num_elements; i++) {
-      odata[i] = idata[i];
-    }
-  }
-
-  void add1(Tensor &output, const Tensor input) {
-    auto type =  input.getType();
-    if (output.getType() != type) {
-      throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
-    }
-
-    if (output.getNumElements() != input.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
-    }
-
-    // TODO: Implement an easier dispatch function
-    switch (type) {
-    case TWML_TYPE_FLOAT:
-      twml::add1<float>(output, input);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::add1<double>(output, input);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "add1 only supports float and double tensors");
-    }
-  }
-
-  void copy(Tensor &output, const Tensor input) {
-    auto type =  input.getType();
-    if (output.getType() != type) {
-      throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
-    }
-
-    if (output.getNumElements() != input.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
-    }
-
-    // TODO: Implement an easier dispatch function
-    switch (type) {
-    case TWML_TYPE_FLOAT:
-      twml::copy<float>(output, input);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::copy<double>(output, input);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "copy only supports float and double tensors");
-    }
-  }
-
-  int64_t featureId(const std::string &feature) {
-    const char *str = feature.c_str();
-    uint64_t len = feature.size();
-    int64_t id = 0;
-    TWML_CHECK(twml_get_feature_id(&id, len, str), "Error getting featureId");
-    return id;
-  }
-}  // namespace twml
-
-twml_err twml_add1(twml_tensor output, const twml_tensor input) {
-  HANDLE_EXCEPTIONS(
-    auto out = twml::getTensor(output);
-    auto in = twml::getConstTensor(input);
-    twml::add1(*out, *in););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_copy(twml_tensor output, const twml_tensor input) {
-  HANDLE_EXCEPTIONS(
-    auto out = twml::getTensor(output);
-    auto in = twml::getConstTensor(input);
-    twml::copy(*out, *in););
-  return TWML_ERR_NONE;
-}
-
-inline twml_err twml_get_feature_id_internal(int64_t *result,
-                                             uint64_t out_size, uint16_t *out,
-                                             uint64_t out2_size, uint16_t *out2,
-                                             const uint64_t len, const char *str) {
-  uint64_t k = 0;
-  for (uint64_t i = 0; i < len; i++) {
-    if (str[i] == '#') {
-      k = i;
-      break;
-    }
-  }
-
-  uint8_t hash[16];
-  if (k != 0) {
-    ssize_t n = utf8_to_utf16((const uint8_t *) str, k, out, out_size);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-
-    MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, out2);
-    n = utf8_to_utf16((const uint8_t *) (str + k + 1), len - k - 1, &out2[4], out2_size - 8);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-
-    MurmurHash3_x64_128(out2, (n * sizeof(uint16_t)) + 8, 0, hash);
-  } else {
-    ssize_t n = utf8_to_utf16((const uint8_t *)str, len, out, out_size);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-    MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, hash);
-  }
-  int64_t id;
-  memcpy(&id, hash, sizeof(int64_t));
-  *result = id;
-
-  return TWML_ERR_NONE;
-}
-
-static const int UTF16_STR_MAX_SIZE = 1024;
-
-twml_err twml_get_feature_id(int64_t *result, const uint64_t len, const char *str) {
-  try {
-    uint16_t out[UTF16_STR_MAX_SIZE];
-    uint16_t out2[UTF16_STR_MAX_SIZE];
-    return twml_get_feature_id_internal(result,
-                                        UTF16_STR_MAX_SIZE, out,
-                                        UTF16_STR_MAX_SIZE, out2,
-                                        len, str);
-  } catch(const std::invalid_argument &ex) {
-    // If the space on the stack is not enough, try using the heap.
-    // len + 1 is needed because a null terminating character is added at the end.
-    std::vector<uint16_t> out(len + 1);
-    std::vector<uint16_t> out2(len + 1);
-    return twml_get_feature_id_internal(result,
-                                        len + 1, out.data(),
-                                        len + 1, out2.data(),
-                                        len, str);
-
-  }
-}
diff --git a/twml/libtwml/src/lib/functions.docx b/twml/libtwml/src/lib/functions.docx
new file mode 100644
index 000000000..59d6e825e
Binary files /dev/null and b/twml/libtwml/src/lib/functions.docx differ
diff --git a/twml/libtwml/src/lib/hashing_discretizer_impl.cpp b/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
deleted file mode 100644
index 166242ffb..000000000
--- a/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "internal/linear_search.h"
-#include "internal/error.h"
-#include <twml/hashing_discretizer_impl.h>
-#include <twml/optim.h>
-#include <algorithm>
-
-namespace twml {
-  template<typename Tx>
-  static int64_t lower_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
-    auto index_temp = std::lower_bound(data, data + buf_size, val);
-    return static_cast<int64_t>(index_temp - data);
-  }
-
-  template<typename Tx>
-  static int64_t upper_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
-    auto index_temp = std::upper_bound(data, data + buf_size, val);
-    return static_cast<int64_t>(index_temp - data);
-  }
-
-  template<typename Tx>
-  using search_method = int64_t (*)(const Tx *, const Tx, const int64_t);
-
-  typedef uint64_t (*hash_signature)(uint64_t, int64_t, uint64_t);
-
-  // uint64_t integer_multiplicative_hashing()
-  //
-  // A function to hash discretized feature_ids into one of 2**output_bits buckets.
-  // This function hashes the feature_ids to achieve a uniform distribution of
-  //   IDs, so the hashed IDs are with high probability far apart
-  // Then, bucket_indices can simply be added, resulting in unique new IDs with high probability
-  // We integer hash again to again spread out the new IDs
-  // Finally we take the upper
-  // Required args:
-  //   feature_id:
-  //     The feature id of the feature to be hashed.
-  //   bucket_index:
-  //     The bucket index of the discretized feature value
-  //   output_bits:
-  //     The number of bits of output space for the features to be hashed into.
-  //
-  // Note - feature_ids may have arbitrary distribution within int32s
-  // Note - 64 bit feature_ids can be processed with this, but the upper
-  //          32 bits have no effect on the output
-  // e.g. all feature ids 0 through 255 exist in movie-lens.
-  // this hashing constant is good for 32 LSBs. will use N=32. (can use N<32 also)
-  // this hashing constant is co-prime with 2**32, therefore we have that
-  //   a != b, a and b in [0,2**32)
-  //    implies
-  //   f(a) != f(b) where f(x) = (hashing_constant * x) % (2**32)
-  // note that we are mostly ignoring the upper 32 bits, using modulo 2**32 arithmetic
-  uint64_t integer_multiplicative_hashing(uint64_t feature_id,
-                                          int64_t bucket_index,
-                                          uint64_t output_bits) {
-    // possibly use 14695981039346656037 for 64 bit unsigned??
-    //  = 20921 * 465383 * 1509404459
-    // alternatively, 14695981039346656039 is prime
-    // We would also need to use N = 64
-    const uint64_t hashing_constant = 2654435761;
-    const uint64_t N = 32;
-    // hash once to prevent problems from anomalous input id distributions
-    feature_id *= hashing_constant;
-    feature_id += bucket_index;
-    // this hash enables the following right shift operation
-    //  without losing the bucket information (lower bits)
-    feature_id *= hashing_constant;
-    // output size is a power of 2
-    feature_id >>= N - output_bits;
-    uint64_t mask = (1 << output_bits) - 1;
-    return mask & feature_id;
-  }
-
-  uint64_t integer64_multiplicative_hashing(uint64_t feature_id,
-                                            int64_t bucket_index,
-                                            uint64_t output_bits) {
-    const uint64_t hashing_constant = 14695981039346656039UL;
-    const uint64_t N = 64;
-    // hash once to prevent problems from anomalous input id distributions
-    feature_id *= hashing_constant;
-    feature_id += bucket_index;
-    // this hash enables the following right shift operation
-    //  without losing the bucket information (lower bits)
-    feature_id *= hashing_constant;
-    // output size is a power of 2
-    feature_id >>= N - output_bits;
-    uint64_t mask = (1 << output_bits) - 1;
-    return mask & feature_id;
-  }
-
-  int64_t option_bits(int64_t options, int64_t high, int64_t low) {
-    options >>= low;
-    options &= (1 << (high - low + 1)) - 1;
-    return options;
-  }
-
-  // it is assumed that start_compute and end_compute are valid
-  template<typename T>
-  void hashDiscretizerInfer(Tensor &output_keys,
-                            Tensor &output_vals,
-                            const Tensor &input_ids,
-                            const Tensor &input_vals,
-                            const Tensor &bin_vals,
-                            int output_bits,
-                            const Map<int64_t, int64_t> &ID_to_index,
-                            int64_t start_compute,
-                            int64_t end_compute,
-                            int64_t n_bin,
-                            int64_t options) {
-    auto output_keys_data = output_keys.getData<int64_t>();
-    auto output_vals_data = output_vals.getData<T>();
-
-    auto input_ids_data = input_ids.getData<int64_t>();
-    auto input_vals_data = input_vals.getData<T>();
-
-    auto bin_vals_data = bin_vals.getData<T>();
-
-    // The function pointer implementation removes the option_bits
-    // function call (might be inlined) and corresponding branch from
-    // the hot loop, but it prevents inlining these functions, so
-    // there will be function call overhead. Uncertain which would
-    // be faster, testing needed. Also, code optimizers do weird things...
-    hash_signature hash_fn = integer_multiplicative_hashing;
-    switch (option_bits(options, 4, 2)) {
-      case 0:
-      hash_fn = integer_multiplicative_hashing;
-      break;
-      case 1:
-      hash_fn = integer64_multiplicative_hashing;
-      break;
-      default:
-      hash_fn = integer_multiplicative_hashing;
-    }
-
-    search_method<T> search_fn = lower_bound_search;
-    switch (option_bits(options, 1, 0)) {
-      case 0:
-      search_fn = lower_bound_search<T>;
-      break;
-      case 1:
-      search_fn = linear_search<T>;
-      break;
-      case 2:
-      search_fn = upper_bound_search<T>;
-      break;
-      default:
-      search_fn = lower_bound_search<T>;
-    }
-
-    for (uint64_t i = start_compute; i < end_compute; i++) {
-      int64_t id = input_ids_data[i];
-      T val = input_vals_data[i];
-
-      auto iter = ID_to_index.find(id);
-      if (iter != ID_to_index.end()) {
-        int64_t feature_idx = iter->second;
-        const T *bin_vals_start = bin_vals_data + feature_idx * n_bin;
-        int64_t out_bin_idx = search_fn(bin_vals_start, val, n_bin);
-        output_keys_data[i] = hash_fn(id, out_bin_idx, output_bits);
-        output_vals_data[i] = 1;
-      } else {
-        // feature not calibrated
-        output_keys_data[i] = id & ((1 << output_bits) - 1);
-        output_vals_data[i] = val;
-      }
-    }
-  }
-
-  void hashDiscretizerInfer(Tensor &output_keys,
-                            Tensor &output_vals,
-                            const Tensor &input_ids,
-                            const Tensor &input_vals,
-                            int n_bin,
-                            const Tensor &bin_vals,
-                            int output_bits,
-                            const Map<int64_t, int64_t> &ID_to_index,
-                            int start_compute,
-                            int end_compute,
-                            int64_t options) {
-    if (input_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    uint64_t size = input_ids.getDim(0);
-    if (end_compute == -1) {
-      end_compute = size;
-    }
-
-    if (start_compute < 0 || start_compute >= size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "start_compute out of range");
-    }
-
-    if (end_compute < -1 || end_compute > size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "end_compute out of range");
-    }
-
-    if (start_compute > end_compute && end_compute != -1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "must have start_compute <= end_compute, or end_compute==-1");
-    }
-
-    if (output_keys.getStride(0) != 1 || output_vals.getStride(0) != 1 ||
-        input_ids.getStride(0) != 1 || input_vals.getStride(0) != 1 ||
-        bin_vals.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "All Strides must be 1.");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::hashDiscretizerInfer<float>(output_keys, output_vals,
-                  input_ids, input_vals,
-                  bin_vals, output_bits, ID_to_index,
-                  start_compute, end_compute, n_bin, options);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::hashDiscretizerInfer<double>(output_keys, output_vals,
-                   input_ids, input_vals,
-                   bin_vals, output_bits, ID_to_index,
-                   start_compute, end_compute, n_bin, options);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for hashDiscretizerInfer");
-    }
-  }
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/hashing_discretizer_impl.docx b/twml/libtwml/src/lib/hashing_discretizer_impl.docx
new file mode 100644
index 000000000..8bcfaa71d
Binary files /dev/null and b/twml/libtwml/src/lib/hashing_discretizer_impl.docx differ
diff --git a/twml/libtwml/src/lib/internal/endianutils.docx b/twml/libtwml/src/lib/internal/endianutils.docx
new file mode 100644
index 000000000..954dd71c9
Binary files /dev/null and b/twml/libtwml/src/lib/internal/endianutils.docx differ
diff --git a/twml/libtwml/src/lib/internal/endianutils.h b/twml/libtwml/src/lib/internal/endianutils.h
deleted file mode 100644
index 3b27797d7..000000000
--- a/twml/libtwml/src/lib/internal/endianutils.h
+++ /dev/null
@@ -1,137 +0,0 @@
-//
-//  endian_fix.h
-//  ImageCore
-//
-//  For OSes that use glibc < 2.9 (like RHEL5)
-//
-#pragma once
-
-#ifdef __APPLE__
-#include <libkern/OSByteOrder.h>
-#define htobe16(x) OSSwapHostToBigInt16(x)
-#define htole16(x) OSSwapHostToLittleInt16(x)
-#define betoh16(x) OSSwapBigToHostInt16(x)
-#define letoh16(x) OSSwapLittleToHostInt16(x)
-#define htobe32(x) OSSwapHostToBigInt32(x)
-#define htole32(x) OSSwapHostToLittleInt32(x)
-#define betoh32(x) OSSwapBigToHostInt32(x)
-#define letoh32(x) OSSwapLittleToHostInt32(x)
-#define htobe64(x) OSSwapHostToBigInt64(x)
-#define htole64(x) OSSwapHostToLittleInt64(x)
-#define betoh64(x) OSSwapBigToHostInt64(x)
-#define letoh64(x) OSSwapLittleToHostInt64(x)
-#else
-#include <endian.h>
-#ifdef __USE_BSD
-/* Conversion interfaces.  */
-#include <byteswap.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#ifndef htobe16
-#define htobe16(x) __bswap_16(x)
-#endif
-#ifndef htole16
-#define htole16(x) (x)
-#endif
-#ifndef betoh16
-#define betoh16(x) __bswap_16(x)
-#endif
-#ifndef letoh16
-#define letoh16(x) (x)
-#endif
-
-#ifndef htobe32
-#define htobe32(x) __bswap_32(x)
-#endif
-#ifndef htole32
-#define htole32(x) (x)
-#endif
-#ifndef betoh32
-#define betoh32(x) __bswap_32(x)
-#endif
-#ifndef letoh32
-#define letoh32(x) (x)
-#endif
-
-#ifndef htobe64
-#define htobe64(x) __bswap_64(x)
-#endif
-#ifndef htole64
-#define htole64(x) (x)
-#endif
-#ifndef betoh64
-#define betoh64(x) __bswap_64(x)
-#endif
-#ifndef letoh64
-#define letoh64(x) (x)
-#endif
-
-#else /* __BYTE_ORDER == __LITTLE_ENDIAN */
-#ifndef htobe16
-#define htobe16(x) (x)
-#endif
-#ifndef htole16
-#define htole16(x) __bswap_16(x)
-#endif
-#ifndef be16toh
-#define be16toh(x) (x)
-#endif
-#ifndef le16toh
-#define le16toh(x) __bswap_16(x)
-#endif
-
-#ifndef htobe32
-#define htobe32(x) (x)
-#endif
-#ifndef htole32
-#define htole32(x) __bswap_32(x)
-#endif
-#ifndef betoh32
-#define betoh32(x) (x)
-#endif
-#ifndef letoh32
-#define letoh32(x) __bswap_32(x)
-#endif
-
-#ifndef htobe64
-#define htobe64(x) (x)
-#endif
-#ifndef htole64
-#define htole64(x) __bswap_64(x)
-#endif
-#ifndef betoh64
-#define betoh64(x) (x)
-#endif
-#ifndef letoh64
-#define letoh64(x) __bswap_64(x)
-#endif
-
-#endif /* __BYTE_ORDER == __LITTLE_ENDIAN */
-
-#else  /* __USE_BSD */
-#ifndef betoh16
-#define betoh16 be16toh
-#endif
-
-#ifndef betoh32
-#define betoh32 be32toh
-#endif
-
-#ifndef betoh64
-#define betoh64 be64toh
-#endif
-
-#ifndef letoh16
-#define letoh16 le16toh
-#endif
-
-#ifndef letoh32
-#define letoh32 le32toh
-#endif
-
-#ifndef letoh64
-#define letoh64 le64toh
-#endif
-
-#endif /* __USE_BSD */
-#endif /* __APPLE__ */
diff --git a/twml/libtwml/src/lib/internal/error.docx b/twml/libtwml/src/lib/internal/error.docx
new file mode 100644
index 000000000..b9ea1c028
Binary files /dev/null and b/twml/libtwml/src/lib/internal/error.docx differ
diff --git a/twml/libtwml/src/lib/internal/error.h b/twml/libtwml/src/lib/internal/error.h
deleted file mode 100644
index 3d1bc5441..000000000
--- a/twml/libtwml/src/lib/internal/error.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-#include <twml/Error.h>
-#include <iostream>
-
-#define HANDLE_EXCEPTIONS(fn) do {              \
-        try {                                   \
-            fn                                  \
-        } catch(const twml::Error &e) {         \
-            std::cerr << e.what() << std::endl; \
-            return e.err();                     \
-        } catch(...) {                          \
-            std::cerr << "Unknown error\n";     \
-            return TWML_ERR_UNKNOWN;            \
-        }                                       \
-    } while(0)
-
-#define TWML_CHECK(fn, msg) do {                \
-        twml_err err = fn;                      \
-        if (err == TWML_ERR_NONE) break;        \
-        throw twml::Error(err, msg);            \
-    } while(0)
-
-
-#define CHECK_THRIFT_TYPE(real_type, expected_type, type) do {      \
-    int real_type_val = real_type;                                  \
-    if (real_type_val != expected_type) {                           \
-      throw twml::ThriftInvalidType(real_type_val, __func__, type); \
-    }                                                               \
-  } while(0)
diff --git a/twml/libtwml/src/lib/internal/interpolate.docx b/twml/libtwml/src/lib/internal/interpolate.docx
new file mode 100644
index 000000000..71b2458ea
Binary files /dev/null and b/twml/libtwml/src/lib/internal/interpolate.docx differ
diff --git a/twml/libtwml/src/lib/internal/interpolate.h b/twml/libtwml/src/lib/internal/interpolate.h
deleted file mode 100644
index 3e1daf53e..000000000
--- a/twml/libtwml/src/lib/internal/interpolate.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-#include <twml/optim.h>
-namespace twml {
-
-  enum InterpolationMode {LINEAR, NEAREST};
-
-  template<typename Tx, typename Ty>
-  static Tx interpolation(const Tx *xsData, const int64_t xsStride,
-                 const Ty *ysData, const int64_t ysStride,
-                 const Tx val, const int64_t mainSize,
-                 const InterpolationMode mode,
-                 const int64_t lowest,
-                 const bool return_local_index = false) {
-    int64_t left = 0;
-    int64_t right = mainSize-1;
-
-    if (val <= xsData[0]) {
-      right = 0;
-    } else if (val >= xsData[right*xsStride]) {
-      left = right;
-    } else {
-      while (left < right) {
-        int64_t middle = (left+right)/2;
-
-        if (middle < mainSize - 1 &&
-          val >= xsData[middle*xsStride] &&
-          val <= xsData[(middle+1)*xsStride]) {
-          left = middle;
-          right = middle + 1;
-          break;
-        } else if (val > xsData[middle*xsStride]) {
-          left = middle;
-        } else {
-          right = middle;
-        }
-      }
-      if (lowest) {
-        while (left > 0 &&
-             val >= xsData[(left - 1) * xsStride] &&
-             val == xsData[left * xsStride]) {
-          left--;
-          right--;
-        }
-      }
-    }
-
-    Ty out = 0;
-    if (return_local_index) {
-        out = left;
-    } else if (mode == NEAREST) {
-      out = ysData[left*ysStride];
-    } else {
-      int64_t leftys = left*ysStride;
-      int64_t rightys = right*ysStride;
-      int64_t leftxs = left*xsStride;
-      int64_t rightxs = right*xsStride;
-      if (right != left+1 ||
-        xsData[leftxs] == xsData[rightxs]) {
-        out = ysData[leftys];
-      } else {
-        Tx xLeft = xsData[leftxs];
-        Tx xRight = xsData[rightxs];
-        Tx yLeft = ysData[leftys];
-        Tx ratio = (val - xLeft) / (xRight - xLeft);
-        out = ratio*(ysData[rightys] - yLeft) + yLeft;
-      }
-    }
-    return out;
-  }
-
-}  // namespace twml
-#endif
diff --git a/twml/libtwml/src/lib/internal/khash.docx b/twml/libtwml/src/lib/internal/khash.docx
new file mode 100644
index 000000000..20baeb8a5
Binary files /dev/null and b/twml/libtwml/src/lib/internal/khash.docx differ
diff --git a/twml/libtwml/src/lib/internal/khash.h b/twml/libtwml/src/lib/internal/khash.h
deleted file mode 100644
index c9075cbbc..000000000
--- a/twml/libtwml/src/lib/internal/khash.h
+++ /dev/null
@@ -1,627 +0,0 @@
-/* The MIT License
-
-   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/*
-  An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
-   int ret, is_missing;
-   khiter_t k;
-   khash_t(32) *h = kh_init(32);
-   k = kh_put(32, h, 5, &ret);
-   kh_value(h, k) = 10;
-   k = kh_get(32, h, 10);
-   is_missing = (k == kh_end(h));
-   k = kh_get(32, h, 5);
-   kh_del(32, h, k);
-   for (k = kh_begin(h); k != kh_end(h); ++k)
-      if (kh_exist(h, k)) kh_value(h, k) = 1;
-   kh_destroy(32, h);
-   return 0;
-}
-*/
-
-/*
-  2013-05-02 (0.2.8):
-
-   * Use quadratic probing. When the capacity is power of 2, stepping function
-     i*(i+1)/2 guarantees to traverse each bucket. It is better than double
-     hashing on cache performance and is more robust than linear probing.
-
-     In theory, double hashing should be more robust than quadratic probing.
-     However, my implementation is probably not for large hash tables, because
-     the second hash function is closely tied to the first hash function,
-     which reduce the effectiveness of double hashing.
-
-   Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
-
-  2011-12-29 (0.2.7):
-
-    * Minor code clean up; no actual effect.
-
-  2011-09-16 (0.2.6):
-
-   * The capacity is a power of 2. This seems to dramatically improve the
-     speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
-
-      - http://code.google.com/p/ulib/
-      - http://nothings.org/computer/judy/
-
-   * Allow to optionally use linear probing which usually has better
-     performance for random input. Double hashing is still the default as it
-     is more robust to certain non-random input.
-
-   * Added Wang's integer hash function (not used by default). This hash
-     function is more robust to certain non-random input.
-
-  2011-02-14 (0.2.5):
-
-    * Allow to declare global functions.
-
-  2009-09-26 (0.2.4):
-
-    * Improve portability
-
-  2008-09-19 (0.2.3):
-
-   * Corrected the example
-   * Improved interfaces
-
-  2008-09-11 (0.2.2):
-
-   * Improved speed a little in kh_put()
-
-  2008-09-10 (0.2.1):
-
-   * Added kh_clear()
-   * Fixed a compiling error
-
-  2008-09-02 (0.2.0):
-
-   * Changed to token concatenation which increases flexibility.
-
-  2008-08-31 (0.1.2):
-
-   * Fixed a bug in kh_get(), which has not been tested previously.
-
-  2008-08-31 (0.1.1):
-
-   * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
-  @header
-
-  Generic hash table library.
- */
-
-#define AC_VERSION_KHASH_H "0.2.8"
-
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-/* compiler specific configuration */
-
-#if UINT_MAX == 0xffffffffu
-typedef unsigned int khint32_t;
-#elif ULONG_MAX == 0xffffffffu
-typedef unsigned long khint32_t;
-#endif
-
-#if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khint64_t;
-#else
-typedef uint64_t khint64_t;
-#endif
-
-#ifndef kh_inline
-#ifdef _MSC_VER
-#define kh_inline __inline
-#else
-#define kh_inline inline
-#endif
-#endif /* kh_inline */
-
-#ifndef klib_unused
-#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
-#define klib_unused __attribute__ ((__unused__))
-#else
-#define klib_unused
-#endif
-#endif /* klib_unused */
-
-typedef khint32_t khint_t;
-typedef khint_t khiter_t;
-
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
-
-#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#ifndef kcalloc
-#define kcalloc(N,Z) calloc(N,Z)
-#endif
-#ifndef kmalloc
-#define kmalloc(Z) malloc(Z)
-#endif
-#ifndef krealloc
-#define krealloc(P,Z) realloc(P,Z)
-#endif
-#ifndef kfree
-#define kfree(P) free(P)
-#endif
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define __KHASH_TYPE(name, khkey_t, khval_t) \
-   typedef struct kh_##name##_s { \
-      khint_t n_buckets, size, n_occupied, upper_bound; \
-      khint32_t *flags; \
-      khkey_t *keys; \
-      khval_t *vals; \
-   } kh_##name##_t;
-
-#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \
-   extern kh_##name##_t *kh_init_##name(void);                    \
-   extern void kh_destroy_##name(kh_##name##_t *h);               \
-   extern void kh_clear_##name(kh_##name##_t *h);                 \
-   extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \
-   extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
-   extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
-   extern void kh_del_##name(kh_##name##_t *h, khint_t x);
-
-#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   SCOPE kh_##name##_t *kh_init_##name(void) {                    \
-      return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \
-   }                                                  \
-   SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \
-   {                                                  \
-      if (h) {                                        \
-         kfree((void *)h->keys); kfree(h->flags);              \
-         kfree((void *)h->vals);                            \
-         kfree(h);                                       \
-      }                                               \
-   }                                                  \
-   SCOPE void kh_clear_##name(kh_##name##_t *h)                \
-   {                                                  \
-      if (h && h->flags) {                               \
-         memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
-         h->size = h->n_occupied = 0;                       \
-      }                                               \
-   }                                                  \
-   SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \
-   {                                                  \
-      if (h->n_buckets) {                                   \
-         khint_t k, i, last, mask, step = 0; \
-         mask = h->n_buckets - 1;                           \
-         k = __hash_func(key); i = k & mask;                   \
-         last = i; \
-         while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-            i = (i + (++step)) & mask; \
-            if (i == last) return h->n_buckets;                \
-         }                                            \
-         return __ac_iseither(h->flags, i)? h->n_buckets : i;     \
-      } else return 0;                                   \
-   }                                                  \
-   SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
-   { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
-      khint32_t *new_flags = 0;                             \
-      khint_t j = 1;                                     \
-      {                                               \
-         kroundup32(new_n_buckets);                            \
-         if (new_n_buckets < 4) new_n_buckets = 4;             \
-         if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
-         else { /* hash table size to be changed (shrink or expand); rehash */ \
-            new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
-            if (!new_flags) return -1;                      \
-            memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
-            if (h->n_buckets < new_n_buckets) { /* expand */      \
-               khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-               if (!new_keys) { kfree(new_flags); return -1; }    \
-               h->keys = new_keys;                          \
-               if (kh_is_map) {                          \
-                  khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-                  if (!new_vals) { kfree(new_flags); return -1; } \
-                  h->vals = new_vals;                       \
-               }                                      \
-            } /* otherwise shrink */                        \
-         }                                            \
-      }                                               \
-      if (j) { /* rehashing is needed */                       \
-         for (j = 0; j != h->n_buckets; ++j) {                 \
-            if (__ac_iseither(h->flags, j) == 0) {             \
-               khkey_t key = h->keys[j];                    \
-               khval_t val;                              \
-               khint_t new_mask;                         \
-               new_mask = new_n_buckets - 1;                   \
-               if (kh_is_map) val = h->vals[j];             \
-               __ac_set_isdel_true(h->flags, j);               \
-               while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
-                  khint_t k, i, step = 0; \
-                  k = __hash_func(key);                     \
-                  i = k & new_mask;                      \
-                  while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
-                  __ac_set_isempty_false(new_flags, i);        \
-                  if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
-                     { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
-                     if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-                     __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
-                  } else { /* write the element and jump out of the loop */ \
-                     h->keys[i] = key;                   \
-                     if (kh_is_map) h->vals[i] = val;       \
-                     break;                              \
-                  }                                   \
-               }                                      \
-            }                                         \
-         }                                            \
-         if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
-            h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-            if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-         }                                            \
-         kfree(h->flags); /* free the working space */            \
-         h->flags = new_flags;                              \
-         h->n_buckets = new_n_buckets;                      \
-         h->n_occupied = h->size;                           \
-         h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
-      }                                               \
-      return 0;                                          \
-   }                                                  \
-   SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
-   {                                                  \
-      khint_t x;                                         \
-      if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
-         if (h->n_buckets > (h->size<<1)) {                    \
-            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
-               *ret = -1; return h->n_buckets;                 \
-            }                                         \
-         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
-            *ret = -1; return h->n_buckets;                    \
-         }                                            \
-      } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
-      {                                               \
-         khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
-         x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
-         if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
-         else {                                          \
-            last = i; \
-            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-               if (__ac_isdel(h->flags, i)) site = i;          \
-               i = (i + (++step)) & mask; \
-               if (i == last) { x = site; break; }             \
-            }                                         \
-            if (x == h->n_buckets) {                        \
-               if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
-               else x = i;                               \
-            }                                         \
-         }                                            \
-      }                                               \
-      if (__ac_isempty(h->flags, x)) { /* not present at all */      \
-         h->keys[x] = key;                               \
-         __ac_set_isboth_false(h->flags, x);                   \
-         ++h->size; ++h->n_occupied;                           \
-         *ret = 1;                                       \
-      } else if (__ac_isdel(h->flags, x)) { /* deleted */            \
-         h->keys[x] = key;                               \
-         __ac_set_isboth_false(h->flags, x);                   \
-         ++h->size;                                      \
-         *ret = 2;                                       \
-      } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
-      return x;                                          \
-   }                                                  \
-   SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \
-   {                                                  \
-      if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \
-         __ac_set_isdel_true(h->flags, x);                     \
-         --h->size;                                      \
-      }                                               \
-   }
-
-#define KHASH_DECLARE(name, khkey_t, khval_t)                     \
-   __KHASH_TYPE(name, khkey_t, khval_t)                        \
-   __KHASH_PROTOTYPES(name, khkey_t, khval_t)
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   __KHASH_TYPE(name, khkey_t, khval_t)                        \
-   __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
-  @abstract     Integer hash function
-  @param  key   The integer [khint32_t]
-  @return       The hash value [khint_t]
- */
-#define kh_int_hash_func(key) (khint32_t)(key)
-/*! @function
-  @abstract     Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
-  @abstract     64-bit integer hash function
-  @param  key   The integer [khint64_t]
-  @return       The hash value [khint_t]
- */
-#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
-/*! @function
-  @abstract     64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-/*! @function
-  @abstract     const char* hash function
-  @param  s     Pointer to a null terminated string
-  @return       The hash value
- */
-static kh_inline khint_t __ac_X31_hash_string(const char *s)
-{
-   khint_t h = (khint_t)*s;
-   if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
-   return h;
-}
-/*! @function
-  @abstract     Another interface to const char* hash function
-  @param  key   Pointer to a null terminated string [const char*]
-  @return       The hash value [khint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
-  @abstract     Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-static kh_inline khint_t __ac_Wang_hash(khint_t key)
-{
-    key += ~(key << 15);
-    key ^=  (key >> 10);
-    key +=  (key << 3);
-    key ^=  (key >> 6);
-    key += ~(key << 11);
-    key ^=  (key >> 16);
-    return key;
-}
-#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other convenient macros... */
-
-/*!
-  @abstract Type of the hash table.
-  @param  name  Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
-  @abstract     Initiate a hash table.
-  @param  name  Name of the hash table [symbol]
-  @return       Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name()
-
-/*! @function
-  @abstract     Destroy a hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
-  @abstract     Reset a hash table without deallocating memory.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
-  @abstract     Resize a hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  s     New size [khint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
-  @abstract     Insert a key to the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Key [type of keys]
-  @param  r     Extra return code: -1 if the operation failed;
-                0 if the key is present in the hash table;
-                1 if the bucket is empty (never used); 2 if the element in
-            the bucket has been deleted [int*]
-  @return       Iterator to the inserted element [khint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
-  @abstract     Retrieve a key from the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Key [type of keys]
-  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
-  @abstract     Remove a key from the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Iterator to the element to be deleted [khint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-/*! @function
-  @abstract     Test whether a bucket contains data.
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
-  @abstract     Get key given an iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
-  @abstract     Get value given an iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       Value [type of values]
-  @discussion   For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
-  @abstract     Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
-  @abstract     Get the start iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       The start iterator [khint_t]
- */
-#define kh_begin(h) (khint_t)(0)
-
-/*! @function
-  @abstract     Get the end iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       The end iterator [khint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
-  @abstract     Get the number of elements in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       Number of elements in the hash table [khint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
-  @abstract     Get the number of buckets in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       Number of buckets in the hash table [khint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/*! @function
-  @abstract     Iterate over the entries in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  kvar  Variable to which key will be assigned
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \
-   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
-      if (!kh_exist(h,__i)) continue;                 \
-      (kvar) = kh_key(h,__i);                      \
-      (vvar) = kh_val(h,__i);                      \
-      code;                                  \
-   } }
-
-/*! @function
-  @abstract     Iterate over the values in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach_value(h, vvar, code) { khint_t __i;      \
-   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
-      if (!kh_exist(h,__i)) continue;                 \
-      (vvar) = kh_val(h,__i);                      \
-      code;                                  \
-   } }
-
-/* More conenient interfaces */
-
-/*! @function
-  @abstract     Instantiate a hash set containing integer keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name)                            \
-   KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing integer keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t)                      \
-   KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing 64-bit integer keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT64(name)                             \
-   KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing 64-bit integer keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_INT64(name, khval_t)                       \
-   KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-typedef const char *kh_cstr_t;
-/*! @function
-  @abstract     Instantiate a hash map containing const char* keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name)                            \
-   KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing const char* keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t)                      \
-   KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-#endif /* __AC_KHASH_H */
diff --git a/twml/libtwml/src/lib/internal/linear_search.docx b/twml/libtwml/src/lib/internal/linear_search.docx
new file mode 100644
index 000000000..bb9ab4648
Binary files /dev/null and b/twml/libtwml/src/lib/internal/linear_search.docx differ
diff --git a/twml/libtwml/src/lib/internal/linear_search.h b/twml/libtwml/src/lib/internal/linear_search.h
deleted file mode 100644
index a3d294853..000000000
--- a/twml/libtwml/src/lib/internal/linear_search.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-#include <twml/optim.h>
-namespace twml {
-
-  template<typename Tx>
-  static int64_t linear_search(const Tx *xsData, const Tx val, const int64_t mainSize) {
-    int64_t left = 0;
-    int64_t right = mainSize-1;
-    while(left <= right && val > xsData[left])
-      left++;
-    return left;
-  }
-
-}  // namespace twml
-#endif
diff --git a/twml/libtwml/src/lib/internal/murmur_hash3.docx b/twml/libtwml/src/lib/internal/murmur_hash3.docx
new file mode 100644
index 000000000..58ecc6e3a
Binary files /dev/null and b/twml/libtwml/src/lib/internal/murmur_hash3.docx differ
diff --git a/twml/libtwml/src/lib/internal/murmur_hash3.h b/twml/libtwml/src/lib/internal/murmur_hash3.h
deleted file mode 100644
index 3bdfbe486..000000000
--- a/twml/libtwml/src/lib/internal/murmur_hash3.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#ifndef _MURMURHASH3_H_
-#define _MURMURHASH3_H_
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else  // defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
-
-//-----------------------------------------------------------------------------
-
-#endif // _MURMURHASH3_H_
diff --git a/twml/libtwml/src/lib/internal/thrift.docx b/twml/libtwml/src/lib/internal/thrift.docx
new file mode 100644
index 000000000..a46b5ef4b
Binary files /dev/null and b/twml/libtwml/src/lib/internal/thrift.docx differ
diff --git a/twml/libtwml/src/lib/internal/thrift.h b/twml/libtwml/src/lib/internal/thrift.h
deleted file mode 100644
index 4e4786219..000000000
--- a/twml/libtwml/src/lib/internal/thrift.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// For details of how to encode and decode thrift, check
-// https://github.com/apache/thrift/blob/master/doc/specs/thrift-binary-protocol.md
-
-// Definitions of the thrift binary format
-typedef enum {
-  TTYPE_STOP   = 0,
-  TTYPE_VOID   = 1,
-  TTYPE_BOOL   = 2,
-  TTYPE_BYTE   = 3,
-  TTYPE_DOUBLE = 4,
-  TTYPE_I16    = 6,
-  TTYPE_I32    = 8,
-  TTYPE_I64    = 10,
-  TTYPE_STRING = 11,
-  TTYPE_STRUCT = 12,
-  TTYPE_MAP    = 13,
-  TTYPE_SET    = 14,
-  TTYPE_LIST   = 15,
-  TTYPE_ENUM   = 16,
-} TTYPES;
-
-// Fields of a batch prediction response
-typedef enum {
-  BPR_DUMMY ,
-  BPR_PREDICTIONS,
-} BPR_FIELDS;
-
-// Fields of a datarecord
-typedef enum {
-  DR_CROSS             , // fake field for crosses
-  DR_BINARY            ,
-  DR_CONTINUOUS        ,
-  DR_DISCRETE          ,
-  DR_STRING            ,
-  DR_SPARSE_BINARY     ,
-  DR_SPARSE_CONTINUOUS ,
-  DR_BLOB              ,
-  DR_GENERAL_TENSOR    ,
-  DR_SPARSE_TENSOR     ,
-} DR_FIELDS;
-
-// Fields for General tensor
-typedef enum {
-  GT_DUMMY  , // dummy field
-  GT_RAW    ,
-  GT_STRING ,
-  GT_INT32  ,
-  GT_INT64  ,
-  GT_FLOAT  ,
-  GT_DOUBLE ,
-  GT_BOOL   ,
-} GT_FIELDS;
-
-typedef enum {
-  SP_DUMMY  , // dummy field
-  SP_COO    ,
-} SP_FIELDS;
-
-// Enum values from tensor.thrift
-typedef enum {
-  DATA_TYPE_FLOAT  ,
-  DATA_TYPE_DOUBLE ,
-  DATA_TYPE_INT32  ,
-  DATA_TYPE_INT64  ,
-  DATA_TYPE_UINT8  ,
-  DATA_TYPE_STRING ,
-  DATA_TYPE_BYTE   ,
-  DATA_TYPE_BOOL   ,
-} DATA_TYPES;
diff --git a/twml/libtwml/src/lib/internal/utf_converter.docx b/twml/libtwml/src/lib/internal/utf_converter.docx
new file mode 100644
index 000000000..d3bb15cc7
Binary files /dev/null and b/twml/libtwml/src/lib/internal/utf_converter.docx differ
diff --git a/twml/libtwml/src/lib/internal/utf_converter.h b/twml/libtwml/src/lib/internal/utf_converter.h
deleted file mode 100644
index b0b38fb11..000000000
--- a/twml/libtwml/src/lib/internal/utf_converter.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _UTF_CONVERTER_H_
-#define _UTF_CONVERTER_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/types.h>
-
-ssize_t utf8_to_utf16(const uint8_t *in, uint64_t in_len, uint16_t *out, uint64_t max_out);
-
-#endif
diff --git a/twml/libtwml/src/lib/io/IOError.cpp b/twml/libtwml/src/lib/io/IOError.cpp
deleted file mode 100644
index e0a661c13..000000000
--- a/twml/libtwml/src/lib/io/IOError.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <twml/io/IOError.h>
-
-
-namespace twml {
-namespace io {
-
-namespace {
-  std::string messageFromStatus(IOError::Status status) {
-    switch (status) {
-      case IOError::OUT_OF_RANGE:
-        return "failed to read enough input";
-      case IOError::WRONG_MAGIC:
-        return "wrong magic in stream";
-      case IOError::WRONG_HEADER:
-        return "wrong header in stream";
-      case IOError::ERROR_HEADER_CHECKSUM:
-        return "header checksum doesn't match";
-      case IOError::INVALID_METHOD:
-        return "using invalid method";
-      case IOError::USING_RESERVED:
-        return "using reserved flag";
-      case IOError::ERROR_HEADER_EXTRA_FIELD_CHECKSUM:
-        return "extra header field checksum doesn't match";
-      case IOError::CANT_FIT_OUTPUT:
-        return "can't fit output in the given space";
-      case IOError::SPLIT_FILE:
-        return "split files aren't supported";
-      case IOError::BLOCK_SIZE_TOO_LARGE:
-        return "block size is too large";
-      case IOError::SOURCE_LARGER_THAN_DESTINATION:
-        return "source is larger than destination";
-      case IOError::DESTINATION_LARGER_THAN_CAPACITY:
-        return "destination buffer is too small to fit uncompressed result";
-      case IOError::HEADER_FLAG_MISMATCH:
-        return "failed to match flags for compressed and decompressed data";
-      case IOError::NOT_ENOUGH_INPUT:
-        return "not enough input to proceed with decompression";
-      case IOError::ERROR_SOURCE_BLOCK_CHECKSUM:
-        return "source block checksum doesn't match";
-      case IOError::COMPRESSED_DATA_VIOLATION:
-        return "error occurred while decompressing the data";
-      case IOError::ERROR_DESTINATION_BLOCK_CHECKSUM:
-        return "destination block checksum doesn't match";
-      case IOError::EMPTY_RECORD:
-        return "can't write an empty record";
-      case IOError::MALFORMED_MEMORY_RECORD:
-        return "can't write malformed record";
-      case IOError::UNSUPPORTED_OUTPUT_TYPE:
-        return "output data type is not supported";
-      case IOError::OTHER_ERROR:
-      default:
-        return "unknown error occurred";
-    }
-  }
-}  // namespace
-
-IOError::IOError(Status status): twml::Error(TWML_ERR_IO, "Found error while processing stream: " +
-    messageFromStatus(status)), m_status(status) {}
-
-}  // namespace io
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/io/IOError.docx b/twml/libtwml/src/lib/io/IOError.docx
new file mode 100644
index 000000000..dcf9a59b9
Binary files /dev/null and b/twml/libtwml/src/lib/io/IOError.docx differ
diff --git a/twml/libtwml/src/lib/murmur_hash3.cpp b/twml/libtwml/src/lib/murmur_hash3.cpp
deleted file mode 100644
index 89c9c1fc1..000000000
--- a/twml/libtwml/src/lib/murmur_hash3.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "internal/murmur_hash3.h"
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE  __forceinline
-
-#include <stdlib.h>
-
-#define ROTL32(x,y)  _rotl(x,y)
-#define ROTL64(x,y)  _rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else  // defined(_MSC_VER)
-
-#define  FORCE_INLINE inline __attribute__((always_inline))
-
-FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r )
-{
-  return (x << r) | (x >> (32 - r));
-}
-
-FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r )
-{
-  return (x << r) | (x >> (64 - r));
-}
-
-#define  ROTL32(x,y)  rotl32(x,y)
-#define ROTL64(x,y)  rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
-{
-  return p[i];
-}
-
-FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
-{
-  return p[i];
-}
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32 ( uint32_t h )
-{
-  h ^= h >> 16;
-  h *= 0x85ebca6b;
-  h ^= h >> 13;
-  h *= 0xc2b2ae35;
-  h ^= h >> 16;
-
-  return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64 ( uint64_t k )
-{
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-
-  return k;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32 ( const void * key, int len,
-                          uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 4;
-
-  uint32_t h1 = seed;
-
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  //----------
-  // body
-
-  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i);
-
-    k1 *= c1;
-    k1 = ROTL32(k1,15);
-    k1 *= c2;
-
-    h1 ^= k1;
-    h1 = ROTL32(h1,13);
-    h1 = h1*5+0xe6546b64;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
-
-  uint32_t k1 = 0;
-
-  switch(len & 3)
-  {
-  case 3: k1 ^= tail[2] << 16;
-  case 2: k1 ^= tail[1] << 8;
-  case 1: k1 ^= tail[0];
-          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-
-  h1 = fmix32(h1);
-
-  *(uint32_t*)out = h1;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_128 ( const void * key, const int len,
-                           uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint32_t h1 = seed;
-  uint32_t h2 = seed;
-  uint32_t h3 = seed;
-  uint32_t h4 = seed;
-
-  const uint32_t c1 = 0x239b961b;
-  const uint32_t c2 = 0xab0e9789;
-  const uint32_t c3 = 0x38b34ae5;
-  const uint32_t c4 = 0xa1e38b93;
-
-  //----------
-  // body
-
-  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i*4+0);
-    uint32_t k2 = getblock32(blocks,i*4+1);
-    uint32_t k3 = getblock32(blocks,i*4+2);
-    uint32_t k4 = getblock32(blocks,i*4+3);
-
-    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
-
-    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
-    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
-
-    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
-    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
-
-    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
-    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
-  uint32_t k1 = 0;
-  uint32_t k2 = 0;
-  uint32_t k3 = 0;
-  uint32_t k4 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k4 ^= tail[14] << 16;
-  case 14: k4 ^= tail[13] << 8;
-  case 13: k4 ^= tail[12] << 0;
-           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
-  case 12: k3 ^= tail[11] << 24;
-  case 11: k3 ^= tail[10] << 16;
-  case 10: k3 ^= tail[ 9] << 8;
-  case  9: k3 ^= tail[ 8] << 0;
-           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
-  case  8: k2 ^= tail[ 7] << 24;
-  case  7: k2 ^= tail[ 6] << 16;
-  case  6: k2 ^= tail[ 5] << 8;
-  case  5: k2 ^= tail[ 4] << 0;
-           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
-  case  4: k1 ^= tail[ 3] << 24;
-  case  3: k1 ^= tail[ 2] << 16;
-  case  2: k1 ^= tail[ 1] << 8;
-  case  1: k1 ^= tail[ 0] << 0;
-           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  h1 = fmix32(h1);
-  h2 = fmix32(h2);
-  h3 = fmix32(h3);
-  h4 = fmix32(h4);
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  ((uint32_t*)out)[0] = h1;
-  ((uint32_t*)out)[1] = h2;
-  ((uint32_t*)out)[2] = h3;
-  ((uint32_t*)out)[3] = h4;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x64_128 ( const void * key, const int len,
-                           const uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint64_t h1 = seed;
-  uint64_t h2 = seed;
-
-  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
-  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
-  //----------
-  // body
-
-  const uint64_t * blocks = (const uint64_t *)(data);
-
-  for(int i = 0; i < nblocks; i++)
-  {
-    uint64_t k1 = getblock64(blocks,i*2+0);
-    uint64_t k2 = getblock64(blocks,i*2+1);
-
-    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
-
-    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
-  uint64_t k1 = 0;
-  uint64_t k2 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
-  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
-  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
-  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
-  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
-  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
-  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
-           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
-  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
-  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
-  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
-  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
-  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
-  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
-  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
-           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len;
-
-  h1 += h2;
-  h2 += h1;
-
-  h1 = fmix64(h1);
-  h2 = fmix64(h2);
-
-  h1 += h2;
-  h2 += h1;
-
-  ((uint64_t*)out)[0] = h1;
-  ((uint64_t*)out)[1] = h2;
-}
-
-//-----------------------------------------------------------------------------
-
diff --git a/twml/libtwml/src/lib/murmur_hash3.docx b/twml/libtwml/src/lib/murmur_hash3.docx
new file mode 100644
index 000000000..fef9c4ee2
Binary files /dev/null and b/twml/libtwml/src/lib/murmur_hash3.docx differ
diff --git a/twml/libtwml/src/lib/optim.cpp b/twml/libtwml/src/lib/optim.cpp
deleted file mode 100644
index 7db36c26d..000000000
--- a/twml/libtwml/src/lib/optim.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-#include "internal/interpolate.h"
-#include "internal/error.h"
-#include <twml/optim.h>
-
-namespace twml {
-  template<typename T>
-  void mdlInfer(Tensor &output_keys, Tensor &output_vals,
-          const Tensor &input_keys, const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          bool return_bin_indices) {
-    auto okeysData = output_keys.getData<int64_t>();
-    auto ovalsData = output_vals.getData<T>();
-    uint64_t okeysStride   = output_keys.getStride(0);
-    uint64_t ovaluesStride = output_vals.getStride(0);
-
-    auto ikeysData = input_keys.getData<int64_t>();
-    auto ivalsData = input_vals.getData<T>();
-    uint64_t ikeysStride   = input_keys.getStride(0);
-    uint64_t ivaluesStride = input_vals.getStride(0);
-
-    auto xsData = bin_vals.getData<T>();
-    auto ysData = bin_ids.getData<int64_t>();
-    uint64_t xsStride = bin_vals.getStride(0);
-    uint64_t ysStride = bin_ids.getStride(0);
-
-    auto offsetData = feature_offsets.getData<int64_t>();
-
-    uint64_t size = input_keys.getDim(0);
-    uint64_t total_bins = bin_ids.getNumElements();
-    uint64_t fsize = feature_offsets.getNumElements();
-
-    for (uint64_t i = 0; i < size; i++) {
-      int64_t ikey = ikeysData[i * ikeysStride] - TWML_INDEX_BASE;
-      T val = ivalsData[i * ivaluesStride];
-      if (ikey == -1) {
-        ovalsData[i * ovaluesStride] = val;
-        continue;
-      }
-
-      // Perform interpolation
-      uint64_t offset = offsetData[ikey];
-      uint64_t next_offset = (ikey == (int64_t)(fsize - 1)) ? total_bins : offsetData[ikey + 1];
-      uint64_t mainSize = next_offset - offset;
-
-      const T *lxsData = xsData + offset;
-      const int64_t *lysData = ysData + offset;
-      int64_t okey = interpolation<T, int64_t>(lxsData, xsStride,
-                                 lysData, ysStride,
-                                 val, mainSize, NEAREST, 0,
-                                 return_bin_indices);
-      okeysData[i * okeysStride] = okey + TWML_INDEX_BASE;
-      ovalsData[i * ovaluesStride] = 1;
-    }
-  }
-
-  void mdlInfer(Tensor &output_keys, Tensor &output_vals,
-          const Tensor &input_keys, const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          bool return_bin_indices) {
-    if (input_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_keys must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (bin_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (feature_offsets.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    if (bin_ids.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_ids must be 1 Dimensional");
-    }
-
-    if (bin_vals.getNumElements() != bin_ids.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "Dimensions of bin_vals and bin_ids do not match");
-    }
-
-    if (feature_offsets.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "feature_offsets must be contiguous");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::mdlInfer<float>(output_keys, output_vals,
-                  input_keys, input_vals,
-                  bin_ids, bin_vals, feature_offsets,
-                  return_bin_indices);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::mdlInfer<double>(output_keys, output_vals,
-                   input_keys, input_vals,
-                   bin_ids, bin_vals, feature_offsets,
-                   return_bin_indices);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for mdlInfer");
-    }
-  }
-
-  const int DEFAULT_INTERPOLATION_LOWEST = 0;
-  /**
-   * @param output tensor to hold linear or nearest interpolation output.
-   *    This function does not allocate space.
-   *    The output tensor must have space allcoated.
-   * @param input input tensor; size must match output.
-   *    input is assumed to have size [batch_size, number_of_labels].
-   * @param xs the bins.
-   * @param ys the values for the bins.
-   * @param mode: linear or nearest InterpolationMode.
-   *    linear is used for isotonic calibration.
-   *    nearest is used for MDL calibration and MDL inference.
-   *
-   * @return Returns nothing. Output is stored into the output tensor.
-   *
-   * This is used by IsotonicCalibration inference.
-   */
-  template <typename T>
-  void interpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys,
-    const InterpolationMode mode) {
-    // Sanity check: input and output should have two dims.
-    if (input.getNumDims() != 2 || output.getNumDims() != 2) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "input and output should have 2 dimensions.");
-    }
-
-    // Sanity check: input and output size should match.
-    for (int i = 0; i < input.getNumDims(); i++) {
-      if (input.getDim(i) != output.getDim(i))  {
-        throw twml::Error(TWML_ERR_TYPE,
-                  "input and output mismatch in size.");
-      }
-    }
-
-    // Sanity check: number of labels in input should match
-    // number of labels in xs / ys.
-    if (input.getDim(1) != xs.getDim(0)
-      || input.getDim(1) != ys.getDim(0)) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "input, xs, ys should have the same number of labels.");
-    }
-
-    const uint64_t inputStride0 = input.getStride(0);
-    const uint64_t inputStride1 = input.getStride(1);
-    const uint64_t outputStride0 = output.getStride(0);
-    const uint64_t outputStride1 = output.getStride(1);
-    const uint64_t xsStride0 = xs.getStride(0);
-    const uint64_t xsStride1 = xs.getStride(1);
-    const uint64_t ysStride0 = ys.getStride(0);
-    const uint64_t ysStride1 = ys.getStride(1);
-    const uint64_t mainSize = xs.getDim(1);
-
-    // for each value in the input matrix, compute output value by
-    // calling interpolation.
-    auto inputData = input.getData<T>();
-    auto outputData = output.getData<T>();
-    auto xsData = xs.getData<T>();
-    auto ysData = ys.getData<T>();
-
-    for (uint64_t i = 0; i < input.getDim(0); i++) {
-      for (uint64_t j = 0; j < input.getDim(1); j++) {
-        const T val = inputData[i * inputStride0 + j * inputStride1];
-        const T *lxsData = xsData + j * xsStride0;
-        const T *lysData = ysData + j * ysStride0;
-        const T res = interpolation(
-          lxsData, xsStride1,
-          lysData, ysStride1,
-          val,
-          mainSize,
-          mode,
-          DEFAULT_INTERPOLATION_LOWEST);
-        outputData[i * outputStride0 + j * outputStride1] = res;
-      }
-    }
-  }
-
-  void linearInterpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys) {
-    switch (input.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::interpolation<float>(output, input, xs, ys, LINEAR);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::interpolation<double>(output, input, xs, ys, LINEAR);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for linearInterpolation.");
-    }
-  }
-
-  void nearestInterpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys) {
-    switch (input.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::interpolation<float>(output, input, xs, ys, NEAREST);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::interpolation<double>(output, input, xs, ys, NEAREST);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for nearestInterpolation.");
-    }
-  }
-}  // namespace twml
-
-twml_err twml_optim_mdl_infer(twml_tensor output_keys,
-                twml_tensor output_vals,
-                const twml_tensor input_keys,
-                const twml_tensor input_vals,
-                const twml_tensor bin_ids,
-                const twml_tensor bin_vals,
-                const twml_tensor feature_offsets,
-                bool return_bin_indices) {
-  HANDLE_EXCEPTIONS(
-    using namespace twml;
-    mdlInfer(*getTensor(output_keys),
-         *getTensor(output_vals),
-         *getConstTensor(input_keys),
-         *getConstTensor(input_vals),
-         *getConstTensor(bin_ids),
-         *getConstTensor(bin_vals),
-         *getConstTensor(feature_offsets),
-          return_bin_indices););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_optim_nearest_interpolation(
-                twml_tensor output,
-                const twml_tensor input,
-                const twml_tensor xs,
-                const twml_tensor ys) {
-  HANDLE_EXCEPTIONS(
-    using namespace twml;
-    nearestInterpolation(*getTensor(output),
-      *getConstTensor(input),
-      *getConstTensor(xs),
-      *getConstTensor(ys)););
-  return TWML_ERR_NONE;
-}
diff --git a/twml/libtwml/src/lib/optim.docx b/twml/libtwml/src/lib/optim.docx
new file mode 100644
index 000000000..b9bc8fce9
Binary files /dev/null and b/twml/libtwml/src/lib/optim.docx differ
diff --git a/twml/libtwml/src/lib/utf_converter.cpp b/twml/libtwml/src/lib/utf_converter.cpp
deleted file mode 100644
index 5c943f3e3..000000000
--- a/twml/libtwml/src/lib/utf_converter.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "internal/utf_converter.h"
-
-ssize_t utf8_to_utf16(const uint8_t *in, uint64_t in_len, uint16_t *out, uint64_t max_out) {
-  uint64_t num_out = 0;
-  uint64_t num_in = 0;
-  while (num_in < in_len) {
-    uint32_t uni;
-    uint64_t todo;
-    uint8_t ch = in[num_in];
-    num_in++;
-    if (ch <= 0x7F) {
-      uni = ch;
-      todo = 0;
-    } else if (ch <= 0xBF) {
-      return -1;
-    } else if (ch <= 0xDF) {
-      uni = ch & 0x1F;
-      todo = 1;
-    } else if (ch <= 0xEF) {
-      uni = ch & 0x0F;
-      todo = 2;
-    } else if (ch <= 0xF7) {
-      uni = ch & 0x07;
-      todo = 3;
-    } else {
-      return -1;
-    }
-    for (uint64_t j = 0; j < todo; ++j) {
-      if (num_in == in_len) return -1;
-      uint8_t ch = in[num_in];
-      num_in++;
-      if (ch < 0x80 || ch > 0xBF) return -1;
-      uni <<= 6;
-      uni += ch & 0x3F;
-    }
-    if (uni >= 0xD800 && uni <= 0xDFFF) return -1;
-    if (uni > 0x10FFFF) return -1;
-    if (uni <= 0xFFFF) {
-      if (num_out == max_out) return -1;
-      out[num_out] = uni;
-      num_out++;
-    } else {
-      uni -= 0x10000;
-      if (num_out + 1 >= max_out) return -1;
-      out[num_out] = (uni >> 10) + 0xD800;
-      out[num_out + 1] = (uni & 0x3FF) + 0xDC00;
-      num_out += 2;
-    }
-  }
-  if (num_out == max_out) return -1;
-  out[num_out] = 0;
-  return num_out;
-}
diff --git a/twml/libtwml/src/lib/utf_converter.docx b/twml/libtwml/src/lib/utf_converter.docx
new file mode 100644
index 000000000..4e378830f
Binary files /dev/null and b/twml/libtwml/src/lib/utf_converter.docx differ
diff --git a/twml/libtwml/src/ops/CMakeLists.docx b/twml/libtwml/src/ops/CMakeLists.docx
new file mode 100644
index 000000000..e6f5b8f76
Binary files /dev/null and b/twml/libtwml/src/ops/CMakeLists.docx differ
diff --git a/twml/libtwml/src/ops/CMakeLists.txt b/twml/libtwml/src/ops/CMakeLists.txt
deleted file mode 100644
index e2feaff23..000000000
--- a/twml/libtwml/src/ops/CMakeLists.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
-cmake_policy(VERSION 2.8)
-set(CMAKE_MACOSX_RPATH 1)
-
-file(GLOB_RECURSE sources *.cpp)
-
-set (CMAKE_CXX_FLAGS "-Wall -std=c++11 -fno-stack-protector ${CMAKE_CXX_FLAGS}")
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_inc.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_INC)
-
-if (NOT (${TF_RES} EQUAL "0"))
-  message(${TF_RES})
-  message(FATAL_ERROR "Failed to get include path for tensorflow")
-endif()
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_lib.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_LIB)
-
-if (NOT (${TF_RES} EQUAL "0"))
-  message(${TF_RES})
-  message(FATAL_ERROR "Failed to get lib path for tensorflow")
-endif()
-
-find_path(
-  TWML_INC
-  NAMES "twml.h"
-  PATHS $ENV{LIBTWML_HOME}/include)
-
-add_library(twml_tf MODULE ${sources})
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{LIBTWML_HOME}/cmake")
-
-if (UNIX)
-  if (APPLE)
-    set (CMAKE_CXX_FLAGS "-undefined dynamic_lookup -stdlib=libc++  ${CMAKE_CXX_FLAGS}")
-    # -Wl,-all_load ensures symbols not used by twml_tf are also included.
-    # -Wl,-noall_load limits the scope of the previous flag.
-    set (LINK_ALL_OPTION  "-Wl,-all_load")
-    set (NO_LINK_ALL_OPTION  "-Wl,-noall_load")
-    set(TF_FRAMEWORK_LIB ${TF_LIB}/libtensorflow_framework.1.dylib)
-  else()
-    # -Wl,--whole-archive ensures symbols not used by twml_tf are also included.
-    # -Wl,--no-whole-archive limits the scope of the previous flag.
-    set (LINK_ALL_OPTION  "-Wl,--whole-archive")
-    set (NO_LINK_ALL_OPTION  "-Wl,--no-whole-archive")
-    set(TF_FRAMEWORK_LIB ${TF_LIB}/libtensorflow_framework.so.1)
-  endif()
-endif()
-
-
-target_include_directories(
-  twml_tf
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${TWML_INC}
-  # TF_INC needs to be the last to avoid some weird white-spacing issues with generated Makefile.
-  ${TF_INC} # Needed because of some header files auto-generated during build time.
-  ${TF_INC}/external/nsync/public/
-  )
-
-target_link_libraries(twml_tf
-  PUBLIC
-  # Since we are using twml_tf as the "one" dynamic library,
-  # we want it to have the C function symbols needed for other functions as well.
-  ${LINK_ALL_OPTION} twml ${NO_LINK_ALL_OPTION}
-  ${TF_FRAMEWORK_LIB}
-  )
diff --git a/twml/libtwml/src/ops/add1.cpp b/twml/libtwml/src/ops/add1.cpp
deleted file mode 100644
index 66281841a..000000000
--- a/twml/libtwml/src/ops/add1.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("Add1")
-.Attr("T: {float, double, int32}")
-.Input("input1: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class Add1 : public OpKernel {
- public:
-  explicit Add1(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
-                             &output_tensor));
-    auto output_flat = output_tensor->flat<T>();
-
-    // Add 1 to input and assign to output
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output_flat(i) = input(i) + 1;
-    }
-  }
-};
-
-
-REGISTER_OP("Add1Grad")
-.Attr("T: {float, double, int32}")
-.Input("grad_output: T")
-.Output("grad_input: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  });
-
-template<typename T>
-class Add1Grad : public OpKernel {
- public:
-  explicit Add1Grad(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& grad_output_tensor = context->input(0);
-    auto grad_output = grad_output_tensor.flat<T>();
-
-    // Create an grad_input tensor
-    Tensor* grad_input_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, grad_output_tensor.shape(),
-                             &grad_input_tensor));
-
-    auto grad_input_flat = grad_input_tensor->flat<T>();
-
-    // Copy from grad_output to grad_input
-    const int N = grad_output.size();
-    for (int i = 0; i < N; i++) {
-      grad_input_flat(i) = grad_output(i);
-    }
-  }
-};
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("Add1")                    \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    Add1<Type>);                    \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("Add1Grad")                \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    Add1Grad<Type>);                \
-
-REGISTER(float);
-REGISTER(double);
-REGISTER(int32);
diff --git a/twml/libtwml/src/ops/add1.docx b/twml/libtwml/src/ops/add1.docx
new file mode 100644
index 000000000..b3de720ed
Binary files /dev/null and b/twml/libtwml/src/ops/add1.docx differ
diff --git a/twml/libtwml/src/ops/batch_prediction_request.cpp b/twml/libtwml/src/ops/batch_prediction_request.cpp
deleted file mode 100644
index a83c3ebcf..000000000
--- a/twml/libtwml/src/ops/batch_prediction_request.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-REGISTER_OP("DecodeAndHashBatchPredictionRequest")
-.Input("input_bytes: uint8")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of hashed data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  hashed_data_record_handle: A resource handle to the HashedDataRecordResource containing batch of HashedDataRecords.
-)doc");
-
-class DecodeAndHashBatchPredictionRequest : public OpKernel {
- public:
-  explicit DecodeAndHashBatchPredictionRequest(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      const uint8_t *input_bytes = resource->input.flat<uint8>().data();
-      twml::HashedDataRecordReader reader;
-      twml::HashedBatchPredictionRequest bpr;
-      reader.setKeepMap(&m_keep_map);
-      reader.setBuffer(input_bytes);
-      reader.setDecodeMode(m_decode_mode);
-      bpr.decode(reader);
-
-      resource->common = std::move(bpr.common());
-      resource->records = std::move(bpr.requests());
-
-      // Each datarecord has a copy of common features.
-      // Initialize total_size by common_size * num_records
-      int64 common_size = static_cast<int64>(resource->common.totalSize());
-      int64 num_records = static_cast<int64>(resource->records.size());
-      int64 total_size = common_size * num_records;
-      for (const auto &record : resource->records) {
-        total_size += static_cast<int64>(record.totalSize());
-      }
-
-      resource->total_size = total_size;
-      resource->num_labels = 0;
-      resource->num_weights = 0;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("DecodeAndHashBatchPredictionRequest").Device(DEVICE_CPU),
-  DecodeAndHashBatchPredictionRequest);
-
-REGISTER_OP("DecodeBatchPredictionRequest")
-.Input("input_bytes: uint8")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecordResource containing batch of DataRecords.
-)doc");
-
-class DecodeBatchPredictionRequest : public OpKernel {
- public:
-  explicit DecodeBatchPredictionRequest(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      const uint8_t *input_bytes = resource->input.flat<uint8>().data();
-      twml::DataRecordReader reader;
-      twml::BatchPredictionRequest bpr;
-      reader.setKeepMap(&m_keep_map);
-      reader.setBuffer(input_bytes);
-      bpr.decode(reader);
-
-      resource->common = std::move(bpr.common());
-      resource->records = std::move(bpr.requests());
-
-      resource->num_weights = 0;
-      resource->num_labels = 0;
-      resource->keep_map = &m_keep_map;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("DecodeBatchPredictionRequest").Device(DEVICE_CPU),
-  DecodeBatchPredictionRequest);
diff --git a/twml/libtwml/src/ops/batch_prediction_request.docx b/twml/libtwml/src/ops/batch_prediction_request.docx
new file mode 100644
index 000000000..273428ba9
Binary files /dev/null and b/twml/libtwml/src/ops/batch_prediction_request.docx differ
diff --git a/twml/libtwml/src/ops/batch_prediction_request_v2.cpp b/twml/libtwml/src/ops/batch_prediction_request_v2.cpp
deleted file mode 100644
index 3e89c9a0a..000000000
--- a/twml/libtwml/src/ops/batch_prediction_request_v2.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <cstdint>
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <iterator>
-
-template<typename InputType, typename RecordType>
-class DecodeBatchPredictionRequestKernel : public OpKernel {
- public:
-  explicit DecodeBatchPredictionRequestKernel(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- protected:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-  int64 m_decode_mode;
-
-  template<typename ResourceType>
-  void Decode(OpKernelContext* context, ResourceType *resource) {
-    resource->input = context->input(0);
-    const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, 0);
-    int num_labels = static_cast<int>(m_labels_map.size());
-    int num_weights = static_cast<int>(m_weights_map.size());
-
-    typename RecordType::Reader reader;
-    twml::GenericBatchPredictionRequest<RecordType> bpr(num_labels, num_weights);
-
-    reader.setKeepMap(&m_keep_map);
-    reader.setLabelsMap(&m_labels_map);
-    reader.setBuffer(input_bytes);
-    reader.setDecodeMode(m_decode_mode);
-    // Do not set weight map if it is empty. This will take a faster path.
-    if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-    }
-    bpr.decode(reader);
-
-    resource->common = std::move(bpr.common());
-    resource->records = std::move(bpr.requests());
-
-    resource->num_labels = num_labels;
-    resource->num_weights = num_weights;
-  }
-};
-
-
-REGISTER_OP("DecodeAndHashBatchPredictionRequestV2")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes a list/batch of data records and creates a handle to the batch of hashed data records.
-
-Compared to DecodeAndHashBatchPredictionRequest, DecodeAndHashBatchPredictionRequestV2 is used for training instead
-of serving. Thus label_features and weight_features[optional] must be passed, and labels and weights are extracted in
-the output.
-DecodeAndHashBatchPredictionRequestV2 controls what DataRecords we want to process together in a batch in training.
-For instance, we can put all instances for a query in the same batch when training a ranking model.
-Notice that this OP was added separately to make sure we would not break the API for DecodeAndHashBatchPredictionRequest.
-It requires some discussions if we merge the two ops into a single .cpp file in a future API revision.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  hashed_data_record_handle: A resource handle to the HashedDataRecordResource containing batch of HashedDataRecords.
-)doc");
-
-template<typename InputType>
-class DecodeAndHashBatchPredictionRequestV2 :
-    public DecodeBatchPredictionRequestKernel<InputType, twml::HashedDataRecord> {
-
-public:
-  DecodeAndHashBatchPredictionRequestV2(OpKernelConstruction *context)
-    : DecodeBatchPredictionRequestKernel<InputType, twml::HashedDataRecord>(context) {
-  }
-
- private:
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(
-        context,
-        makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      this->Decode(context, resource);
-
-      // Each datarecord has a copy of common features.
-      // Initialize total_size by common_size * num_records
-      int64 common_size = static_cast<int64>(resource->common.totalSize());
-      int64 num_records = static_cast<int64>(resource->records.size());
-      int64 total_size = common_size * num_records;
-      for (const auto &record : resource->records) {
-        total_size += static_cast<int64>(record.totalSize());
-      }
-
-      resource->total_size = total_size;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("DecodeBatchPredictionRequestV2")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  shared_name: name used by the resource handle inside the resource manager.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: reserved, do not use.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecordResource containing batch of DataRecords.
-)doc");
-
-
-template<typename InputType>
-class DecodeBatchPredictionRequestV2 :
-    public DecodeBatchPredictionRequestKernel<InputType, twml::DataRecord> {
-public:
-  DecodeBatchPredictionRequestV2(OpKernelConstruction *context)
-    : DecodeBatchPredictionRequestKernel<InputType, twml::DataRecord>(context) {
-  }
-
-private:
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(
-        context,
-        makeResourceHandle<DataRecordResource>(context, 0, &resource));
-      this->Decode(context, resource);
-      resource->keep_map = &(this->m_keep_map);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER_DECODE_OPS(InputType)                      \
-    REGISTER_KERNEL_BUILDER(                                \
-        Name("DecodeAndHashBatchPredictionRequestV2")       \
-        .Device(DEVICE_CPU)                                 \
-        .TypeConstraint<InputType>("InputType"),            \
-        DecodeAndHashBatchPredictionRequestV2<InputType>);  \
-    REGISTER_KERNEL_BUILDER(                                \
-        Name("DecodeBatchPredictionRequestV2")              \
-        .Device(DEVICE_CPU)                                 \
-        .TypeConstraint<InputType>("InputType"),            \
-        DecodeBatchPredictionRequestV2<InputType>);         \
-
-REGISTER_DECODE_OPS(uint8)
-REGISTER_DECODE_OPS(string)
diff --git a/twml/libtwml/src/ops/batch_prediction_request_v2.docx b/twml/libtwml/src/ops/batch_prediction_request_v2.docx
new file mode 100644
index 000000000..4cc88e1e2
Binary files /dev/null and b/twml/libtwml/src/ops/batch_prediction_request_v2.docx differ
diff --git a/twml/libtwml/src/ops/batch_prediction_response_writer.cpp b/twml/libtwml/src/ops/batch_prediction_response_writer.cpp
deleted file mode 100644
index 4876dd48a..000000000
--- a/twml/libtwml/src/ops/batch_prediction_response_writer.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("BatchPredictionResponseWriter")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and values into a BatchPredictionResponse.
-
-values: input feature value. (float/double)
-keys: feature ids from the original BatchPredictionRequest. (int64)
-
-Outputs
-  bytes: output BatchPredictionRequest serialized using Thrift into a uint8 tensor.
-)doc");
-
-template<typename T>
-class BatchPredictionResponseWriter : public OpKernel {
- public:
-  explicit BatchPredictionResponseWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-    const Tensor& values = context->input(1);
-
-    try {
-      // Ensure the inner dimension matches.
-      if (values.dim_size(values.dims() - 1) != keys.dim_size(keys.dims() - 1)) {
-        throw std::runtime_error("The sizes of keys and values need to match");
-      }
-
-      // set inputs as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-      const twml::Tensor in_values_ = TFTensor_to_twml_tensor(values);
-      // no tensors in this op
-      const twml::Tensor dummy_dense_keys_;
-      const std::vector<twml::RawTensor> dummy_dense_values_;
-
-      // call constructor BatchPredictionResponse
-      twml::BatchPredictionResponse tempResult(
-        in_keys_, in_values_, dummy_dense_keys_, dummy_dense_values_);
-
-      // determine the length of the result
-      int len = tempResult.encodedSize();
-      TensorShape result_shape = {1, len};
-
-      // Create an output tensor, the size is determined by the content of input.
-      Tensor* result = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape,
-                                                       &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // Call writer of BatchPredictionResponse
-      tempResult.write(out_result);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER(Type)                     \
-                                           \
-  REGISTER_KERNEL_BUILDER(                 \
-    Name("BatchPredictionResponseWriter")  \
-    .Device(DEVICE_CPU)                    \
-    .TypeConstraint<Type>("T"),            \
-    BatchPredictionResponseWriter<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/batch_prediction_response_writer.docx b/twml/libtwml/src/ops/batch_prediction_response_writer.docx
new file mode 100644
index 000000000..835e7744a
Binary files /dev/null and b/twml/libtwml/src/ops/batch_prediction_response_writer.docx differ
diff --git a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp b/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp
deleted file mode 100644
index b98d23206..000000000
--- a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("BatchPredictionTensorResponseWriter")
-.Attr("T: list({string, int32, int64, float, double})")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and dense tensors into a BatchPredictionResponse.
-
-values: list of tensors
-keys: feature ids from the original BatchPredictionRequest. (int64)
-
-Outputs
-  bytes: output BatchPredictionRequest serialized using Thrift into a uint8 tensor.
-)doc");
-
-class BatchPredictionTensorResponseWriter : public OpKernel {
- public:
-  explicit BatchPredictionTensorResponseWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-
-    try {
-      // set keys as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-
-      // check sizes
-      uint64_t num_keys = in_keys_.getNumElements();
-      uint64_t num_values = context->num_inputs() - 1;
-
-      OP_REQUIRES(context, num_values % num_keys == 0,
-        errors::InvalidArgument("Number of dense tensors not multiple of dense keys"));
-
-      // set dense tensor values
-      std::vector<twml::RawTensor> in_values_;
-      for (int i = 1; i < context->num_inputs(); i++) {
-        in_values_.push_back(TFTensor_to_twml_raw_tensor(context->input(i)));
-      }
-
-      // no continuous predictions in this op, only tensors
-      const twml::Tensor dummy_cont_keys_;
-      const twml::Tensor dummy_cont_values_;
-
-      // call constructor BatchPredictionResponse
-      twml::BatchPredictionResponse tempResult(
-        dummy_cont_keys_, dummy_cont_values_, in_keys_, in_values_);
-
-      // determine the length of the result
-      int len = tempResult.encodedSize();
-      TensorShape result_shape = {1, len};
-
-      // Create an output tensor, the size is determined by the content of input.
-      Tensor* result = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape,
-                                                       &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // Call writer of BatchPredictionResponse
-      tempResult.write(out_result);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchPredictionTensorResponseWriter").Device(DEVICE_CPU),
-    BatchPredictionTensorResponseWriter);
diff --git a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.docx b/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.docx
new file mode 100644
index 000000000..57bc03ff9
Binary files /dev/null and b/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.docx differ
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp b/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp
deleted file mode 100644
index 0a7f02af3..000000000
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TWML modified to optimize binary features:
-// - Sparse tensor values are assumed to be binary, so only add operation is done
-//   rather than mul-add;
-// - In house version of vectorization is used instead of Eigen;
-// - Enable sharding and multithreading.
-
-#define EIGEN_USE_THREADS
-
-#include "binary_sparse_dense_matmul.h"
-#include "binary_sparse_dense_matmul_impl.h"
-
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-namespace shape_inference {
-// TODO: The `a_value` is supposed to be all ones.
-// Users should not call this op directly but to use it from `sparse_op` python library. 
-// To make it consistent with original op, the signature remains the same currently,
-//  we will think a better way to contrain correct use of this op.
-// CX-18174
-REGISTER_OP("BinarySparseTensorDenseMatMul")
-    .Input("a_indices: Tindices")
-    .Input("a_values: T")
-    .Input("a_shape: int64")
-    .Input("b: T")
-    .Output("product: T")
-    .Attr("T: type")
-    .Attr("Tindices: {int32,int64} = DT_INT64")
-    .Attr("adjoint_a: bool = false")
-    .Attr("adjoint_b: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      DimensionHandle unused_dim;
-      ShapeHandle unused;
-      ShapeHandle b;
-      ShapeHandle a_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));  // a_indices
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));  // a_values
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &a_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(a_shape, 2, &a_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &b));
-
-      bool adjoint_a;
-      bool adjoint_b;
-      TF_RETURN_IF_ERROR(c->GetAttr("adjoint_a", &adjoint_a));
-      TF_RETURN_IF_ERROR(c->GetAttr("adjoint_b", &adjoint_b));
-
-      DimensionHandle output_right = c->Dim(b, adjoint_b ? 0 : 1);
-      DimensionHandle output_left = c->Dim(a_shape, adjoint_a ? 1 : 0);
-      DimensionHandle inner_left = c->Dim(a_shape, adjoint_a ? 0 : 1);
-      DimensionHandle inner_right = c->Dim(b, adjoint_b ? 1 : 0);
-      TF_RETURN_IF_ERROR(c->Merge(inner_left, inner_right, &unused_dim));
-      c->set_output(0, c->Matrix(output_left, output_right));
-      return Status::OK();
-    });
-}  // namespace shape_inference
-
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-template <typename Device, typename T, typename Tindices>
-class BinarySparseTensorDenseMatMulOp : public OpKernel {
- public:
-  explicit BinarySparseTensorDenseMatMulOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_a", &adjoint_a_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_b", &adjoint_b_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* a_indices;
-    const Tensor* a_values;
-    const Tensor* a_shape;
-    const Tensor* b;
-    OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
-    OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values));
-    OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
-    OP_REQUIRES_OK(ctx, ctx->input("b", &b));
-
-    // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b->shape()),
-                errors::InvalidArgument("Tensor 'b' is not a matrix"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()),
-                errors::InvalidArgument("Tensor 'a_shape' is not a vector"));
-
-    OP_REQUIRES(
-        ctx, a_shape->NumElements() == 2,
-        errors::InvalidArgument("Tensor 'a_shape' must have 2 elements"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values->shape()),
-                errors::InvalidArgument("Tensor 'a_values' is not a vector"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
-                errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
-
-    const int64 nnz = a_indices->shape().dim_size(0);
-    OP_REQUIRES(ctx, nnz == a_values->NumElements(),
-                errors::InvalidArgument("Number of rows of a_indices does not "
-                                        "match number of entries in a_values"));
-
-    OP_REQUIRES(
-        ctx, a_indices->shape().dim_size(1) == a_shape->NumElements(),
-        errors::InvalidArgument("Number of columns of a_indices does not match "
-                                "number of entries in a_shape"));
-
-    auto a_shape_t = a_shape->vec<int64>();
-    const int64 outer_left = (adjoint_a_) ? a_shape_t(1) : a_shape_t(0);
-    const int64 outer_right =
-        (adjoint_b_) ? b->shape().dim_size(0) : b->shape().dim_size(1);
-    const int64 inner_left = (adjoint_a_) ? a_shape_t(0) : a_shape_t(1);
-    const int64 inner_right =
-        (adjoint_b_) ? b->shape().dim_size(1) : b->shape().dim_size(0);
-
-    OP_REQUIRES(
-        ctx, inner_right == inner_left,
-        errors::InvalidArgument(
-            "Cannot multiply A and B because inner dimension does not match: ",
-            inner_left, " vs. ", inner_right,
-            ".  Did you forget a transpose?  "
-            "Dimensions of A: [",
-            a_shape_t(0), ", ", a_shape_t(1),
-            ").  Dimensions of B: ", b->shape().DebugString()));
-
-    TensorShape out_shape({outer_left, outer_right});
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
-
-    if (out->NumElements() == 0) {
-      // If a has shape [0, x] or b has shape [x, 0], the output shape
-      // is a 0-element matrix, so there is nothing to do.
-      return;
-    }
-
-    if (a_values->NumElements() == 0 || b->NumElements() == 0) {
-      // If a has shape [x, 0] and b has shape [0, y], the
-      // output shape is [x, y] where x and y are non-zero, so we fill
-      // the output with zeros.
-      out->flat<T>().device(ctx->eigen_device<Device>()) = 
-          out->flat<T>().constant(T(0));
-      return;
-    }
-
-#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                        \
-  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
-    Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
-        Device, T, Tindices, ADJ_A,                                        \
-        ADJ_B>::Compute(ctx, a_indices, a_values, a_shape, b, out);        \
-    OP_REQUIRES_OK(ctx, functor_status);                                   \
-  }
-
-    MAYBE_ADJOINT(false, false);
-    MAYBE_ADJOINT(false, true);
-    MAYBE_ADJOINT(true, false);
-    MAYBE_ADJOINT(true, true);
-
-#undef MAYBE_ADJOINT
-  }
-
- private:
-  bool adjoint_a_;
-  bool adjoint_b_;
-};
-
-#define REGISTER_CPU(TypeT, TypeIndex)           \
-  REGISTER_KERNEL_BUILDER(                       \
-      Name("BinarySparseTensorDenseMatMul")      \
-          .Device(DEVICE_CPU)                    \
-          .TypeConstraint<TypeT>("T")            \
-          .TypeConstraint<TypeIndex>("Tindices") \
-          .HostMemory("a_shape"),                \
-      BinarySparseTensorDenseMatMulOp<CPUDevice, TypeT, TypeIndex>);
-
-#define REGISTER_KERNELS_CPU(T) \
-  REGISTER_CPU(T, int64);       \
-  REGISTER_CPU(T, int32)
-
-REGISTER_KERNELS_CPU(float);
-REGISTER_KERNELS_CPU(double);
-REGISTER_KERNELS_CPU(int32);
-REGISTER_KERNELS_CPU(complex64);
-REGISTER_KERNELS_CPU(complex128);
-
-namespace functor {
-
-namespace {
-Status KOutOfBoundsError(int64 k, std::size_t i, int rhs_index_a,
-                         std::size_t lhs_right) {
-  return errors::InvalidArgument("k (", k, ") from index[", i, ",", rhs_index_a,
-                                 "] out of bounds (>=", lhs_right, ")");
-}
-
-Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
-                         int64 out_dim0) {
-  return errors::InvalidArgument("m (", m, ") from index[", i, ",", lhs_index_a,
-                                 "] out of bounds (>=", out_dim0, ")");
-}
-
-}  // namespace
-
-
-// The general functor just borrows the code from tf except that add is computed 
-// instead of mul-add.
-template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
-  // Vectorize certain operations above this size.
-  static const std::size_t kNumVectorize = 32;
-
-  static Status Compute(OpKernelContext* ctx,
-                        const Tensor *a_indices,
-                        const Tensor *a_values,
-                        const Tensor *a_shape,
-                        const Tensor *b,
-                        Tensor *out) {
-    return EigenCompute(ctx->eigen_device<CPUDevice>(), out->matrix<T>(),
-                        a_indices->matrix<Tindices>(), a_values->vec<T>(),
-                        b->matrix<T>());
-  }
-
-  static Status EigenCompute(const CPUDevice& d, typename TTypes<T>::Matrix out,
-                             typename TTypes<Tindices>::ConstMatrix a_indices,
-                             typename TTypes<T>::ConstVec a_values,
-                             typename TTypes<T>::ConstMatrix b) {
-    const std::size_t nnz = a_values.size();
-    const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
-    const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
-    const int lhs_index_a = ADJ_A ? 1 : 0;
-    const int rhs_index_a = ADJ_A ? 0 : 1;
-
-    out.setZero();
-
-    if (rhs_right < kNumVectorize) {
-      // Disable vectorization if the RHS of output is too small
-      auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
-
-      for (std::size_t i = 0; i < nnz; ++i) {
-        const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
-        const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
-        if (!FastBoundsCheck(k, lhs_right)) {
-          return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);
-        }
-        if (!FastBoundsCheck(m, out.dimension(0))) {
-          return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));
-        }
-        for (std::size_t n = 0; n < rhs_right; ++n) {
-          const T b_value = maybe_adjoint_b(k, n);
-          out(m, n) += b_value;
-        }
-      }
-    } else {
-      // Vectorization via Eigen.
-      const int b_chip_index = ADJ_B ? 1 : 0;
-
-#define LOOP_NNZ(b_passed)                                                  \
-  for (std::size_t i = 0; i < nnz; ++i) {                                   \
-    const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
-    const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
-    if (!FastBoundsCheck(k, lhs_right)) {                                   \
-      return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);               \
-    }                                                                       \
-    if (!FastBoundsCheck(m, out.dimension(0))) {                            \
-      return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));        \
-    }                                                                       \
-    out.template chip<0>(m) += b_passed.template chip<b_chip_index>(k);     \
-  }
-
-
-      if (ADJ_B) {
-        // Perform transpose and conjugation on B once, since we chip out B's
-        // columns in the nnz loop.
-        Eigen::array<int, 2> shuffle;  // preserve dimension order
-        shuffle[0] = 1; shuffle[1] = 0;
-        Eigen::Tensor<T, 2, Eigen::ColMajor> col_major_conj_b =
-            b.swap_layout().shuffle(shuffle).conjugate();
-        LOOP_NNZ(col_major_conj_b);
-      } else {
-        LOOP_NNZ(b);
-      }
-#undef LOOP_NNZ
-    }
-    return Status::OK();
-  }
-};
-
-
-// We have only specified and optimised the case with no matrix transpose, 
-// since it is the most typical usage in productions.
-template <typename Tindices>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, 
-                                      float, Tindices, false, false> {
-  static Status Compute(OpKernelContext* ctx,
-                        const Tensor *a_indices,
-                        const Tensor *a_values,
-                        const Tensor *a_shape,
-                        const Tensor *b,
-                        Tensor *out) {
-    auto a_indices_ptr = a_indices->flat<Tindices>().data();     
-    auto b_ptr = b->flat<float>().data();
-    auto out_ptr = out->flat<float>().data();
-    const int64 nnz = a_indices->shape().dim_size(0);
-    const int64 outer_left = a_shape->vec<int64>()(0);
-    const int64 outer_right = b->shape().dim_size(1);
-    ParallelLookupAndSegmentSum<Tindices>(ctx, a_indices_ptr, b_ptr, nnz,
-                                outer_left, outer_right, out_ptr);
-    return Status::OK();
-  }
-};
-
-}  // namespace functor
-
-}  // namespace tensorflow
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul.docx b/twml/libtwml/src/ops/binary_sparse_dense_matmul.docx
new file mode 100644
index 000000000..980fd6fb0
Binary files /dev/null and b/twml/libtwml/src/ops/binary_sparse_dense_matmul.docx differ
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul.h b/twml/libtwml/src/ops/binary_sparse_dense_matmul.h
deleted file mode 100644
index 92494af52..000000000
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TWML modified to optimize binary features 
-#ifndef TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-#define TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-namespace functor {
-
-template <typename Device, typename T, typename Tindices, bool ADJ_A,
-          bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor {
-  static EIGEN_ALWAYS_INLINE Status Compute(
-      const Device& d, typename TTypes<T>::Matrix out,
-      typename TTypes<Tindices>::ConstMatrix a_indices,
-      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
-};
-
-template <typename MATRIX, bool ADJ>
-class MaybeAdjoint;
-
-template <typename MATRIX>
-class MaybeAdjoint<MATRIX, false> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
-      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
-    return m_(i, j);
-  }
-
- private:
-  const MATRIX m_;
-};
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T MaybeConj(T v) {
-  return v;
-}
-
-template <typename MATRIX>
-class MaybeAdjoint<MATRIX, true> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
-      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
-    return Eigen::numext::conj(m_(j, i));
-  }
-
- private:
-  const MATRIX m_;
-};
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.docx b/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.docx
new file mode 100644
index 000000000..27d2d2fe2
Binary files /dev/null and b/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.docx differ
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h b/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h
deleted file mode 100644
index db61647cb..000000000
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
-
-#include <atomic>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-
-namespace tensorflow {
-namespace functor {
-
-// `ConservativeShard` is adopted rather than `Shard` in tensorflow because the
-// original `Shard` may generate number of shards more than the number of
-// threads, which is not ideal for this case, as it may cause too much overhead.
-static void ConservativeShard(int max_parallelism, thread::ThreadPool *workers,
-                              int64 total, int64 cost_per_unit,
-                              std::function<void(int64, int64)> work) {
-  if (total == 0) {
-    return;
-  }
-  max_parallelism = std::min(max_parallelism, workers->NumThreads());
-  if (max_parallelism <= 1) {
-    // Just inline the whole work since we only have 1 thread (core).
-    work(0, total);
-    return;
-  }
-  cost_per_unit = std::max(1LL, cost_per_unit);
-  // We shard [0, total) into "num_shards" shards.
-  //   1 <= num_shards <= num worker threads
-  //
-  // If total * cost_per_unit is small, it is not worth shard too
-  // much. Let us assume each cost unit is 1ns, kMinCostPerShard=10000
-  // is 10us.
-  static const int64 kMinCostPerShard = 10000;
-  const int num_shards =
-      std::max<int>(1, std::min(static_cast<int64>(max_parallelism),
-                                total * cost_per_unit / kMinCostPerShard));
-
-  // Each shard contains up to "block_size" units. [0, total) is sharded
-  // into:
-  //   [0, block_size), [block_size, 2*block_size), ...
-  // The 1st shard is done by the caller thread and the other shards
-  // are dispatched to the worker threads. The last shard may be smaller than
-  // block_size.
-  const int64 block_size = (total + num_shards - 1) / num_shards;
-  if (block_size >= total) {
-    work(0, total);
-    return;
-  }
-  const int num_shards_used = (total + block_size - 1) / block_size;
-  BlockingCounter counter(num_shards_used - 1);
-  for (int64 start = block_size; start < total; start += block_size) {
-    auto limit = std::min(start + block_size, total);
-    workers->Schedule([&work, &counter, start, limit]() {
-      work(start, limit);        // Compute the shard.
-      counter.DecrementCount();  // The shard is done.
-    });
-  }
-
-  // Inline execute the 1st shard.
-  work(0, std::min(block_size, total));
-  counter.Wait();
-}
-
-static inline void VectorSum(float *a, const float *b, int n) {
-  for (int i = 0; i < n; ++i) {
-    a[i] += b[i];
-  }
-}
-
-// This func is to vectorize the computation of segment sum.
-template<typename Tindices>
-static void LookupAndSegmentSum(const Tindices *a_indices, const float *b,
-                                int nnz, int outer_right, float *output) {
-  for (std::size_t i = 0; i < nnz; ++i) {
-    const Tindices m = a_indices[i * 2];
-    const Tindices k = a_indices[i * 2 + 1];
-    auto output_row_m = output + m * outer_right;
-    auto b_row_k = b + k * outer_right;
-    VectorSum(output_row_m, b_row_k, outer_right);
-  }
-}
-
-// This func enables sharding and multithreading, it comes with an overhead of
-// duplicating output buffer to achieve lock free output. So there should not
-// be too many threads.
-template<typename Tindices>
-static void ParallelLookupAndSegmentSum(OpKernelContext *ctx,
-                                        const Tindices *a_indices,
-                                        const float *b, int nnz, int outer_left,
-                                        int outer_right, float *output) {
-  auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-  int out_size = outer_left * outer_right;
-  if (worker_threads.num_threads <= 1) {
-    memset(output, 0, out_size * sizeof(float));
-    LookupAndSegmentSum<Tindices>(a_indices, b, 
-                                  nnz, outer_right,
-                                  output);
-    return;
-  }
-
-  // this is to make buffer align with kAllocatorAlignment
-  int padded_out_size = (out_size + (Allocator::kAllocatorAlignment - 1)) &
-                        ~(Allocator::kAllocatorAlignment - 1);
-  std::size_t num_bytes =
-      (worker_threads.num_threads - 1) * padded_out_size * sizeof(float);
-  auto buffer = std::unique_ptr<float>(reinterpret_cast<float *>(
-      port::AlignedMalloc(num_bytes, Allocator::kAllocatorAlignment)));
-  float *temp_out = buffer.get();
-
-  std::atomic<int> thread_index(0);
-
-  auto task = [&](int64 start, int64 limit) {
-    int local_thread_index = thread_index++;
-    float *buf_ptr = nullptr;
-    if (local_thread_index == 0) {
-      buf_ptr = output;
-    } else {
-      buf_ptr = temp_out + (local_thread_index - 1) * padded_out_size;
-    }
-    memset(buf_ptr, 0, out_size * sizeof(float));
-
-    LookupAndSegmentSum<Tindices>(a_indices + start * 2, b, 
-                                  limit - start, outer_right,
-                                  buf_ptr);
-  };
-
-  int cost_per_unit = outer_right;
-
-  // We don't use tensorflow shard func as tf may create more shards than
-  // number of threads.
-  ConservativeShard(worker_threads.num_threads, worker_threads.workers, nnz,
-                    static_cast<int64>(cost_per_unit), task);
-
-  for (int i = 1; i < thread_index; ++i) {
-    VectorSum(output, temp_out + (i - 1) * padded_out_size, out_size);
-  }
-}
-
-}  // namespace functor
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
\ No newline at end of file
diff --git a/twml/libtwml/src/ops/block_format_dataset.cpp b/twml/libtwml/src/ops/block_format_dataset.cpp
deleted file mode 100644
index fdf4a9543..000000000
--- a/twml/libtwml/src/ops/block_format_dataset.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "block_format_reader.h"
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/io/random_inputstream.h"
-
-#if !defined(DISABLE_ZLIB)
-#include "tensorflow/core/lib/io/zlib_inputstream.h"
-#endif
-
-#include <twml.h>
-
-#include <cstdio>
-#include <algorithm>
-#include <iterator>
-
-using namespace tensorflow;
-
-
-inline std::string stripPath(std::string const &file_name) {
-  const auto pos = file_name.find_last_of("/");
-  if (pos == std::string::npos) return file_name;
-  return file_name.substr(pos + 1);
-}
-
-inline std::string getExtension(std::string const &file_name) {
-  const auto stripped_file_name = stripPath(file_name);
-  const auto pos = stripPath(stripped_file_name).find_last_of(".");
-  if (pos == std::string::npos) return "";
-  return stripped_file_name.substr(pos + 1);
-}
-
-REGISTER_OP("BlockFormatDatasetV2")
-.Input("filenames: string")
-.Input("compression_type: string")
-.Input("buffer_size: int64")
-.Output("handle: variant")
-.SetIsStateful()
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-
-Creates a dataset for streaming BlockFormat data in compressed (e.g. gzip), uncompressed formats.
-This op also has the ability stream a dataset containing files from multiple formats mentioned above.
-
-filenames: A scalar or vector containing the name(s) of the file(s) to be read.
-compression_type: A scalar string denoting the compression type. Can be 'none', 'zlib', 'auto'.
-buffer_size: A scalar denoting the buffer size to use during decompression.
-
-Outputs
-  handle: A handle to the dataset. This handle is later used to create an iterator to stream the data from the dataset.
-
-)doc");
-
-
-class BlockFormatDatasetV2 : public DatasetOpKernel {
- public:
-  using DatasetOpKernel::DatasetOpKernel;
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase **output) override {
-    const Tensor* filenames_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
-    OP_REQUIRES(
-        ctx, filenames_tensor->dims() <= 1,
-        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
-
-    const auto filenames_flat = filenames_tensor->flat<string>();
-    const int64 num_files = filenames_tensor->NumElements();
-    std::vector<string> filenames;
-    filenames.reserve(num_files);
-    std::copy(filenames_flat.data(),
-              filenames_flat.data() + num_files,
-              std::back_inserter(filenames));
-
-    string compression_type;
-    OP_REQUIRES_OK(
-        ctx, tensorflow::data::ParseScalarArgument<string>(
-            ctx, "compression_type", &compression_type));
-
-    int64 buffer_size = -1;
-    OP_REQUIRES_OK(
-        ctx, tensorflow::data::ParseScalarArgument<int64>(
-            ctx, "buffer_size", &buffer_size));
-
-    OP_REQUIRES(ctx, buffer_size >= 0,
-                errors::InvalidArgument(
-                    "`buffer_size` must be >= 0 (0 == no buffering)"));
-
-    OP_REQUIRES(ctx,
-                compression_type == "auto" ||
-                compression_type == "gz" ||
-                compression_type == "",
-                errors::InvalidArgument("Unknown extension: ", compression_type));
-
-    *output = new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx,
-            std::vector<string> filenames,
-            std::string compression_type,
-            int64 buffer_size)
-        : DatasetBase(DatasetContext(ctx)),
-          compression_type_(compression_type),
-          buffer_size_(buffer_size),
-          filenames_(std::move(filenames))
-    {}
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    string DebugString() const override { return "BlockFormatDatasetV2::Dataset"; }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* filenames = nullptr;
-      Node* compression_type = nullptr;
-      Node* buffer_size = nullptr;
-      TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
-      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_size_, &buffer_size));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {filenames, compression_type, buffer_size}, output));
-      return Status::OK();
-    }
-
-   private:
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::BlockFormat")}));
-    }
-
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params &params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        do {
-          // We are currently processing a file, so try to read the next record.
-          if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
-            Status s = reader_->ReadNext(&result_tensor.scalar<string>()());
-            if (s.ok()) {
-              out_tensors->emplace_back(std::move(result_tensor));
-              *end_of_sequence = false;
-              return Status::OK();
-            } else if (!errors::IsOutOfRange(s)) {
-              return s;
-            }
-
-            // We have reached the end of the current file, so maybe
-            // move on to next file.
-            reader_.reset();
-            ++current_file_index_;
-          }
-
-          // Iteration ends when there are no more files to process.
-          if (current_file_index_ == dataset()->filenames_.size()) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          // Actually move on to next file.
-          const string& next_filename =
-              dataset()->filenames_[current_file_index_];
-
-          auto compression_type = dataset()->compression_type_;
-          int64 buffer_size = dataset()->buffer_size_;
-
-          if (compression_type == "auto") {
-            compression_type = getExtension(next_filename);
-          }
-
-          if (compression_type != "gz" && compression_type != "") {
-            return errors::InvalidArgument("Unknown extension: ", compression_type);
-          }
-
-          tensorflow::Env* env = tensorflow::Env::Default();
-          TF_CHECK_OK(env->NewRandomAccessFile(next_filename, &file_));
-
-          // RandomAccessInputstream defaults the second param to "false".
-          // The second parameter "false" is the key issue.
-          // "false" assumes the ownership of the file is elsewhere.
-          // But making that "true" causes segfaults down the line.
-          // So keep the ownership of "file_" in this class and clean up properly.
-          file_stream_.reset(new tensorflow::io::RandomAccessInputStream(file_.get(), false));
-
-          if (compression_type == "gz") {
-            // unpack_stream does not take ownership of file_stream_
-#if !defined(DISABLE_ZLIB)
-            unpack_stream_.reset(new tensorflow::io::ZlibInputStream(
-                                   file_stream_.get(),
-                                   buffer_size,
-                                   buffer_size,
-                                   tensorflow::io::ZlibCompressionOptions::GZIP()));
-            reader_.reset(new BlockFormatReader(unpack_stream_.get()));
-#else
-            return errors::InvalidArgument("libtwml compiled without zlib support");
-#endif
-          } else {
-            unpack_stream_.reset(nullptr);
-            reader_.reset(new BlockFormatReader(file_stream_.get()));
-          }
-        } while (true);
-      }
-
-     private:
-      mutex mu_;
-      uint64_t current_file_index_ GUARDED_BY(mu_) = 0;
-      std::unique_ptr<tensorflow::RandomAccessFile> file_;
-      std::unique_ptr<tensorflow::io::InputStreamInterface> file_stream_;
-      std::unique_ptr<tensorflow::io::InputStreamInterface> unpack_stream_;
-      std::unique_ptr<BlockFormatReader> reader_ GUARDED_BY(mu_);
-    };
-
-    const std::string compression_type_;
-    const int64 buffer_size_;
-    const std::vector<string> filenames_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("BlockFormatDatasetV2")
-  .Device(DEVICE_CPU),
-  BlockFormatDatasetV2);
diff --git a/twml/libtwml/src/ops/block_format_dataset.docx b/twml/libtwml/src/ops/block_format_dataset.docx
new file mode 100644
index 000000000..97930b494
Binary files /dev/null and b/twml/libtwml/src/ops/block_format_dataset.docx differ
diff --git a/twml/libtwml/src/ops/block_format_reader.docx b/twml/libtwml/src/ops/block_format_reader.docx
new file mode 100644
index 000000000..42118df2c
Binary files /dev/null and b/twml/libtwml/src/ops/block_format_reader.docx differ
diff --git a/twml/libtwml/src/ops/block_format_reader.h b/twml/libtwml/src/ops/block_format_reader.h
deleted file mode 100644
index 29450cc03..000000000
--- a/twml/libtwml/src/ops/block_format_reader.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#pragma once
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/lib/io/random_inputstream.h"
-
-#include <twml.h>
-
-#include <string>
-
-using tensorflow::int64;
-using tensorflow::Status;
-using std::string;
-
-class BlockFormatReader : twml::BlockFormatReader {
- public:
-  explicit BlockFormatReader(tensorflow::io::InputStreamInterface *stream)
-      : twml::BlockFormatReader() , stream_(stream) {
-  }
-
-  // Read the next record.
-  // Returns OK on success,
-  // Returns OUT_OF_RANGE for end of file, or something else for an error.
-  Status ReadNext(string* record) {
-    if (this->next()) {
-      return stream_->ReadNBytes(this->current_size(), record);
-    }
-    return tensorflow::errors::OutOfRange("eof");
-  }
-
-  uint64_t read_bytes(void *dest, int size, int count) {
-    uint64_t bytesToRead = size * count;
-    std::string current;
-    // TODO: Try to merge ReadNBytes and the memcpy below
-    // ReadNBytes performs a memory copy already.
-    Status status = stream_->ReadNBytes(bytesToRead, &current);
-    if (!status.ok()) {
-      return 0;
-    }
-    memcpy(dest, current.c_str(), bytesToRead);
-    return count;
-  }
-
- private:
-  tensorflow::io::InputStreamInterface *stream_;
-  TF_DISALLOW_COPY_AND_ASSIGN(BlockFormatReader);
-};
diff --git a/twml/libtwml/src/ops/compress_sample_ids.cpp b/twml/libtwml/src/ops/compress_sample_ids.cpp
deleted file mode 100644
index 3053de471..000000000
--- a/twml/libtwml/src/ops/compress_sample_ids.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <algorithm>    // std::fill_n
-
-using namespace tensorflow;
-
-REGISTER_OP("CompressSampleIds")
-.Attr("T: {int32}")
-.Input("input: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->Vector(c->kUnknownDim));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class CompressSampleIds : public OpKernel {
- public:
-  explicit CompressSampleIds(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    const int N = input.size();
-
-    // Check for improper input
-    bool error = (N > 0 && input(0) < 0);
-    for (int i = 1; !error && i < N; i++) {
-      error = input(i - 1) > input(i);
-    }
-
-    OP_REQUIRES(
-      context, !error,
-      errors::InvalidArgument(
-        "Error in CompressSampleIds. SampleIds must be non-negative and non-decreasing"
-      )
-    );
-
-    // choose output size, either last input element + 1, or 0
-    int output_size = 0;
-    if (N > 0) {
-      output_size = input(N - 1) + 1;
-    }
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, TensorShape({output_size}), &output_tensor)
-    );
-    auto output_flat = output_tensor->flat<T>();
-
-    // Zero-initialize output
-    for (int i = 0; i < output_size; i++) {
-      output_flat(i) = 0;
-    }
-
-    // count how many of each input element
-    for (int i = 0; i < N; i++) {
-      output_flat(input(i)) ++;
-    }
-  }
-};
-
-REGISTER_OP("DecompressSampleIds")
-.Attr("T: {int32}")
-.Input("input: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->Vector(c->kUnknownDim));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class DecompressSampleIds : public OpKernel {
- public:
-  explicit DecompressSampleIds(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    const int N = input.size();
-
-    // Check for improper input
-    bool error = false;
-    int output_size = 0;
-    for (int i = 0; !error && i < N; i++) {
-      error = input(i) < 0;
-      output_size += input(i);
-    }
-
-    OP_REQUIRES(
-      context, !error,
-      errors::InvalidArgument(
-        "Error in DecompressSampleIds. Inputs must be non-negative."
-      )
-    );
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, TensorShape({output_size}),&output_tensor)
-    );
-    auto output_flat = output_tensor->flat<T>();
-
-    T *output_data = output_flat.data();
-    for (int current_sample = 0; current_sample < N; current_sample++) {
-      std::fill_n(output_data, input(current_sample), current_sample);
-      output_data += input(current_sample);
-    }
-  }
-};
-
-
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("CompressSampleIds")       \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    CompressSampleIds<Type>);       \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("DecompressSampleIds")     \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    DecompressSampleIds<Type>);     \
-                                    \
-
-REGISTER(int32);
diff --git a/twml/libtwml/src/ops/compress_sample_ids.docx b/twml/libtwml/src/ops/compress_sample_ids.docx
new file mode 100644
index 000000000..c887e58ab
Binary files /dev/null and b/twml/libtwml/src/ops/compress_sample_ids.docx differ
diff --git a/twml/libtwml/src/ops/contrib/get_substrings.cpp b/twml/libtwml/src/ops/contrib/get_substrings.cpp
deleted file mode 100644
index 8cd167e65..000000000
--- a/twml/libtwml/src/ops/contrib/get_substrings.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "../tensorflow_utils.h"
-#include "../resource_utils.h"
-
-#include <string>
-#include <set>
-
-using std::string;
-
-void join(const std::set<string>& v, char c, string& s) {
-         s.clear();
-         std::set<std::string>::iterator it = v.begin();
-         while (it != v.end()) {
-            s += *it;
-            it++;
-            if (it != v.end()) s+= c;
-         }
-}
-
-// cpp function that computes substrings of a given word
-std::string computeSubwords(std::string word, int32_t minn, int32_t maxn) {
-         std::string word2 = "<" + word + ">";
-         std::set<string> ngrams;
-         std::string s;
-         ngrams.insert(word);
-         ngrams.insert(word2);
-         for (size_t i = 0; i < word2.size(); i++) {
-            if ((word2[i] & 0xC0) == 0x80) continue;
-            for (size_t j = minn; i+j <= word2.size() && j <= maxn; j++) {
-              ngrams.insert(word2.substr(i, j));
-            }
-         }
-         join(ngrams, ';',  s);
-         ngrams.clear();
-         return s;
-}
-
-// tf-op function that computes substrings for a given tensor of words
-template< typename ValueType>
-
-void ComputeSubStringsTensor(OpKernelContext *context, int32 min_n, int32 max_n) {
-  try {
-      const Tensor& values = context->input(0);
-
-      auto values_flat = values.flat<ValueType>();
-
-      // batch_size from input_size  :
-      const int batch_size = values_flat.size();
-
-      // define the output tensor
-      Tensor* substrings = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, values.shape(), &substrings));
-
-      auto substrings_flat = substrings->flat<ValueType>();
-       // compute substrings for the given tensor values
-      for (int64 i = 0; i < batch_size; i++) {
-            substrings_flat(i) = computeSubwords(values_flat(i), min_n, max_n);
-      }
-  }
-  catch (const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-  }
-}
-
-REGISTER_OP("GetSubstrings")
-.Attr("ValueType: {string}")
-.Attr("min_n: int")
-.Attr("max_n: int")
-.Input("values: ValueType")
-.Output("substrings: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert word to substrings of length between min_n and max_n.
-
-Attr
-  min_n,max_n: The size of the substrings.
-
-Input
-  values: 1D input tensor containing the values.
-
-Outputs
-  substrings: A string tensor where substrings are joined by ";".
-)doc");
-
-template<typename ValueType>
-class GetSubstrings : public OpKernel {
- public:
-  explicit GetSubstrings(OpKernelConstruction *context) : OpKernel(context) {
-      OP_REQUIRES_OK(context, context->GetAttr("min_n", &min_n));
-      OP_REQUIRES_OK(context, context->GetAttr("max_n", &max_n));
-  }
-
- private:
-  int32 min_n;
-  int32 max_n;
-  void Compute(OpKernelContext *context) override {
-    ComputeSubStringsTensor<ValueType>(context, min_n, max_n);
-  }
-};
-
-
-#define REGISTER_SUBSTRINGS(ValueType)          \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("GetSubstrings")                       \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<ValueType>("ValueType"),    \
-    GetSubstrings<ValueType>);                  \
-
-REGISTER_SUBSTRINGS(string)
diff --git a/twml/libtwml/src/ops/contrib/get_substrings.docx b/twml/libtwml/src/ops/contrib/get_substrings.docx
new file mode 100644
index 000000000..0f5272764
Binary files /dev/null and b/twml/libtwml/src/ops/contrib/get_substrings.docx differ
diff --git a/twml/libtwml/src/ops/data_record.cpp b/twml/libtwml/src/ops/data_record.cpp
deleted file mode 100644
index 71ea72ac4..000000000
--- a/twml/libtwml/src/ops/data_record.cpp
+++ /dev/null
@@ -1,1891 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include <twml/functions.h>
-#include <twml/utilities.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-
-using std::string;
-
-REGISTER_OP("DecodeDataRecord")
-.Attr("InputType: {uint8, string}")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Input("input_bytes: InputType")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that creates a handle for the datarecord.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecord struct.
-)doc");
-
-template<typename InputType>
-class DecodeDataRecord : public OpKernel {
- public:
-  explicit DecodeDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      int batch_size = getBatchSize<InputType>(resource->input);
-      int num_labels = static_cast<int>(m_labels_map.size());
-      int num_weights = static_cast<int>(m_weights_map.size());
-
-      twml::DataRecordReader reader;
-      reader.setKeepMap(&m_keep_map);
-      reader.setLabelsMap(&m_labels_map);
-
-      // Do not set weight map if it is empty. This will take a faster path.
-      if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-      }
-
-      resource->records.clear();
-      resource->records.reserve(batch_size);
-      for (int i = 0; i < batch_size; i++) {
-        resource->records.emplace_back(num_labels, num_weights);
-      }
-
-      for (int64 id = 0; id < batch_size; id++) {
-        const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
-        reader.setBuffer(input_bytes);
-        // decode the reader
-        resource->records[id].decode(reader);
-      }
-      // This should be fine because m_keep_map should never go out of scope.
-      resource->keep_map = &m_keep_map;
-      resource->num_weights = num_weights;
-      resource->num_labels = num_labels;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-int64_t count_if_exists(const twml::DataRecord::BinaryFeatures &set,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &key : set) {
-    if (keep_map->find(key) == keep_map->end()) continue;
-    count++;
-  }
-  return count;
-}
-
-// This works for continuous, discrete, and string features
-template<typename V>
-int64_t count_if_exists(const twml::Map<int64_t, V> &map,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &elem : map) {
-    if (keep_map->find(elem.first) == keep_map->end()) continue;
-    count++;
-  }
-  return count;
-}
-
-int64_t count_if_exists(const twml::DataRecord::SparseBinaryFeatures &map,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &elem : map) {
-    if (keep_map->find(elem.first) == keep_map->end()) continue;
-    count += elem.second.size();
-  }
-  return count;
-}
-
-int64_t count_if_exists(const twml::DataRecord::SparseContinuousFeatures &map,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &elem : map) {
-    if (keep_map->find(elem.first) == keep_map->end()) continue;
-    count += elem.second.size();
-  }
-  return count;
-}
-
-REGISTER_OP("GetBinaryFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads binary features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: always set to 1 (float)
-)doc");
-
-class GetBinaryFeatures : public OpKernel {
- public:
-  explicit GetBinaryFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_binary_size = count_if_exists(common.getBinary(), handle->keep_map);
-      int64 total_binary_size = records.size() * common_binary_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_binary_size += count_if_exists(handle->records[id].getBinary(), handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_binary_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* ids = nullptr;
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getBinary()) {
-          if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it;
-          offset++;
-        }
-        for (const auto &it : records[id].getBinary()) {
-          if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it;
-          offset++;
-        }
-      }
-      // All the values for binary features are 1.
-      std::fill(values_flat.data(), values_flat.data() + total_size, 1);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetContinuousFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads continuous features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: Datarecord keys (int64)
-  values: Datarecord values(float)
-)doc");
-
-class GetContinuousFeatures : public OpKernel {
- public:
-  explicit GetContinuousFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_continuous_size = count_if_exists(common.getContinuous(), handle->keep_map);
-      int64 total_continuous_size = records.size() * common_continuous_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_continuous_size += count_if_exists(handle->records[id].getContinuous(),
-                                                 handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_continuous_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* values = nullptr;
-      Tensor* ids = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto values_flat = values->flat<float>();
-      auto ids_flat = ids->flat<int64>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-        for (const auto &it : records[id].getContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetDiscreteFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads discrete features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values(int64)
-)doc");
-
-class GetDiscreteFeatures : public OpKernel {
- public:
-  explicit GetDiscreteFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_discrete_size = count_if_exists(common.getDiscrete(), handle->keep_map);
-      int64 total_discrete_size = records.size() * common_discrete_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_discrete_size += count_if_exists(handle->records[id].getDiscrete(),
-                                               handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_discrete_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* values = nullptr;
-      Tensor* ids = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto values_flat = values->flat<int64>();
-      auto ids_flat = ids->flat<int64>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getDiscrete()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-        for (const auto &it : records[id].getDiscrete()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetStringFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads string features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-  values: always set to 1 (float)
-)doc");
-
-class GetStringFeatures : public OpKernel {
- public:
-  explicit GetStringFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_string_size = count_if_exists(common.getString(), handle->keep_map);
-      int64 total_string_size = records.size() * common_string_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_string_size += count_if_exists(handle->records[id].getString(),
-                                             handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_string_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-      Tensor* ids = nullptr;
-      Tensor*values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      std::fill(values_flat.data(), values_flat.data() + total_size, 1);
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getString()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          names_flat(offset) = it.second;
-          offset++;
-        }
-        for (const auto &it : records[id].getString()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          names_flat(offset) = it.second;
-          offset++;
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseBinaryFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads sparse binary features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-  values: always set to 1 (float)
-)doc");
-
-class GetSparseBinaryFeatures : public OpKernel {
- public:
-  explicit GetSparseBinaryFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_sparse_binary_size = count_if_exists(common.getSparseBinary(), handle->keep_map);
-      int64 total_sparse_binary_size = records.size() * common_sparse_binary_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_sparse_binary_size += count_if_exists(handle->records[id].getSparseBinary(),
-                                                    handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_sparse_binary_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-      Tensor* ids = nullptr;
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      // All the values for sparse binary features are 1.
-      std::fill(values_flat.data(), values_flat.data() + total_size, 1);
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getSparseBinary()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner;
-            offset++;
-          }
-        }
-        for (const auto &it : records[id].getSparseBinary()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner;
-            offset++;
-          }
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseContinuousFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads sparse continuous features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values(float)
-  names: DataRecord values(string)
-)doc");
-
-class GetSparseContinuousFeatures : public OpKernel {
- public:
-  explicit GetSparseContinuousFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_sparse_continuous_size = count_if_exists(common.getSparseContinuous(),
-                                                            handle->keep_map);
-      int64 total_sparse_continuous_size = records.size() * common_sparse_continuous_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_sparse_continuous_size += count_if_exists(handle->records[id].getSparseContinuous(),
-                                                        handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_sparse_continuous_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* values = nullptr;
-      Tensor* names = nullptr;
-      Tensor* ids = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &names));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto values_flat = values->flat<float>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        // copying the contents of the maps of maps
-        for (const auto &it : common.getSparseContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          // for each id; iterate through the number of maps corresponding to that id
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner.first;
-            values_flat(offset) = it_inner.second;
-            offset++;
-          }
-        }
-        // copying the contents of the maps of maps
-        for (const auto &it : records[id].getSparseContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          // for each id; iterate through the number of maps corresponding to that id
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner.first;
-            values_flat(offset) = it_inner.second;
-            offset++;
-          }
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetBatchSizeFromDataRecord")
-.Input("data_record_handle: resource")
-.Output("batch_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns batch size from the data record.
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  batch_size: Number of records held in the handle.
-)doc");
-
-class GetBatchSizeFromDataRecord : public OpKernel {
- public:
-  explicit GetBatchSizeFromDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->records.size();
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetLabelsFromDataRecord")
-.Input("data_record_handle: resource")
-.Output("labels: float")
-.Attr("default_label: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns labels from the data record.
-
-Attr
-  default_label: The value used when a label is absent in a data record.
-
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
-)doc");
-
-class GetLabelsFromDataRecord : public OpKernel {
- private:
-  float default_label;
-
- public:
-  explicit GetLabelsFromDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_labels = static_cast<int>(handle->num_labels);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
-
-      Tensor *labels;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
-
-      // The default value of label is not present in data record is std::nanf
-      // For continuous labels, change that to a default_label or label.
-      auto func = [this](float label) -> float {
-        return std::isnan(label) ? default_label : label;
-      };
-
-      auto labels_data = labels->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_labels = record.labels();
-        labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetWeightsFromDataRecord")
-.Input("data_record_handle: resource")
-.Output("weights: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns weights from the data record.
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
-)doc");
-
-class GetWeightsFromDataRecord : public OpKernel {
- public:
-  explicit GetWeightsFromDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_weights = static_cast<int>(handle->num_weights);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
-
-      Tensor *weights;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
-
-      auto weights_data = weights->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_weights = record.weights();
-        weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-template<typename ValueType, typename FeatureType, typename TensorType>
-void SetValueGroup(
-const FeatureType& type,
-const int64& feature_id,
-const int64& id,
-const ValueType& default_value,
-TensorType values_flat) {
-  auto it = type.find(feature_id);
-  values_flat(id) = (it == type.end()) ? default_value : it->second;
-}
-
-template<typename ValueType, typename TensorType>
-// overloading for BinaryFeatures; as it needs to set a value of 1
-void SetValueGroup(
-const twml::DataRecord::BinaryFeatures& type,
-const int64& feature_id,
-const int64& id,
-const ValueType& default_value,
-TensorType values_flat) {
-  auto it = type.find(feature_id);
-  values_flat(id) = (it == type.end()) ? default_value : 1;
-}
-
-// Helper for Group Extraction of Dense Features
-template<typename ValueType, typename FeatureType>
-void ComputeHelperGroupFeaturesAsTensors(
-OpKernelContext* context,
-const std::vector<int64>& feature_ids,
-ValueType& default_value,
-std::function<const FeatureType&(const twml::DataRecord&)> f) {
-  auto handle = getHandle<DataRecordResource>(context, 0);
-  const auto &records = handle->records;
-  // Output shape is 2D; where the first dimension corresponds to the batch_size
-  // and the second corresponds to the number of features passed to the TF Op.
-  const int batch_size = static_cast<int64>(handle->records.size());
-  const int num_feature_ids = static_cast<int>(feature_ids.size());
-  TensorShape shape = {batch_size, num_feature_ids};
-
-  // Define the output
-  Tensor* values = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
-  auto values_flat = values->flat<ValueType>();
-
-  for (int64 id = 0; id < records.size(); id++) {
-    const auto &type = f(records[id]);
-    const auto id_offset = id * feature_ids.size();
-    for (int64 fid = 0; fid < feature_ids.size(); fid++) {
-      auto feature_id = feature_ids[fid];
-      // The value is set to default if it does not exist in the current DataRecord
-      SetValueGroup(type, feature_id, id_offset + fid, default_value, values_flat);
-    }
-  }
-}
-
-// Helper for Single Extraction of Dense Features
-template<typename ValueType, typename FeatureType>
-void ComputeHelperFeaturesAsTensors(
-OpKernelContext* context,
-ValueType& default_value,
-int64 feature_id,
-std::function<const FeatureType&(const twml::DataRecord&)> f) {
-  auto handle = getHandle<DataRecordResource>(context, 0);
-  const auto &records = handle->records;
-  // Output shape is 2D; where the first dimension corresponds to the batch_size
-  // and the second corresponds to the number of features passed to the TF Op.
-  const int total_size = static_cast<int64>(handle->records.size());
-  TensorShape shape = {total_size};
-
-  // Define the output
-  Tensor* values = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
-  auto values_flat = values->flat<ValueType>();
-  for (int64 id = 0; id < records.size(); id++) {
-    const auto &type = f(records[id]);
-    SetValueGroup(type, feature_id, id, default_value, values_flat);
-  }
-}
-
-REGISTER_OP("GetBinaryAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetBinaryAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  float default_value;
-
- public:
-  explicit GetBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetContinuousAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetContinuousAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  float default_value;
-
- public:
-  explicit GetContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetDiscreteAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: int")
-.Output("values: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetDiscreteAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  int64 default_value;
-
- public:
-  explicit GetDiscreteAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetStringAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: string")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  names: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetStringAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  string default_value;
-
- public:
-  explicit GetStringAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-REGISTER_OP("GetBinaryGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-
-class GetBinaryGroupAsTensor : public OpKernel {
- private:
-  float default_value;
-  std::vector<int64> feature_ids;
-
- public:
-  explicit GetBinaryGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-       std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
-        [](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
-       ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-REGISTER_OP("GetContinuousGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-class GetContinuousGroupAsTensor : public OpKernel {
- private:
-  float default_value;
-  std::vector<int64> feature_ids;
-
- public:
-  explicit GetContinuousGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
-      ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetDiscreteGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: int")
-.Output("values: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-class GetDiscreteGroupAsTensor : public OpKernel {
- private:
-  std::vector<int64> feature_ids;
-  int64 default_value;
-
- public:
-  explicit GetDiscreteGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
-      ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetStringGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: string")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  names: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-class GetStringGroupAsTensor : public OpKernel {
- private:
-  std::vector<int64> feature_ids;
-  string default_value;
-
- public:
-  explicit GetStringGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
-    ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseBinaryAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns tensors corresponding to the ids, keys and names of a particular
-feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-)doc");
-class GetSparseBinaryAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetSparseBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // We need two passes to the data:
-      // 1 to compute the output size of the tensor
-      // 2 to copy the values to the tensor
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-
-      // Creating a vector we increment every time a key is found
-      std::vector<std::string> temp_names;
-      std::vector<int64> temp_ids;
-
-      for (int64 id = 0; id < records.size(); id++) {
-        const auto &sparse_binary = records[id].getSparseBinary();
-        auto it = sparse_binary.find(feature_id);
-        // Find all instances of key in DataRecord
-        if (it != sparse_binary.end()) {
-          // insert to temp_names all the values in the dictionary value
-          temp_names.insert(temp_names.end(), it->second.begin(), it->second.end());
-          temp_ids.insert(temp_ids.end(), it->second.size(), id);
-        }
-      }
-
-      // The total_size will be the that of the saved vector
-      const int total_size = static_cast<int64>(temp_names.size());
-      TensorShape shape = {total_size};
-      Tensor* ids = nullptr;
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-
-      // The feature id value will always be the same
-      std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
-      std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
-      std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseContinuousAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns tensors corresponding to the ids, keys, names and values of a particular
-feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-  values: DataRecord values(float)
-)doc");
-class GetSparseContinuousAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetSparseContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // We need two passes to the data:
-      // 1 to compute the output size of the tensor
-      // 2 to copy the values to the tensor
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-
-      // Creating a vector we increment every time a key is found
-      std::vector<std::string> temp_names;
-      std::vector<float> temp_values;
-      std::vector<int64> temp_ids;
-
-      for (int64 id = 0; id < records.size(); id++) {
-        const auto &sparse_continuous = records[id].getSparseContinuous();
-        auto it = sparse_continuous.find(feature_id);
-        // Find all instances of key in DataRecord
-        if (it != sparse_continuous.end()) {
-          // insert to temp_names all the values in the dictionary value
-          auto value_map = it->second;
-          for (auto& elem : value_map) {
-             temp_names.push_back(elem.first);
-             temp_values.push_back(elem.second);
-             temp_ids.push_back(id);
-          }
-        }
-      }
-
-      // The total_size will be the that of the saved vector
-      const int total_size = static_cast<int64>(temp_names.size());
-      TensorShape shape = {total_size};
-      Tensor* ids = nullptr;
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-      Tensor* values = nullptr;
-
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
-
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      // The feature id value will always be the same
-      std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
-      std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
-      std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
-      std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-// Helper function to add ids, keys and values to common vector
-inline void addIdsKeysValuesToVectors(
-  const int64 id,
-  const int64 key,
-  const double value,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  ids.push_back(id);
-  keys.push_back(key);
-  values.push_back(value);
-}
-
-struct KeepFeatures {
-  KeepFeatures() : vec(), set() {}
-  template<typename ContainerType>
-  KeepFeatures(const std::vector<int64> &keep_features,
-               const ContainerType *const container) {
-    vec.reserve(keep_features.size());
-#ifdef USE_DENSE_HASH
-    set.resize(keep_features.size());
-    set.set_empty_key(0);
-#else
-    set.reserve(keep_features.size());
-#endif  // USE_DENSE_HASH
-    set.max_load_factor(0.5);
-    for (const auto &elem : keep_features) {
-      if (container->find(elem) == container->end()) continue;
-      vec.push_back(elem);
-      set.insert(elem);
-    }
-  }
-  size_t size() const {
-    return vec.size();
-  }
-  std::vector<int64> vec;
-  twml::Set<int64> set;
-};
-
-// Helper Function to Filter and Hash Feature for Binary Features
-void filterAndHashFeature(
-  const twml::DataRecord::BinaryFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      addIdsKeysValuesToVectors(current_id, *iter, 1, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem) == keep_features.set.end()) continue;
-      addIdsKeysValuesToVectors(current_id, elem, 1, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Continuous Features
-void filterAndHashFeature(
-  const twml::DataRecord::ContinuousFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      addIdsKeysValuesToVectors(current_id, iter->first, iter->second, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      addIdsKeysValuesToVectors(current_id, elem.first, elem.second, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Discrete Features
-void filterAndHashFeature(
-  const twml::DataRecord::DiscreteFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      int64_t key = twml::mixDiscreteIdAndValue(iter->first, iter->second);
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      int64_t key = twml::mixDiscreteIdAndValue(elem.first, elem.second);
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for String Features
-void filterAndHashFeature(
-  const twml::DataRecord::StringFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      int64_t key = twml::mixStringIdAndValue(
-        iter->first,
-        iter->second.size(),
-        reinterpret_cast<const uint8_t*>(iter->second.c_str()));
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      int64_t key = twml::mixStringIdAndValue(
-        elem.first,
-        elem.second.size(),
-        reinterpret_cast<const uint8_t*>(elem.second.c_str()));
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Sparse Binary Features
-void filterAndHashFeature(
-  const twml::DataRecord::SparseBinaryFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      for (const auto &name : iter->second) {
-        int64_t key = twml::mixStringIdAndValue(iter->first, name.size(),
-                                                reinterpret_cast<const uint8_t*>(name.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-      }
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      for (const auto &name : elem.second) {
-        int64_t key = twml::mixStringIdAndValue(elem.first, name.size(),
-                                                reinterpret_cast<const uint8_t*>(name.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-      }
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Sparse Continuous Features
-void filterAndHashFeature(
-  const twml::DataRecord::SparseContinuousFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      for (const auto &map : iter->second) {
-        int64_t key = twml::mixStringIdAndValue(
-          iter->first,
-          map.first.size(),
-          reinterpret_cast<const uint8_t*>(map.first.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      for (const auto &map : elem.second) {
-        int64_t key = twml::mixStringIdAndValue(
-          elem.first,
-          map.first.size(),
-          reinterpret_cast<const uint8_t*>(map.first.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Sparse Continuous Features
-void filterAndHashFeatureCompat(
-  const twml::DataRecord::SparseContinuousFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      for (const auto &map : iter->second) {
-        int64_t key = twml::featureId(map.first);
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      for (const auto &map : elem.second) {
-        int64_t key = twml::featureId(map.first);
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  }
-}
-
-void copy_if_exists(std::vector<int64>& out,
-                    const std::vector<int64>& in,
-                    const twml::Map<int64_t, int64_t> *const map) {
-  out.reserve(in.size());
-  for (const auto &elem : in) {
-    if (map->find(elem) == map->end()) continue;
-    out.push_back(elem);
-  }
-}
-
-void ComputeHashedFeaturesAsTensor(OpKernelContext* context,
-                                   const DataRecordResource *const handle,
-                                   const KeepFeatures &binary_keep_features,
-                                   const KeepFeatures &continuous_keep_features,
-                                   const KeepFeatures &discrete_keep_features,
-                                   const KeepFeatures &string_keep_features,
-                                   const KeepFeatures &sparse_binary_keep_features,
-                                   const KeepFeatures &sparse_continuous_keep_features,
-                                   bool sparse_continuous_compatibility) {
-
-  const auto &records = handle->records;
-  uint64_t estimated_size = (binary_keep_features.size() + continuous_keep_features.size() +
-                             discrete_keep_features.size() + string_keep_features.size() +
-                             sparse_binary_keep_features.size() +
-                             sparse_continuous_keep_features.size());
-  // Construct temporary vectors for common features
-  std::vector<int64> common_ids, common_keys, temp_ids, temp_keys;
-  std::vector<float> common_values, temp_values;
-  common_ids.reserve(estimated_size);
-  common_keys.reserve(estimated_size);
-  common_values.reserve(estimated_size);
-
-  const auto &common_binary = handle->common.getBinary();
-  const auto &common_continuous = handle->common.getContinuous();
-  const auto &common_discrete = handle->common.getDiscrete();
-  const auto &common_string = handle->common.getString();
-  const auto &common_sparse_binary = handle->common.getSparseBinary();
-  const auto &common_sparse_continuous = handle->common.getSparseContinuous();
-
-  filterAndHashFeature(common_binary, 0, binary_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_continuous, 0, continuous_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_discrete, 0, discrete_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_string, 0, string_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_sparse_binary, 0, sparse_binary_keep_features,
-                       common_ids, common_keys, common_values);
-  if (sparse_continuous_compatibility) {
-    filterAndHashFeatureCompat(common_sparse_continuous, 0, sparse_continuous_keep_features,
-                               common_ids, common_keys, common_values);
-  } else {
-    filterAndHashFeature(common_sparse_continuous, 0, sparse_continuous_keep_features,
-                         common_ids, common_keys, common_values);
-  }
-  common_ids.clear();
-  // Construct temporary vectors for all features
-  estimated_size = (estimated_size + common_keys.size()) * records.size();
-  temp_ids.reserve(estimated_size);
-  temp_keys.reserve(estimated_size);
-  temp_values.reserve(estimated_size);
-
-  for (int64 id = 0; id < records.size(); id++) {
-    temp_ids.insert(temp_ids.end(), common_keys.size(), id);
-    temp_keys.insert(temp_keys.end(), common_keys.begin(), common_keys.end());
-    temp_values.insert(temp_values.end(), common_values.begin(), common_values.end());
-    const auto &binary = records[id].getBinary();
-    const auto &continuous = records[id].getContinuous();
-    const auto &discrete = records[id].getDiscrete();
-    const auto &str = records[id].getString();
-    const auto &sparse_binary = records[id].getSparseBinary();
-    const auto &sparse_continuous = records[id].getSparseContinuous();
-
-    filterAndHashFeature(binary, id, binary_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(continuous, id, continuous_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(discrete, id, discrete_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(str, id, string_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(sparse_binary, id, sparse_binary_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    if (sparse_continuous_compatibility) {
-      filterAndHashFeatureCompat(sparse_continuous, id, sparse_continuous_keep_features,
-                                 temp_ids, temp_keys, temp_values);
-    } else {
-      filterAndHashFeature(sparse_continuous, id, sparse_continuous_keep_features,
-                           temp_ids, temp_keys, temp_values);
-    }
-  }
-
-  // Copy the temporary vectors into the output Tensors
-  TensorShape shape = {static_cast<int64>(temp_ids.size())};
-  Tensor* ids = nullptr;
-  Tensor* keys = nullptr;
-  Tensor* values = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-  OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-  OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-  auto ids_flat = ids->flat<int64>();
-  auto keys_flat = keys->flat<int64>();
-  auto values_flat = values->flat<float>();
-  std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
-  std::copy(temp_keys.begin(), temp_keys.end(), keys_flat.data());
-  std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
-}
-
-REGISTER_OP("GetHashedFeaturesAsSparseTensor")
-.Input("data_record_handle: resource")
-.Attr("binary_keep_features: list(int)")
-.Attr("continuous_keep_features: list(int)")
-.Attr("discrete_keep_features: list(int)")
-.Attr("string_keep_features: list(int)")
-.Attr("sparse_binary_keep_features: list(int)")
-.Attr("sparse_continuous_keep_features: list(int)")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-A tensorflow OP for returning required features of different type as
-a single sparse tensor. Hashing trick is applied.
-
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values (float)
-)doc");
-
-class GetHashedFeaturesAsSparseTensor: public OpKernel {
- public:
-  explicit GetHashedFeaturesAsSparseTensor(OpKernelConstruction* context): OpKernel(context) {
-    // Get the list of features to keep for each feature type
-    OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
-  }
-
- private:
-  std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
-  std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      // Create a new list of keep features based on the original keep_set.
-      // This is to ensure compatibility with existing behavior such as:
-      //  - Ensure no new features are decoded in this op.
-      //  - Ensure labels or weights dont get included here.
-      // TODO: Should we return features requested by user here even if they are labels / weights?
-      KeepFeatures binary_keep_features(binary_keep_features_, handle->keep_map);
-      KeepFeatures continuous_keep_features(continuous_keep_features_, handle->keep_map);
-      KeepFeatures discrete_keep_features(discrete_keep_features_, handle->keep_map);
-      KeepFeatures string_keep_features(string_keep_features_, handle->keep_map);
-      KeepFeatures sparse_binary_keep_features(sparse_binary_keep_features_, handle->keep_map);
-      KeepFeatures sparse_continuous_keep_features(sparse_continuous_keep_features_, handle->keep_map);
-      ComputeHashedFeaturesAsTensor(context, handle.get(),
-                                    binary_keep_features,
-                                    continuous_keep_features,
-                                    discrete_keep_features,
-                                    string_keep_features,
-                                    sparse_binary_keep_features,
-                                    sparse_continuous_keep_features,
-                                    false);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetHashedFeaturesAsSparseTensorV2")
-.Input("data_record_handle: resource")
-.Attr("binary_keep_features: list(int)")
-.Attr("continuous_keep_features: list(int)")
-.Attr("discrete_keep_features: list(int)")
-.Attr("string_keep_features: list(int)")
-.Attr("sparse_binary_keep_features: list(int)")
-.Attr("sparse_continuous_keep_features: list(int)")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("decode_mode: int = 0")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-A tensorflow OP for returning required features of different type as
-a single sparse tensor. Hashing trick is applied.
-
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values (float)
-)doc");
-
-class GetHashedFeaturesAsSparseTensorV2: public OpKernel {
- public:
-  explicit GetHashedFeaturesAsSparseTensorV2(OpKernelConstruction* context): OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-    std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
-    std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
-
-    // Get the list of features to keep for each feature type
-    OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    twml::Map<int64_t, int64_t> keep_map;
-#ifdef USE_DENSE_HASH
-    keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-
-    binary_keep_features = KeepFeatures(binary_keep_features_, &keep_map);
-    continuous_keep_features = KeepFeatures(continuous_keep_features_, &keep_map);
-    discrete_keep_features = KeepFeatures(discrete_keep_features_, &keep_map);
-    string_keep_features = KeepFeatures(string_keep_features_, &keep_map);
-    sparse_binary_keep_features = KeepFeatures(sparse_binary_keep_features_, &keep_map);
-    sparse_continuous_keep_features = KeepFeatures(sparse_continuous_keep_features_, &keep_map);
-
-  }
-
- private:
-  KeepFeatures binary_keep_features, continuous_keep_features, discrete_keep_features;
-  KeepFeatures string_keep_features, sparse_binary_keep_features, sparse_continuous_keep_features;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      // Create a new list of keep features based on the original keep_set.
-      // This is to ensure compatibility with existing behavior such as:
-      //  - Ensure no new features are decoded in this op.
-      //  - Ensure labels or weights dont get included here.
-      // TODO: Should we return features requested by user here even if they are labels / weights?
-      ComputeHashedFeaturesAsTensor(context, handle.get(),
-                                    binary_keep_features,
-                                    continuous_keep_features,
-                                    discrete_keep_features,
-                                    string_keep_features,
-                                    sparse_binary_keep_features,
-                                    sparse_continuous_keep_features,
-                                    m_decode_mode == 0);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-#define REGISTER_DECODE_DATA_RECORD(InputType)  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("DecodeDataRecord")                    \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<InputType>("InputType"),    \
-    DecodeDataRecord<InputType>);               \
-
-REGISTER_DECODE_DATA_RECORD(uint8)
-REGISTER_DECODE_DATA_RECORD(string)
-
-#define REGISTER_GETTER(FIELD)                  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "Features")               \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##Features);                      \
-
-#define REGISTER_GETTER_FROM_DR(FIELD)          \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "FromDataRecord")         \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##FromDataRecord);                \
-
-#define REGISTER_GETTER_AS_TENSOR(FIELD)        \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "AsTensor")               \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##AsTensor);                      \
-
-
-#define REGISTER_GETTER_GROUP_AS_TENSOR(FIELD)  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "GroupAsTensor")          \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##GroupAsTensor);                 \
-
-REGISTER_GETTER(Binary)
-REGISTER_GETTER(Continuous)
-REGISTER_GETTER(Discrete)
-REGISTER_GETTER(String)
-REGISTER_GETTER(SparseBinary)
-REGISTER_GETTER(SparseContinuous)
-REGISTER_GETTER_FROM_DR(BatchSize)
-REGISTER_GETTER_FROM_DR(Labels)
-REGISTER_GETTER_FROM_DR(Weights)
-REGISTER_GETTER_AS_TENSOR(Binary)
-REGISTER_GETTER_AS_TENSOR(Continuous)
-REGISTER_GETTER_AS_TENSOR(Discrete)
-REGISTER_GETTER_AS_TENSOR(String)
-REGISTER_GETTER_AS_TENSOR(SparseBinary)
-REGISTER_GETTER_AS_TENSOR(SparseContinuous)
-REGISTER_GETTER_GROUP_AS_TENSOR(Binary)
-REGISTER_GETTER_GROUP_AS_TENSOR(Continuous)
-REGISTER_GETTER_GROUP_AS_TENSOR(Discrete)
-REGISTER_GETTER_GROUP_AS_TENSOR(String)
-REGISTER_KERNEL_BUILDER(
-  Name("GetHashedFeaturesAsSparseTensor")
-  .Device(DEVICE_CPU),
-  GetHashedFeaturesAsSparseTensor);
-REGISTER_KERNEL_BUILDER(
-  Name("GetHashedFeaturesAsSparseTensorV2")
-  .Device(DEVICE_CPU),
-  GetHashedFeaturesAsSparseTensorV2);
diff --git a/twml/libtwml/src/ops/data_record.docx b/twml/libtwml/src/ops/data_record.docx
new file mode 100644
index 000000000..30a84df24
Binary files /dev/null and b/twml/libtwml/src/ops/data_record.docx differ
diff --git a/twml/libtwml/src/ops/data_record_tensor_writer.cpp b/twml/libtwml/src/ops/data_record_tensor_writer.cpp
deleted file mode 100644
index 9368c870e..000000000
--- a/twml/libtwml/src/ops/data_record_tensor_writer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("DataRecordTensorWriter")
-.Attr("T: list({string, int32, int64, float, double, bool})")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and dense tensors into a DataRecord.
-
-values: list of tensors
-keys: feature ids from the original DataRecord (int64)
-
-Outputs
-  bytes: output DataRecord serialized using Thrift into a uint8 tensor.
-)doc");
-
-class DataRecordTensorWriter : public OpKernel {
- public:
-  explicit DataRecordTensorWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-
-    try {
-      // set keys as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-
-      // check sizes
-      uint64_t num_keys = in_keys_.getNumElements();
-      uint64_t num_values = context->num_inputs() - 1;
-
-      OP_REQUIRES(context, num_keys == num_values,
-        errors::InvalidArgument("Number of dense keys and dense tensors do not match"));
-
-      // populate DataRecord object
-      const int64_t *keys = in_keys_.getData<int64_t>();
-      twml::DataRecord record = twml::DataRecord();
-
-      for (int i = 1; i < context->num_inputs(); i++) {
-        const twml::RawTensor& value = TFTensor_to_twml_raw_tensor(context->input(i));
-        record.addRawTensor(keys[i-1], value);
-      }
-
-      // determine the length of the encoded result (no memory is copied)
-      twml::ThriftWriter thrift_dry_writer = twml::ThriftWriter(nullptr, 0, true);
-      twml::DataRecordWriter record_dry_writer = twml::DataRecordWriter(thrift_dry_writer);
-      record_dry_writer.write(record);
-      int len = thrift_dry_writer.getBytesWritten();
-      TensorShape result_shape = {1, len};
-
-      // allocate output tensor
-      Tensor* result = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape, &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // write to output tensor
-      uint8_t *buffer = out_result.getData<uint8_t>();
-      twml::ThriftWriter thrift_writer = twml::ThriftWriter(buffer, len, false);
-      twml::DataRecordWriter record_writer = twml::DataRecordWriter(thrift_writer);
-      record_writer.write(record);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("DataRecordTensorWriter").Device(DEVICE_CPU),
-    DataRecordTensorWriter);
diff --git a/twml/libtwml/src/ops/data_record_tensor_writer.docx b/twml/libtwml/src/ops/data_record_tensor_writer.docx
new file mode 100644
index 000000000..7034b3264
Binary files /dev/null and b/twml/libtwml/src/ops/data_record_tensor_writer.docx differ
diff --git a/twml/libtwml/src/ops/discretizer.cpp b/twml/libtwml/src/ops/discretizer.cpp
deleted file mode 100644
index 10d1b3c78..000000000
--- a/twml/libtwml/src/ops/discretizer.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-
-void ComputeDiscretizers(OpKernelContext* context, const bool return_bin_indices = false) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_ids = context->input(2);
-  const Tensor& bin_vals = context->input(3);
-  const Tensor& feature_offsets = context->input(4);
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(),
-                                                   &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, keys.shape(),
-                                                   &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_ids_ = TFTensor_to_twml_tensor(bin_ids);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-    const twml::Tensor feature_offsets_ = TFTensor_to_twml_tensor(feature_offsets);
-    twml::mdlInfer(out_keys_, out_vals_,
-                   in_keys_, in_vals_,
-                   bin_ids_, bin_vals_,
-                   feature_offsets_,
-                   return_bin_indices);
-  }  catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
-
-REGISTER_OP("MDL")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using MDL, you should use a hashmap to get the intersection of
-input `keys` with the features that MDL knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  mdl_keys = hashmap.find(keys) # mdl_keys are now in range [0, num_classes_from_calibration)
-  mdl_keys = where (mdl_keys != -1) # Ignore keys not found
-
-
-Inside MDL, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-
-template<typename T>
-class MDL : public OpKernel {
- public:
-  explicit MDL(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context);
-  }
-};
-
-REGISTER_OP("PercentileDiscretizer")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using PercentileDiscretizer, you should use a hashmap to get the intersection of
-input `keys` with the features that PercentileDiscretizer knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  percentile_discretizer_keys = hashmap.find(keys) # percentile_discretizer_keys are now in range [0, num_classes_from_calibration)
-  percentile_discretizer_keys = where (percentile_discretizer_keys != -1) # Ignore keys not found
-
-
-Inside PercentileDiscretizer, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizer : public OpKernel {
- public:
-  explicit PercentileDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context);
-  }
-};
-
-
-REGISTER_OP("PercentileDiscretizerBinIndices")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-If the feature id and bin id of the discretized value is the same on multiple runs, they
-will always be assigned to the same output key and value, regardless of the bin_id assigned during
-calibration.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using PercentileDiscretizerBinIndices, you should use a hashmap to get the intersection of
-input `keys` with the features that PercentileDiscretizerBinIndices knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  percentile_discretizer_keys = hashmap.find(keys) # percentile_discretizer_keys are now in range [0, num_classes_from_calibration)
-  percentile_discretizer_keys = where (percentile_discretizer_keys != -1) # Ignore keys not found
-
-
-Inside PercentileDiscretizerBinIndices, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizerBinIndices : public OpKernel {
- public:
-  explicit PercentileDiscretizerBinIndices(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context, true);
-  }
-};
-
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizerBinIndices")   \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizerBinIndices<Type>);   \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizer")   \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizer<Type>);   \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("MDL")                     \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    MDL<Type>);                     \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/discretizer.docx b/twml/libtwml/src/ops/discretizer.docx
new file mode 100644
index 000000000..acadc7e27
Binary files /dev/null and b/twml/libtwml/src/ops/discretizer.docx differ
diff --git a/twml/libtwml/src/ops/feature_extractor.cpp b/twml/libtwml/src/ops/feature_extractor.cpp
deleted file mode 100644
index 9e0910bae..000000000
--- a/twml/libtwml/src/ops/feature_extractor.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include <map>
-#include <vector>
-
-REGISTER_OP("FeatureExtractor")
-.Attr("T: {float, double} = DT_FLOAT")
-.Input("mask_in: bool")
-.Input("ids_in: int64")
-.Input("keys_in: int64")
-.Input("values_in: T")
-.Input("codes_in: int64")
-.Input("types_in: int8")
-.Output("ids_out: int64")
-.Output("keys_out: int64")
-.Output("values_out: T")
-.Output("codes_out: int64")
-.Output("types_out: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that extracts the desired indices of a Tensor based on a mask
-
-Input
-  mask_in: boolean Tensor that determines which are the indices to be kept (bool)
-  ids_in: input indices Tensor (int64)
-  keys_in: input keys Tensor (int64)
-  values_in: input values Tensor (float/double)
-  codes_in: input codes Tensor (int64)
-  types_in: input types Tensor(int8)
-
-Outputs
-  ids_out: output indices Tensor (int64)
-  keys_out: output keys Tensor (int64)
-  values_out: output values Tensor (float/double)
-  codes_out: output codes Tensor (int64)
-  types_out: output types Tensor(int8)
-
-)doc");
-template <typename T>
-class FeatureExtractor : public OpKernel {
- public:
-  explicit FeatureExtractor(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  template <typename A, typename U>
-  bool allequal(const A &t, const U &u) {
-      return t == u;
-  }
-
-  template <typename A, typename U, typename... Others>
-  bool allequal(const A &t, const U &u, Others const &... args) {
-      return (t == u) && allequal(u, args...);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Get input tensors
-    const Tensor& input_mask = context->input(0);
-    const Tensor& input_ids = context->input(1);
-    const Tensor& input_keys = context->input(2);
-    const Tensor& input_values = context->input(3);
-    const Tensor& input_codes = context->input(4);
-    const Tensor& input_types = context->input(5);
-
-    auto mask = input_mask.flat<bool>();
-    auto ids = input_ids.flat<int64>();
-    auto keys = input_keys.flat<int64>();
-    auto codes = input_codes.flat<int64>();
-    auto values = input_values.flat<T>();
-    auto types = input_types.flat<int8>();
-
-    // Verify that all Tensors have the same size.
-    OP_REQUIRES(context, allequal(mask.size(), ids.size(), keys.size(), codes.size(), values.size(), types.size()),
-                errors::InvalidArgument("all input vectors must be the same size."));
-
-    // Get the size of the output vectors by counting the numbers of trues.
-    int total_size = 0;
-    for (int i = 0; i < mask.size(); i++) {
-      if (mask(i))
-        total_size += 1;
-    }
-
-    // Shape is the number of Trues in the mask Eigen::Tensor
-    TensorShape shape_out = {total_size};
-
-    // Create the output tensors
-    Tensor* output_codes = nullptr;
-    Tensor* output_ids = nullptr;
-    Tensor* output_values = nullptr;
-    Tensor* output_types = nullptr;
-    Tensor* output_keys = nullptr;
-
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape_out, &output_ids));
-    OP_REQUIRES_OK(context, context->allocate_output(1, shape_out, &output_keys));
-    OP_REQUIRES_OK(context, context->allocate_output(2, shape_out, &output_values));
-    OP_REQUIRES_OK(context, context->allocate_output(3, shape_out, &output_codes));
-    OP_REQUIRES_OK(context, context->allocate_output(4, shape_out, &output_types));
-
-    auto output_ids_ = output_ids->flat<int64>();
-    auto output_keys_ = output_keys->flat<int64>();
-    auto output_codes_ = output_codes->flat<int64>();
-    auto output_values_ = output_values->flat<T>();
-    auto output_types_ = output_types->flat<int8>();
-
-    // Iterate through the mask and set values to output Eigen::Tensors
-    int j = 0;
-    for (int i = 0; i < mask.size(); i++) {
-      if (mask(i)) {
-        output_ids_(j) = ids(i);
-        output_keys_(j) = keys(i);
-        output_values_(j) = values(i);
-        output_codes_(j) = codes(i);
-        output_types_(j) = types(i);
-        ++j;
-      }
-    }
-  }
-};
-
-#define REGISTER(Type)                        \
-                                              \
-  REGISTER_KERNEL_BUILDER(                    \
-  Name("FeatureExtractor")  \
-  .Device(DEVICE_CPU)                         \
-  .TypeConstraint<Type>("T"),                 \
-  FeatureExtractor<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/feature_extractor.docx b/twml/libtwml/src/ops/feature_extractor.docx
new file mode 100644
index 000000000..aa1101ab1
Binary files /dev/null and b/twml/libtwml/src/ops/feature_extractor.docx differ
diff --git a/twml/libtwml/src/ops/feature_id.cpp b/twml/libtwml/src/ops/feature_id.cpp
deleted file mode 100644
index 150b5614c..000000000
--- a/twml/libtwml/src/ops/feature_id.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("FeatureId")
-.Attr("feature_names: list(string)")
-.Output("output: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that hashes a list of strings into int64. This is used for feature name hashing.
-
-Attr
-  feature_names: a list of string feature names (list(string)).
-
-Outputs
-  ouput: hashes corresponding to the string feature names (int64).
-)doc");
-
-
-class FeatureId : public OpKernel {
- private:
-    std::vector<string> input_vector;
-
- public:
-  explicit FeatureId(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_names", &input_vector));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Get size of the input_vector and create TensorShape shape
-    const int total_size = static_cast<int>(input_vector.size());
-    TensorShape shape = {total_size};
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape,
-                             &output_tensor));
-    auto output_flat = output_tensor->flat<int64>();
-
-    // Transform the input tensor into a int64
-    for (int i = 0; i < total_size; i++) {
-      output_flat(i) = twml::featureId(input_vector[i]);
-    }
-  }
-};
-
-
-REGISTER_KERNEL_BUILDER(
-  Name("FeatureId")
-  .Device(DEVICE_CPU),
-  FeatureId);
diff --git a/twml/libtwml/src/ops/feature_id.docx b/twml/libtwml/src/ops/feature_id.docx
new file mode 100644
index 000000000..a7ea69c25
Binary files /dev/null and b/twml/libtwml/src/ops/feature_id.docx differ
diff --git a/twml/libtwml/src/ops/feature_mask.cpp b/twml/libtwml/src/ops/feature_mask.cpp
deleted file mode 100644
index fc1498413..000000000
--- a/twml/libtwml/src/ops/feature_mask.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include <map>
-#include <vector>
-#include <set>
-
-REGISTER_OP("FeatureMask")
-.Attr("T: {int64, int8}")
-.Input("keep: T")
-.Attr("list_keep: list(int)")
-.Output("mask: bool")
-
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that creates a mask of the indices that should be kept.
-
-Attribute
-list_keep: list of values which should be kept(list(int))
-
-Input
-  keep: Tensor for which we will apply the mask (int64, int8)
-
-Outputs
-  mask: boolean Tensor. (bool)
-
-)doc");
-template <typename T>
-class FeatureMask : public OpKernel {
- private:
-  std::set<int64> feature_set_keep;
-
- public:
-  explicit FeatureMask(OpKernelConstruction* context)
-      : OpKernel(context) {
-        std::vector<int64> feature_list_keep;
-        OP_REQUIRES_OK(context, context->GetAttr("list_keep", &feature_list_keep));
-        // create set that contains the content of the feature_list_keep, since tensorflow does not allow
-        // me to directly ouput the contents of list_keep to a set
-        feature_set_keep = std::set<int64>(feature_list_keep.begin(), feature_list_keep.end());
-      }
-
-  void Compute(OpKernelContext* context) override {
-    // Get size of the input_vector and create TensorShape shape
-    const Tensor& input = context->input(0);
-
-    auto keep = input.flat<T>();
-
-    // Create an output tensor
-    Tensor* output_mask = nullptr;
-
-    // Output shape is determined and now we can copy the contents of the vector to the output Tensor.
-    const int total_size_out = static_cast<int>(keep.size());
-
-    TensorShape shape_out = {total_size_out};
-
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape_out, &output_mask));
-
-    auto output_mask_ = output_mask->flat<bool>();
-
-    // Check if value is in set, output is boolean
-    for (int j = 0; j < keep.size(); j++){
-      output_mask_(j) = (feature_set_keep.count(keep(j)));
-    }
-  }
-};
-
-
-#define REGISTER(Type)                        \
-                                              \
-  REGISTER_KERNEL_BUILDER(                    \
-  Name("FeatureMask")  \
-  .Device(DEVICE_CPU)                         \
-  .TypeConstraint<Type>("T"),                 \
-  FeatureMask<Type>);  \
-
-REGISTER(int64);
-REGISTER(int8);
diff --git a/twml/libtwml/src/ops/feature_mask.docx b/twml/libtwml/src/ops/feature_mask.docx
new file mode 100644
index 000000000..94e0c0724
Binary files /dev/null and b/twml/libtwml/src/ops/feature_mask.docx differ
diff --git a/twml/libtwml/src/ops/fixed_length_tensor.cpp b/twml/libtwml/src/ops/fixed_length_tensor.cpp
deleted file mode 100644
index 876367ad3..000000000
--- a/twml/libtwml/src/ops/fixed_length_tensor.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-using std::string;
-
-template<typename IndexType, typename ValueType, bool calc_batch_size>
-void ComputeFixedLengthTensor(OpKernelContext *context, int64 max_length_) {
-  try {
-    const Tensor& segment_ids = context->input(0);
-    const Tensor& values = context->input(1);
-    const Tensor& pad_value = context->input(2);
-
-    auto indices_flat = segment_ids.flat<IndexType>();
-    auto values_flat = values.flat<ValueType>();
-
-    auto pad_value_scalar = pad_value.scalar<ValueType>()();
-
-    // Get maximum length from batch if user hasn't specified it.
-    int64 max_length = max_length_;
-    if (max_length < 0 && indices_flat.size() > 0) {
-      int64 current_id = indices_flat(0);
-      int64 current_length = 1;
-
-      for (int64 i = 1; i < indices_flat.size(); i++) {
-        if (current_id == indices_flat(i)) {
-          current_length++;
-        } else {
-          current_id = indices_flat(i);
-          max_length = std::max(max_length, current_length);
-          current_length = 1;
-        }
-      }
-      // This is needed if the last batch is the longest sequence.
-      max_length = std::max(max_length, current_length);
-    }
-
-    int64 batch_size = 0;
-    if (calc_batch_size) {
-      if (indices_flat.size() > 0) {
-        // The last value of segment_ids will have value batch_size  1;
-        batch_size = 1 + indices_flat(indices_flat.size() - 1);
-      } else {
-        batch_size = 0;
-      }
-    } else {
-      const Tensor& batch_size_tensor = context->input(3);
-      batch_size = batch_size_tensor.flat<int64>()(0);
-    }
-
-    TensorShape output_shape = {batch_size, max_length};
-    Tensor* fixed_length = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &fixed_length));
-
-    auto fixed_length_flat = fixed_length->flat<ValueType>();
-
-    int64 n = 0;
-    int64 offset = 0;
-    for (int64 i = 0; i < batch_size; i++) {
-      for (int64 j = 0; j < max_length; j++) {
-        if (n < indices_flat.size() && indices_flat(n) == i) {
-          // Copy from variable length tensor.
-          fixed_length_flat(offset + j) = values_flat(n);
-          n++;
-        } else {
-          // Pad to fixed length.
-          fixed_length_flat(offset + j) = pad_value_scalar;
-        }
-      }
-      // Corner case: truncate to max_length if user specified max_length < current length.
-      while (n < indices_flat.size() && i == indices_flat(n)) n++;
-
-      // Update output pointer
-      offset += max_length;
-    }
-  } catch (const std::exception &err) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-  }
-}
-
-REGISTER_OP("FixedLengthTensor")
-.Attr("IndexType: {int64, int32}")
-.Attr("ValueType: {int64, int32, string}")
-.Attr("max_length: int")
-.Input("segment_ids: IndexType")
-.Input("values: ValueType")
-.Input("pad_value: ValueType")
-.Output("fixed_length: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert variable length segments into fixed length tensor.
-
-Attr
-  max_length: The size of the inner most (i.e. last) dimension.
-
-Input
-  segment_ids: 1D input tensor containing the sorted segment_ids.
-  values: 1D input tensor containing the values.
-  pad_value: The value used for padding the fixed length tensor.
-
-Outputs
-  fixed_length: A fixed length tensor of size [batch_size, max_length].
-)doc");
-
-template<typename IndexType, typename ValueType>
-class FixedLengthTensor: public OpKernel {
- public:
-  explicit FixedLengthTensor(OpKernelConstruction *context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_length", &max_length_));
-  }
-
- private:
-  int64 max_length_;
-
-  void Compute(OpKernelContext *context) override {
-    ComputeFixedLengthTensor<IndexType, ValueType, true>(context, max_length_);
-  }
-};
-
-REGISTER_OP("FixedLengthTensorV2")
-.Attr("IndexType: {int64, int32}")
-.Attr("ValueType: {int64, int32, string}")
-.Attr("max_length: int")
-.Input("segment_ids: IndexType")
-.Input("values: ValueType")
-.Input("pad_value: ValueType")
-.Input("batch_size: int64")
-.Output("fixed_length: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert variable length segments into fixed length tensor.
-
-Attr
-  max_length: The size of the inner most (i.e. last) dimension.
-
-Input
-  segment_ids: 1D input tensor containing the sorted segment_ids.
-  values: 1D input tensor containing the values.
-  pad_value: The value used for padding the fixed length tensor.
-  batch_size: The batch size to use.
-
-Outputs
-  fixed_length: A fixed length tensor of size [batch_size, max_length].
-)doc");
-
-template<typename IndexType, typename ValueType>
-class FixedLengthTensorV2: public OpKernel {
- public:
-  explicit FixedLengthTensorV2(OpKernelConstruction *context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_length", &max_length_));
-  }
-
- private:
-  int64 max_length_;
-
-  void Compute(OpKernelContext *context) override {
-    ComputeFixedLengthTensor<IndexType, ValueType, false>(context, max_length_);
-  }
-};
-
-#define REGISTER_SPARSE_TO_FIXED_LENGTH(IndexType, ValueType)   \
-  REGISTER_KERNEL_BUILDER(                                      \
-    Name("FixedLengthTensor")                                   \
-    .Device(DEVICE_CPU)                                         \
-    .TypeConstraint<IndexType>("IndexType")                     \
-    .TypeConstraint<ValueType>("ValueType"),                    \
-    FixedLengthTensor<IndexType, ValueType>);                   \
-                                                                \
-  REGISTER_KERNEL_BUILDER(                                      \
-    Name("FixedLengthTensorV2")                                 \
-    .Device(DEVICE_CPU)                                         \
-    .TypeConstraint<IndexType>("IndexType")                     \
-    .TypeConstraint<ValueType>("ValueType"),                    \
-    FixedLengthTensorV2<IndexType, ValueType>);                 \
-
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, int64)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, int32)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, string)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, int64)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, int32)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, string)
diff --git a/twml/libtwml/src/ops/fixed_length_tensor.docx b/twml/libtwml/src/ops/fixed_length_tensor.docx
new file mode 100644
index 000000000..e8512e28e
Binary files /dev/null and b/twml/libtwml/src/ops/fixed_length_tensor.docx differ
diff --git a/twml/libtwml/src/ops/hashed_data_record.cpp b/twml/libtwml/src/ops/hashed_data_record.cpp
deleted file mode 100644
index ba094c3d9..000000000
--- a/twml/libtwml/src/ops/hashed_data_record.cpp
+++ /dev/null
@@ -1,520 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <functional>
-
-REGISTER_OP("DecodeAndHashDataRecord")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that creates a handle for the hashed data record.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
-
-Outputs
-  hashed_data_record_handle: A resource handle to batch of HashedDataRecords.
-)doc");
-
-template<typename InputType>
-class DecodeAndHashDataRecord : public OpKernel {
- public:
-  explicit DecodeAndHashDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      int batch_size = getBatchSize<InputType>(resource->input);
-      int num_labels = static_cast<int>(m_labels_map.size());
-      int num_weights = static_cast<int>(m_weights_map.size());
-
-      twml::HashedDataRecordReader reader;
-      reader.setKeepMap(&m_keep_map);
-      reader.setLabelsMap(&m_labels_map);
-      reader.setDecodeMode(m_decode_mode);
-
-      // Do not set weight map if it is empty. This will take a faster path.
-      if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-      }
-
-      resource->records.clear();
-      resource->records.reserve(batch_size);
-
-      int64 total_size = 0;
-
-      for (int id = 0; id < batch_size; id++) {
-        const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
-        reader.setBuffer(input_bytes);
-        resource->records.emplace_back(num_labels, num_weights);
-        resource->records[id].decode(reader);
-        total_size += static_cast<int64>(resource->records[id].totalSize());
-      }
-
-      resource->total_size = total_size;
-      resource->num_labels = num_labels;
-      resource->num_weights = num_weights;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetIdsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns unhashed ids from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-)doc");
-
-// This Kernel is used for both training and serving once the resource is created.
-class GetIdsFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetIdsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-      const int64 common_size = static_cast<int64>(common.totalSize());
-      const int64 total_size = handle->total_size;
-      TensorShape shape = {total_size};
-
-      Tensor *ids;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-
-      int id = 0;
-      int64 offset = 0;
-      auto ids_flat = ids->flat<int64>();
-      for (const auto &record : records) {
-        // Since common features are added to each input, add the common_size to the current size.
-        // For training common_size == 0, for serving it can be a non-zero value.
-        int64 curr_size = static_cast<int64>(record.totalSize()) + common_size;
-        std::fill(ids_flat.data() + offset, ids_flat.data() + offset + curr_size, id);
-        offset += curr_size;
-        id++;
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-// OutType: Output Tensor Type. FieldType: The storage type used inside HashedDatarecord.
-template<typename OutType, typename FieldType>
-class GetOutputFromHashedDataRecord : public OpKernel {
- protected:
-  using Getter = std::function<const std::vector<FieldType>&(const twml::HashedDataRecord &)>;
-  Getter getter;
-
- public:
-  explicit GetOutputFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-      const int64 total_size = handle->total_size;
-      TensorShape shape = {total_size};
-
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
-
-      const auto &common_output = getter(common);
-
-      auto output_data = output->flat<OutType>().data();
-      for (const auto &record : records) {
-        // This is does not copy anything during training as common_size == 0
-        // It will copy the relevant common features coming from a batch prediction request.
-        output_data = std::copy(common_output.begin(), common_output.end(), output_data);
-
-        // Copy the current record to output.
-        const auto& rec_output = getter(record);
-        output_data = std::copy(rec_output.begin(), rec_output.end(), output_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetUKeysFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("ukeys: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns unhashed keys from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ukeys: unhased keys / raw feature ids from the original request.
-)doc");
-
-class GetUKeysFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetUKeysFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.keys();
-    };
-  }
-};
-
-REGISTER_OP("GetKeysFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("keys: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns keys from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  keys: keys after raw feature ids are hashed with values (int64)
-)doc");
-
-class GetKeysFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetKeysFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.transformed_keys();
-    };
-  }
-};
-
-REGISTER_OP("GetValuesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns values from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  values: feature values.
-)doc");
-
-class GetValuesFromHashedDataRecord : public GetOutputFromHashedDataRecord<float, double> {
- public:
-  explicit GetValuesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<float, double>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<double> & {
-      return record.values();
-    };
-  }
-};
-
-REGISTER_OP("GetCodesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("codes: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns codes from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  codes: deepbird feature code, usually from A,B,C,D ... in the config.
-)doc");
-
-class GetCodesFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetCodesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.codes();
-    };
-  }
-};
-
-REGISTER_OP("GetTypesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("types: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns types from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  types: feature types corresponding to BINARY, DISCRETE, etc.
-)doc");
-
-class GetTypesFromHashedDataRecord : public GetOutputFromHashedDataRecord<int8, uint8_t> {
- public:
-  explicit GetTypesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int8, uint8_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<uint8_t> & {
-      return record.types();
-    };
-  }
-};
-
-REGISTER_OP("GetBatchSizeFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("batch_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns batch size from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  batch_size: Number of records held in the handle.
-)doc");
-
-class GetBatchSizeFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetBatchSizeFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->records.size();
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetTotalSizeFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("total_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns total size from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  total_size: Total number of keys / values in the batch.
-)doc");
-
-class GetTotalSizeFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetTotalSizeFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->total_size;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetLabelsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("labels: float")
-.Attr("default_label: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns labels from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
-)doc");
-
-class GetLabelsFromHashedDataRecord : public OpKernel {
- private:
-  float default_label;
-
- public:
-  explicit GetLabelsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_labels = static_cast<int>(handle->num_labels);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
-
-      Tensor *labels;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
-
-      // The default value of label is not present in data record is std::nanf
-      // For continuous labels, change that to a default_label or label.
-      auto func = [this](float label) -> float {
-        return std::isnan(label) ? default_label : label;
-      };
-
-      auto labels_data = labels->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_labels = record.labels();
-        labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetWeightsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("weights: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns weights from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
-)doc");
-
-class GetWeightsFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetWeightsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_weights = static_cast<int>(handle->num_weights);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
-
-      Tensor *weights;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
-
-      auto weights_data = weights->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_weights = record.weights();
-        weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-#define REGISTER_DECODE_AND_HASH(InputType)     \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("DecodeAndHashDataRecord")             \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<InputType>("InputType"),    \
-    DecodeAndHashDataRecord<InputType>);        \
-
-REGISTER_DECODE_AND_HASH(uint8)
-REGISTER_DECODE_AND_HASH(string)
-
-#define REGISTER_GETTER(FIELD)                  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "FromHashedDataRecord")   \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##FromHashedDataRecord);          \
-
-REGISTER_GETTER(Ids)
-REGISTER_GETTER(UKeys)
-REGISTER_GETTER(Keys)
-REGISTER_GETTER(Values)
-REGISTER_GETTER(Codes)
-REGISTER_GETTER(Types)
-REGISTER_GETTER(BatchSize)
-REGISTER_GETTER(TotalSize)
-REGISTER_GETTER(Labels)
-REGISTER_GETTER(Weights)
diff --git a/twml/libtwml/src/ops/hashed_data_record.docx b/twml/libtwml/src/ops/hashed_data_record.docx
new file mode 100644
index 000000000..3bbb6a5c7
Binary files /dev/null and b/twml/libtwml/src/ops/hashed_data_record.docx differ
diff --git a/twml/libtwml/src/ops/hashing_discretizer.cpp b/twml/libtwml/src/ops/hashing_discretizer.cpp
deleted file mode 100644
index 634f6db33..000000000
--- a/twml/libtwml/src/ops/hashing_discretizer.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-void ComputeHashingDiscretizer(
-  OpKernelContext*,
-  int64_t,
-  const twml::Map<int64_t, int64_t> &,
-  int64_t,
-  int64_t,
-  int64_t);
-
-REGISTER_OP("HashingDiscretizer")
-.Attr("T: {float, double}")
-.Input("input_ids: int64")
-.Input("input_vals: T")
-.Input("bin_vals: T")
-.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
-.Attr("n_bin: int")
-.Attr("output_bits: int")
-.Attr("cost_per_unit: int")
-.Attr("options: int")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn(
-  [](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(1));
-    return Status::OK();
-  }
-)
-.Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features (if calibrated).
-  - note - choice of float or double should be consistent among inputs/output
-
-Input
-  input_ids(int64): A tensor containing input feature ids (direct from data record).
-  input_vals(float/double): A tensor containing input values at corresponding feature ids.
-    - i.e. input_ids[i] <-> input_vals[i] for each i
-  bin_vals(float/double): A tensor containing the bin boundaries for values of a given feature.
-    - float or double, matching input_vals
-  feature_ids(int64 attr): 1D TensorProto of feature IDs seen during calibration
-    -> hint: look up make_tensor_proto:
-       proto_init = np.array(values, dtype=np.int64)
-       tensor_attr = tf.make_tensor_proto(proto_init)
-  n_bin(int): The number of bin boundary values per feature
-    -> hence, n_bin + 1 buckets for each feature
-  output_bits(int): The maximum number of bits to use for the output IDs.
-  cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
-    if not CPU-bound) to complete a unit of work. Overestimating creates too
-    many shards and CPU time will be dominated by per-shard overhead, such as
-    Context creation. Underestimating may not fully make use of the specified
-    parallelism.
-  options(int): selects behavior of the op.
-    0x00 in bits{1:0} for std::lower_bound bucket search.
-    0x01 in bits{1:0} for linear bucket search
-    0x02 in bits{1:0} for std::upper_bound bucket search
-    0x00 in bits{4:2} for integer_multiplicative_hashing
-    0x01 in bits{4:2} for integer64_multiplicative_hashing
-    higher bits/other values are reserved for future extensions
-
-Outputs
-  new_keys(int64): The discretized feature ids with same shape and size as keys.
-  new_vals(float or double): The discretized values with the same shape and size as vals.
-
-Operation
-  Note that the discretization operation maps observation vectors to higher dimensional
-    observation vectors. Here, we describe this mapping.
-
-  Let a calibrated feature observation be given by (F,x), where F is the ID of the
-    feature, and x is some real value (i.e., continuous feature). This kind of
-    representation is useful for the representation of sparse vectors, where there
-    are many zeros.
-
-  For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
-    (0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
-    elements of the vector.
-
-  The disretizer performs the following operation:
-    (F,x) -> (map(x|F),1).
-  Hence, we have that map(x|F) is a new feature ID, and the value observed for that
-    feature is 1. We might read map(x|F) as 'the map of x for feature F'.
-
-  For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
-    We will then have that map(x|F) is in the set newIDs(F) for any value of x. Each
-    set member of newIDs(F) is associated with a 'bin', as defined by the bin
-    boundaries given in the bin_vals input array. For any two different feature IDs F
-    and G, we would ideally have that INTERSECT(newIDs(F),newIDs(G)) is the empty set.
-    However, this is not guaranteed for this discretizer.
-
-  In the case of this hashing discretizer, map(x|F) can actually be written as follows:
-    let bucket = bucket(x|F) be the the bucket index for x, according to the
-    calibration on F. (This is an integer value in [0,n_bin], inclusive)
-    F is an integer ID. Here, we have that map(x|F) = hash_fn(F,bucket). This has
-    the desirable property that the new ID depends only on the calibration data
-    supplied for feature F, and not on any other features in the dataset (e.g.,
-    number of other features present in the calibration data, or order of features
-    in the dataset). Note that PercentileDiscretizer does NOT have this property.
-    This comes at the expense of the possibility of output ID collisions, which
-    we try to minimize through the design of hash_fn.
-
-  Example - consider input vector with a single element, i.e. [x].
-    Let's Discretize to one of 2 values, as follows:
-    Let F=0 for the ID of the single feature in the vector.
-    Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
-    bucket = bucket(x|F=0) = 0 if x<=BNDRY(0) else 1
-    Let map(x|F) = hash_fn(F=0,bucket=0) if x<=BNDRY(0) else hash_fn(F=0,bucket=1)
-  If we had another element y in the vector, i.e. [x, y], then we might additionally
-    Let F=1 for element y.
-    Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
-    bucket = bucket(x|F=1) = 0 if x<=BNDRY(1) else 1
-    Let map(x|F) = hash_fn(F=1,bucket=0) if x<=BNDRY(1) else hash_fn(F=1,bucket=1)
-  Note how the construction of map(x|F=1) does not depend on whether map(x|F=0)
-    was constructed.
-)doc");
-
-template<typename T>
-class HashingDiscretizer : public OpKernel {
- public:
-  explicit HashingDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("n_bin", &n_bin_));
-    OP_REQUIRES(context,
-                n_bin_ > 0,
-                errors::InvalidArgument("Must have n_bin_ > 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("output_bits", &output_bits_));
-    OP_REQUIRES(context,
-                output_bits_ > 0,
-                errors::InvalidArgument("Must have output_bits_ > 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("cost_per_unit", &cost_per_unit_));
-    OP_REQUIRES(context,
-                cost_per_unit_ >= 0,
-                errors::InvalidArgument("Must have cost_per_unit >= 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("options", &options_));
-
-    // construct the ID_to_index hash map
-    Tensor feature_IDs;
-
-    // extract the tensors
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_ids", &feature_IDs));
-
-    // for access to the data
-    // int64_t data type is set in to_layer function of the calibrator objects in Python
-    auto feature_IDs_flat = feature_IDs.flat<int64>();
-
-    // verify proper dimension constraints
-    OP_REQUIRES(context,
-                feature_IDs.shape().dims() == 1,
-                errors::InvalidArgument("feature_ids must be 1D."));
-
-    // reserve space in the hash map and fill in the values
-    int64_t num_features = feature_IDs.shape().dim_size(0);
-#ifdef USE_DENSE_HASH
-    ID_to_index_.set_empty_key(0);
-    ID_to_index_.resize(num_features);
-#else
-    ID_to_index_.reserve(num_features);
-#endif  // USE_DENSE_HASH
-    for (int64_t i = 0 ; i < num_features ; i++) {
-      ID_to_index_[feature_IDs_flat(i)] = i;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeHashingDiscretizer(
-      context,
-      output_bits_,
-      ID_to_index_,
-      n_bin_,
-      cost_per_unit_,
-      options_);
-  }
-
- private:
-  twml::Map<int64_t, int64_t> ID_to_index_;
-  int n_bin_;
-  int output_bits_;
-  int cost_per_unit_;
-  int options_;
-};
-
-#define REGISTER(Type)              \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("HashingDiscretizer")      \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    HashingDiscretizer<Type>);      \
-
-REGISTER(float);
-REGISTER(double);
-
-void ComputeHashingDiscretizer(
-    OpKernelContext* context,
-    int64_t output_bits,
-    const twml::Map<int64_t, int64_t> &ID_to_index,
-    int64_t n_bin,
-    int64_t cost_per_unit,
-    int64_t options) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_vals = context->input(2);
-
-  const int64 output_size = keys.dim_size(0);
-
-  TensorShape output_shape;
-  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(&output_size, 1, &output_shape));
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [&](int64 start, int64 limit) {
-      twml::hashDiscretizerInfer(out_keys_, out_vals_,
-                             in_keys_, in_vals_,
-                             n_bin,
-                             bin_vals_,
-                             output_bits,
-                             ID_to_index,
-                             start, limit,
-                             options);
-    };
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          output_size,
-          static_cast<int64>(cost_per_unit),
-          task);
-  } catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
-
diff --git a/twml/libtwml/src/ops/hashing_discretizer.docx b/twml/libtwml/src/ops/hashing_discretizer.docx
new file mode 100644
index 000000000..c12ca9805
Binary files /dev/null and b/twml/libtwml/src/ops/hashing_discretizer.docx differ
diff --git a/twml/libtwml/src/ops/hashmap.cpp b/twml/libtwml/src/ops/hashmap.cpp
deleted file mode 100644
index ce11ff81d..000000000
--- a/twml/libtwml/src/ops/hashmap.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-
-#include <mutex>
-
-using namespace tensorflow;
-
-REGISTER_OP("Hashmap")
-.Input("keys: int64")
-.Input("hash_keys: int64")
-.Input("hash_values: int64")
-.Output("values: int64")
-.Output("mask: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check if the sizes are different in the input
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-  });
-
-
-class Hashmap : public OpKernel {
- private:
-  twml::HashMap hmap;
-  std::once_flag flag;
-
- public:
-  explicit Hashmap(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // Quick hack
-      const Tensor& keys = context->input(0);
-
-      std::call_once(this->flag, [this, context](){
-          const Tensor& hash_keys = context->input(1);
-          const Tensor& hash_values = context->input(2);
-          const auto hash_keys_flat = hash_keys.flat<int64>();
-          const auto hash_values_flat = hash_values.flat<int64>();
-          const int64 N = hash_keys_flat.size();
-
-          for (int64 i = 0; i < N; i++) {
-            hmap.insert(hash_keys_flat(i), hash_values_flat(i));
-          }
-        });
-
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(),
-                                                       &values));
-
-      Tensor* mask = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, keys.shape(),
-                                                       &mask));
-
-      // copy the values without sharing a storage
-      values->flat<int64>() = keys.flat<int64>();
-
-      auto keys_flat = keys.flat<int64>();
-      auto values_flat = values->flat<int64>();
-      auto mask_flat = mask->flat<int8>();
-
-      // TODO: use twml tensor
-      const int64 N = keys_flat.size();
-      for (int64 i = 0; i < N; i++) {
-        // values_flat(i), keys_flat(i) return references to tensorflow::int64.
-        // Using them in hmap.get() was causing issues because of automatic casting.
-        int64_t val = values_flat(i);
-        int64_t key = keys_flat(i);
-        mask_flat(i) = hmap.get(val, key);
-        values_flat(i) = val;
-      }
-    }  catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("Hashmap")
-  .Device(DEVICE_CPU),
-  Hashmap);
diff --git a/twml/libtwml/src/ops/hashmap.docx b/twml/libtwml/src/ops/hashmap.docx
new file mode 100644
index 000000000..3041c759c
Binary files /dev/null and b/twml/libtwml/src/ops/hashmap.docx differ
diff --git a/twml/libtwml/src/ops/isotonic_calibration.cpp b/twml/libtwml/src/ops/isotonic_calibration.cpp
deleted file mode 100644
index 10a8c22dc..000000000
--- a/twml/libtwml/src/ops/isotonic_calibration.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("IsotonicCalibration")
-.Attr("T: {float, double}")
-.Input("input: T")
-.Input("xs: T")
-.Input("ys: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  // output shape should be the same as input shape.
-  c->set_output(0, c->input(0));
-  return Status::OK();
-}).Doc(R"doc(
-
-This operation calibrates probabilities by fitting to a piece-wise non-decreasing function.
-
-Input
-  input: A tensor containing uncalibrated probabilities.
-  xs: A tensor containing the boundaries of the bins.
-  ys: A tensor contianing calibrated values for the corresponding bins.
-
-Expected Sizes:
-  input: [batch_size, num_labels].
-  xs, ys: [num_labels, num_bins].
-
-Expected Types:
-  input: float or double.
-  xs, ys: same as input.
-
-Outputs
-  output: A tensor containing calibrated probabilities with same shape and size as input.
-
-)doc");
-
-template<typename T>
-class IsotonicCalibration : public OpKernel {
- public:
-  explicit IsotonicCalibration(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const Tensor& xs = context->input(1);
-    const Tensor& ys = context->input(2);
-
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, input.shape(), &output));
-
-    try {
-      const twml::Tensor twml_input = TFTensor_to_twml_tensor(input);
-      const twml::Tensor twml_xs = TFTensor_to_twml_tensor(xs);
-      const twml::Tensor twml_ys = TFTensor_to_twml_tensor(ys);
-      twml::Tensor twml_output = TFTensor_to_twml_tensor(*output);
-
-      twml::linearInterpolation(twml_output, twml_input, twml_xs, twml_ys);
-    }  catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER(Type)                \
-                                      \
-  REGISTER_KERNEL_BUILDER(            \
-    Name("IsotonicCalibration")       \
-    .Device(DEVICE_CPU)               \
-    .TypeConstraint<Type>("T"),       \
-    IsotonicCalibration<Type>);       \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/isotonic_calibration.docx b/twml/libtwml/src/ops/isotonic_calibration.docx
new file mode 100644
index 000000000..ab2331255
Binary files /dev/null and b/twml/libtwml/src/ops/isotonic_calibration.docx differ
diff --git a/twml/libtwml/src/ops/num_intra_op_threads.cpp b/twml/libtwml/src/ops/num_intra_op_threads.cpp
deleted file mode 100644
index 7e5ef0cbf..000000000
--- a/twml/libtwml/src/ops/num_intra_op_threads.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("NumIntraOpThreads")
-.Input("x: float32")
-.Output("num_intra_op_threads: int32")
-.SetShapeFn(tensorflow::shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns the number of threads in the intra_op_parallelism pool
-This is not part of the Tensorflow API as of the date of writing this doc. Hence,
-a tensorflow operation is the best resort.
-Input
-  x: Dummy placeholder so that constant folding is not done by TF GraphOptimizer.
-  Please refer https://github.com/tensorflow/tensorflow/issues/22546 for more
-  details.
-Output
-  num_intra_op_threads: A scalar tensor corresponding to the number of threads in
-  the intra_op_parallelism pool
-)doc");
-
-class NumIntraOpThreads : public OpKernel {
- public:
-  explicit NumIntraOpThreads(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    int num_intra_op_threads = context->device()->tensorflow_cpu_worker_threads()->num_threads;
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    auto output_flat = output_tensor->flat<int32>();
-    output_flat(0) = num_intra_op_threads;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("NumIntraOpThreads").Device(DEVICE_CPU), NumIntraOpThreads);
diff --git a/twml/libtwml/src/ops/num_intra_op_threads.docx b/twml/libtwml/src/ops/num_intra_op_threads.docx
new file mode 100644
index 000000000..ca26ebaa8
Binary files /dev/null and b/twml/libtwml/src/ops/num_intra_op_threads.docx differ
diff --git a/twml/libtwml/src/ops/par_add.cpp b/twml/libtwml/src/ops/par_add.cpp
deleted file mode 100644
index c03c1ad89..000000000
--- a/twml/libtwml/src/ops/par_add.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/logging.h"
-#include <iostream>
-
-#include <vector>
-
-using namespace tensorflow;
-
-REGISTER_OP("ParAdd")
-  .Input("input_a: float")
-  .Input("input_b: float")
-  .Output("a_plus_b: float")
-  .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-  });
-
-
-class ParAddOp : public OpKernel {
- public:
-  explicit ParAddOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor0 = context->input(0);
-    auto input_flat0 = input_tensor0.flat<float>();
-    const Tensor& input_tensor1 = context->input(1);
-    auto input_flat1 = input_tensor1.flat<float>();
-
-    OP_REQUIRES(context, input_tensor0.shape() == input_tensor1.shape(),
-                errors::InvalidArgument("Input tensors must be identical shape."));
-
-    // Create an output tensor
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0,
-                                            input_tensor0.shape(),
-                                            &output_tensor));
-    auto output_flat = output_tensor->flat<float>();
-
-    // PARALLEL ADD
-    const int N = input_flat0.size();
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [=, &input_flat0, &input_flat1, &output_flat](int64 start, int64 limit) {
-      for (; start < limit; ++start) {
-        output_flat(start) = input_flat0(start) + input_flat1(start);
-      }
-    };
-
-    // this is a heuristic. high number is likely to be sharded into smaller pieces
-    int64 cost_per_unit = 1;
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          N,
-          cost_per_unit,
-          task);
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParAdd").Device(DEVICE_CPU), ParAddOp);
-
-
diff --git a/twml/libtwml/src/ops/par_add.docx b/twml/libtwml/src/ops/par_add.docx
new file mode 100644
index 000000000..57f97f38a
Binary files /dev/null and b/twml/libtwml/src/ops/par_add.docx differ
diff --git a/twml/libtwml/src/ops/partition_sparse_tensor.cpp b/twml/libtwml/src/ops/partition_sparse_tensor.cpp
deleted file mode 100644
index 4a210ba7f..000000000
--- a/twml/libtwml/src/ops/partition_sparse_tensor.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("PartitionSparseTensorMod")
-.Attr("T: {float, double}")
-.Input("indices: int64")
-.Input("values: T")
-.Output("result: output_types")
-.Attr("num_partitions: int")
-.Attr("output_types: list({int64, float, double})")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-
-A tensorflow OP that partitions an input batch represented as a sparse tensor
-(indices are [ids, keys]) into separate sparse tensors to more optimally place
-sparse computations in distributed training.
-
-Inputs
-  indices: Indices from sparse tensor ([ids, keys] from the batch).
-  values: Batch values from the original features dict.
-
-Attr
-  num_partitions: Number of partitions to generate.
-  output_types: A list of types for the output tensors like
-                [tf.int64, tf.float32, tf.int64, tf.float32, ...]
-                The length must be 2 * num_partitions (see Outputs below)
-
-Outputs
-  List of dense tensors containing for each partition:
-    - partitioned indices tensor ([ids, keys] from partitioned batch)
-    - partitioned values tensor
-  The list lenth is 2 * num_partitions. Example:
-  [ [ids_1, keys_1], values_1, [ids_2, keys_2], values_2, ... ]
-)doc");
-
-template<typename T>
-class PartitionSparseTensorMod : public OpKernel {
- private:
-  int64 num_partitions;
-
- public:
-  explicit PartitionSparseTensorMod(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("num_partitions", &num_partitions));
-    OP_REQUIRES(context, num_partitions > 0,
-                errors::InvalidArgument("Number of partitions must be positive"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // grab input tensors
-    const Tensor& indices_tensor = context->input(0);  // (ids, keys)
-    const Tensor& values_tensor = context->input(1);
-
-    // check sizes
-    int64 num_keys = indices_tensor.shape().dim_size(0);
-    OP_REQUIRES(context, indices_tensor.dims() == 2,
-                errors::InvalidArgument("Indices tensor must be 2D [ids, keys]"));
-    OP_REQUIRES(context, indices_tensor.shape().dim_size(1) == 2,
-                errors::InvalidArgument("Indices tensor must have 2 cols [ids, keys]"));
-    OP_REQUIRES(context, values_tensor.shape().dim_size(0) == num_keys,
-                errors::InvalidArgument("Number of values must match number of keys"));
-
-    // grab input vectors
-    auto indices = indices_tensor.flat<int64>();
-    auto values = values_tensor.flat<T>();
-
-    // count the number of features that fall in each partition
-    std::vector<int64> partition_counts(num_partitions);
-
-    for (int i = 0; i < num_keys; i++) {
-      int64 key = indices(2 * i + 1);
-      int64 partition_id = key % num_partitions;
-      partition_counts[partition_id]++;
-    }
-
-    // allocate outputs for each partition and keep references
-    std::vector<int64*> output_indices_partitions;
-    std::vector<T*> output_values_partitions;
-    output_indices_partitions.reserve(num_partitions);
-    output_values_partitions.reserve(num_partitions);
-
-    for (int i = 0; i < num_partitions; i++) {
-      Tensor *output_indices = nullptr, *output_values = nullptr;
-      TensorShape shape_indices = TensorShape({partition_counts[i], 2});
-      TensorShape shape_values = TensorShape({partition_counts[i]});
-
-      OP_REQUIRES_OK(context, context->allocate_output(2 * i, shape_indices, &output_indices));
-      OP_REQUIRES_OK(context, context->allocate_output(2 * i + 1, shape_values, &output_values));
-
-      output_indices_partitions.push_back(output_indices->flat<int64>().data());
-      output_values_partitions.push_back(output_values->flat<T>().data());
-    }
-
-    // assign a partition id to each feature
-    // populate tensors for each partition
-    std::vector<int64> partition_indices(num_partitions);
-
-    for (int i = 0; i < num_keys; i++) {
-      int64 key = indices(2 * i + 1);
-      int64 pid = key % num_partitions;  // partition id
-      int64 idx = partition_indices[pid]++;
-
-      output_indices_partitions[pid][2 * idx] = indices(2 * i);
-      output_indices_partitions[pid][2 * idx + 1] = key / num_partitions;
-      output_values_partitions[pid][idx] = values(i);
-    }
-  }
-};
-
-#define REGISTER(Type)                \
-                                      \
-  REGISTER_KERNEL_BUILDER(            \
-    Name("PartitionSparseTensorMod")  \
-    .Device(DEVICE_CPU)               \
-    .TypeConstraint<Type>("T"),       \
-    PartitionSparseTensorMod<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/partition_sparse_tensor.docx b/twml/libtwml/src/ops/partition_sparse_tensor.docx
new file mode 100644
index 000000000..148c2dc0b
Binary files /dev/null and b/twml/libtwml/src/ops/partition_sparse_tensor.docx differ
diff --git a/twml/libtwml/src/ops/percentile_discretizer_v2.cpp b/twml/libtwml/src/ops/percentile_discretizer_v2.cpp
deleted file mode 100644
index 2a0dac7d8..000000000
--- a/twml/libtwml/src/ops/percentile_discretizer_v2.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-
-using namespace tensorflow;
-
-void CombinedComputeDiscretizers(
-  OpKernelContext*,
-  int64_t,
-  const twml::Map<int64_t, int64_t>&,
-  int64_t);
-
-REGISTER_OP("PercentileDiscretizerV2")
-.Attr("T: {float, double}")
-.Input("input_ids: int64")
-.Input("input_vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Input("start_compute: int64")
-.Input("end_compute: int64")
-.Attr("output_bits: int")
-.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
-.Attr("feature_indices: tensor = { dtype: DT_INT64 }")
-.Attr("cost_per_unit: int")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features (if calibrated).
-  - note - choice of float or double should be consistent among inputs/output
-
-Input
-  input_ids(int64): A tensor containing input feature ids (direct from data record).
-  input_vals: A tensor containing input values at corresponding feature ids.
-    - i.e. input_ids[i] <-> input_vals[i] for each i
-    - float or double
-  bin_ids(int64): A tensor containing the discretized feature id for each bin.
-  bin_vals: A tensor containing the bin boundaries for values of a given feature.
-    - float or double
-  feature_offsets(int64): Specifies the starting location of bins for a given feature id.
-  start_compute(int64 scalar tensor): which index to start the computation at
-  end_compute(int64 scalar tensor): which index to end the computation right before
-    -> for example, (start_compute,end_compute)=(0,10) would compute on 0 thru 9
-  output_bits(int): The maximum number of bits to use for the output IDs.
-    -> 2**out_bits must be greater than bin_ids.size
-  feature_ids(int64): 1D TensorProto of feature IDs seen during calibration
-  feature_indices(int64): 1D TensorProto of feature indices corresponding with feature_IDs
-    -> hint: look up make_tensor_proto:
-       proto_init = np.array(values, dtype=np.int64)
-       tensor_attr = tf.make_tensor_proto(my_proto_init)
-  cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
-    if not CPU-bound) to complete a unit of work. Overestimating creates too
-    many shards and CPU time will be dominated by per-shard overhead, such as
-    Context creation. Underestimating may not fully make use of the specified
-    parallelism.
-
-Outputs
-  new_keys(int64): The discretized feature ids with same shape and size as keys.
-  new_vals(float or double): The discretized values with the same shape and size as vals.
-
-Operation
-  Note that the discretization operation maps observation vectors to higher dimensional
-    observation vectors. Here, we describe this mapping.
-
-  Let a calibrated feature observation be given by (F,x), where F is the ID of the
-    feature, and x is some real value (i.e., continuous feature). This kind of
-    representation is useful for the representation of sparse vectors, where there
-    are many zeros.
-
-  For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
-    (0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
-    elements of the vector
-
-  The disretizer performs the following operation:
-    (F,x) -> (map(x|F),1).
-  Hence, we have that map(x|F) is a new feature ID, and the value observed for that
-    feature is 1. We might read map(x|F) as 'the map of x for feature F'.
-
-  For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
-    We will then have that F~(x) is in the set newIDs(F) for any value of x. Each set member
-    of newIDs(F) is associated with a 'bin', as defined by the bin boundaries given in
-    the bin_vals input array. For any two different feature IDs F and G, we have that
-    INTERSECT(newIDs(F),newIDs(G)) is the empty set
-
-  Example - consider input vector with a single element, i.e. [x].
-    Let's Discretize to one of 2 values, as follows:
-    Let F=0 for the ID of the single feature in the vector.
-    Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
-    Let newIDs(F) = newIDs(0) = {0,1}
-    Let map(x|F) = map(x|0) = 0 if x<=BNDRY else 1
-  If we had another element y in the vector, i.e. [x, y], then we might additionally
-    Let F=1 for element y.
-    Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
-    Let newIDs(F) = newIDs(1) = {2,3} (so as to have empty intersect with newIDs(0))
-    Let map(x|F) = map(x|1) = 2 if x<=BNDRY else 3
-  Consider vector observation [-0.1, 0.2]. We then represent this as [(0, -0.1), (1, 0.2)]
-    Let BNDRY(0) = BNDRY(1) = 0. When we discretize the vector, we get:
-    (0, -0.1) -> (map(-0.1|0), 1) = (0, 1)
-    (1,  0.2) -> (map( 0.2|1), 1) = (3, 1)
-    Our output vector is then represented sparsely as [(0, 1), (3, 1)], and the dense
-    representation of this could be [1, 0, 0, 1]
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizerV2 : public OpKernel {
- public:
-  explicit PercentileDiscretizerV2(OpKernelConstruction* context) : OpKernel(context) {
-    // get the number of output bits
-    // for use with features that have not been calibrated
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("output_bits", &output_bits_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("cost_per_unit", &cost_per_unit_));
-    OP_REQUIRES(context, cost_per_unit_ >= 0,
-                errors::InvalidArgument("Must have cost_per_unit >= 0."));
-
-    // construct the ID_to_index hash map
-    Tensor feature_IDs;
-    Tensor feature_indices;
-
-    // extract the tensors
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_ids", &feature_IDs));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_indices", &feature_indices));
-
-    // for access to the data
-    // int64_t data type is set in to_layer function of the calibrator objects in Python
-    auto feature_IDs_flat = feature_IDs.flat<int64>();
-    auto feature_indices_flat = feature_indices.flat<int64>();
-
-    // verify proper dimension constraints
-    OP_REQUIRES(context, feature_IDs.shape() == feature_indices.shape(),
-                errors::InvalidArgument("feature_ids and feature_indices must be identical shape."));
-    OP_REQUIRES(context, feature_IDs.shape().dims() == 1,
-                errors::InvalidArgument("feature_ids and feature_indices must be 1D."));
-
-    // reserve space in the hash map and fill in the values
-    int num_features = feature_IDs.shape().dim_size(0);
-
-#ifdef USE_DENSE_HASH
-    ID_to_index_.set_empty_key(0);
-    ID_to_index_.resize(num_features);
-#else
-    ID_to_index_.reserve(num_features);
-#endif  // USE_DENSE_HASH
-    for (int i = 0 ; i < num_features ; i++) {
-      ID_to_index_[feature_IDs_flat(i)] = feature_indices_flat(i);
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    CombinedComputeDiscretizers(
-      context,
-      output_bits_,
-      ID_to_index_,
-      cost_per_unit_);
-  }
-
- private:
-  twml::Map<int64_t, int64_t> ID_to_index_;
-  int output_bits_;
-  int cost_per_unit_;
-};
-
-#define REGISTER(Type)              \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizerV2")         \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizerV2<Type>);         \
-
-REGISTER(float);
-REGISTER(double);
-
-void CombinedComputeDiscretizers(
-    OpKernelContext* context,
-    int64_t output_bits,
-    const twml::Map<int64_t, int64_t> &ID_to_index,
-    int64_t cost_per_unit) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_ids = context->input(2);
-  const Tensor& bin_vals = context->input(3);
-  const Tensor& feature_offsets = context->input(4);
-
-  uint64 full_size = keys.dim_size(0);
-  const int total_size = static_cast<int64>(full_size);
-  TensorShape output_shape = {total_size};
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_ids_ = TFTensor_to_twml_tensor(bin_ids);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-    const twml::Tensor feature_offsets_ = TFTensor_to_twml_tensor(feature_offsets);
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [&](int64 start, int64 limit) {
-      twml::discretizerInfer(out_keys_, out_vals_,
-                             in_keys_, in_vals_,
-                             bin_ids_, bin_vals_,
-                             feature_offsets_, output_bits,
-                             ID_to_index,
-                             start, limit,
-                             start);
-    };
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          full_size,
-          static_cast<int64>(cost_per_unit),
-          task);
-  }  catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
diff --git a/twml/libtwml/src/ops/percentile_discretizer_v2.docx b/twml/libtwml/src/ops/percentile_discretizer_v2.docx
new file mode 100644
index 000000000..d59a94931
Binary files /dev/null and b/twml/libtwml/src/ops/percentile_discretizer_v2.docx differ
diff --git a/twml/libtwml/src/ops/resource_utils.docx b/twml/libtwml/src/ops/resource_utils.docx
new file mode 100644
index 000000000..3ec29dfe8
Binary files /dev/null and b/twml/libtwml/src/ops/resource_utils.docx differ
diff --git a/twml/libtwml/src/ops/resource_utils.h b/twml/libtwml/src/ops/resource_utils.h
deleted file mode 100644
index a41fe6845..000000000
--- a/twml/libtwml/src/ops/resource_utils.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#pragma once
-
-#include <twml.h>
-
-#include <atomic>
-#include <string>
-#include <vector>
-
-// Add these to make gcc ignore the warnings from tensorflow.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-compare"
-
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/resource_op_kernel.h"
-
-#pragma GCC diagnostic pop
-
-#include <memory>
-#include <functional>
-
-template<typename T>
-void unrefHandle(T *handle) {
-  handle->Unref();
-}
-
-template <typename T>
-using unique_handle = std::unique_ptr<T, std::function<void(T *)> >;
-
-// as std::type_index is not abi compatible, we bypass the hash_code checks.
-// https://github.com/tensorflow/tensorflow/commit/15275d3a14c77e2244ae1155f93243256f08e3ed
-#ifdef __APPLE__
-template <typename T>
-Status CreateTwmlResource(OpKernelContext* ctx, const ResourceHandle& p, T* value) {
-  return ctx->resource_manager()->Create(p.container(), p.name(), value);
-}
-
-template <typename T>
-Status LookupTwmlResource(OpKernelContext* ctx, const ResourceHandle& p,
-                      T** value) {
-  return ctx->resource_manager()->Lookup(p.container(), p.name(), value);
-}
-#endif  // __APPLE__
-
-template<typename T>
-unique_handle<T> getHandle(tensorflow::OpKernelContext* context, int input_idx) {
-  using namespace tensorflow;
-  T *ptr = nullptr;
-#ifdef __APPLE__
-  auto s = LookupTwmlResource(context, HandleFromInput(context, input_idx), &ptr);
-#else
-  auto s = LookupResource(context, HandleFromInput(context, input_idx), &ptr);
-#endif  // __APPLE__
-
-  if (!s.ok()) {
-    throw std::runtime_error("Failed to get resource handle");
-  }
-  return unique_handle<T>(ptr, unrefHandle<T>);
-}
-
-template<typename InputType>
-const uint8_t *getInputBytes(const Tensor &input, int id) {
-  return reinterpret_cast<const uint8_t *>(input.flat<InputType>().data());
-}
-
-template<>
-inline const uint8_t *getInputBytes<string>(const Tensor &input, int id) {
-  return reinterpret_cast<const uint8_t *>(input.flat<string>()(id).c_str());
-}
-
-template<typename InputType>
-const int getBatchSize(const Tensor &input) {
-  return 1;
-}
-
-template<>
-inline const int getBatchSize<string>(const Tensor &input) {
-  return static_cast<int>(input.NumElements());
-}
-
-class DataRecordResource : public ResourceBase {
- public:
-  Tensor input;
-  int64 num_labels;
-  int64 num_weights;
-  twml::DataRecord common;
-  std::vector<twml::DataRecord> records;
-  twml::Map<int64_t, int64_t> *keep_map;
-  string DebugString() const override { return "DataRecords resource"; }
-};
-
-// A thin layer around batch of HashedDataRecords
-class HashedDataRecordResource : public ResourceBase {
- public:
-  Tensor input;
-  int64 total_size;
-  int64 num_labels;
-  int64 num_weights;
-  twml::HashedDataRecord common;
-  std::vector<twml::HashedDataRecord> records;
-  string DebugString() const override { return "HashedDataRecord Resource"; }
-};
-
-#define TF_CHECK_STATUS(fn) do {                \
-    Status s = fn;                              \
-    if (!s.ok()) return s;                      \
-  } while (0)
-
-template<typename ResourceType>
-Status makeResourceHandle(OpKernelContext* context, int out_idx, ResourceType **resource_) {
-  static std::atomic<int64> id;
-  Tensor* handle_tensor;
-  TF_CHECK_STATUS(context->allocate_output(out_idx, TensorShape({}), &handle_tensor));
-
-  ResourceType *resource = new ResourceType();
-  const auto resource_name = typeid(ResourceType).name() + std::to_string(id++);
-  ResourceHandle handle = MakePerStepResourceHandle<ResourceType>(context, resource_name);
-#ifdef __APPLE__
-  TF_CHECK_STATUS(CreateTwmlResource(context, handle, resource));
-#else
-  TF_CHECK_STATUS(CreateResource(context, handle, resource));
-#endif  // __APPLE__
-  handle_tensor->scalar<ResourceHandle>()() = handle;
-
-  *resource_ = resource;
-  return Status::OK();
-}
diff --git a/twml/libtwml/src/ops/scripts/get_inc.docx b/twml/libtwml/src/ops/scripts/get_inc.docx
new file mode 100644
index 000000000..d65d511e8
Binary files /dev/null and b/twml/libtwml/src/ops/scripts/get_inc.docx differ
diff --git a/twml/libtwml/src/ops/scripts/get_inc.py b/twml/libtwml/src/ops/scripts/get_inc.py
deleted file mode 100644
index c50edfa90..000000000
--- a/twml/libtwml/src/ops/scripts/get_inc.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Gets the path of headers for the current Tensorflow library"""
-
-import tensorflow.compat.v1 as tf
-
-print(tf.sysconfig.get_include(), end='')
diff --git a/twml/libtwml/src/ops/scripts/get_inc.sh b/twml/libtwml/src/ops/scripts/get_inc.sh
deleted file mode 100755
index 5cb064338..000000000
--- a/twml/libtwml/src/ops/scripts/get_inc.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-PEX_INTERPRETER=1 "$PYTHON_ENV" "$LIBTWML_HOME"/src/ops/scripts/get_inc.py
diff --git a/twml/libtwml/src/ops/scripts/get_lib.docx b/twml/libtwml/src/ops/scripts/get_lib.docx
new file mode 100644
index 000000000..3c977fe6f
Binary files /dev/null and b/twml/libtwml/src/ops/scripts/get_lib.docx differ
diff --git a/twml/libtwml/src/ops/scripts/get_lib.py b/twml/libtwml/src/ops/scripts/get_lib.py
deleted file mode 100644
index 7150c48b7..000000000
--- a/twml/libtwml/src/ops/scripts/get_lib.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Gets the path of headers for the current Tensorflow library"""
-
-import tensorflow.compat.v1 as tf
-
-print(tf.sysconfig.get_lib(), end='')
diff --git a/twml/libtwml/src/ops/scripts/get_lib.sh b/twml/libtwml/src/ops/scripts/get_lib.sh
deleted file mode 100755
index 1b9d802b6..000000000
--- a/twml/libtwml/src/ops/scripts/get_lib.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-PEX_INTERPRETER=1 "$PYTHON_ENV" "$LIBTWML_HOME"/src/ops/scripts/get_lib.py
diff --git a/twml/libtwml/src/ops/scripts/symlink.docx b/twml/libtwml/src/ops/scripts/symlink.docx
new file mode 100644
index 000000000..9cc175fe0
Binary files /dev/null and b/twml/libtwml/src/ops/scripts/symlink.docx differ
diff --git a/twml/libtwml/src/ops/scripts/symlink.sh b/twml/libtwml/src/ops/scripts/symlink.sh
deleted file mode 100755
index 2ddb76371..000000000
--- a/twml/libtwml/src/ops/scripts/symlink.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-#Needed to create a "nice" symlink to _pywrap_tensorflow_internal.so so
-#that cmake can link with the library properly.
-
-#This library is only needed for streaming datasets and is linked with
-#libtwml_tf_data.so which will not be used at runtime.
-
-TF_PYTHON_LIB_DIR=$(PEX_INTERPRETER=1 "$PYTHON_ENV" "$TWML_HOME"/backends/tensorflow/src/scripts/get_lib.py)
-TF_INTERNAL_LIB=$TWML_HOME/backends/tensorflow/twml/lib/libtensorflow_internal.so
-rm -f "$TF_INTERNAL_LIB"
-ln -s "$TF_PYTHON_LIB_DIR"/python/_pywrap_tensorflow_internal.so "$TF_INTERNAL_LIB"
diff --git a/twml/libtwml/src/ops/sleep_op.cpp b/twml/libtwml/src/ops/sleep_op.cpp
deleted file mode 100644
index dd9a1834c..000000000
--- a/twml/libtwml/src/ops/sleep_op.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <chrono>
-#include <thread>
-
-using namespace tensorflow;
-
-REGISTER_OP("Sleep")
-.Input("num_milliseconds: int32")
-.Output("sleep_time_in_ms: int32")
-.SetShapeFn(tensorflow::shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that sleeps for specified number of milliseconds. 
-This is a proxy to determine the number of inter_op_parallelism pool. 
-This is not part of the Tensorflow API as of the date of writing this 
-doc. Hence, a tensorflow operation is the best resort.
-Input
-  num_milliseconds: A scalar tensor corresponding to the number
-  of milliseconds the operation should sleep for
-Output
-  sleep_time_in_ms: A scalar tensor corresponding to the 
-  actual number of milliseconds for which the operation slept
-)doc");
-
-class SleepOp : public OpKernel {
- public:
-    explicit SleepOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-    void Compute(OpKernelContext* context) override {
-      // Grab the input tensor
-      const Tensor& input_tensor = context->input(0);
-      auto input = input_tensor.flat<int32>();
-
-      // Sleep for specified milliseconds
-      auto start = std::chrono::high_resolution_clock::now();
-      std::this_thread::sleep_for(std::chrono::milliseconds(input(0)));
-      auto end = std::chrono::high_resolution_clock::now();
-      std::chrono::duration<double, std::milli> elapsed = end-start;
-
-      // Set the output tensor
-      Tensor* output_tensor = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output_tensor));
-      auto output_flat = output_tensor->flat<int32>();
-      output_flat(0) = elapsed.count();
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("Sleep").Device(DEVICE_CPU), SleepOp);
diff --git a/twml/libtwml/src/ops/sleep_op.docx b/twml/libtwml/src/ops/sleep_op.docx
new file mode 100644
index 000000000..63fc0535f
Binary files /dev/null and b/twml/libtwml/src/ops/sleep_op.docx differ
diff --git a/twml/libtwml/src/ops/sparse_normalization.cpp b/twml/libtwml/src/ops/sparse_normalization.cpp
deleted file mode 100644
index 9b079429c..000000000
--- a/twml/libtwml/src/ops/sparse_normalization.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("SparseMaxNorm")
-.Attr("epsilon: float")
-.Input("max_values: Ref(float)")
-.Input("indices: int64")
-.Input("values: float")
-.Input("is_training: bool")
-.Output("updated_max_values: Ref(float)")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that normalizes a batch of sparse inputs based on the current maximum value.
-
-Input
-  max_values: float tensor variable representing the max values seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-  is_training: bool tensor specifying if the op should be run in training mode or not.
-
-Outputs
-  updated_max_values: max_values updated with the current batch.
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-  # During training / inference
-  for i, idx in enumerate(indices):
-    updated_max_values[idx] = max(max_values[idx], abs(values[i]))
-    normalized_values[i] = values[i] / updated_max_values[idx]
-
-)doc");
-
-class SparseMaxNorm : public OpKernel {
- private:
-  float epsilon_;
-
- public:
-  explicit SparseMaxNorm(OpKernelConstruction *context) : OpKernel(context) {
-        OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-  }
-
-  void Compute(OpKernelContext *context) override {
-        // We always return the input ref.
-    context->forward_ref_input_to_ref_output(0, 0);
-    Tensor max_values_tensor = context->mutable_input(0, false);
-
-    OP_REQUIRES(context, max_values_tensor.IsInitialized(),
-                errors::FailedPrecondition("Attempting to use uninitialized "
-                                           "parameters: ",
-                                           requested_input(0)));
-
-    const Tensor &indices_tensor = context->input(1);
-    const Tensor &values_tensor = context->input(2);
-    const Tensor &is_training_tensor = context->input(3);
-
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-    const bool is_training = is_training_tensor.scalar<bool>()();
-
-    auto max_values = max_values_tensor.flat<float>();
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto normalized_values = normalized_values_tensor->flat<float>();
-
-    const int64 N = indices.size();
-
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float value = values(i);
-      float max_value = std::max(max_values(idx), std::abs(value));
-
-      // Guaranteed to be between [-1, 1].
-      normalized_values(i) = value / std::max(max_value, epsilon_);
-
-      if (is_training) {
-        max_values(idx) = max_value;
-      }
-    }
-  }
-};
-
-REGISTER_OP("SparseBatchNorm")
-.Attr("input_size: int")
-.Attr("epsilon: float")
-.Input("means: Ref(float)")
-.Input("variances: Ref(float)")
-.Input("indices: int64")
-.Input("values: float")
-.Input("is_training: bool")
-.Output("updated_means: Ref(float)")
-.Output("updated_vars: Ref(float)")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that performs batch normalization.
-
-Attr
-  input_size: Size of the inputs.
-  epsilon: The minimum value of the variance.
-
-Input
-  mean: float tensor variable representing the running mean seen so far.
-  variances: float tensor variable representing the running variance seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-  is_training: bool tensor specifying if the op should be run in training mode or not.
-
-Outputs
-  updated_means: mean updated with the current batch.
-  updated_vars: variances updated with the current batch.
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-    if is_training:
-      means, variances = update_metrics(means, variances, values)
-
-    normalized_values = (values - means) / sqrt(variances + epsilon)
-    return normalized_values * gamma + beta
-
-)doc");
-
-class SparseBatchNorm : public OpKernel {
- private:
-  std::vector<int64> counts_;
-  std::vector<float> m2s_;
-  float epsilon_;
-
- public:
-  explicit SparseBatchNorm(OpKernelConstruction *context) : OpKernel(context) {
-    int64 input_size;
-    OP_REQUIRES_OK(context, context->GetAttr("input_size", &input_size));
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-    counts_.resize(input_size);
-    m2s_.resize(input_size);
-  }
-
-  void Compute(OpKernelContext *context) override {
-    // We always return the input ref.
-    context->forward_ref_input_to_ref_output(0, 0);
-    context->forward_ref_input_to_ref_output(1, 1);
-
-    Tensor means_tensor = context->mutable_input(0, true);
-    Tensor variances_tensor = context->mutable_input(1, true);
-
-    OP_REQUIRES(context, means_tensor.IsInitialized(),
-                errors::FailedPrecondition("Attempting to use uninitialized "
-                                           "parameters: ",
-                                           requested_input(0)));
-
-    OP_REQUIRES(context, variances_tensor.IsInitialized(),
-                errors::FailedPrecondition("Attempting to use uninitialized "
-                                           "parameters: ",
-                                           requested_input(1)));
-
-    const Tensor &indices_tensor = context->input(2);
-    const Tensor &values_tensor = context->input(3);
-    const Tensor &is_training_tensor = context->input(4);
-
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-    const bool is_training = is_training_tensor.scalar<bool>()();
-
-    auto means = means_tensor.flat<float>();
-    auto variances = variances_tensor.flat<float>();
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(2, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto normalized_values = normalized_values_tensor->flat<float>();
-    const int64 N = indices.size();
-
-    if (is_training) {
-      // Accumulate, mean, count, sum of squared differences.
-      // Reference wiki:
-      // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
-      // Reference paper:
-      // https://www.jstor.org/stable/1266577?seq=1#page_scan_tab_contents
-      for (int64 i = 0; i < N; i++) {
-        int64 idx = indices(i);
-        int64 count = counts_[idx] + 1;
-
-        float value = values(i);
-        float old_mean = means(idx);
-        float old_delta = value - old_mean;
-        float new_mean = old_mean + old_delta / count;
-        float new_delta = value - new_mean;
-
-        counts_[idx] = count;
-        m2s_[idx] += new_delta * old_delta;
-        means(idx) = new_mean;
-        variances(idx) = m2s_[idx] / count;
-      }
-    }
-
-    // Normalize the values
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float stdev = std::sqrt(variances(idx) + epsilon_);
-      normalized_values(i) = (values(i) - means(idx)) / stdev;
-    }
-  }
-};
-
-REGISTER_OP("SparseMaxNormInference")
-.Attr("epsilon: float")
-.Input("max_values: float")
-.Input("indices: int64")
-.Input("values: float")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that normalizes a batch of sparse inputs based on the current maximum value.
-This is the inference OP.
-
-Input
-  max_values: float tensor representing the max values seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-
-Outputs
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-  # During inference
-  for i, idx in enumerate(indices):
-    updated_max_values[idx] = max(max_values[idx], abs(values[i]))
-    normalized_values[i] = values[i] / updated_max_values[idx]
-
-)doc");
-
-class SparseMaxNormInference : public OpKernel {
- private:
-  float epsilon_;
-
- public:
-  explicit SparseMaxNormInference(OpKernelConstruction *context) : OpKernel(context) {
-        OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    const Tensor &max_values_tensor = context->input(0);
-    const Tensor &indices_tensor = context->input(1);
-    const Tensor &values_tensor = context->input(2);
-
-    const auto max_values = max_values_tensor.flat<float>();
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto normalized_values = normalized_values_tensor->flat<float>();
-
-    const int64 N = indices.size();
-
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float value = values(i);
-      float max_value = std::max(max_values(idx), std::abs(value));
-
-      // Guaranteed to be between [-1, 1].
-      normalized_values(i) = value / std::max(max_value, epsilon_);
-    }
-  }
-};
-
-REGISTER_OP("SparseMaxNormTraining")
-.Attr("epsilon: float")
-.Input("max_values: float")
-.Input("indices: int64")
-.Input("values: float")
-.Output("updated_max_values: float")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that normalizes a batch of sparse inputs based on the current maximum value.
-This is the training OP.
-
-Input
-  max_values: float tensor variable representing the max values seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-
-Outputs
-  updated_max_values: max_values updated with the current batch.
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-  # During training
-  for i, idx in enumerate(indices):
-    updated_max_values[idx] = max(max_values[idx], abs(values[i]))
-    normalized_values[i] = values[i] / updated_max_values[idx]
-
-)doc");
-
-class SparseMaxNormTraining : public OpKernel {
- private:
-  float epsilon_;
-
- public:
-  explicit SparseMaxNormTraining(OpKernelConstruction *context) : OpKernel(context) {
-        OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    const Tensor &max_values_tensor = context->input(0);
-    const Tensor &indices_tensor = context->input(1);
-    const Tensor &values_tensor = context->input(2);
-
-    const auto max_values = max_values_tensor.flat<float>();
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-
-    Tensor *updated_max_values_tensor = nullptr;
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, max_values_tensor.shape(),
-                                                     &updated_max_values_tensor));
-    OP_REQUIRES_OK(context, context->allocate_output(1, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto updated_max_values = updated_max_values_tensor->flat<float>();
-    auto normalized_values = normalized_values_tensor->flat<float>();
-
-    const int64 N = indices.size();
-
-    // This copy is needed because the values of updated_max_values are originally garbage.
-    // Also note that N is not the same as max_values.size()
-    std::copy(max_values.data(), max_values.data() + max_values.size(), updated_max_values.data());
-
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float value = values(i);
-      float updated_max_value = std::max(updated_max_values(idx), std::abs(value));
-      // Guaranteed to be between [-1, 1].
-      normalized_values(i) = value / std::max(updated_max_value, epsilon_);
-      // Saving the updated_max_values
-      updated_max_values(idx) = updated_max_value;
-    }
-  }
-};
-
-
-
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseMaxNorm")
-  .Device(DEVICE_CPU),
-  SparseMaxNorm);
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseBatchNorm")
-  .Device(DEVICE_CPU),
-  SparseBatchNorm);
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseMaxNormInference")
-  .Device(DEVICE_CPU),
-  SparseMaxNormInference);
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseMaxNormTraining")
-  .Device(DEVICE_CPU),
-  SparseMaxNormTraining);
diff --git a/twml/libtwml/src/ops/sparse_normalization.docx b/twml/libtwml/src/ops/sparse_normalization.docx
new file mode 100644
index 000000000..fca487ab1
Binary files /dev/null and b/twml/libtwml/src/ops/sparse_normalization.docx differ
diff --git a/twml/libtwml/src/ops/tensor_record.cpp b/twml/libtwml/src/ops/tensor_record.cpp
deleted file mode 100644
index ad044e378..000000000
--- a/twml/libtwml/src/ops/tensor_record.cpp
+++ /dev/null
@@ -1,692 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-using std::string;
-
-REGISTER_OP("GetStringTensorsFromDataRecord")
-.Attr("feature_id: int")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("strings: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns string tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D int64 tensor representing the input index in a given batch.
-  strings: A 1D string tensor representing the decoded strings from the batch.
-)doc");
-
-REGISTER_OP("GetStringTensorsFromHashedDataRecord")
-.Attr("feature_id: int")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.Output("strings: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns string tensors from the hashed data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D int64 tensor representing the input index in a given batch.
-  strings: A 1D string tensor representing the decoded strings from the batch.
-)doc");
-
-template<typename Resource>
-class GetStringTensorsOp : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetStringTensorsOp(OpKernelConstruction *context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    const int64 batch_size = static_cast<int64>(handle->records.size());
-    const auto &records = handle->records;
-
-    try {
-      int64 total_size = 0;
-      for (const auto &record : records) {
-        try {
-          const auto &tensor = record.getRawTensor(feature_id);
-          total_size += static_cast<int64>(tensor.getNumElements());
-        } catch(const std::out_of_range &err) {
-          LOG(WARNING) << "Ignoring missing string tensor with key: " << feature_id << std::endl;
-          continue;
-        }
-      }
-
-      twml::ThriftReader reader(nullptr);
-      TensorShape shape = {total_size};
-      Tensor *strings_tensor = nullptr;
-      Tensor *ids_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids_tensor));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &strings_tensor));
-
-      auto strings_data = strings_tensor->flat<string>().data();
-      auto ids_data = ids_tensor->flat<int64>().data();
-
-      for (int64 i = 0; i < batch_size; i++) {
-        const auto &record = records[i];
-        try {
-          const twml::RawTensor &tensor = record.getRawTensor(feature_id);
-          const uint8_t *buffer = static_cast<const uint8_t *>(tensor.getData<void>());
-          const int64 num_strings = static_cast<int64>(tensor.getNumElements());
-          reader.setBuffer(buffer);
-
-          for (int64 j = 0; j < num_strings; j++) {
-            const uint8_t *curr_begin = nullptr;
-            const auto curr_length = reader.getRawBuffer<uint8_t>(&curr_begin);
-            strings_data[j] = std::string(curr_begin, curr_begin + curr_length);
-            ids_data[j] = i;
-          }
-          ids_data += num_strings;
-          strings_data += num_strings;
-        } catch(const std::out_of_range &err) {
-          continue;
-        }
-      }
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetStringTensorsFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetStringTensorsOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetStringTensorsFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetStringTensorsOp<HashedDataRecordResource>);
-
-REGISTER_OP("GetTensorsFromDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Input("data_record_handle: resource")
-.Output("output: string")
-.Output("out_shape: int64")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_shape: A tensor containing [batch_size, thrift_shape].
-  out_type: Output type returned as a string tensor of size 1.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-REGISTER_OP("GetTensorsFromHashedDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Input("hashed_data_record_handle: resource")
-.Output("output: string")
-.Output("out_shape: int64")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns decodes and tensors from the hashed data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_shape: A tensor containing [batch_size, thrift_shape].
-  out_type: Output type returned as a string tensor of size 1.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-template<class Resource>
-class GetTensorsOp : public OpKernel {
- private:
-  bool assert_shape;
-  int64 feature_id;
-
- public:
-  explicit GetTensorsOp(OpKernelConstruction *context)
-      : OpKernel(context), assert_shape(true) {
-    OP_REQUIRES_OK(context, context->GetAttr("assert_shape", &assert_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    uint64 batch_size = handle->records.size();
-    const auto &records = handle->records;
-
-    try {
-      TensorShape raw_shape = {static_cast<int64>(batch_size)};
-      Tensor* output_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, raw_shape, &output_tensor));
-      auto output_flat = output_tensor->flat<string>();
-      auto output_data = output_flat.data();
-
-      twml_type type = TWML_TYPE_UNKNOWN;
-      bool is_big_endian = false;
-
-      std::vector<uint64> shape(1, batch_size);
-      uint64 length = 0;
-
-      for (auto record : records) {
-        const twml::RawTensor tensor = record.getRawTensor(feature_id);
-        const auto &curr_dims = tensor.getDims();
-        const auto curr_type = tensor.getType();
-        const bool curr_is_big_endian = tensor.is_big_endian();
-        const uint64 curr_length = tensor.getRawLength();
-
-        // Create the output tensor based on first tensor
-        if (shape.size() == 1) {
-          // Push the shape of individual tensors into shape
-          shape.reserve(curr_dims.size() + 1);
-          shape.insert(shape.end(), curr_dims.begin(), curr_dims.end());
-          type = curr_type;
-          is_big_endian = curr_is_big_endian;
-          length = curr_length;
-
-        } else {
-          if (assert_shape) {
-            // Assert shape of all tensors is the same.
-            bool is_same_shape = std::equal(shape.begin() + 1, shape.end(), curr_dims.begin());
-
-            if (!is_same_shape || length != curr_length) {
-              throw std::runtime_error("TensorShape mismatch for feature_id: "
-                                       + std::to_string(feature_id));
-            }
-          }
-
-          // Assert type and endianness of all tensors is the same.
-          if (type != curr_type || is_big_endian != curr_is_big_endian) {
-            throw std::runtime_error("Tensor type mismatch for feature_id: "
-                                     + std::to_string(feature_id));
-          }
-        }
-
-        // Copy from datarecord to output
-        const uint8 *tensor_data = reinterpret_cast<const uint8 *>(tensor.getData<void>());
-        *output_data = std::string(tensor_data, tensor_data + curr_length);
-
-        // Increment it for the next tensor in the batch.
-        output_data++;
-      }
-
-      Tensor *shape_tensor = nullptr;
-      TensorShape shape_shape = {static_cast<int64>(shape.size())};
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape_shape, &shape_tensor));
-      auto shape_flat = shape_tensor->flat<int64>();
-      for (int i = 0; i < static_cast<int>(shape.size()); i++) {
-        shape_flat(i) = shape[i];
-      }
-
-      Tensor* type_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(2, {}, &type_tensor));
-      type_tensor->scalar<string>()() = twml::getTypeName(type);
-
-      Tensor* endian_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(3, {}, &endian_tensor));
-      endian_tensor->scalar<uint8>()() = is_big_endian;
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsOp<HashedDataRecordResource>);
-
-REGISTER_OP("GetTensorsWithMissingMaskFromDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Attr("default_shape: list(int)")
-.Attr("dtype_size: int")
-.Input("data_record_handle: resource")
-.Output("output: string")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.Output("is_found: bool")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  assert_shape: Specifies if the shape needs to be same across the batch.
-  feature_id: The hashed id of the feature name.
-  default_shape: Expected shape of output tensor.
-  dtype_size: expected size of each element.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_type: A string tensor represnting the type.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-  is_missing: A boolean tensor of length batch_size represnting if the tensor was found for an input.
-)doc");
-
-REGISTER_OP("GetTensorsWithMissingMaskFromHashedDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Attr("default_shape: list(int)")
-.Attr("dtype_size: int")
-.Input("hashed_data_record_handle: resource")
-.Output("output: string")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.Output("is_found: bool")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  assert_shape: Specifies if the shape needs to be same across the batch.
-  feature_id: The hashed id of the feature name.
-  default_shape: Expected shape of output tensor.
-  dtype_size: expected size of each element.
-
-Input
-  hashed_data_record_handle: Resource handle to HashedDataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_type: A string tensor represnting the type.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-  is_missing: A boolean tensor of length batch_size represnting if the tensor was found for an input.
-)doc");
-
-template<class Resource>
-class GetTensorsWithMissingMaskOp : public OpKernel {
- private:
-  bool assert_shape;
-  int64 feature_id;
-  int64 dtype_size;
-  std::vector<int64> shape;
-
- public:
-  explicit GetTensorsWithMissingMaskOp(OpKernelConstruction *context)
-      : OpKernel(context), assert_shape(true) {
-    OP_REQUIRES_OK(context, context->GetAttr("assert_shape", &assert_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_shape", &shape));
-    OP_REQUIRES_OK(context, context->GetAttr("dtype_size", &dtype_size));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    uint64 batch_size = handle->records.size();
-    const auto &records = handle->records;
-
-    try {
-      TensorShape raw_shape = {static_cast<int64>(batch_size)};
-      Tensor* output_tensor = nullptr;
-      Tensor* is_found_tensor = nullptr;
-
-      OP_REQUIRES_OK(context, context->allocate_output(0, raw_shape, &output_tensor));
-      OP_REQUIRES_OK(context, context->allocate_output(3, raw_shape, &is_found_tensor));
-
-      auto output_flat = output_tensor->flat<string>();
-      auto output_data = output_flat.data();
-      auto is_found_data = is_found_tensor->flat<bool>().data();
-
-      twml_type type = TWML_TYPE_UNKNOWN;
-      bool is_big_endian = false;
-
-      uint64 length = std::accumulate(shape.begin(), shape.end(), dtype_size, std::multiplies<int64>());
-      for (auto record : records) {
-        try {
-          const twml::RawTensor tensor = record.getRawTensor(feature_id);
-          const auto &curr_dims = tensor.getDims();
-          const auto curr_type = tensor.getType();
-          const bool curr_is_big_endian = tensor.is_big_endian();
-          const uint64 curr_length = tensor.getRawLength();
-
-          if (type == TWML_TYPE_UNKNOWN) {
-            type = curr_type;
-            is_big_endian = curr_is_big_endian;
-            // FloatTensors are stored as a list of doubles.
-            // If the requested dtype_size is 4, update the length.
-            // NOTE: All the missing tensors before this have wrong length, this is fixed at the end.
-            if (type == TWML_TYPE_DOUBLE && is_big_endian && dtype_size == 4) {
-              length = length * 2;
-            }
-          } else {
-            // Assert type and endianness of all tensors is the same.
-            if (type != curr_type || is_big_endian != curr_is_big_endian) {
-              throw std::runtime_error("Tensor type mismatch for feature_id: "
-                                       + std::to_string(feature_id));
-            }
-          }
-
-          // Assert shape of all tensors is the same.
-          if (assert_shape && type != TWML_TYPE_UNKNOWN) {
-            // Assert shape of all tensors is the same.
-            bool is_same_shape = std::equal(shape.begin(), shape.end(), curr_dims.begin());
-
-            if (!is_same_shape || length != curr_length) {
-              throw std::runtime_error("TensorShape mismatch for feature_id: "
-                                       + std::to_string(feature_id));
-            }
-          }
-
-          // Copy from datarecord to output
-          const uint8 *tensor_data = reinterpret_cast<const uint8 *>(tensor.getData<void>());
-          *output_data = std::string(tensor_data, tensor_data + curr_length);
-          *is_found_data = true;
-        } catch(const std::out_of_range &err) {
-          *output_data = std::string();
-          output_data->resize(length);
-          *is_found_data = false;
-        }
-
-        // Increment it for the next tensor in the batch.
-        output_data++;
-        is_found_data++;
-      }
-
-      // Reset pointers to the beginning
-      output_data = output_flat.data();
-      is_found_data = is_found_tensor->flat<bool>().data();
-
-      // Resize any missing tensors before type (and hence true length) was known.
-      if (type == TWML_TYPE_DOUBLE) {
-        for (int64 i = 0; i < static_cast<int64>(records.size()); i++) {
-          if (!is_found_data[i]) {
-            output_data[i].resize(length);
-          }
-        }
-      }
-
-      Tensor* type_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, {}, &type_tensor));
-      type_tensor->scalar<string>()() = twml::getTypeName(type);
-
-      Tensor* endian_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(2, {}, &endian_tensor));
-      endian_tensor->scalar<uint8>()() = is_big_endian;
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsWithMissingMaskFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsWithMissingMaskOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsWithMissingMaskFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsWithMissingMaskOp<HashedDataRecordResource>);
-
-REGISTER_OP("GetSparseTensorsFromDataRecord")
-.Attr("feature_id: int")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("indices: string")
-.Output("values: string")
-.Output("dense_shape: int64")
-.Output("values_type: string")
-.Output("valueendian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D tensor representing which input in the batch the value belongs to.
-  indices: An string tensor containing indices of the sparse tensor as bytes.
-  values: An string tensor containing values of the sparse tensor as bytes.
-  dense_shape: A tensor containing [batch_size, thrift_shape].
-  values_type: The data type of value tensor returned as a string tensor of size 1.
-  values_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-REGISTER_OP("GetSparseTensorsFromHashedDataRecord")
-.Attr("feature_id: int")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.Output("indices: string")
-.Output("values: string")
-.Output("dense_shape: int64")
-.Output("values_type: string")
-.Output("values_endian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D tensor representing which input in the batch the value belongs to.
-  indices: An string tensor containing indices of the sparse tensor as bytes.
-  values: An string tensor containing values of the sparse tensor as bytes.
-  dense_shape: A tensor containing [batch_size, thrift_shape].
-  values_type: The data type of value tensor returned as a string tensor of size 1.
-  values_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-template<typename Resource>
-class GetSparseTensorsOp : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetSparseTensorsOp(OpKernelConstruction *context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    const int64 batch_size = static_cast<int64>(handle->records.size());
-    const auto &records = handle->records;
-
-    try {
-      twml_type type = TWML_TYPE_UNKNOWN;
-      bool is_big_endian = false;
-
-      std::vector<uint64> shape(1, batch_size);
-
-      int64 total_length = 0;
-      std::vector<int64> lengths;
-      lengths.reserve(batch_size);
-
-      int64 total_indices_length = 0;
-      std::vector<int64> indices_raw_lengths;
-      std::vector<const uint8 *> indices_data_ptrs;
-      indices_raw_lengths.reserve(batch_size);
-      indices_data_ptrs.reserve(batch_size);
-
-      int64 total_values_length = 0;
-      std::vector<int64> values_raw_lengths;
-      std::vector<const uint8 *> values_data_ptrs;
-      values_raw_lengths.reserve(batch_size);
-      values_data_ptrs.reserve(batch_size);
-
-      for (auto record : records) {
-        const twml::RawSparseTensor sparse_tensor = record.getRawSparseTensor(feature_id);
-        const twml::RawTensor indices = sparse_tensor.indices();
-        const twml::RawTensor values = sparse_tensor.values();
-        const auto &dense_shape = sparse_tensor.denseShape();
-        const auto indices_type = indices.getType();
-        const auto indices_is_big_endian = indices.is_big_endian();
-        const auto values_type = values.getType();
-        const bool values_is_big_endian = values.is_big_endian();
-
-        const uint64 indices_length = indices.getDims().back();
-        const uint64 values_length = values.getDims().back();
-
-        auto indices_raw_length = indices.getRawLength();
-        auto values_raw_length = values.getRawLength();
-
-        auto indices_data_ptr = reinterpret_cast<const uint8 *>(indices.getData<void>());
-        auto values_data_ptr = reinterpret_cast<const uint8 *>(values.getData<void>());
-
-        indices_raw_lengths.push_back(indices_raw_length);
-        values_raw_lengths.push_back(values_raw_length);
-
-        indices_data_ptrs.push_back(indices_data_ptr);
-        values_data_ptrs.push_back(values_data_ptr);
-
-        total_indices_length += indices_raw_length;
-        total_values_length += values_raw_length;
-
-        if (shape.size() == 1) {
-          shape.reserve(dense_shape.size() + 1);
-          shape.insert(shape.end(), dense_shape.begin(), dense_shape.end());
-          type = values_type;
-          is_big_endian = values_is_big_endian;
-        }
-
-        // Assert shape of all tensors is the same.
-        if (!std::equal(shape.begin() + 1, shape.end(), dense_shape.begin())) {
-          throw std::runtime_error("dense_shape of sparse tensors doesn't match for feature_id: "
-                                   + std::to_string(feature_id));
-        }
-        // Assert type of all values tensor is the same.
-        if (type != values_type || is_big_endian != values_is_big_endian) {
-          throw std::runtime_error("The type of values do not match for feature_id: "
-                                   + std::to_string(feature_id));
-        }
-        // Assert indices tensor is big endian and of type INT64.
-        if (indices_type != TWML_TYPE_INT64 || !indices_is_big_endian) {
-          throw std::runtime_error("Unexpected type for index tensor for feature_id: "
-                                   + std::to_string(feature_id));
-        }
-
-        if (indices_length != values_length) {
-          throw std::runtime_error("The length of values and indices does not match for : "
-                                   + std::to_string(feature_id));
-        }
-
-        lengths.push_back(indices_length);
-        total_length += indices_length;
-      }
-
-      Tensor* ids_tensor = nullptr;
-      TensorShape ids_shape = {static_cast<int64>(total_length)};
-      OP_REQUIRES_OK(context, context->allocate_output(0, ids_shape, &ids_tensor));
-      auto ids_tensor_flat = ids_tensor->flat<int64>();
-      auto ids_tensor_data = ids_tensor_flat.data();
-
-      TensorShape raw_shape = {static_cast<int64>(1)};
-
-      Tensor* indices_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, raw_shape, &indices_tensor));
-      auto indices_tensor_flat = indices_tensor->flat<string>();
-      auto indices_tensor_string = indices_tensor_flat.data();
-      indices_tensor_string->resize(total_indices_length);
-      auto indices_tensor_iter = indices_tensor_string->begin();
-
-      Tensor* values_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(2, raw_shape, &values_tensor));
-      auto values_tensor_flat = values_tensor->flat<string>();
-      auto values_tensor_string = values_tensor_flat.data();
-      values_tensor_string->resize(total_values_length);
-      auto values_tensor_iter = values_tensor_string->begin();
-
-      for (int64 i = 0; i < batch_size; i++) {
-        // Fill in the data for id == i for all values in the current input.
-        std::fill(ids_tensor_data, ids_tensor_data + lengths[i], i);
-        ids_tensor_data += lengths[i];
-
-        indices_tensor_iter = std::copy(indices_data_ptrs[i],
-                                        indices_data_ptrs[i] + indices_raw_lengths[i],
-                                        indices_tensor_iter);
-
-        values_tensor_iter = std::copy(values_data_ptrs[i],
-                                        values_data_ptrs[i] + values_raw_lengths[i],
-                                        values_tensor_iter);
-      }
-
-      Tensor *shape_tensor = nullptr;
-      TensorShape shape_shape = {static_cast<int64>(shape.size())};
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape_shape, &shape_tensor));
-      auto shape_flat = shape_tensor->flat<int64>();
-      for (int i = 0; i < static_cast<int>(shape.size()); i++) {
-        shape_flat(i) = shape[i];
-      }
-
-      Tensor* type_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(4, {}, &type_tensor));
-      type_tensor->scalar<string>()() = twml::getTypeName(type);
-
-      Tensor* endian_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(5, {}, &endian_tensor));
-      endian_tensor->scalar<uint8>()() = is_big_endian;
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetSparseTensorsFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetSparseTensorsOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetSparseTensorsFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetSparseTensorsOp<HashedDataRecordResource>);
diff --git a/twml/libtwml/src/ops/tensor_record.docx b/twml/libtwml/src/ops/tensor_record.docx
new file mode 100644
index 000000000..708a8b7bc
Binary files /dev/null and b/twml/libtwml/src/ops/tensor_record.docx differ
diff --git a/twml/libtwml/src/ops/tensorflow_utils.cpp b/twml/libtwml/src/ops/tensorflow_utils.cpp
deleted file mode 100644
index 95ebc7e4c..000000000
--- a/twml/libtwml/src/ops/tensorflow_utils.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "tensorflow_utils.h"
-#include <string>
-#include <vector>
-
-twml::Tensor TFTensor_to_twml_tensor(Tensor &input) {
-  int ndims = input.dims();
-  std::vector<uint64_t> dims(ndims);
-  std::vector<uint64_t> strides(ndims);
-  for (int i = 0; i < ndims; i++) {
-    dims[i] = input.dim_size(i);
-  }
-  uint64_t stride = 1;
-  for (int i = ndims-1; i >= 0; i--) {
-    strides[i] = stride;
-    stride *= dims[i];
-  }
-
-  switch (input.dtype()) {
-    case DT_INT8:
-      return twml::Tensor(input.flat<int8>().data(), dims, strides, TWML_TYPE_INT8);
-    case DT_UINT8:
-      return twml::Tensor(input.flat<uint8>().data(), dims, strides, TWML_TYPE_UINT8);
-    case DT_INT32:
-      return twml::Tensor(input.flat<int32>().data(), dims, strides, TWML_TYPE_INT32);
-    case DT_INT64:
-      return twml::Tensor(input.flat<int64>().data(), dims, strides, TWML_TYPE_INT64);
-    case DT_FLOAT:
-      return twml::Tensor(input.flat<float>().data(), dims, strides, TWML_TYPE_FLOAT);
-    case DT_DOUBLE:
-      return twml::Tensor(input.flat<double>().data(), dims, strides, TWML_TYPE_DOUBLE);
-    case DT_BOOL:
-      return twml::Tensor(input.flat<bool>().data(), dims, strides, TWML_TYPE_BOOL);
-    case DT_STRING:
-      return twml::Tensor(input.flat<string>().data(), dims, strides, TWML_TYPE_STRING);
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "Unknown tensor data type.");
-      break;
-  }
-}
-
-const twml::Tensor TFTensor_to_twml_tensor(const Tensor &input) {
-  // TODO: define some type of constant tensor, which should be used for inputs to force not
-  // changing
-  return TFTensor_to_twml_tensor(const_cast<Tensor&>(input));
-}
-
-twml::RawTensor TFTensor_to_twml_raw_tensor(Tensor &input) {
-  int ndims = input.dims();
-  std::vector<uint64_t> dims(ndims);
-  std::vector<uint64_t> strides(ndims);
-  for (int i = 0; i < ndims; i++) {
-    dims[i] = input.dim_size(i);
-  }
-  uint64_t stride = 1;
-  for (int i = ndims-1; i >= 0; i--) {
-    strides[i] = stride;
-    stride *= dims[i];
-  }
-
-  switch (input.dtype()) {
-    case DT_INT8:
-      return twml::RawTensor(input.flat<int8>().data(), dims, strides, TWML_TYPE_INT8, false, input.flat<int8>().size());
-    case DT_UINT8:
-      return twml::RawTensor(input.flat<uint8>().data(), dims, strides, TWML_TYPE_UINT8, false, input.flat<uint8>().size());
-    case DT_INT32:
-      return twml::RawTensor(input.flat<int32>().data(), dims, strides, TWML_TYPE_INT32, false, input.flat<int32>().size());
-    case DT_INT64:
-      return twml::RawTensor(input.flat<int64>().data(), dims, strides, TWML_TYPE_INT64, false, input.flat<int64>().size());
-    case DT_FLOAT:
-      return twml::RawTensor(input.flat<float>().data(), dims, strides, TWML_TYPE_FLOAT, false, input.flat<float>().size());
-    case DT_DOUBLE:
-      return twml::RawTensor(input.flat<double>().data(), dims, strides, TWML_TYPE_DOUBLE, false, input.flat<double>().size());
-    case DT_BOOL:
-      return twml::RawTensor(input.flat<bool>().data(), dims, strides, TWML_TYPE_BOOL, false, input.flat<bool>().size());
-    case DT_STRING:
-      return twml::RawTensor(input.flat<string>().data(), dims, strides, TWML_TYPE_STRING, false, input.flat<string>().size());
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "Unknown tensor data type.");
-      break;
-  }
-}
-
-const twml::RawTensor TFTensor_to_twml_raw_tensor(const Tensor &input) {
-  // TODO: define some type of constant tensor, which should be used for inputs to force not
-  // changing
-  return TFTensor_to_twml_raw_tensor(const_cast<Tensor&>(input));
-}
diff --git a/twml/libtwml/src/ops/tensorflow_utils.docx b/twml/libtwml/src/ops/tensorflow_utils.docx
new file mode 100644
index 000000000..9a21a5893
Binary files /dev/null and b/twml/libtwml/src/ops/tensorflow_utils.docx differ
diff --git a/twml/libtwml/src/ops/tensorflow_utils.h b/twml/libtwml/src/ops/tensorflow_utils.h
deleted file mode 100644
index 4940f680d..000000000
--- a/twml/libtwml/src/ops/tensorflow_utils.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <twml.h>
-
-using namespace tensorflow;
-twml::Tensor TFTensor_to_twml_tensor(Tensor &input);
-twml::RawTensor TFTensor_to_twml_raw_tensor(Tensor &input);
-const twml::Tensor TFTensor_to_twml_tensor(const Tensor &input);
-const twml::RawTensor TFTensor_to_twml_raw_tensor(const Tensor &input);
-
diff --git a/twml/libtwml/src/ops/var_length_reader.cpp b/twml/libtwml/src/ops/var_length_reader.cpp
deleted file mode 100644
index 62b5fc2a1..000000000
--- a/twml/libtwml/src/ops/var_length_reader.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("VarLengthReader")
-.Input("input1: int32")
-.Output("output: int32")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    ::tensorflow::shape_inference::ShapeHandle input;
-    // check that input has only 1 dimension.
-    TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
-    // there's no inference on output shape.
-    return Status::OK();
-  });
-
-
-class VarLengthReaderOp : public OpKernel {
- public:
-  explicit VarLengthReaderOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<int32>();
-
-    // get the first element in the input tensor, use it as output shape.
-    int32 len = input(0);
-    TensorShape output_shape = {1, len};
-
-    // Create an output tensor, the size is determined by the content of input.
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
-
-    auto output_flat = output_tensor->flat<int32>();
-
-    // Fill output with ones.
-    const int N = output_flat.size();
-    for (int i = 0; i < N; i++) {
-      output_flat(i) = 1;
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("VarLengthReader").Device(DEVICE_CPU), VarLengthReaderOp);
diff --git a/twml/libtwml/src/ops/var_length_reader.docx b/twml/libtwml/src/ops/var_length_reader.docx
new file mode 100644
index 000000000..5775c0039
Binary files /dev/null and b/twml/libtwml/src/ops/var_length_reader.docx differ
diff --git a/twml/setup.cfg b/twml/setup.cfg
deleted file mode 100644
index d887f33c2..000000000
--- a/twml/setup.cfg
+++ /dev/null
@@ -1,8 +0,0 @@
-[bdist_wheel]
-universal=1
-
-[build]
-build-lib=build_dir
-
-[bdist]
-bdist-base=build_dir
diff --git a/twml/setup.docx b/twml/setup.docx
new file mode 100644
index 000000000..532712b25
Binary files /dev/null and b/twml/setup.docx differ
diff --git a/twml/setup.py b/twml/setup.py
deleted file mode 100644
index 7e4003bae..000000000
--- a/twml/setup.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-
-from setuptools import find_packages, setup
-
-
-THIS_DIR = os.path.dirname(os.path.realpath(__file__))
-TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, 'twml/tests/data')
-
-data_files = []
-for parent, children, files in os.walk(TWML_TEST_DATA_DIR):
-  data_files += [os.path.join(parent, f) for f in files]
-
-setup(
-  name='twml',
-  version='2.0',
-  description="Tensorflow wrapper for twml",
-  packages=find_packages(exclude=["build"]),
-  install_requires=[
-    'thriftpy2',
-    'numpy',
-    'pyyaml',
-    'future',
-    'scikit-learn',
-    'scipy'
-  ],
-  package_data={
-    'twml': data_files,
-  },
-)
diff --git a/twml/twml/__init__.docx b/twml/twml/__init__.docx
new file mode 100644
index 000000000..a85035854
Binary files /dev/null and b/twml/twml/__init__.docx differ
diff --git a/twml/twml/__init__.py b/twml/twml/__init__.py
deleted file mode 100644
index 0c96df68b..000000000
--- a/twml/twml/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-""" Importing the pyton op wrappers """
-
-import os
-
-# Import from twitter.deepbird
-from twitter.deepbird.logging.log_level import set_logging_level  # noqa: F401
-from twitter.deepbird.sparse import SparseTensor  # noqa: F401
-from twitter.deepbird.sparse import sparse_dense_matmul  # noqa: F401
-
-from .util import dynamic_partition, feature_id, limit_bits, limit_sparse_tensor_size  # noqa: F401
-from .util import write_file, fixed_length_tensor, setup_tf_logging_formatter  # noqa: F401
-from .array import Array  # noqa: F401
-
-# Module to parse feature patterns and match them from data_spec.json
-from .feature_config import FeatureConfig, FeatureConfigBuilder  # noqa: F401
-
-# Data record streaming, reading, writing, and parsing.
-from .dataset import *  # noqa: T400
-from .readers import *  # noqa: T400
-from .block_format_writer import *  # noqa: T400
-
-# Graph output functions
-from .export_output_fns import *  # noqa: T400
-
-# Input parsers
-from .parsers import *  # noqa: T400
-
-# Input functions
-from .input_fns import *  # noqa: T400
-
-# Feature filter functions
-from .filters import *  # noqa: T400
-
-# Custom argparser for Trainer
-from .argument_parser import *  # noqa: T400
-
-from . import constants  # noqa: F401
-from . import errors  # noqa: F401
-from . import layers  # noqa: F401
-from . import lookup  # noqa: F401
-from . import readers  # noqa: F401
-from . import summary  # noqa: F401
-from . import tensorboard  # noqa: F401
-
-import tensorflow.compat.v1 as tf  # noqa: F402
-tf.disable_eager_execution()
-
-# TODO: Figure out a better way to deal with this.
-if 'OMP_NUM_THREADS' not in os.environ and 'MKL_NUM_THREADS' not in os.environ:
-  os.environ["OMP_NUM_THREADS"] = '1'
-
-# Import all custom C++ ops
-from libtwml import add1, partition_sparse_tensor, CLIB  # noqa: F401
-
-# Configure logging levels to info for various frameworks
-set_logging_level('INFO')
-
-from . import contrib  # noqa: F401
-from . import hooks  # noqa: F401
-from . import trainers  # noqa: F401
-from . import metrics  # noqa: F401
diff --git a/twml/twml/argument_parser.docx b/twml/twml/argument_parser.docx
new file mode 100644
index 000000000..8dac82c82
Binary files /dev/null and b/twml/twml/argument_parser.docx differ
diff --git a/twml/twml/argument_parser.py b/twml/twml/argument_parser.py
deleted file mode 100644
index c771eebdf..000000000
--- a/twml/twml/argument_parser.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# pylint: disable=protected-access, arguments-differ
-"""
-Command-line argument parsing for the Trainer.
-"""
-import argparse
-from argparse import ArgumentError
-from operator import attrgetter
-import tempfile
-
-import twml
-import tensorflow.compat.v1 as tf
-
-
-SERIAL = "serial"
-TREE = "tree"
-LOG_LEVELS = {
-  "debug": tf.logging.DEBUG,
-  "info": tf.logging.INFO,
-  "warn": tf.logging.WARN,
-  "error": tf.logging.ERROR}
-
-
-class SortingHelpFormatter(argparse.HelpFormatter):
-  """
-  Used to sort args alphabetically in the help message.
-  """
-
-  def add_arguments(self, actions):
-    actions = sorted(actions, key=attrgetter('option_strings'))
-    super(SortingHelpFormatter, self).add_arguments(actions)
-
-
-def _set_log_level(level=None):
-  """Sets the tensorflow log level to the input level."""
-  if level is None:
-    return None
-  level = level.lower()
-  if level not in LOG_LEVELS.keys():
-    raise ValueError(f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}.")
-  tf.logging.set_verbosity(LOG_LEVELS[level])
-  tf.logging.info(f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}")
-  return level
-
-
-def get_trainer_parser():
-  """
-  Add common commandline args to parse for the Trainer class.
-  Typically, the user calls this function and then parses cmd-line arguments
-  into an argparse.Namespace object which is then passed to the Trainer constructor
-  via the params argument.
-
-  See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-  for a list and description of all cmd-line arguments.
-
-  Args:
-    learning_rate_decay:
-      Defaults to False. When True, parses learning rate decay arguments.
-
-  Returns:
-    argparse.ArgumentParser instance with some useful args already added.
-  """
-  parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-
-  parser.add_argument(
-    "--save_dir", type=str, default=tempfile.mkdtemp(),
-    help="Path to the training result directory."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR ")
-  parser.add_argument(
-    "--export_dir", type=str, default=None,
-    help="Path to the directory to export a SavedModel for prediction servers.")
-  parser.add_argument(
-    "--log_aggregation_app_id", type=str, default=None,
-    help="specify app_id for log aggregation. disabled by default.")
-  parser.add_argument(
-    "--train.batch_size", "--train_batch_size", type=int, default=32,
-    dest='train_batch_size',
-    help="number of samples per training batch")
-  parser.add_argument(
-    "--eval.batch_size", "--eval_batch_size", type=int, default=32,
-    dest='eval_batch_size',
-    help="number of samples per cross-validation batch. Defaults to train_batch_size")
-  parser.add_argument(
-    "--train.learning_rate", "--learning_rate", type=float, default=0.002,
-    dest='learning_rate',
-    help="learning rate. Scales the gradient update.")
-  parser.add_argument(
-    "--train.steps", "--train_steps", type=int, default=-1,
-    dest='train_steps',
-    help="number of training batches before running evaluation."
-         "Defaults to -1 (runs through entire dataset). "
-         "Only used for Trainer.[train,learn]. "
-         "For Trainer.train_and_evaluate, use train.max_steps instead. ")
-  parser.add_argument(
-    "--eval.steps", "--eval_steps", type=int, default=-1,
-    dest="eval_steps",
-    help="number of steps per evaluation. Each batch is a step."
-         "Defaults to -1 (runs through entire dataset). ")
-  parser.add_argument(
-    "--eval.period", "--eval_period", type=int, default=600,
-    dest="eval_period",
-    help="Trainer.train_and_evaluate waits for this long after each evaluation. "
-         "Defaults to 600 seconds (evaluate every ten minutes). "
-         "Note that anything lower than 10*60seconds is probably a bad idea because TF saves "
-         "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--eval.delay", "--eval_delay", type=int, default=120,
-    dest="eval_delay",
-    help="Trainer.train_and_evaluate waits for this long before performing the first evaluation"
-         "Defaults to 120 seconds (evaluate after first 2 minutes of training). "
-         "eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--train.max_steps", "--train_max_steps", type=int, default=None,
-    dest="train_max_steps",
-    help="Stop training after this many global steps. Each training batch is its own step."
-         "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1."
-         "If set to a non-positive value, loop forever. Usually useful with early stopping.")
-  parser.add_argument(
-    "--train.log_metrics", dest="train_log_metrics", action="store_true", default=False,
-    help="Set this to true to see metrics during training. "
-         "WARNING: metrics during training does not represent model performance. "
-         "WARNING: use for debugging only as this slows down training.")
-  parser.add_argument(
-    "--train.early_stop_patience", "--early_stop_patience", type=int, default=-1,
-    dest="early_stop_patience",
-    help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric."
-         "Defaults to -1 (no early-stopping)."
-         "NOTE: This can not be enabled when --distributed is also set.")
-  parser.add_argument(
-    "--train.early_stop_tolerance", "--early_stop_tolerance", type=float, default=0,
-    dest="early_stop_tolerance",
-    help="a non-negative tolerance for comparing early_stop_metric."
-         "e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-         "Defaults to 0.")
-  parser.add_argument(
-    "--train.dataset_shards", "--train_dataset_shards",
-    dest="train_dataset_shards",
-    type=int, default=None,
-    help="An int value that indicates the number of partitions (shards) for the dataset. This is"
-    " useful for codistillation and other techniques that require each worker to train on disjoint"
-    " partitions of the dataset.")
-  parser.add_argument(
-    "--train.dataset_shard_index", "--train_dataset_shard_index",
-    dest="train_dataset_shard_index",
-    type=int, default=None,
-    help="An int value (starting at zero) that indicates which partition (shard) of the dataset"
-    " to use if --train.dataset_shards is set.")
-  parser.add_argument(
-    "--continue_from_checkpoint", dest="continue_from_checkpoint", action="store_true",
-    help="DEPRECATED. This option is currently a no-op."
-    " Continuing from the provided checkpoint is now the default."
-    " Use --overwrite_save_dir if you would like to override it instead"
-    " and restart training from scratch.")
-  parser.add_argument(
-    "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-    help="Delete the contents of the current save_dir if it exists")
-  parser.add_argument(
-    "--data_threads", "--num_threads", type=int, default=2,
-    dest="num_threads",
-    help="Number of threads to use for loading the dataset. "
-         "num_threads is deprecated and to be removed in future versions. Use data_threads.")
-  parser.add_argument(
-    "--max_duration", "--max_duration", type=float, default=None,
-    dest="max_duration",
-    help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.")
-  parser.add_argument(
-    "--num_workers", type=int, default=None,
-    help="Number of workers to use when training in hogwild manner on a single node.")
-  parser.add_argument(
-    "--distributed", dest="distributed", action="store_true",
-    help="Pass this flag to use train_and_evaluate to train in a distributed fashion"
-         "NOTE: You can not use early stopping when --distributed is enabled"
-  )
-  parser.add_argument(
-    "--distributed_training_cleanup",
-    dest="distributed_training_cleanup",
-    action="store_true",
-    help="Set if using distributed training on GKE to stop TwitterSetDeployment"
-         "from continuing training upon restarts (will be deprecated once we migrate off"
-         "TwitterSetDeployment for distributed training on GKE)."
-  )
-  parser.add_argument(
-    "--disable_auto_ps_shutdown", default=False, action="store_true",
-    help="Disable the functionality of automatically shutting down parameter server after "
-         "distributed training complete (either succeed or failed)."
-  )
-  parser.add_argument(
-    "--disable_tensorboard", default=False, action="store_true",
-    help="Do not start the TensorBoard server."
-  )
-  parser.add_argument(
-    "--tensorboard_port", type=int, default=None,
-    help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.")
-  parser.add_argument(
-    "--health_port", type=int, default=None,
-    help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-         "Not user-facing as it is set automatically by the twml_cli."
-  )
-  parser.add_argument(
-    "--stats_port", type=int, default=None,
-    help="Port to listen on for stats endpoints"
-  )
-  parser.add_argument(
-    "--experiment_tracking_path",
-    dest="experiment_tracking_path",
-    type=str, default=None,
-    help="The tracking path of this experiment. Format: \
-        user_name:project_name:experiment_name:run_name. The path is used to track and display \
-        a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \
-        disabled when the deprecated Model Repo TrackRun is used in your model config. ")
-  parser.add_argument(
-    "--disable_experiment_tracking",
-    dest="disable_experiment_tracking",
-    action="store_true",
-    help="Whether experiment tracking should be disabled.")
-  parser.add_argument(
-    "--config.save_checkpoints_secs", "--save_checkpoints_secs", type=int, default=600,
-    dest='save_checkpoints_secs',
-    help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. "
-    "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.")
-  parser.add_argument(
-    "--config.keep_checkpoint_max", "--keep_checkpoint_max", type=int, default=20,
-    dest='keep_checkpoint_max',
-    help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. "
-    "Specifies how many checkpoints to keep. Defaults to 20.")
-  parser.add_argument(
-    "--config.tf_random_seed", "--tf_random_seed", type=int, default=None,
-    dest='tf_random_seed',
-    help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. "
-         "Specifies the seed to use. Defaults to None.")
-  parser.add_argument(
-    "--optimizer", type=str, default='SGD',
-    help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.")
-  parser.add_argument(
-    "--gradient_noise_scale", type=float, default=None,
-    help="adds 0-mean normal noise scaled by this value. Defaults to None.")
-  parser.add_argument(
-    "--clip_gradients", type=float, default=None,
-    help="If specified, a global clipping is applied to prevent "
-         "the norm of the gradient to exceed this value. Defaults to None.")
-  parser.add_argument(
-    "--dgc.density", "--dgc_density", type=float, default=0.1,
-    dest="dgc_density",
-    help="Specifies gradient density level when using deep gradient compression optimizer."
-         "E.g., default value being 0.1 means that only top 10%% most significant rows "
-         "(based on absolute value sums) are kept."
-  )
-  parser.add_argument(
-    "--dgc.density_decay", "--dgc_density_decay", type=bool, default=True,
-    dest="dgc_density_decay",
-    help="Specifies whether to (exponentially) decay the gradient density level when"
-         " doing gradient compression. If set 'False', the 'density_decay_steps', "
-         "'density_decay_rate' and 'min_density' arguments will be ignored."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_steps", "--dgc_density_decay_steps", type=int, default=10000,
-    dest="dgc_density_decay_steps",
-    help="Specifies the step interval to perform density decay."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_rate", "--dgc_density_decay_rate", type=float, default=0.5,
-    dest="dgc_density_decay_rate",
-    help="Specifies the decay rate when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.min_density", "--dgc_min_density", type=float, default=0.1,
-    dest="dgc_min_density",
-    help="Specifies the minimum density level when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.accumulation", "--dgc_accumulation", type=bool, default=False,
-    dest="dgc_accumulation",
-    help="Specifies whether to accumulate small gradients when using deep gradient compression "
-         "optimizer."
-  )
-  parser.add_argument(
-    "--show_optimizer_summaries", dest="show_optimizer_summaries", action="store_true",
-    help="When specified, displays gradients and learning rate in tensorboard."
-    "Turning it on has 10-20%% performance hit. Enable for debugging only")
-
-  parser.add_argument(
-    "--num_mkl_threads", dest="num_mkl_threads", default=1, type=int,
-    help="Specifies how many threads to use for MKL"
-    "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads."
-    "intra_op_parallelism_threads is set to num_mkl_threads.")
-
-  parser.add_argument("--verbosity", type=_set_log_level, choices=LOG_LEVELS.keys(), default=None,
-    help="Sets log level to a given verbosity.")
-
-  parser.add_argument(
-    "--feature_importance.algorithm", dest="feature_importance_algorithm",
-    type=str, default=TREE, choices=[SERIAL, TREE],
-    help="""
-    There are two algorithms that the module supports, `serial` and `tree`.
-      The `serial` algorithm computes feature importances for each feature, and
-      the `tree` algorithm groups features by feature name prefix, computes feature
-      importances for groups of features, and then only 'zooms-in' on a group when the
-      importance is greater than the `--feature_importance.sensitivity` value. The `tree` algorithm
-      will usually run faster, but for relatively unimportant features it will only compute an
-      upper bound rather than an exact importance value. We suggest that users generally stick
-      to the `tree` algorithm, unless if they have a very small number of features or
-      near-random model performance.
-      """)
-
-  parser.add_argument(
-    "--feature_importance.sensitivity", dest="feature_importance_sensitivity", type=float, default=0.03,
-    help="""
-    The maximum amount that permuting a feature group can cause the model performance (determined
-      by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature
-      group. This is only used for the `tree` algorithm.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.dont_build_tree", dest="dont_build_tree", action="store_true", default=False,
-    help="""
-    If True, don't build the feature trie for the tree algorithm and only use the extra_groups
-    """)
-
-  parser.add_argument(
-    "--feature_importance.split_feature_group_on_period", dest="split_feature_group_on_period", action="store_true", default=False,
-    help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm")
-
-  parser.add_argument(
-    "--feature_importance.example_count", dest="feature_importance_example_count", type=int, default=10000,
-    help="""
-    The number of examples used to compute feature importance.
-    Larger values yield more reliable results, but also take longer to compute.
-    These records are loaded into memory. This number is agnostic to batch size.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.data_dir", dest="feature_importance_data_dir", type=str, default=None,
-    help="Path to the dataset used to compute feature importance."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR "
-         "Defaults to eval_data_dir")
-
-  parser.add_argument(
-    "--feature_importance.metric", dest="feature_importance_metric", type=str, default="roc_auc",
-    help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_larger_the_better", dest="feature_importance_is_metric_larger_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_smaller_the_better", dest="feature_importance_is_metric_smaller_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)")
-
-  subparsers = parser.add_subparsers(help='Learning Rate Decay Functions. Can only pass 1.'
-                                          'Should be specified after all the optional arguments'
-                                          'and followed by its specific args'
-                                          'e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn'
-                                          ' --decay_rate 0.0004 --min_learning_rate 0.001',
-                                     dest='learning_rate_decay')
-
-  # Create the parser for the "exponential_learning_rate_decay_fn"
-  parser_exponential = subparsers.add_parser('exponential_learning_rate_decay',
-                                             help='Exponential learning rate decay. '
-                                             'Exponential decay implements:'
-                                             'decayed_learning_rate = learning_rate * '
-                                             'exponential_decay_rate ^ '
-                                             '(global_step / decay_steps')
-  parser_exponential.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay.")
-  parser_exponential.add_argument(
-    "--exponential_decay_rate", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay. Must be positive. ")
-
-  # Create the parser for the "polynomial_learning_rate_decay_fn"
-  parser_polynomial = subparsers.add_parser('polynomial_learning_rate_decay',
-                                            help='Polynomial learning rate decay. '
-                                            'Polynomial decay implements: '
-                                            'global_step = min(global_step, decay_steps)'
-                                            'decayed_learning_rate = '
-                                            '(learning_rate - end_learning_rate) * '
-                                            '(1 - global_step / decay_steps) ^ '
-                                            '(polynomial_power) + end_learning_rate'
-                                            'So for linear decay you can use a '
-                                            'polynomial_power=1 (the default)')
-  parser_polynomial.add_argument(
-    "--end_learning_rate", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay (ignored otherwise).")
-  parser_polynomial.add_argument(
-    "--polynomial_power", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay."
-         "The power of the polynomial. Defaults to linear, 1.0.")
-  parser_polynomial.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'polynomial' learning_rate_decay. ")
-
-  # Create the parser for the "piecewise_constant_learning_rate_decay_fn"
-  parser_piecewise_constant = subparsers.add_parser('piecewise_constant_learning_rate_decay',
-                                                    help='Piecewise Constant '
-                                                    'learning rate decay. '
-                                                    'For piecewise_constant, '
-                                                    'consider this example: '
-                                                    'We want to use a learning rate '
-                                                    'that is 1.0 for'
-                                                    'the first 100000 steps,'
-                                                    '0.5 for steps 100001 to 110000, '
-                                                    'and 0.1 for any additional steps. '
-                                                    'To do so, specify '
-                                                    '--piecewise_constant_boundaries=100000,110000'
-                                                    '--piecewise_constant_values=1.0,0.5,0.1')
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_values",
-    action=parse_comma_separated_list(element_type=float),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated floats or ints that specifies the values "
-         "for the intervals defined by boundaries. It should have one more "
-         "element than boundaries.")
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_boundaries",
-    action=parse_comma_separated_list(element_type=int),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated integers, with strictly increasing entries.")
-
-  # Create the parser for the "inverse_learning_rate_decay_fn"
-  parser_inverse = subparsers.add_parser('inverse_learning_rate_decay',
-                                         help='Inverse Leaning rate decay. '
-                                         'Inverse implements:'
-                                         'decayed_lr = max(lr /(1 + decay_rate * '
-                                         'floor(global_step /decay_step)),'
-                                         ' min_learning_rate)'
-                                         'When decay_step=1 this mimics the behaviour'
-                                         'of the default learning rate decay'
-                                         'of DeepBird v1.')
-
-  parser_inverse.add_argument(
-    "--decay_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.")
-  parser_inverse.add_argument(
-    "--min_learning_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.")
-  parser_inverse.add_argument(
-    "--decay_steps", type=float, default=1,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_learning_rate_decay_fn"
-  parser_cosine = subparsers.add_parser('cosine_learning_rate_decay',
-                                        help='Cosine Leaning rate decay. '
-                                        'Cosine implements:'
-                                        'decayed_lr = 0.5 * (1 + cos(pi *\
-                                         global_step / decay_steps)) * lr'
-                                       )
-
-  parser_cosine.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number.\
-    Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine.add_argument(
-    "--decay_steps", type=float,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_restart_learning_rate_decay_fn"
-  parser_cosine_restart = subparsers.add_parser('cosine_restarts_learning_rate_decay',
-                                                help='Applies cosine decay with restarts \
-                                                  to the learning rate'
-                                                'See [Loshchilov & Hutter, ICLR2016],\
-                                                   SGDR: Stochastic'
-                                                'Gradient Descent with Warm Restarts.'
-                                                'https://arxiv.org/abs/1608.03983'
-                                               )
-  parser_cosine_restart.add_argument(
-    "--first_decay_steps", type=float,
-    help="Required for 'cosine_restart' learning_rate_decay.")
-  parser_cosine_restart.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine_restart.add_argument(
-    "--t_mul", type=float, default=2,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Used to derive the number of iterations in the i-th period")
-  parser_cosine_restart.add_argument(
-    "--m_mul", type=float, default=1,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-      Used to derive the initial learning rate of the i-th period.")
-
-  # Create dummy parser for None, which is the default.
-  parser_default = subparsers.add_parser(
-    'no_learning_rate_decay',
-    help='No learning rate decay')  # noqa: F841
-
-  parser.set_default_subparser('no_learning_rate_decay')
-
-  return parser
-
-
-class DefaultSubcommandArgParse(argparse.ArgumentParser):
-  """
-  Subclass of argparse.ArgumentParser that sets default parser
-  """
-  _DEFAULT_SUBPARSER = None
-
-  def set_default_subparser(self, name):
-    """
-    sets the default subparser
-    """
-    self._DEFAULT_SUBPARSER = name
-
-  def _parse_known_args(self, arg_strings, *args, **kwargs):
-    """
-    Overwrites _parse_known_args
-    """
-    in_args = set(arg_strings)
-    d_sp = self._DEFAULT_SUBPARSER
-    if d_sp is not None and not {'-h', '--help'}.intersection(in_args):
-      for x_val in self._subparsers._actions:
-        subparser_found = (
-          isinstance(x_val, argparse._SubParsersAction) and
-          in_args.intersection(x_val._name_parser_map.keys())
-        )
-        if subparser_found:
-          break
-      else:
-        # insert default in first position, this implies no
-        # global options without a sub_parsers specified
-        arg_strings = arg_strings + [d_sp]
-    return super(DefaultSubcommandArgParse, self)._parse_known_args(
-      arg_strings, *args, **kwargs
-    )
-
-  def _check_value(self, action, value):
-    try:
-      super(DefaultSubcommandArgParse, self)._check_value(
-        action, value
-      )
-    except ArgumentError as error:
-      error.message += ("\nERROR: Deepbird is trying to interpret \"{}\" as a value of {}. If this is not what you expected, "
-        "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, "
-        "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after "
-        "the `learning_rate_decay` argument.\n").format(value, action.dest, value, value)
-      raise error
-
-
-def parse_comma_separated_list(element_type=str):
-  """
-  Generates an argparse.Action that converts a string representing a comma separated list to a
-  list and converts each element to a specified type.
-  """
-
-  # pylint: disable-msg=too-few-public-methods
-  class _ParseCommaSeparatedList(argparse.Action):
-    """
-    Converts a string representing a comma separated list to a list and converts each element to a
-    specified type.
-    """
-
-    def __call__(self, parser, namespace, values, option_string=None):
-      if values is not None:
-        values = [element_type(v) for v in values.split(',')]
-      setattr(namespace, self.dest, values)
-
-  return _ParseCommaSeparatedList
diff --git a/twml/twml/array.docx b/twml/twml/array.docx
new file mode 100644
index 000000000..00dd68b63
Binary files /dev/null and b/twml/twml/array.docx differ
diff --git a/twml/twml/array.py b/twml/twml/array.py
deleted file mode 100644
index a8524a06d..000000000
--- a/twml/twml/array.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""Module containing wrapper class to allow numpy arrays to work with twml functions"""
-
-import ctypes as ct
-
-from absl import logging
-from libtwml import CLIB
-import numpy as np
-
-
-_NP_TO_TWML_TYPE = {
-  'float32': ct.c_int(1),
-  'float64': ct.c_int(2),
-  'int32': ct.c_int(3),
-  'int64': ct.c_int(4),
-  'int8': ct.c_int(5),
-  'uint8': ct.c_int(6),
-}
-
-
-class Array(object):
-  """
-  Wrapper class to allow numpy arrays to work with twml functions.
-  """
-
-  def __init__(self, array):
-    """
-    Wraps numpy array and creates a handle that can be passed to C functions from libtwml.
-
-    array: Numpy array
-    """
-    if not isinstance(array, np.ndarray):
-      raise TypeError("Input must be a numpy array")
-
-    try:
-      ttype = _NP_TO_TWML_TYPE[array.dtype.name]
-    except KeyError as err:
-      logging.error("Unsupported numpy type")
-      raise err
-
-    handle = ct.c_void_p(0)
-    ndim = ct.c_int(array.ndim)
-    dims = array.ctypes.get_shape()
-    isize = array.dtype.itemsize
-
-    strides_t = ct.c_size_t * array.ndim
-    strides = strides_t(*[n // isize for n in array.strides])
-
-    err = CLIB.twml_tensor_create(ct.pointer(handle),
-                                  array.ctypes.get_as_parameter(),
-                                  ndim, dims, strides, ttype)
-
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-    # Store the numpy array to ensure it isn't deleted before self
-    self._array = array
-
-    self._handle = handle
-
-    self._type = ttype
-
-  @property
-  def handle(self):
-    """
-    Return the twml handle
-    """
-    return self._handle
-
-  @property
-  def shape(self):
-    """
-    Return the shape
-    """
-    return self._array.shape
-
-  @property
-  def ndim(self):
-    """
-    Return the shape
-    """
-    return self._array.ndim
-
-  @property
-  def array(self):
-    """
-    Return the numpy array
-    """
-    return self._array
-
-  @property
-  def dtype(self):
-    """
-    Return numpy dtype
-    """
-    return self._array.dtype
-
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    CLIB.twml_tensor_delete(self._handle)
diff --git a/twml/twml/block_format_writer.docx b/twml/twml/block_format_writer.docx
new file mode 100644
index 000000000..14ecc5e38
Binary files /dev/null and b/twml/twml/block_format_writer.docx differ
diff --git a/twml/twml/block_format_writer.py b/twml/twml/block_format_writer.py
deleted file mode 100644
index 9c4a9b6a8..000000000
--- a/twml/twml/block_format_writer.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""Module containing wrapper class to write block format data"""
-import ctypes as ct
-
-from libtwml import CLIB
-
-
-class BlockFormatWriter(object):
-  """
-  Class to write block format file.
-  """
-
-  def __init__(self, file_name, records_per_block=100):
-    file_name = file_name
-    if not isinstance(file_name, str):
-      raise ValueError("file_name has to be of type str")
-
-    self.file_name = ct.c_char_p(file_name.encode())
-    self.records_per_block = ct.c_int(int(records_per_block))
-    handle = ct.c_void_p(0)
-    err = CLIB.block_format_writer_create(ct.pointer(handle),
-                                          self.file_name,
-                                          self.records_per_block)
-    self._handle = None
-    # 1000 means TWML_ERR_NONE
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-    self._handle = handle
-
-  @property
-  def handle(self):
-    """
-    Return the handle
-    """
-    return self._handle
-
-  def write(self, class_name, record):
-    """
-    Write a record.
-
-    Note: `record` needs to be in a format that can be converted to ctypes.c_char_p.
-    """
-    if not isinstance(class_name, str):
-      raise ValueError("class_name has to be of type str")
-
-    record_len = len(record)
-    class_name = ct.c_char_p(class_name.encode())
-    record = ct.c_char_p(record)
-    err = CLIB.block_format_write(self._handle, class_name, record, record_len)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-  def flush(self):
-    """
-    Flush records in buffer to outputfile.
-    """
-    err = CLIB.block_format_flush(self._handle)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    if self._handle:
-      CLIB.block_format_writer_delete(self._handle)
diff --git a/twml/twml/constants.docx b/twml/twml/constants.docx
new file mode 100644
index 000000000..fcdc7e305
Binary files /dev/null and b/twml/twml/constants.docx differ
diff --git a/twml/twml/constants.py b/twml/twml/constants.py
deleted file mode 100644
index c6c726eed..000000000
--- a/twml/twml/constants.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# These should coincide with 'enum class DecodeMode' values in HashedDataRecordReader.h
-
-from twitter.deepbird.io.legacy.constants import (
-  DECODE_MODES,  # noqa: F401
-  DEFAULT_DECODE_MODE,  # noqa: F401
-  HASH_FNAME_AND_VALNAME,  # noqa: F401
-  HASH_VALNAME,  # noqa: F401
-  HashingDiscretizerOptions,  # noqa: F401
-  DEFAULT_ZOOKEEPER_BASE_ZNODE,  # noqa: F401
-  DEFAULT_ZOOKEEPER_HOST,  # noqa: F401
-)
diff --git a/twml/twml/contrib/__init__.docx b/twml/twml/contrib/__init__.docx
new file mode 100644
index 000000000..07f8e9d4b
Binary files /dev/null and b/twml/twml/contrib/__init__.docx differ
diff --git a/twml/twml/contrib/__init__.py b/twml/twml/contrib/__init__.py
deleted file mode 100644
index 1a5e8efe4..000000000
--- a/twml/twml/contrib/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# pylint: disable=wildcard-import
-""" experimental and contributed modules """
-
-from . import layers  # noqa: F401
-from . import feature_importances  # noqa: F401
-from . import calibrators  # noqa: F401
-from . import readers  # noqa: F401
-from . import utils  # noqa: F401
-from . import build_graphs_fns  # noqa: F401
-from . import feature_config  # noqa: F401
-from . import parsers  # noqa: F401
-from . import initializers  # noqa: F401
-from . import export # noqa: F401
-from . import feature_config_parsers # noqa: F401
-
-# These imports do not work with TF 2.x and are not needed either.
-# If you are using TF 2.x, use the modular targets under src/python/twitter/deepbird.
-import tensorflow
-from . import trainers  # noqa: F401
-from . import metrics  # noqa: F401
-from . import hooks  # noqa: F401
diff --git a/twml/twml/contrib/build_graphs_fns.docx b/twml/twml/contrib/build_graphs_fns.docx
new file mode 100644
index 000000000..4fad2fe7f
Binary files /dev/null and b/twml/twml/contrib/build_graphs_fns.docx differ
diff --git a/twml/twml/contrib/build_graphs_fns.py b/twml/twml/contrib/build_graphs_fns.py
deleted file mode 100644
index 829f61512..000000000
--- a/twml/twml/contrib/build_graphs_fns.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# pylint: disable=unused-argument, missing-docstring
-'''
-Common build graphs that can be reused
-'''
-import tensorflow.compat.v1 as tf
-
-
-def get_saved_modules_graph(input_graph_fn):
-  """
-  Get common graph for stitching different saved modules for export.
-  This graph is used to save checkpoints; and then export the modules
-  as a unity.
-  Args:
-        features:
-          model features
-        params:
-          model params
-        input_graph_fn:
-          main logic for the stitching
-  Returns:
-    build_graph
-  """
-  def build_graph(features, label, mode, params, config=None):
-    output = input_graph_fn(features, params)
-    # If mode is train, we just need to assign a dummy loss
-    # and update the train op. This is done to save the graph to save_dir.
-    if mode == 'train':
-      loss = tf.constant(1)
-      train_op = tf.assign_add(tf.train.get_global_step(), 1)
-      return {'train_op': train_op, 'loss': loss}
-    return output
-  return build_graph
diff --git a/twml/twml/contrib/calibrators/__init__.docx b/twml/twml/contrib/calibrators/__init__.docx
new file mode 100644
index 000000000..1ceea2abd
Binary files /dev/null and b/twml/twml/contrib/calibrators/__init__.docx differ
diff --git a/twml/twml/contrib/calibrators/__init__.py b/twml/twml/contrib/calibrators/__init__.py
deleted file mode 100644
index 02181ed12..000000000
--- a/twml/twml/contrib/calibrators/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# pylint: disable=wildcard-import
-"""
-This module contains classes used for calibration.
-Typically, each calibrator defines a ``twml.calibrator.Calibrator`` subclass
-and a ``twml.calibrator.CalibrationFeature``.
-The latter manages weights and values of individual features.
-The former manages a set of ``CalibratorFeatures``
-(although some ``Calibrators`` don't use ``CalibrationFeature``).
-Ultimately, the ``Calibrator`` should produce an initialized layer via its ``to_layer()`` method.
-"""
-
-from .common_calibrators import calibrate_discretizer_and_export, add_discretizer_arguments  # noqa: F401
-from .calibrator import Calibrator  # noqa: F401
-from .mdl import MDLCalibrator  # noqa: F401
-from .isotonic import IsotonicCalibrator  # noqa: F401
-from .percentile_discretizer import PercentileDiscretizerCalibrator  # noqa: F401
-from .hashed_percentile_discretizer import HashedPercentileDiscretizerCalibrator  # noqa: F401
-from .hashing_discretizer import HashingDiscretizerCalibrator  # noqa: F401
\ No newline at end of file
diff --git a/twml/twml/contrib/calibrators/calibrator.docx b/twml/twml/contrib/calibrators/calibrator.docx
new file mode 100644
index 000000000..9f99ef399
Binary files /dev/null and b/twml/twml/contrib/calibrators/calibrator.docx differ
diff --git a/twml/twml/contrib/calibrators/calibrator.py b/twml/twml/contrib/calibrators/calibrator.py
deleted file mode 100644
index 7408412e0..000000000
--- a/twml/twml/contrib/calibrators/calibrator.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# pylint: disable=missing-docstring, unused-argument
-''' Contains the base classes for CalibrationFeature and Calibrator '''
-
-
-from collections import defaultdict
-
-import numpy as np
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-import twml.util
-
-
-class CalibrationFeature(object):
-  '''
-  Accumulates values and weights for individual features.
-  Typically, each unique feature defined in the accumulated SparseTensor or Tensor
-  would have its own CalibrationFeature instance.
-  '''
-
-  def __init__(self, feature_id):
-    ''' Constructs a CalibrationFeature
-
-    Arguments:
-      feature_id:
-        number identifying the feature.
-    '''
-    self.feature_id = feature_id
-    self._calibrated = False
-    self._features_dict = defaultdict(list)
-
-  def add_values(self, new_features):
-    '''
-    Extends lists to contain the values in this batch
-    '''
-    for key in new_features:
-      self._features_dict[key].append(new_features[key])
-
-  def _concat_arrays(self):
-    '''
-    This class calls this function after you have added all the values.
-    It creates a dictionary with the concatanated arrays
-    '''
-    self._features_dict.update((k, np.concatenate(v)) for k, v in self._features_dict.items())
-
-  def calibrate(self, *args, **kwargs):
-    raise NotImplementedError
-
-
-class Calibrator(object):
-  '''
-  Accumulates features and their respective values for Calibration
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()`` and;
-   2. calibrate by calling ``calibrate()``;
-   3. convert to a twml.layers layer by calling ``to_layer()``.
-
-  Note you can only use one calibrator per Trainer.
-  '''
-
-  def __init__(self, calibrator_name=None, **kwargs):
-    '''
-    Arguments:
-      calibrator_name.
-        Default: if set to None it will be the same as the class name.
-        Please be reminded that if in the model there are many calibrators
-        of the same type the calibrator_name should be changed to avoid confusion.
-    '''
-    self._calibrated = False
-    if calibrator_name is None:
-      calibrator_name = twml.util.to_snake_case(self.__class__.__name__)
-    self._calibrator_name = calibrator_name
-    self._kwargs = kwargs
-
-  @property
-  def is_calibrated(self):
-    return self._calibrated
-
-  @property
-  def name(self):
-    return self._calibrator_name
-
-  def accumulate(self, *args, **kwargs):
-    '''Accumulates features and their respective values for Calibration.'''
-    raise NotImplementedError
-
-  def calibrate(self):
-    '''Calibrates after the accumulation has ended.'''
-    self._calibrated = True
-
-  def to_layer(self, name=None):
-    '''
-    Returns a twml.layers.Layer instance with the result of calibrator.
-
-    Arguments:
-      name:
-        name-scope of the layer
-    '''
-    raise NotImplementedError
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-
-    Returns:
-      dictionary of Layer constructor arguments to initialize the
-      layer Variables. Typically, this should contain enough information
-      to initialize empty layer Variables of the correct size, which will then
-      be filled with the right data using init_map.
-    '''
-    raise NotImplementedError
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-      name:
-        name for the calibrator.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def write_summary(self, writer, sess=None):
-    """
-    This method is called by save() to write tensorboard summaries to disk.
-    See MDLCalibrator.write_summary for an example.
-    By default, the method does nothing. It can be overloaded by child-classes.
-
-    Arguments:
-      writer:
-        `tf.summary.FilteWriter
-        <https://www.tensorflow.org/versions/master/api_docs/python/tf/summary/FileWriter>`_
-        instance.
-        The ``writer`` is used to add summaries to event files for inclusion in tensorboard.
-      sess (optional):
-        `tf.Session <https://www.tensorflow.org/versions/master/api_docs/python/tf/Session>`_
-        instance. The ``sess`` is used to produces summaries for the writer.
-    """
diff --git a/twml/twml/contrib/calibrators/common_calibrators.docx b/twml/twml/contrib/calibrators/common_calibrators.docx
new file mode 100644
index 000000000..232b66260
Binary files /dev/null and b/twml/twml/contrib/calibrators/common_calibrators.docx differ
diff --git a/twml/twml/contrib/calibrators/common_calibrators.py b/twml/twml/contrib/calibrators/common_calibrators.py
deleted file mode 100644
index 5301901e4..000000000
--- a/twml/twml/contrib/calibrators/common_calibrators.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# pylint: disable=invalid-name, no-member, unused-argument
-"""
-This module contains common calibrate and export functions for calibrators.
-"""
-
-# These 3 TODO are encapsulated by CX-11446
-# TODO: many of these functions hardcode datarecords yet don't allow passing a parse_fn.
-# TODO: provide more generic (non DataRecord specific) functions
-# TODO: many of these functions aren't common at all.
-#       For example, Discretizer functions should be moved to PercentileDiscretizer.
-
-import copy
-import os
-import time
-
-from absl import logging
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-from twml.argument_parser import SortingHelpFormatter
-from twml.input_fns import data_record_input_fn
-from twml.util import list_files_by_datetime, sanitize_hdfs_path
-from twml.contrib.calibrators.isotonic import IsotonicCalibrator
-
-
-def calibrator_arguments(parser):
-  """
-  Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model
-  """
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir",
-    help="Path to save or load calibrator calibration")
-  parser.add_argument("--calibrator_batch_size", type=int, default=128,
-    dest="calibrator_batch_size",
-    help="calibrator batch size")
-  parser.add_argument("--calibrator_parts_downsampling_rate", type=float, default=1,
-    dest="calibrator_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--calibrator_max_steps", type=int, default=None,
-    dest="calibrator_max_steps",
-    help="Max Steps taken by calibrator to accumulate samples")
-  parser.add_argument("--calibrator_num_bins", type=int, default=22,
-    dest="calibrator_num_bins",
-    help="Num bins of calibrator")
-  parser.add_argument("--isotonic_calibrator", dest='isotonic_calibrator', action='store_true',
-    help="Isotonic Calibrator present")
-  parser.add_argument("--calibrator_keep_rate", type=float, default=1.0,
-    dest="calibrator_keep_rate",
-    help="Keep rate")
-  return parser
-
-
-def _generate_files_by_datetime(params):
-
-  files = list_files_by_datetime(
-    base_path=sanitize_hdfs_path(params.train_data_dir),
-    start_datetime=params.train_start_datetime,
-    end_datetime=params.train_end_datetime,
-    datetime_prefix_format=params.datetime_format,
-    extension="lzo",
-    parallelism=1,
-    hour_resolution=params.hour_resolution,
-    sort=True)
-
-  return files
-
-
-def get_calibrate_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.calibrator_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.calibrator_keep_rate,
-    parts_downsampling_rate=params.calibrator_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
-
-
-def get_discretize_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.discretizer_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.discretizer_keep_rate,
-    parts_downsampling_rate=params.discretizer_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
-
-
-def discretizer_arguments(parser=None):
-  """
-  Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model. Defaults to None
-  """
-
-  if parser is None:
-    parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-    parser.add_argument(
-      "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-      help="Delete the contents of the current save_dir if it exists")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %Y/%m/%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
-    parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
-    parser.add_argument(
-      "--tensorboard_port", type=int, default=None,
-      help="Port for tensorboard to run on.")
-    parser.add_argument(
-      "--stats_port", type=int, default=None,
-      help="Port for stats server to run on.")
-    parser.add_argument(
-      "--health_port", type=int, default=None,
-      help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-           "Not user-facing as it is set automatically by the twml_cli."
-    )
-    parser.add_argument(
-      "--data_spec", type=str, default=None,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-  parser.add_argument("--discretizer.save_dir", type=str,
-    dest="discretizer_save_dir",
-    help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer_batch_size", type=int, default=128,
-    dest="discretizer_batch_size",
-    help="Discretizer batch size")
-  parser.add_argument("--discretizer_keep_rate", type=float, default=0.0008,
-    dest="discretizer_keep_rate",
-    help="Keep rate")
-  parser.add_argument("--discretizer_parts_downsampling_rate", type=float, default=0.2,
-    dest="discretizer_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--discretizer_max_steps", type=int, default=None,
-    dest="discretizer_max_steps",
-    help="Max Steps taken by discretizer to accumulate samples")
-  return parser
-
-
-def calibrate(trainer, params, build_graph, input_fn, debug=False):
-  """
-  Calibrate Isotonic Calibration
-  Arguments:
-    trainer:
-      Trainer
-    params:
-      Parameters
-    build_graph:
-      Build Graph used to be the input to the calibrator
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if trainer._estimator.config.is_chief:
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.calibrator_save_dir)
-      tf.io.gfile.rmtree(params.calibrator_save_dir)
-
-    calibrator = IsotonicCalibrator(params.calibrator_num_bins)
-
-    # chief trains discretizer
-    logging.info("Chief training calibrator")
-
-    # Accumulate the features for each calibrator
-    features, labels = input_fn()
-    if 'weights' not in features:
-      raise ValueError("Weights need to be returned as part of the parse_fn")
-    weights = features.pop('weights')
-
-    preds = build_graph(features=features, label=None, mode='infer', params=params, config=None)
-    init = tf.global_variables_initializer()
-    table_init = tf.tables_initializer()
-    with tf.Session() as sess:
-      sess.run(init)
-      sess.run(table_init)
-      count = 0
-      max_steps = params.calibrator_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          weights_vals, labels_vals, preds_vals = sess.run([weights, labels, preds['output']])
-          calibrator.accumulate(preds_vals, labels_vals, weights_vals.flatten())
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    calibrator.calibrate()
-    calibrator.save(params.calibrator_save_dir)
-    trainer.estimator._params.isotonic_calibrator = True
-
-    if debug:
-      return calibrator
-
-  else:
-    calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(calibrator_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % calibrator_save_dir)
-      time.sleep(60)
-
-
-def discretize(params, feature_config, input_fn, debug=False):
-  """
-  Discretizes continuous features
-  Arguments:
-    params:
-      Parameters
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-    params.num_workers is None):
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.discretizer_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.discretizer_save_dir)
-      tf.io.gfile.rmtree(params.discretizer_save_dir)
-
-    config_map = feature_config()
-    discretize_dict = config_map['discretize_config']
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    batch = input_fn()
-    # Accumulate the features for each calibrator
-    with tf.Session() as sess:
-      count = 0
-      max_steps = params.discretizer_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          inputs = sess.run(batch)
-          for name, clbrt in discretize_dict.items():
-            clbrt.accumulate_features(inputs[0], name)
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      for name, clbrt in discretize_dict.items():
-        clbrt.calibrate()
-        clbrt.add_hub_signatures(name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(params.discretizer_save_dir, session)
-
-    for name, clbrt in discretize_dict.items():
-      clbrt.write_summary_json(params.discretizer_save_dir, name)
-
-    if debug:
-      return discretize_dict
-
-  else:
-    # wait for the file to be removed (if necessary)
-    # should be removed after an actual fix applied
-    time.sleep(60)
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
-
-
-def add_discretizer_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument("--discretizer.save_dir", type=str,
-                      dest="discretizer_save_dir",
-                      help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer.batch_size", type=int, default=128,
-                      dest="discretizer_batch_size",
-                      help="Discretizer batch size")
-  parser.add_argument("--discretizer.keep_rate", type=float, default=0.0008,
-                      dest="discretizer_keep_rate",
-                      help="Keep rate")
-  parser.add_argument("--discretizer.parts_downsampling_rate", type=float, default=0.2,
-                      dest="discretizer_parts_downsampling_rate",
-                      help="Parts downsampling rate")
-  parser.add_argument("--discretizer.num_bins", type=int, default=20,
-                      dest="discretizer_num_bins",
-                      help="Number of bins per feature")
-  parser.add_argument("--discretizer.output_size_bits", type=int, default=22,
-                      dest="discretizer_output_size_bits",
-                      help="Number of bits allocated to the output size")
-  return parser
-
-
-def add_isotonic_calibrator_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-  parser.add_argument("--calibrator.num_bins", type=int,
-    default=25000, dest="calibrator_num_bins",
-    help="number of bins for isotonic calibration")
-  parser.add_argument("--calibrator.parts_downsampling_rate", type=float, default=0.1,
-    dest="calibrator_parts_downsampling_rate", help="Parts downsampling rate")
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir", help="Path to save or load calibrator output")
-  parser.add_argument("--calibrator.load_tensorflow_module", type=str, default=None,
-    dest="calibrator_load_tensorflow_module",
-    help="Location from where to load a pretrained graph from. \
-                           Typically, this is where the MLP graph is saved")
-  parser.add_argument("--calibrator.export_mlp_module_name", type=str, default='tf_hub_mlp',
-    help="Name for loaded hub signature",
-    dest="export_mlp_module_name")
-  parser.add_argument("--calibrator.export_isotonic_module_name",
-    type=str, default="tf_hub_isotonic",
-    dest="calibrator_export_module_name",
-    help="export module name")
-  parser.add_argument("--calibrator.final_evaluation_steps", type=int,
-    dest="calibrator_final_evaluation_steps", default=None,
-    help="number of steps for final evaluation")
-  parser.add_argument("--calibrator.train_steps", type=int, default=-1,
-    dest="calibrator_train_steps",
-    help="number of steps for calibration")
-  parser.add_argument("--calibrator.batch_size", type=int, default=1024,
-    dest="calibrator_batch_size",
-    help="Calibrator batch size")
-  parser.add_argument("--calibrator.is_calibrating", action='store_true',
-    dest="is_calibrating",
-    help="Dummy argument to allow running in chief worker")
-  return parser
-
-
-def calibrate_calibrator_and_export(name, calibrator, build_graph_fn, params, feature_config,
-                                    run_eval=True, input_fn=None, metric_fn=None,
-                                    export_task_type_overrider=None):
-  """
-  Pre-set `isotonic calibrator` calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config which will be passed to the trainer
-    export_task_type_overrider:
-      the task type for exporting the calibrator
-      if specified, this will override the default export task type in trainer.hub_export(..)
-  """
-
-  # create calibrator params
-  params_c = copy.deepcopy(params)
-  params_c.data_threads = 1
-  params_c.num_workers = 1
-  params_c.continue_from_checkpoint = True
-  params_c.overwrite_save_dir = False
-  params_c.stats_port = None
-
-  # Automatically load from the saved Tensorflow Hub module if not specified.
-  if params_c.calibrator_load_tensorflow_module is None:
-    path_saved_tensorflow_model = os.path.join(params.save_dir, params.export_mlp_module_name)
-    params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model
-
-  if "calibrator_parts_downsampling_rate" in params_c:
-    params_c.train_parts_downsampling_rate = params_c.calibrator_parts_downsampling_rate
-  if "calibrator_save_dir" in params_c:
-    params_c.save_dir = params_c.calibrator_save_dir
-  if "calibrator_batch_size" in params_c:
-    params_c.train_batch_size = params_c.calibrator_batch_size
-    params_c.eval_batch_size = params_c.calibrator_batch_size
-  # TODO: Deprecate this option. It is not actually used. Calibrator
-  #       simply iterates until the end of input_fn.
-  if "calibrator_train_steps" in params_c:
-    params_c.train_steps = params_c.calibrator_train_steps
-
-  if metric_fn is None:
-    metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None)
-
-  # Common Trainer which will also be used by all workers
-  trainer = twml.trainers.DataRecordTrainer(
-    name=name,
-    params=params_c,
-    feature_config=feature_config,
-    build_graph_fn=build_graph_fn,
-    save_dir=params_c.save_dir,
-    metric_fn=metric_fn
-  )
-
-  if trainer._estimator.config.is_chief:
-
-    # Chief trains calibrator
-    logging.info("Chief training calibrator")
-
-    # Disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    hooks = None
-    if params_c.calibrator_train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)]
-
-    def parse_fn(input_x):
-      fc_parse_fn = feature_config.get_parse_fn()
-      features, labels = fc_parse_fn(input_x)
-      features['labels'] = labels
-      return features, labels
-
-    if input_fn is None:
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-
-    # Calibrate stage
-    trainer.estimator._params.mode = 'calibrate'
-    trainer.calibrate(calibrator=calibrator,
-                      input_fn=input_fn,
-                      steps=params_c.calibrator_train_steps,
-                      hooks=hooks)
-
-    # Save Checkpoint
-    # We need to train for 1 step, to save the graph to checkpoint.
-    # This is done just by the chief.
-    # We need to set the mode to evaluate to save the graph that will be consumed
-    # In the final evaluation
-    trainer.estimator._params.mode = 'evaluate'
-    trainer.train(input_fn=input_fn, steps=1)
-
-    # Restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    # Workers wait for calibration to be ready
-    final_calibrator_path = os.path.join(params_c.calibrator_save_dir,
-                                         params_c.calibrator_export_module_name)
-
-    final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path)
-
-    while not tf.io.gfile.exists(final_calibrator_path + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % final_calibrator_path)
-      time.sleep(60)
-
-  # Evaluate stage
-  if run_eval:
-    trainer.estimator._params.mode = 'evaluate'
-    # This will allow the Evaluate method to be run in Hogwild
-    # trainer.estimator._params.continue_from_checkpoint = True
-    trainer.evaluate(name='test', input_fn=input_fn, steps=params_c.calibrator_final_evaluation_steps)
-
-  trainer.hub_export(name=params_c.calibrator_export_module_name,
-    export_task_type_overrider=export_task_type_overrider,
-    serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn())
-
-  return trainer
-
-
-def calibrate_discretizer_and_export(name, calibrator, build_graph_fn, params, feature_config):
-  """
-  Pre-set percentile discretizer calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config or input_fn which will be passed to the trainer.
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-        params.num_workers is None):
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    # disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    # create discretizer params
-    params_c = copy.deepcopy(params)
-    params_c.data_threads = 1
-    params_c.train_steps = -1
-    params_c.train_max_steps = None
-    params_c.eval_steps = -1
-    params_c.num_workers = 1
-    params_c.tensorboard_port = None
-    params_c.stats_port = None
-
-    if "discretizer_batch_size" in params_c:
-      params_c.train_batch_size = params_c.discretizer_batch_size
-      params_c.eval_batch_size = params_c.discretizer_batch_size
-    if "discretizer_keep_rate" in params_c:
-      params_c.train_keep_rate = params_c.discretizer_keep_rate
-    if "discretizer_parts_downsampling_rate" in params_c:
-      params_c.train_parts_downsampling_rate = params_c.discretizer_parts_downsampling_rate
-    if "discretizer_save_dir" in params_c:
-      params_c.save_dir = params_c.discretizer_save_dir
-
-    # train discretizer
-    trainer = twml.trainers.DataRecordTrainer(
-      name=name,
-      params=params_c,
-      build_graph_fn=build_graph_fn,
-      save_dir=params_c.save_dir,
-    )
-
-    if isinstance(feature_config, twml.feature_config.FeatureConfig):
-      parse_fn = twml.parsers.get_continuous_parse_fn(feature_config)
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-    elif callable(feature_config):
-      input_fn = feature_config
-    else:
-      got_type = type(feature_config).__name__
-      raise ValueError(
-        "Expecting feature_config to be FeatureConfig or function got %s" % got_type)
-
-    hooks = None
-    if params_c.train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.train_steps)]
-
-    trainer.calibrate(calibrator=calibrator, input_fn=input_fn,
-                      steps=params_c.train_steps, hooks=hooks)
-    # restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
-
-
-def build_percentile_discretizer_graph(features, label, mode, params, config=None):
-  """
-  Pre-set Percentile Discretizer Build Graph
-  Follows the same signature as build_graph
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-  if isinstance(sparse_tf, tf.SparseTensor):
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-  elif isinstance(sparse_tf, twml.SparseTensor):
-    indices = sparse_tf.indices
-    ids = sparse_tf.ids
-
-  # Return weights, feature_ids, feature_values
-  weights = tf.gather(params=weights, indices=ids)
-  feature_ids = indices
-  feature_values = sparse_tf.values
-  # Update train_op and assign dummy_loss
-  train_op = tf.assign_add(tf.train.get_global_step(), 1)
-  loss = tf.constant(1)
-  if mode == 'train':
-    return {'train_op': train_op, 'loss': loss}
-  return {'feature_ids': feature_ids, 'feature_values': feature_values, 'weights': weights}
-
-
-def isotonic_module(mode, params):
-  """
-  Common Isotonic Calibrator module for Hub Export
-  """
-  inputs = tf.sparse_placeholder(tf.float32, name="sparse_input")
-  mlp = hub.Module(params.calibrator_load_tensorflow_module)
-  logits = mlp(inputs, signature=params.export_mlp_module_name)
-  isotonic_calibrator = hub.Module(params.save_dir)
-  output = isotonic_calibrator(logits, signature="isotonic_calibrator")
-  hub.add_signature(inputs={"sparse_input": inputs},
-    outputs={"default": output},
-    name=params.calibrator_export_module_name)
-
-
-def build_isotonic_graph_from_inputs(inputs, features, label, mode, params, config=None, isotonic_fn=None):
-  """
-  Helper function to build_isotonic_graph
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  """
-  if params.mode == 'calibrate':
-    mlp = hub.Module(params.calibrator_load_tensorflow_module)
-    logits = mlp(inputs, signature=params.export_mlp_module_name)
-    weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-    # Update train_op and assign dummy_loss
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    loss = tf.constant(1)
-    if mode == 'train':
-      return {'train_op': train_op, 'loss': loss}
-    return {'predictions': logits, 'targets': features['labels'], 'weights': weights}
-  else:
-    if isotonic_fn is None:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_module, mode=mode, params=params)
-    else:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_fn, mode=mode, params=params)
-    output_hub = hub.Module(isotonic_spec,
-      name=params.calibrator_export_module_name)
-    hub.register_module_for_export(output_hub, params.calibrator_export_module_name)
-    output = output_hub(inputs, signature=params.calibrator_export_module_name)
-    output = tf.clip_by_value(output, 0, 1)
-    loss = tf.reduce_sum(tf.stop_gradient(output))
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    return {'train_op': train_op, 'loss': loss, 'output': output}
-
-
-def build_isotonic_graph(features, label, mode, params, config=None, export_discretizer=True):
-  """
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  This assumes that MLP already contains all modules (include percentile
-  discretizer); if export_discretizer is set
-  then it does not export the MDL phase.
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  if export_discretizer:
-    return build_isotonic_graph_from_inputs(sparse_tf, features, label, mode, params, config)
-  discretizer = hub.Module(params.discretizer_path)
-
-  if params.discretizer_signature is None:
-    discretizer_signature = "percentile_discretizer_calibrator"
-  else:
-    discretizer_signature = params.discretizer_signature
-  input_sparse = discretizer(sparse_tf, signature=discretizer_signature)
-  return build_isotonic_graph_from_inputs(input_sparse, features, label, mode, params, config)
diff --git a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.docx b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.docx
new file mode 100644
index 000000000..6be90b65b
Binary files /dev/null and b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.docx differ
diff --git a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
deleted file mode 100644
index e14f62303..000000000
--- a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
-import twml
-
-
-class HashedPercentileDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashedPercentileDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashedPercentileDiscretizer instead.
-  '''
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.contrib.layers.HashedPercentileDiscretizer(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets
-    )
diff --git a/twml/twml/contrib/calibrators/hashing_discretizer.docx b/twml/twml/contrib/calibrators/hashing_discretizer.docx
new file mode 100644
index 000000000..aab1d3827
Binary files /dev/null and b/twml/twml/contrib/calibrators/hashing_discretizer.docx differ
diff --git a/twml/twml/contrib/calibrators/hashing_discretizer.py b/twml/twml/contrib/calibrators/hashing_discretizer.py
deleted file mode 100644
index 965ced934..000000000
--- a/twml/twml/contrib/calibrators/hashing_discretizer.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
-import numpy as np
-import twml
-
-
-class HashingDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashingDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashingDiscretizer instead.
-  '''
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    # Need to sort hash_map_keys according to hash_map_values
-    # just in case they're not in order of being put in the dict
-    # hash_map_values is already 0 through len(hash_map_values)-1
-    hash_map_keys = hash_map_keys.flatten()
-    # why is this float32 in PercentileDiscretizerCalibrator.to_layer ????
-    # need int for indexing
-    hash_map_values = hash_map_values.flatten().astype(np.int32)
-    feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64)
-    for idx in range(len(hash_map_keys)):
-      feature_ids[hash_map_values[idx]] = hash_map_keys[idx]
-
-    return twml.contrib.layers.HashingDiscretizer(
-      feature_ids=feature_ids,
-      bin_vals=self._bin_vals.flatten(),
-      n_bin=self._n_bin + 1,  # (self._n_bin + 1) bin_vals for each feature_id
-      out_bits=self._out_bits,
-      cost_per_unit=500,
-      name=name
-    )
diff --git a/twml/twml/contrib/calibrators/isotonic.docx b/twml/twml/contrib/calibrators/isotonic.docx
new file mode 100644
index 000000000..509b028e3
Binary files /dev/null and b/twml/twml/contrib/calibrators/isotonic.docx differ
diff --git a/twml/twml/contrib/calibrators/isotonic.py b/twml/twml/contrib/calibrators/isotonic.py
deleted file mode 100644
index d03a75ff8..000000000
--- a/twml/twml/contrib/calibrators/isotonic.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# pylint: disable=arguments-differ, unused-argument
-''' Contains Isotonic Calibration'''
-
-from .calibrator import CalibrationFeature, Calibrator
-
-from absl import logging
-import numpy as np
-from sklearn.isotonic import isotonic_regression
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-import twml.layers
-
-
-DEFAULT_SAMPLE_WEIGHT = 1
-
-
-def sort_values(inputs, target, weight, ascending=True):
-  '''
-  Sorts arrays based on the first array.
-
-  Arguments:
-    inputs:
-      1D array which will dictate the order which the remainder 2 arrays will be sorted
-    target:
-      1D array
-    weight:
-      1D array
-    ascending:
-      Boolean. If set to True (the default), sorts values in ascending order.
-
-  Returns:
-    sorted inputs:
-      1D array sorted by the order of `ascending`
-    sorted targets:
-      1D array
-    sorted weight:
-      1D array
-  '''
-  # assert that the length of inputs and target are the same
-  if len(inputs) != len(target):
-    raise ValueError('Expecting inputs and target sizes to match')
-   # assert that the length of inputs and weight are the same
-  if len(inputs) != len(weight):
-    raise ValueError('Expecting inputs and weight sizes to match')
-  inds = inputs.argsort()
-  if not ascending:
-    inds = inds[::-1]
-  return inputs[inds], target[inds], weight[inds]
-
-
-class IsotonicFeature(CalibrationFeature):
-  '''
-  IsotonicFeature adds values, weights and targets to each feature and then runs
-  isotonic regression by calling `sklearn.isotonic.isotonic_regression
-  <http://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html>`_
-  '''
-
-  def _get_bin_boundaries(self, n_samples, bins, similar_bins):
-    """
-    Calculates the sample indices that define bin boundaries
-
-    Arguments:
-      n_samples:
-        (int) number of samples
-      bins:
-        (int) number of bins. Needs to be smaller or equal than n_samples.
-      similar_bins:
-        (bool) If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
-
-    Returns:
-      (list[int]) List of sample indices defining bin boundaries
-    """
-
-    if bins > n_samples:
-      raise ValueError(
-        "The number of bins needs to be less than or equal to the number of samples. "
-        "Currently bins={0} and n_samples={1}.".format(bins, n_samples)
-      )
-
-    step = n_samples // bins
-
-    if similar_bins:
-      # dtype=int will floor the linspace
-      bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int)
-    else:
-      bin_boundaries = range(0, step * bins, step)
-
-    bin_boundaries = np.append(bin_boundaries, n_samples)
-
-    return bin_boundaries
-
-  def calibrate(self, bins, similar_bins=False, debug=False):
-    '''Calibrates the IsotonicFeature into calibrated weights and bias.
-
-    1. Sorts the values of the feature class, based on the order of values
-    2. Performs isotonic regression using sklearn.isotonic.isotonic_regression
-    3. Performs the binning of the samples, in order to obtain the final weight and bias
-      which will be used for inference
-
-    Note that this method can only be called once.
-
-    Arguments:
-      bins:
-        number of bins.
-      similar_bins:
-        If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
-      debug:
-        Defaults to False. If debug is set to true, output other parameters useful for debugging.
-
-    Returns:
-      [calibrated weight, calibrated bias]
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    # parse through the dict to obtain the targets, weights and values
-    self._concat_arrays()
-    feature_targets = self._features_dict['targets']
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-    srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values(
-      inputs=feature_values,
-      target=feature_targets,
-      weight=feature_weights
-    )
-    calibrated_feature_values = isotonic_regression(
-      srtd_feature_targets, sample_weight=srtd_feature_weights)
-    # create the final outputs for the prediction of each class
-    bpreds = []
-    btargets = []
-    bweights = []
-    rpreds = []
-
-    # Create bin boundaries
-    bin_boundaries = self._get_bin_boundaries(
-      len(calibrated_feature_values), bins, similar_bins=similar_bins)
-
-    for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]):
-      # separate each one of the arrays based on their respective bins
-      lpreds = srtd_feature_values[int(sidx):int(eidx)]
-      lrpreds = calibrated_feature_values[int(sidx):int(eidx)]
-      ltargets = srtd_feature_targets[int(sidx):int(eidx)]
-      lweights = srtd_feature_weights[int(sidx):int(eidx)]
-
-      # calculate the outputs (including the bpreds and rpreds)
-      bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      btargets.append(np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights))))
-      bweights.append(np.squeeze(np.sum(lweights)))
-    # transposing the bpreds and rpreds which will be used as input to the inference step
-    bpreds = np.asarray(bpreds).T
-    rpreds = np.asarray(rpreds).T
-    btargets = np.asarray(btargets).T
-    bweights = np.asarray(bweights).T
-    # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate
-    self._calibrated = True
-    if debug:
-      return bpreds, rpreds, btargets, bweights
-    return bpreds, rpreds
-
-
-class IsotonicCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for isotonic calibration.
-  Internally, each feature's values is accumulated via its own isotonicFeature object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and
-   3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, similar_bins=False, **kwargs):
-    ''' Constructs an isotonicCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for isotonic.
-        Note that each feature actually maps to ``n_bin+1`` output IDs.
-    '''
-    super(IsotonicCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._similar_bins = similar_bins
-    self._ys_input = []
-    self._xs_input = []
-    self._isotonic_feature_dict = {}
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output: output of prediction of build_graph for calibrator
-    '''
-    weights = output['weights'] if 'weights' in output else None
-    return self.accumulate(output['predictions'], output['targets'], weights)
-
-  def accumulate(self, predictions, targets, weights=None):
-    '''
-    Accumulate a single batch of class predictions, class targets and class weights.
-    These are accumulated until calibrate() is called.
-
-    Arguments:
-      predictions:
-        float matrix of class values. Each dimension corresponds to a different class.
-        Shape is ``[n, d]``, where d is the number of classes.
-      targets:
-        float matrix of class targets. Each dimension corresponds to a different class.
-        Shape ``[n, d]``, where d is the number of classes.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each prediction.
-    '''
-    if predictions.shape != targets.shape:
-      raise ValueError(
-        'Expecting predictions.shape == targets.shape, got %s and %s instead' %
-        (str(predictions.shape), str(targets.shape)))
-    if weights is not None:
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weight, got %dD instead' % weights.ndim)
-      elif weights.size != predictions.shape[0]:
-        raise ValueError(
-          'Expecting predictions.shape[0] == weights.size, got %d != %d instead' %
-          (predictions.shape[0], weights.size))
-    # iterate through the rows of predictions and sets one class to each row
-    if weights is None:
-      weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT)
-    for class_key in range(predictions.shape[1]):
-      # gets the predictions and targets for that class
-      class_predictions = predictions[:, class_key]
-      class_targets = targets[:, class_key]
-      if class_key not in self._isotonic_feature_dict:
-        isotonic_feature = IsotonicFeature(class_key)
-        self._isotonic_feature_dict[class_key] = isotonic_feature
-      else:
-        isotonic_feature = self._isotonic_feature_dict[class_key]
-      isotonic_feature.add_values({'values': class_predictions, 'weights': weights,
-                                   'targets': class_targets})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each IsotonicFeature after accumulation is complete.
-    Results are stored in ``self._ys_input`` and ``self._xs_input``
-
-    Arguments:
-      debug:
-        Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``.
-    '''
-    super(IsotonicCalibrator, self).calibrate()
-    bias_temp = []
-    weight_temp = []
-    logging.info("Beginning isotonic calibration.")
-    isotonic_features_dict = self._isotonic_feature_dict
-    for class_id in isotonic_features_dict:
-      bpreds, rpreds = isotonic_features_dict[class_id].calibrate(bins=self._n_bin, similar_bins=self._similar_bins)
-      weight_temp.append(bpreds)
-      bias_temp.append(rpreds)
-    # save isotonic results onto a matrix
-    self._xs_input = np.array(weight_temp, dtype=np.float32)
-    self._ys_input = np.array(bias_temp, dtype=np.float32)
-    logging.info("Isotonic calibration finished.")
-    if debug:
-      return np.array(weight_temp), np.array(bias_temp)
-    return None
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    logging.info("You probably do not need to save the isotonic layer. \
-                  So feel free to set save to False in the Trainer. \
-                  Additionally this only saves the layer not the whole graph.")
-
-    def calibrator_module():
-      '''
-      Way to save Isotonic layer
-      '''
-      # The input to isotonic is a dense layer
-      inputs = tf.placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def to_layer(self):
-    """ Returns a twml.layers.Isotonic Layer that can be used for feature discretization.
-    """
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    isotonic_layer = twml.layers.Isotonic(
-      n_unit=self._xs_input.shape[0], n_bin=self._xs_input.shape[1],
-      xs_input=self._xs_input, ys_input=self._ys_input,
-      **self._kwargs)
-
-    return isotonic_layer
-
-  def get_layer_args(self, name=None):
-    """ Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation """
-    return {'n_unit': self._xs_input.shape[0], 'n_bin': self._xs_input.shape[1]}
diff --git a/twml/twml/contrib/calibrators/mdl.docx b/twml/twml/contrib/calibrators/mdl.docx
new file mode 100644
index 000000000..90852a401
Binary files /dev/null and b/twml/twml/contrib/calibrators/mdl.docx differ
diff --git a/twml/twml/contrib/calibrators/mdl.py b/twml/twml/contrib/calibrators/mdl.py
deleted file mode 100644
index 0fe3265a4..000000000
--- a/twml/twml/contrib/calibrators/mdl.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains MDLFeature and MDLCalibrator used for MDL calibration '''
-
-
-import os
-
-from .percentile_discretizer import PercentileDiscretizerCalibrator, PercentileDiscretizerFeature
-
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-import twml.layers
-
-
-DEFAULT_SAMPLE_WEIGHT = 1
-
-
-class MDLFeature(PercentileDiscretizerFeature):
-  ''' Accumulates and calibrates a single sparse MDL feature. '''
-
-
-class MDLCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for MDL calibration.
-  Internally, each feature's values is accumulated via its own ``MDLFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.MDL layer by calling ``to_layer()``.
-
-  '''
-
-  def to_layer(self, name=None):
-    """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
-
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
-        %d but requested that the output be limited to %d values (%d bits),
-        which is smaller than that. Please ensure the output has enough bits
-        to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = twml.layers.MDL(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets,
-      **self._kwargs
-    )
-
-    return discretizer
-
-  def save(self, save_dir, name='calibrator', verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory
-      name:
-        name for the graph scope. Passed to to_layer(name=name) to set
-        scope of layer.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    layer_args = self.get_layer_args()
-
-    calibrator_filename = os.path.join(save_dir, name + '.json.tf')
-    calibrator_dict = {
-      'layer_args': layer_args,
-      'saved_layer_scope': name + '/',
-    }
-    twml.write_file(calibrator_filename, calibrator_dict, encode='json')
-
-    if verbose:
-      logging.info("The layer graph and other information necessary ")
-      logging.info("for multi-phase training is saved in directory:")
-      logging.info(save_dir)
-      logging.info("This directory can be specified as --init_from_dir argument.")
-      logging.info("")
-      logging.info("Other information is available in: %s.json.tf", name)
-      logging.info("This file can be loaded with twml.read_file(decode='json) to obtain ")
-      logging.info("layer_args, saved_layer_scope and variable_names")
-
-    graph = tf.Graph()
-    # save graph for tensorboard as well
-    writer = tf.summary.FileWriter(logdir=save_dir, graph=graph)
-
-    with tf.Session(graph=graph) as sess:
-      self.write_summary(writer, sess)
-    writer.flush()
diff --git a/twml/twml/contrib/calibrators/percentile_discretizer.docx b/twml/twml/contrib/calibrators/percentile_discretizer.docx
new file mode 100644
index 000000000..548b0c7a6
Binary files /dev/null and b/twml/twml/contrib/calibrators/percentile_discretizer.docx differ
diff --git a/twml/twml/contrib/calibrators/percentile_discretizer.py b/twml/twml/contrib/calibrators/percentile_discretizer.py
deleted file mode 100644
index eefce62c2..000000000
--- a/twml/twml/contrib/calibrators/percentile_discretizer.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \
-    for PercentileDiscretizer calibration '''
-
-
-
-from .calibrator import CalibrationFeature, Calibrator
-
-import os
-import numpy as np
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-import twml.layers
-
-
-DEFAULT_SAMPLE_WEIGHT = 1
-
-
-class PercentileDiscretizerFeature(CalibrationFeature):
-  ''' Accumulates and calibrates a single sparse PercentileDiscretizer feature. '''
-
-  @staticmethod
-  def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer):
-    '''
-    Determine how many training values fell into a given bin during calibration.
-    This is calculated by finding the index of the first appearance of each bin
-    boundary in values (values may repeat, so that isn't trivially in indices.)
-    Subtracting each bin boundary index from the next tells you how many values fall in
-    that bin.
-    To get this to calculate the last bin correctly, len(values) is appended to the
-    list of bound indices.
-
-    This assumes that ``bin_vals`` excludes np.inf bin boundaries when
-    PercentileDiscretizer was calibrated
-    with fewer values than bins.
-
-    Arguments:
-      values:
-        1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending
-      indices:
-        1D int32 ndarray of the indices (in values) of the bin boundaries
-      bin_vals:
-        1D ndarray containing the bin boundaries
-      bin_counts_buffer:
-        ndarray buffer for returning the PercentileDiscretizer histogram
-    '''
-    # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1]
-    # append index of the last bin since that cannot be empty with how
-    # PercentileDiscretizer is implemented
-    nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1)
-    bin_start_indices = indices.take(nonempty_bins)
-
-    # if multiples of a bin's lower bound value exist, find the first one
-    for (i, idx) in enumerate(bin_start_indices):
-      cur_idx = idx
-      while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]:
-        bin_start_indices[i] = cur_idx = cur_idx - 1
-
-    # the end of each bin is the start of the next bin,
-    # until the last, which is the end of the array
-    # broadcast the counts to the nonempty bins, 0 otherwise
-    bin_counts_buffer[:] = 0
-    bin_counts_buffer[nonempty_bins] = np.diff(np.append(bin_start_indices, values.size))
-
-  def calibrate(
-          self,
-          bin_vals, percentiles, percentile_indices,
-          bin_counts_buffer=None):
-    '''Calibrates the PercentileDiscretizerFeature into bin values for
-    use in PercentileDiscretizerCalibrator.
-    Note that this method can only be called once.
-
-    Arguments:
-      bin_vals:
-        Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature.
-        Will be updated with the results of the calibration.
-        A 1D ndarray.
-      percentiles:
-        1D array of size n_bin with values ranging from 0 to 1.
-        For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)``
-      percentile_indices:
-        Empty 1D array of size n_bin used to store intermediate results when
-        calling twml.twml_optim_nearest_interpolation().
-        For example, np.empty(self._n_bin + 1, dtype=np.float32).
-      bin_counts_buffer:
-        optional ndarray buffer used for retaining count of values per PercentileDiscretizer
-        bucket (for debug and feature exploration purposes)
-
-    Returns:
-      calibrated bin_vals for use by ``PercentileDiscretizerCalibrator``
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    if bin_vals.ndim != 1:
-      raise RuntimeError("Expecting bin_vals row")
-
-    # # concatenate values and weights buffers
-    self._concat_arrays()
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-
-    # get features ready for the bins, order array indices by feature values.
-    indices = np.argsort(feature_values)
-
-    # get ordered values and weights using array indices
-    values = feature_values.take(indices)
-    weights = feature_weights.take(indices)
-
-    # Normalizes the sum of weights to be between 0 and 1
-    weights = np.cumsum(weights, out=feature_weights)
-    weights -= weights[0]
-    if weights[-1] > 0:  # prevent zero-division
-      weights /= weights[-1]
-
-    # Check if we have less values than bin_vals
-    if values.size < bin_vals.size:
-      # Fills all the bins with a value that won't ever be reached
-      bin_vals.fill(np.inf)
-      # Forces the first to be -inf
-      bin_vals[0] = -np.inf
-      # Copies the values as boundaries
-      bin_vals[1:values.size + 1] = values
-
-      if bin_counts_buffer is not None:
-        # slice out bins with +/-np.inf boundary -- their count will be zero anyway
-        # we can't just assume all other bins will have 1 value since there can be dups
-        short_indices = np.arange(values.size, dtype=np.int32)
-        bin_counts_buffer.fill(0)
-        self._gather_debug_info(
-          values, short_indices, bin_vals[1:values.size + 1],
-          bin_counts_buffer[1:values.size + 1])
-
-    else:
-      # Gets the indices for the values that define the boundary for the bins
-      indices_float = np.arange(0, weights.size, dtype=np.float32)
-
-      # Gets things in the correct shape for the linear interpolation
-      weights = weights.reshape(1, weights.size)
-      indices_float = indices_float.reshape(1, weights.size)
-
-      # wrap ndarrays into twml.Array
-      percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1))
-      weights_tarray = twml.Array(weights)
-      indices_float_tarray = twml.Array(indices_float)
-      percentile_indices_tarray = twml.Array(percentile_indices.reshape(percentiles.size, 1))
-
-      # Performs the binary search to find the indices corresponding to the percentiles
-      err = twml.CLIB.twml_optim_nearest_interpolation(
-        percentile_indices_tarray.handle, percentiles_tarray.handle,  # output, input
-        weights_tarray.handle, indices_float_tarray.handle  # xs, ys
-      )
-      if err != 1000:
-        raise ValueError("""twml.CLIB.twml_optim_nearest_interpolation
-          caught an error (see previous stdout). Error code: """ % err)
-
-      indices = indices[:bin_vals.size]
-      indices[:] = percentile_indices
-      indices[0] = 0
-      indices[-1] = weights.size - 1
-
-      # Gets the values at those indices and copies them into bin_vals
-      values.take(indices, out=bin_vals)
-
-      # get # of values per bucket
-      if bin_counts_buffer is not None:
-        self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer)
-
-    self._calibrated = True
-
-
-class PercentileDiscretizerCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for PercentileDiscretizer calibration.
-  Internally, each feature's values is accumulated via its own
-  ``PercentileDiscretizerFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, out_bits, bin_histogram=True,
-               allow_empty_calibration=False, **kwargs):
-    ''' Constructs an PercentileDiscretizerCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for PercentileDiscretizer.
-        Note that each feature actually maps to n_bin+1 output IDs.
-      out_bits:
-        The maximum number of bits to use for the output IDs.
-        2**out_bits must be greater than bin_ids.size or an error is raised.
-      bin_histogram:
-        When True (the default), gathers information during calibration
-        to build a bin_histogram.
-      allow_empty_calibration:
-        allows operation where we might not calibrate any features.
-        Default False to error out if no features were calibrated.
-        Typically, values of uncalibrated features pass through discretizers
-        untouched (though the feature ids will be truncated to obey out_bits).
-    '''
-    super(PercentileDiscretizerCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-
-    self._bin_ids = None
-    self._bin_vals = np.empty(0, dtype=np.float32)  # Note changed from 64 (v1) to 32 (v2)
-
-    self._bin_histogram = bin_histogram
-    self._bin_histogram_dict = None
-
-    self._hash_map_counter = 0
-    self._hash_map = {}
-
-    self._discretizer_feature_dict = {}
-    self._allow_empty_calibration = allow_empty_calibration
-
-  @property
-  def bin_ids(self):
-    '''
-    Gets bin_ids
-    '''
-    return self._bin_ids
-
-  @property
-  def bin_vals(self):
-    '''
-    Gets bin_vals
-    '''
-    return self._bin_vals
-
-  @property
-  def hash_map(self):
-    '''
-    Gets hash_map
-    '''
-    return self._hash_map
-
-  @property
-  def discretizer_feature_dict(self):
-    '''
-    Gets feature_dict
-    '''
-    return self._discretizer_feature_dict
-
-  def accumulate_features(self, inputs, name):
-    '''
-    Wrapper around accumulate for PercentileDiscretizer.
-    Arguments:
-      inputs:
-        batch that will be accumulated
-      name:
-        name of the tensor that will be accumulated
-
-    '''
-    sparse_tf = inputs[name]
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-    weights = np.take(inputs["weights"], ids)
-    return self.accumulate(indices, sparse_tf.values, weights)
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output:
-        output of prediction of build_graph for calibrator
-    '''
-    return self.accumulate(output['feature_ids'], output['feature_values'], output['weights'])
-
-  def accumulate(self, feature_keys, feature_vals, weights=None):
-    '''Accumulate a single batch of feature keys, values and weights.
-
-    These are accumulate until ``calibrate()`` is called.
-
-    Arguments:
-      feature_keys:
-        1D int64 array of feature keys.
-      feature_vals:
-        1D float array of feature values. Each element of this array
-        maps to the commensurate element in ``feature_keys``.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each feature key, value pair.
-        Typically, this is the weight of each sample (but you still need
-        to provide one weight per key,value pair).
-        Each element of this array maps to the commensurate element in feature_keys.
-    '''
-    if feature_keys.ndim != 1:
-      raise ValueError('Expecting 1D feature_keys, got %dD' % feature_keys.ndim)
-    if feature_vals.ndim != 1:
-      raise ValueError('Expecting 1D feature_values, got %dD' % feature_vals.ndim)
-    if feature_vals.size != feature_keys.size:
-      raise ValueError(
-        'Expecting feature_keys.size == feature_values.size, got %d != %d' %
-        (feature_keys.size, feature_vals.size))
-    if weights is not None:
-      weights = np.squeeze(weights)
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weights, got %dD' % weights.ndim)
-      elif weights.size != feature_keys.size:
-        raise ValueError(
-          'Expecting feature_keys.size == weights.size, got %d != %d' %
-          (feature_keys.size, weights.size))
-    if weights is None:
-      weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT)
-    unique_keys = np.unique(feature_keys)
-    for feature_id in unique_keys:
-      idx = np.where(feature_keys == feature_id)
-      if feature_id not in self._discretizer_feature_dict:
-        self._hash_map[feature_id] = self._hash_map_counter
-        # unlike v1, the hash_map_counter is incremented AFTER assignment.
-        # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3
-        self._hash_map_counter += 1
-        # creates a new cache if we never saw the feature before
-        discretizer_feature = PercentileDiscretizerFeature(feature_id)
-        self._discretizer_feature_dict[feature_id] = discretizer_feature
-      else:
-        discretizer_feature = self._discretizer_feature_dict[feature_id]
-      discretizer_feature.add_values({'values': feature_vals[idx], 'weights': weights[idx]})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each PercentileDiscretizer feature after accumulation is complete.
-
-    Arguments:
-      debug:
-        Boolean to request debug info be returned by the method.
-        (see Returns section below)
-
-    The calibration results are stored in two matrices:
-      bin_ids:
-        2D array of size number of accumulate ``features x n_bin+1``.
-        Contains the new IDs generated by PercentileDiscretizer. Each row maps to a feature.
-        Each row maps to different value bins. The IDs
-        are in the range ``1 -> bin_ids.size+1``
-      bin_vals:
-        2D array of the same size as bin_ids.
-        Each row maps to a feature. Each row contains the bin boundaries.
-        These boundaries represent feature values.
-
-    Returns:
-      if debug is True, the method returns
-
-        - 1D int64 array of feature_ids
-        - 2D float32 array copy of bin_vals (the bin boundaries) for each feature
-        - 2D int64 array of bin counts corresponding to the bin boundaries
-
-    '''
-    n_feature = len(self._discretizer_feature_dict)
-    if n_feature == 0 and not self._allow_empty_calibration:
-      raise RuntimeError("Need to accumulate some features for calibration\n"
-                         "Likely, the calibration data is empty. This can\n"
-                         "happen if the dataset is small, or if the following\n"
-                         "cli args are set too low:\n"
-                         "  --discretizer_keep_rate (default=0.0008)\n"
-                         "  --discretizer_parts_downsampling_rate (default=0.2)\n"
-                         "Consider increasing the values of these args.\n"
-                         "To allow empty calibration data (and degenerate discretizer),\n"
-                         "use the allow_empty_calibration input of the constructor.")
-
-    self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1)
-    self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1)
-
-    self._bin_vals.resize(n_feature, self._n_bin + 1)
-
-    # buffers shared by PercentileDiscretizerFeature.calibrate()
-    percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32)
-
-    # Tensor from 0 to 1 in the number of steps provided
-    percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32)
-
-    if debug or self._bin_histogram:
-      debug_feature_ids = np.empty(n_feature, dtype=np.int64)
-      bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64)
-
-    # progress bar for calibration phase
-    progress_bar = tf.keras.utils.Progbar(n_feature)
-
-    discretizer_features_dict = self._discretizer_feature_dict
-    for i, feature_id in enumerate(discretizer_features_dict):
-      if debug or self._bin_histogram:
-        debug_feature_ids[self._hash_map[feature_id]] = feature_id
-        bin_counts_buffer = bin_counts[self._hash_map[feature_id]]
-      else:
-        bin_counts_buffer = None
-
-      # calibrate each PercentileDiscretizer feature (puts results in bin_vals)
-      discretizer_features_dict[feature_id].calibrate(
-        self._bin_vals[self._hash_map[feature_id]],  # Gets feature-values
-        percentiles, percentile_indices,
-        bin_counts_buffer=bin_counts_buffer
-      )
-
-      # update progress bar 20 times
-      if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1):
-        progress_bar.update(i + 1)
-
-    super(PercentileDiscretizerCalibrator, self).calibrate()
-
-    if self._bin_histogram:
-      # save bin histogram data for later
-      self._bin_histogram_dict = {
-        'feature_ids': debug_feature_ids,
-        'bin_counts': bin_counts,
-        'bin_vals': self._bin_vals,
-        'out_bits': self._out_bits,
-      }
-
-    if debug:
-      return debug_feature_ids, self._bin_vals.copy(), bin_counts
-
-    return None
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.layers.PercentileDiscretizer(
-      n_feature=n_feature,
-      n_bin=self._n_bin,
-      out_bits=self._out_bits,
-      bin_values=self._bin_vals.flatten(),
-      hash_keys=hash_map_keys,
-      hash_values=hash_map_values.astype(np.int64),
-      bin_ids=self._bin_ids.flatten().astype(np.int64),
-      feature_offsets=feature_offsets,
-      name=name,
-      **self._kwargs
-    )
-
-  def to_layer(self, name=None):
-    """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
-
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
-        %d but requested that the output be limited to %d values (%d bits),
-        which is smaller than that. Please ensure the output has enough bits
-        to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = self._create_discretizer_layer(n_feature, hash_map_keys,
-                                                 hash_map_values, feature_offsets, name)
-
-    return discretizer
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-    See twml.calibrator.Calibrator.get_layer_args for more detailed documentation.
-    '''
-    layer_args = {
-      'n_feature': len(self._discretizer_feature_dict),
-      'n_bin': self._n_bin,
-      'out_bits': self._out_bits,
-    }
-
-    return layer_args
-
-  def add_hub_signatures(self, name):
-    """
-    Add Hub Signatures for each calibrator
-
-    Arguments:
-      name:
-        Calibrator name
-    """
-    sparse_tf = tf.sparse_placeholder(tf.float32)
-    calibrator_layer = self.to_layer()
-    hub.add_signature(
-      inputs=sparse_tf,
-      outputs=calibrator_layer(sparse_tf, keep_inputs=False),
-      name=name)
-
-  def write_summary(self, writer, sess=None):
-    """
-    This method is called by save() to write a histogram of
-    PercentileDiscretizer feature bins to disk. A histogram is included for each
-    feature.
-
-    Arguments:
-      writer:
-        tf.summary.FilteWriter instance.
-        used to add summaries to event files for inclusion in tensorboard.
-      sess:
-        tf.Session instance. Used to produces summaries for the writer.
-    """
-    bin_counts_ph = tf.placeholder(tf.int64)
-    bin_counts = self._bin_histogram_dict['bin_counts']
-
-    # Record that distribution into a histogram summary
-    histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph)
-    for i in range(bin_counts.shape[0]):
-      bin_counts_summary = sess.run(histo, feed_dict={bin_counts_ph: bin_counts[i]})
-      writer.add_summary(bin_counts_summary, global_step=i)
-
-  def write_summary_json(self, save_dir, name="default"):
-    """
-    Export bin information to HDFS.
-    
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    """
-    # Since the size is small: (# of bins) * (# of features), we always dump the file.
-    discretizer_export_bin_filename = os.path.join(save_dir, name + '_bin.json')
-    discretizer_export_bin_dict = {
-      'feature_ids': self._bin_histogram_dict['feature_ids'].tolist(),
-      'bin_boundaries': self._bin_histogram_dict['bin_vals'].tolist(),
-      'output_bits': self._bin_histogram_dict['out_bits']
-    }
-    twml.write_file(discretizer_export_bin_filename, discretizer_export_bin_dict, encode='json')
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory using TF Hub.
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      # creates the signature to the calibrator module
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=False),
-        name=name)
-      # and another signature for keep_inputs mode
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=True),
-        name=name + '_keep_inputs')
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-    self.write_summary_json(save_dir, name)
diff --git a/twml/twml/contrib/eventbus/input_fn.docx b/twml/twml/contrib/eventbus/input_fn.docx
new file mode 100644
index 000000000..a36b666dd
Binary files /dev/null and b/twml/twml/contrib/eventbus/input_fn.docx differ
diff --git a/twml/twml/contrib/eventbus/input_fn.py b/twml/twml/contrib/eventbus/input_fn.py
deleted file mode 100644
index c184d9434..000000000
--- a/twml/twml/contrib/eventbus/input_fn.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from reader import EventBusPipedBinaryRecordReader
-import tensorflow.compat.v1 as tf
-import twml
-
-
-"""
-This module provides input function for DeepBird v2 training.
-The training data records are loaded from an EventBus reader.
-"""
-
-
-def get_eventbus_data_record_generator(eventbus_reader):
-  """
-  This module provides a data record generater from EventBus reader.
-
-  Args:
-    eventbus_reader: EventBus reader
-
-  Returns:
-    gen: Data record generater
-  """
-  eventbus_reader.initialize()
-  counter = [0]
-
-  def gen():
-    while True:
-      record = eventbus_reader.read()
-      if eventbus_reader.debug:
-        tf.logging.warn("counter: {}".format(counter[0]))
-        with open('tmp_record_{}.bin'.format(counter[0]), 'wb') as f:
-          f.write(record)
-        counter[0] = counter[0] + 1
-      yield record
-  return gen
-
-
-def get_eventbus_data_record_dataset(eventbus_reader, parse_fn, batch_size):
-  """
-  This module generates batch data for training from a data record generator.
-  """
-  dataset = tf.data.Dataset.from_generator(
-    get_eventbus_data_record_generator(eventbus_reader), tf.string, tf.TensorShape([]))
-  return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=4).prefetch(buffer_size=10)
-
-
-def get_train_input_fn(feature_config, params, parse_fn=None):
-  """
-  This module provides input function for DeepBird v2 training.
-  It gets batched training data from data record generator.
-  """
-  eventbus_reader = EventBusPipedBinaryRecordReader(
-    params.jar_file, params.num_eb_threads, params.subscriber_id,
-    filter_str=params.filter_str, debug=params.debug)
-
-  train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn(
-    feature_config, ["ids", "keys", "values", "batch_size", "weights"])
-
-  return lambda: get_eventbus_data_record_dataset(
-    eventbus_reader, train_parse_fn, params.train_batch_size)
diff --git a/twml/twml/contrib/eventbus/reader.docx b/twml/twml/contrib/eventbus/reader.docx
new file mode 100644
index 000000000..f6567f277
Binary files /dev/null and b/twml/twml/contrib/eventbus/reader.docx differ
diff --git a/twml/twml/contrib/eventbus/reader.py b/twml/twml/contrib/eventbus/reader.py
deleted file mode 100644
index 2f8e2749e..000000000
--- a/twml/twml/contrib/eventbus/reader.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import io
-import logging
-import subprocess
-from threading import Lock
-
-"""
-This module provides a binary data record reader for EventBus data.
-It starts a EventBus subscriber in a separate process to receive EventBus streaming data.
-The subscriber is supposed to outputs received data through PIPE to this module.
-This module parses input and output binary data record to serve as a record reader.
-"""
-
-
-class BinaryRecordReader(object):
-  def initialize(self):
-    pass
-
-  def read(self):
-    """Read raw bytes for one record
-    """
-    raise NotImplementedError
-
-  def close(self):
-    pass
-
-
-class ReadableWrapper(object):
-  def __init__(self, internal):
-    self.internal = internal
-
-  def __getattr__(self, name):
-    return getattr(self.internal, name)
-
-  def readable(self):
-    return True
-
-
-class EventBusPipedBinaryRecordReader(BinaryRecordReader):
-
-  JAVA = '/usr/lib/jvm/java-11-twitter/bin/java'
-  RECORD_SEPARATOR_HEX = [
-    0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29,
-    0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff
-  ]
-  RECORD_SEPARATOR = ''.join([chr(i) for i in RECORD_SEPARATOR_HEX])
-  RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR)
-  CHUNK_SIZE = 8192
-
-  def __init__(self, jar_file, num_eb_threads, subscriber_id,
-               filter_str=None, buffer_size=32768, debug=False):
-    self.jar_file = jar_file
-    self.num_eb_threads = num_eb_threads
-    self.subscriber_id = subscriber_id
-    self.filter_str = filter_str if filter_str else '""'
-    self.buffer_size = buffer_size
-    self.lock = Lock()
-    self._pipe = None
-    self._buffered_reader = None
-    self._bytes_buffer = None
-
-    self.debug = debug
-
-  def initialize(self):
-    if not self._pipe:
-      self._pipe = subprocess.Popen(
-        [
-          self.JAVA, '-jar', self.jar_file,
-          '-subscriberId', self.subscriber_id,
-          '-numThreads', str(self.num_eb_threads),
-          '-dataFilter', self.filter_str,
-          '-debug' if self.debug else ''
-        ],
-        stdout=subprocess.PIPE
-      )
-      self._buffered_reader = io.BufferedReader(
-        ReadableWrapper(self._pipe.stdout), self.buffer_size)
-      self._bytes_buffer = io.BytesIO()
-    else:
-      logging.warning('Already initialized')
-
-  def _find_next_record(self):
-    tail = ['']
-    while True:
-      chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE)
-      index = chunk.find(self.RECORD_SEPARATOR)
-      if index < 0:
-        self._bytes_buffer.write(chunk[:-self.RECORD_SEPARATOR_LENGTH])
-        tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH:]
-      else:
-        self._bytes_buffer.write(chunk[:index])
-        return chunk[(index + self.RECORD_SEPARATOR_LENGTH):]
-
-  def _read(self):
-    with self.lock:
-      remaining = self._find_next_record()
-      record = self._bytes_buffer.getvalue()
-      # clean up buffer
-      self._bytes_buffer.close()
-      self._bytes_buffer = io.BytesIO()
-      self._bytes_buffer.write(remaining)
-
-      return record
-
-  def read(self):
-    while True:
-      try:
-        return self._read()
-      except Exception as e:
-        logging.error("Error reading bytes for next record: {}".format(e))
-        if self.debug:
-          raise
-
-  def close(self):
-    try:
-      self._bytes_buffer.close()
-      self._buffered_reader.close()
-      self._pipe.terminate()
-    except Exception as e:
-      logging.error("Error closing reader: {}".format(e))
diff --git a/twml/twml/contrib/export/__init__.docx b/twml/twml/contrib/export/__init__.docx
new file mode 100644
index 000000000..654a1bdce
Binary files /dev/null and b/twml/twml/contrib/export/__init__.docx differ
diff --git a/twml/twml/contrib/export/__init__.py b/twml/twml/contrib/export/__init__.py
deleted file mode 100644
index 99892dcfa..000000000
--- a/twml/twml/contrib/export/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from . import export_fn # noqa: F401
-from . import exporters # noqa: F401
diff --git a/twml/twml/contrib/export/export_fn.docx b/twml/twml/contrib/export/export_fn.docx
new file mode 100644
index 000000000..12893ee05
Binary files /dev/null and b/twml/twml/contrib/export/export_fn.docx differ
diff --git a/twml/twml/contrib/export/export_fn.py b/twml/twml/contrib/export/export_fn.py
deleted file mode 100644
index 6e59fff07..000000000
--- a/twml/twml/contrib/export/export_fn.py
+++ /dev/null
@@ -1,264 +0,0 @@
-"""
-Functions for exporting models for different modes.
-"""
-from collections import OrderedDict
-import os
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator.export import export
-import twml
-import yaml
-
-
-def get_sparse_batch_supervised_input_receiver_fn(feature_config, keep_fields=None):
-  """Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors
-  with labels and weights as defined in feature_config.
-  This input_receiver_fn is required for exporting models with 'train' mode to be trained with
-  Java API
-
-  Args:
-    feature_config (FeatureConfig): deepbird v2 feature config object
-    keep_fields (list): list of fields to keep
-
-  Returns:
-    supervised_input_receiver_fn: input_receiver_fn used for train mode
-  """
-  def supervised_input_receiver_fn():
-    serialized_request = tf.placeholder(dtype=tf.uint8, name='request')
-    receiver_tensors = {'request': serialized_request}
-
-    bpr = twml.contrib.readers.HashedBatchPredictionRequest(serialized_request, feature_config)
-    features = bpr.get_sparse_features() if keep_fields is None else bpr.get_features(keep_fields)
-    features['weights'] = bpr.weights
-    labels = bpr.labels
-    features, labels = bpr.apply_filter(features, labels)
-
-    return export.SupervisedInputReceiver(features, labels, receiver_tensors)
-
-  return supervised_input_receiver_fn
-
-
-def update_build_graph_fn_for_train(build_graph_fn):
-  """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse
-  similar to the export_output_fns for serving.
-  The key difference here is that
-  1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of
-     creating an export_output object. This is because of the way estimators export model in 'train'
-     mode doesn't take custom export_output
-  2. We only do it when `mode == 'train'` to avoid altering the graph when exporting
-     for 'infer' mode
-
-  Args:
-    build_graph_fn (Callable): deepbird v2 build graph function
-
-  Returns:
-    new_build_graph_fn: An updated build_graph_fn that inserts serialized BatchPredictResponse
-                        to graph output when in 'train' mode
-  """
-  def new_build_graph_fn(features, label, mode, params, config=None):
-    output = build_graph_fn(features, label, mode, params, config)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      output.update(
-        twml.export_output_fns.batch_prediction_continuous_output_fn(output)[
-          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs
-      )
-    return output
-  return new_build_graph_fn
-
-
-def export_model_for_train_and_infer(
-    trainer, feature_config, keep_fields, export_dir, as_text=False):
-  """Function for exporting model with both 'train' and 'infer' mode.
-
-  This means the exported saved_model.pb will contain two meta graphs, one with tag 'train'
-  and the other with tag 'serve', and it can be loaded in Java API with either tag depending on
-  the use case
-
-  Args:
-    trainer (DataRecordTrainer): deepbird v2 DataRecordTrainer
-    feature_config (FeatureConfig): deepbird v2 feature config
-    keep_fields (list of string): list of field keys, e.g.
-                                  ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes')
-    export_dir (str): a directory (local or hdfs) to export model to
-    as_text (bool): if True, write 'saved_model.pb' as binary file, else write
-                    'saved_model.pbtxt' as human readable text file. Default False
-  """
-  train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn(
-    feature_config, keep_fields)
-  predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn(
-    feature_config, keep_fields)
-  trainer._export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-  trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn)
-  trainer._estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map={
-      tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn,
-      tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn
-    },
-    as_text=as_text,
-  )
-
-  trainer.export_model_effects(export_dir)
-
-
-def export_all_models_with_receivers(estimator, export_dir,
-                                     train_input_receiver_fn,
-                                     eval_input_receiver_fn,
-                                     predict_input_receiver_fn,
-                                     export_output_fn,
-                                     export_modes=('train', 'eval', 'predict'),
-                                     register_model_fn=None,
-                                     feature_spec=None,
-                                     checkpoint_path=None,
-                                     log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    estimator:
-      Should be of type tf.estimator.Estimator.
-      You can get this from trainer using trainer.estimator
-    export_dir:
-      Directory to export the model.
-    train_input_receiver_fn:
-      Input receiver for train interface.
-    eval_input_receiver_fn:
-      Input receiver for eval interface.
-    predict_input_receiver_fn:
-      Input receiver for predict interface.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    register_model_fn:
-      An optional function which is called with export_dir after models are exported.
-      Defaults to None.
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # TODO: Fix for hogwild / distributed training.
-
-  if export_dir is None:
-    raise ValueError("export_dir can not be None")
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  input_receiver_fn_map = {}
-
-  if "train" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn
-
-  if "eval" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn
-
-  if "predict" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn
-
-  export_dir = estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map=input_receiver_fn_map,
-    checkpoint_path=checkpoint_path,
-  )
-
-  if register_model_fn is not None:
-    register_model_fn(export_dir, feature_spec, log_features)
-
-  return export_dir
-
-
-def export_all_models(trainer,
-                      export_dir,
-                      parse_fn,
-                      serving_input_receiver_fn,
-                      export_output_fn=None,
-                      export_modes=('train', 'eval', 'predict'),
-                      feature_spec=None,
-                      checkpoint=None,
-                      log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    trainer:
-      An object of type twml.trainers.Trainer.
-    export_dir:
-      Directory to export the model.
-    parse_fn:
-      The parse function used parse the inputs for train and eval.
-    serving_input_receiver_fn:
-      The input receiver function used during serving.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    feature_spec:
-      A dictionary obtained from FeatureConfig.get_feature_spec() to serialize
-      as feature_spec.yaml in export_dir.
-      Defaults to None
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # Only export from chief in hogwild or distributed modes.
-  if trainer.params.get('distributed', False) and not trainer.estimator.config.is_chief:
-    tf.logging.info("Trainer.export_model ignored due to instance not being chief.")
-    return
-
-  if feature_spec is None:
-    if getattr(trainer, '_feature_config') is None:
-      raise ValueError("feature_spec is set to None."
-                       "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function")
-    else:
-      feature_spec = trainer._feature_config.get_feature_spec()
-
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  old_export_output_fn = trainer._export_output_fn
-  trainer._export_output_fn = export_output_fn
-  supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(parse_fn)
-  if not checkpoint:
-    checkpoint = trainer.best_or_latest_checkpoint
-
-  export_dir = export_all_models_with_receivers(estimator=trainer.estimator,
-                                                export_dir=export_dir,
-                                                train_input_receiver_fn=supervised_input_receiver_fn,
-                                                eval_input_receiver_fn=supervised_input_receiver_fn,
-                                                predict_input_receiver_fn=serving_input_receiver_fn,
-                                                export_output_fn=export_output_fn,
-                                                export_modes=export_modes,
-                                                register_model_fn=trainer.export_model_effects,
-                                                feature_spec=feature_spec,
-                                                checkpoint_path=checkpoint,
-                                                log_features=log_features)
-  trainer._export_output_fn = old_export_output_fn
-  return export_dir
-
-
-def export_feature_spec(dir_path, feature_spec_dict):
-  """
-  Exports a FeatureConfig.get_feature_spec() dict to <dir_path>/feature_spec.yaml.
-  """
-  def ordered_dict_representer(dumper, data):
-    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
-
-  try:
-    # needed for Python 2
-    yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str)
-    yaml.add_representer(unicode, yaml.representer.SafeRepresenter.represent_unicode)
-  except NameError:
-    # 'unicode' type doesn't exist on Python 3
-    # PyYAML handles unicode correctly in Python 3
-    pass
-
-  yaml.add_representer(OrderedDict, ordered_dict_representer)
-
-  fbase = "feature_spec.yaml"
-  fname = fbase.encode('utf-8') if type(dir_path) != str else fbase
-  file_path = os.path.join(dir_path, fname)
-  with tf.io.gfile.GFile(file_path, mode='w') as f:
-    yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True)
-  tf.logging.info("Exported feature spec to %s" % file_path)
-
-  return file_path
-
-
-# Keep the alias for compatibility.
-get_supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn
diff --git a/twml/twml/contrib/export/exporters.docx b/twml/twml/contrib/export/exporters.docx
new file mode 100644
index 000000000..0733f2881
Binary files /dev/null and b/twml/twml/contrib/export/exporters.docx differ
diff --git a/twml/twml/contrib/export/exporters.py b/twml/twml/contrib/export/exporters.py
deleted file mode 100644
index 122955cbc..000000000
--- a/twml/twml/contrib/export/exporters.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-Wrappers around tf.estimator.Exporters to export models and save checkpoints.
-"""
-import os
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator import exporter
-import twml
-
-
-class _AllSavedModelsExporter(tf.estimator.Exporter):
-  """Internal exporter class to be used for exporting models for different modes."""
-
-  def __init__(self,
-               name,
-               input_receiver_fn_map,
-               backup_checkpoints,
-               assets_extra=None,
-               as_text=False):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-      assets_extra: Additional assets to be included in the exported model.
-      as_text: Specifies if the exported model should be in a human readable text format.
-    """
-    self._name = name
-    self._input_receiver_fn_map = input_receiver_fn_map
-    self._backup_checkpoints = backup_checkpoints
-    self._assets_extra = assets_extra
-    self._as_text = as_text
-
-  @property
-  def name(self):
-    return self._name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    del is_the_final_export
-
-    export_path = twml.util.sanitize_hdfs_path(export_path)
-    checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path)
-
-    if self._backup_checkpoints:
-      backup_path = os.path.join(export_path, "checkpoints")
-      # Ensure backup_path is created. makedirs passes if dir already exists.
-      tf.io.gfile.makedirs(backup_path)
-      twml.util.backup_checkpoint(checkpoint_path, backup_path, empty_backup=False)
-
-    export_result = estimator.experimental_export_all_saved_models(
-      export_path,
-      self._input_receiver_fn_map,
-      assets_extra=self._assets_extra,
-      as_text=self._as_text,
-      checkpoint_path=checkpoint_path)
-
-    return export_result
-
-
-class BestExporter(tf.estimator.BestExporter):
-  """
-  This class inherits from tf.estimator.BestExporter with the following differences:
-    - It also creates a backup of the best checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='best_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               event_file_pattern='eval/*.tfevents.*',
-               compare_fn=exporter._loss_smaller,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter
-    """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
-
-    super(BestExporter, self).__init__(
-      name, serving_input_receiver_fn, event_file_pattern, compare_fn,
-      assets_extra, as_text, exports_to_keep)
-
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
-
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
-
-
-class LatestExporter(tf.estimator.LatestExporter):
-  """
-  This class inherits from tf.estimator.LatestExporter with the following differences:
-    - It also creates a backup of the latest checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='latest_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter
-    """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
-
-    super(LatestExporter, self).__init__(
-      name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep)
-
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
-
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
diff --git a/twml/twml/contrib/feature_config.docx b/twml/twml/contrib/feature_config.docx
new file mode 100644
index 000000000..067c2e259
Binary files /dev/null and b/twml/twml/contrib/feature_config.docx differ
diff --git a/twml/twml/contrib/feature_config.py b/twml/twml/contrib/feature_config.py
deleted file mode 100644
index 833695751..000000000
--- a/twml/twml/contrib/feature_config.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-Feature configuration for DeepBird jobs returns dictionary of sparse and dense Features
-"""
-from twitter.deepbird.io.legacy.contrib import feature_config
-import twml
-
-
-class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-
-    # Override the class in the spec.
-    doc["class"] = "twml.contrib.FeatureConfig"
-
-    return doc
-
-
-class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  # Overwrite self.build() to return twml.FeatureConfig instead
-  def build(self):
-    """
-    Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder.
-    """
-
-    (
-      keep_tensors,
-      keep_sparse_tensors,
-      feature_map,
-      features_add,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
-
-    discretize_dict = {}
-    for config in self._sparse_extraction_configs:
-      if config.discretize_num_bins and config.discretize_output_size_bits:
-        if config.discretize_type == "percentile":
-          calibrator = twml.contrib.calibrators.PercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashed_percentile":
-          calibrator = twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashing":
-          calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator
-        else:
-          raise ValueError("Unsupported discretizer type: " + config.discretize_type)
-        discretize_dict[config.output_name] = calibrator(
-          config.discretize_num_bins,
-          config.discretize_output_size_bits,
-          allow_empty_calibration=config.allow_empty_calibration,
-        )
-      elif config.discretize_num_bins or config.discretize_output_size_bits:
-        raise ValueError(
-          "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig"
-        )
-
-    return FeatureConfig(
-      features={},
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=keep_tensors,
-      sparse_tensor_types=keep_sparse_tensors,
-      feature_types=feature_map,
-      sparse_extraction_configs=self._sparse_extraction_configs,
-      feature_extraction_configs=self._feature_extraction_configs,
-      feature_group_extraction_configs=self._feature_group_extraction_configs,
-      image_configs=self._image_configs,
-      discretize_config=discretize_dict,
-      feature_ids=features_add,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=feature_name_to_feature_parser,
-      feature_in_bq_name=feature_in_bq_name,
-    )
-
-
-TensorExtractionConfig = feature_config.TensorExtractionConfig
-
-FeatureGroupExtractionConfig = feature_config.FeatureGroupExtractionConfig
-
-ImageExtractionConfig = feature_config.ImageExtractionConfig
-
-_set_tensor_namedtuple = feature_config._set_tensor_namedtuple
diff --git a/twml/twml/contrib/feature_config_parsers.docx b/twml/twml/contrib/feature_config_parsers.docx
new file mode 100644
index 000000000..b4c127127
Binary files /dev/null and b/twml/twml/contrib/feature_config_parsers.docx differ
diff --git a/twml/twml/contrib/feature_config_parsers.py b/twml/twml/contrib/feature_config_parsers.py
deleted file mode 100644
index 83c402e2e..000000000
--- a/twml/twml/contrib/feature_config_parsers.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""Utility functions to create FeatureConfig objects from feature_spec.yaml files"""
-import os
-import re
-
-import tensorflow.compat.v1 as tf
-import yaml
-from twml.feature_config import FeatureConfigBuilder
-from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2
-
-
-def _get_config_version(config_dict):
-  doc = config_dict
-  supported_classes = {
-    "twml.FeatureConfig": "v1",
-    "twml.contrib.FeatureConfig": "v2"
-  }
-  if "class" not in doc:
-    raise ValueError("'class' key not found")
-  if doc["class"] not in supported_classes.keys():
-    raise ValueError("Class %s not supported. Supported clases are %s"
-                     % (doc["class"], supported_classes.keys()))
-  return supported_classes[doc["class"]]
-
-
-def _validate_config_dict_v1(config_dict):
-  """
-  Validate spec exported by twml.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.FeatureConfig":
-    malformed_error("'class' is not twml.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format' key not found")
-
-  # validate spec exported by twml.FeatureConfig
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    if "filters" not in doc:
-      malformed_error("'filters' key not found")
-    elif type(doc["filters"]) != list:
-      malformed_error("'filters' is not a list")
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
-
-
-def _validate_config_dict_v2(config_dict):
-  """
-  Validate spec exported by twml.contrib.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.contrib.FeatureConfig":
-    malformed_error("'class' is not twml.contrib.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format key not found'")
-
-  # validate spec exported by twml.contrib.FeatureConfig (basic validation only)
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparseTensors", "discretizeConfig"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    list_keys = ["sparseFeatureGroups", "denseFeatureGroups", "denseFeatures", "images", "filters"]
-    for key in list_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != list:
-        malformed_error("'%s' is not a list" % key)
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
-
-
-def _create_feature_config_v1(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilder(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add features
-    for feature_info in config_dict["features"].values():
-      feature_name = re.escape(feature_info["featureName"])
-      feature_group = feature_info["featureGroup"]
-      fc_builder.add_feature(feature_name, feature_group)
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-    # feature filters
-    for feature_name in config_dict["filters"]:
-      fc_builder.add_filter(feature_name)
-    # weight
-    if config_dict["weight"]:
-      weight_feature = list(config_dict["weight"].values())[0]["featureName"]
-      fc_builder.define_weight(weight_feature)
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
-
-
-def _create_feature_config_v2(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilderV2(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add sparse group extraction configs
-    for sparse_group in config_dict["sparseFeatureGroups"]:
-      fids = sparse_group["features"].keys()
-      fnames = [sparse_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_features_as_hashed_sparse(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        output_tensor_name=sparse_group["outputName"],
-        hash_space_size_bits=sparse_group["hashSpaceBits"],
-        discretize_num_bins=sparse_group["discretize"]["numBins"],
-        discretize_output_size_bits=sparse_group["discretize"]["outputSizeBits"],
-        discretize_type=sparse_group["discretize"]["type"],
-        type_filter=sparse_group["filterType"])
-
-    # add dense group extraction configs
-    for dense_group in config_dict["denseFeatureGroups"]:
-      fids = dense_group["features"].keys()
-      fnames = [dense_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_feature_group(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        group_name=dense_group["outputName"],
-        type_filter=dense_group["filterType"],
-        default_value=dense_group["defaultValue"])
-
-    # add dense feature configs
-    for dense_features in config_dict["denseFeatures"]:
-      fids = dense_features["features"].keys()
-      fnames = [dense_features["features"][fid]["featureName"] for fid in fids]
-      default_value = dense_features["defaultValue"]
-      if len(fnames) == 1 and type(default_value) != dict:
-        fc_builder.extract_feature(
-          feature_name=re.escape(fnames[0]),
-          expected_shape=dense_features["expectedShape"],
-          default_value=dense_features["defaultValue"])
-      else:
-        fc_builder.extract_features(
-          feature_regexes=[re.escape(fname) for fname in fnames],
-          default_value_map=dense_features["defaultValue"])
-
-    # add image feature configs
-    for image in config_dict["images"]:
-      fc_builder.extract_image(
-        feature_name=image["featureName"],
-        preprocess=image["preprocess"],
-        out_type=tf.as_dtype(image["outType"].lower()),
-        channels=image["channels"],
-        default_image=image["defaultImage"],
-      )
-
-    # add other tensor features (non-image)
-    tensor_fnames = []
-    image_fnames = [img["featureName"] for img in config_dict["images"]]
-    for tensor_fname in config_dict["tensors"]:
-      if tensor_fname not in image_fnames:
-        tensor_fnames.append(tensor_fname)
-    for sparse_tensor_fname in config_dict["sparseTensors"]:
-      tensor_fnames.append(sparse_tensor_fname)
-    fc_builder.extract_tensors(tensor_fnames)
-
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
-
-
-def create_feature_config_from_dict(config_dict, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature spec dict.
-  """
-  config_version = _get_config_version(config_dict)
-  if config_version == "v1":
-    _validate_config_dict_v1(config_dict)
-    feature_config = _create_feature_config_v1(config_dict, data_spec_path)
-  elif config_version == "v2":
-    _validate_config_dict_v2(config_dict)
-    feature_config = _create_feature_config_v2(config_dict, data_spec_path)
-  else:
-    raise ValueError("version not supported")
-
-  return feature_config
-
-
-def create_feature_config(config_path, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature_spec.yaml file.
-  """
-  _, ext = os.path.splitext(config_path)
-  if ext not in ['.yaml', '.yml']:
-    raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported")
-
-  with tf.io.gfile.GFile(config_path, mode='r') as fs:
-    config_dict = yaml.safe_load(fs)
-
-  return create_feature_config_from_dict(config_dict, data_spec_path)
diff --git a/twml/twml/contrib/feature_importances/__init__.docx b/twml/twml/contrib/feature_importances/__init__.docx
new file mode 100644
index 000000000..68d5880ef
Binary files /dev/null and b/twml/twml/contrib/feature_importances/__init__.docx differ
diff --git a/twml/twml/contrib/feature_importances/__init__.py b/twml/twml/contrib/feature_importances/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml/contrib/feature_importances/feature_importances.docx b/twml/twml/contrib/feature_importances/feature_importances.docx
new file mode 100644
index 000000000..dd9164e08
Binary files /dev/null and b/twml/twml/contrib/feature_importances/feature_importances.docx differ
diff --git a/twml/twml/contrib/feature_importances/feature_importances.py b/twml/twml/contrib/feature_importances/feature_importances.py
deleted file mode 100644
index a8bfcc129..000000000
--- a/twml/twml/contrib/feature_importances/feature_importances.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# checkstyle: noqa
-
-import time
-from collections import defaultdict
-
-from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
-from com.twitter.mlmetastore.modelrepo.core import FeatureImportance, FeatureNames
-from twitter.deepbird.io.util import match_feature_regex_list
-
-from twml.contrib.feature_importances.helpers import (
-  _get_feature_name_from_config,
-  _get_feature_types_from_records,
-  _get_metrics_hook,
-  _expand_prefix,
-  longest_common_prefix,
-  write_list_to_hdfs_gfile)
-from twml.contrib.feature_importances.feature_permutation import PermutedInputFnFactory
-from twml.tracking import ExperimentTracker
-
-from tensorflow.compat.v1 import logging
-from requests.exceptions import HTTPError, RetryError
-from queue import Queue
-
-
-SERIAL = "serial"
-TREE = "tree"
-INDIVIDUAL = "Individual"
-GROUP = "Group"
-ROC_AUC = "roc_auc"
-RCE = "rce"
-LOSS = "loss"
-
-
-def _repartition(feature_list_queue, fnames_ftypes, split_feature_group_on_period):
-  """
-  Iterate through letters to partition each feature by prefix, and then put each tuple
-    (prefix, feature_partition) into the feature_list_queue
-  Args:
-    prefix (str): The prefix shared by each feature in list_of_feature_types
-    feature_list_queue (Queue<(str, list<(str, str)>)>): The queue of feature groups
-    fnames_ftypes (list<(str, str)>): List of (fname, ftype) pairs. Each fname begins with prefix
-    split_feature_group_on_period (str): If true, require that feature groups end in a period
-  Returns:
-    Updated queue with each group in fnames_ftypes
-  """
-  assert len(fnames_ftypes) > 1
-
-  split_character = "." if split_feature_group_on_period else None
-  # Compute the longest prefix of the words
-  prefix = longest_common_prefix(
-    strings=[fname for fname, _ in fnames_ftypes], split_character=split_character)
-
-  # Separate the features by prefix
-  prefix_to_features = defaultdict(list)
-  for fname, ftype in fnames_ftypes:
-    assert fname.startswith(prefix)
-    new_prefix = _expand_prefix(fname=fname, prefix=prefix, split_character=split_character)
-    prefix_to_features[new_prefix].append((fname, ftype))
-
-  # Add all of the new partitions to the queue
-  for new_prefix, fname_ftype_list in prefix_to_features.items():
-    extended_new_prefix = longest_common_prefix(
-      strings=[fname for fname, _ in fname_ftype_list], split_character=split_character)
-    assert extended_new_prefix.startswith(new_prefix)
-    feature_list_queue.put((extended_new_prefix, fname_ftype_list))
-  return feature_list_queue
-
-
-def _infer_if_is_metric_larger_the_better(stopping_metric):
-  # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to
-  #   larger numbers being worse (e.g. LOSS)
-  if stopping_metric is None:
-    raise ValueError("Error: Stopping Metric cannot be None")
-  elif stopping_metric.startswith(LOSS):
-    logging.info("Interpreting {} to be a metric where larger numbers are worse".format(stopping_metric))
-    is_metric_larger_the_better = False
-  else:
-    logging.info("Interpreting {} to be a metric where larger numbers are better".format(stopping_metric))
-    is_metric_larger_the_better = True
-  return is_metric_larger_the_better
-
-
-def _check_whether_tree_should_expand(baseline_performance, computed_performance, sensitivity, stopping_metric, is_metric_larger_the_better):
-  """
-  Returns True if
-    - the metric is positive (e.g. ROC_AUC) and computed_performance is nontrivially smaller than the baseline_performance
-    - the metric is negative (e.g. LOSS) and computed_performance is nontrivially larger than the baseline_performance
-  """
-  difference = ((baseline_performance[stopping_metric] - computed_performance[stopping_metric]) /
-                 baseline_performance[stopping_metric])
-
-  if not is_metric_larger_the_better:
-      difference = -difference
-
-  logging.info(
-    "Found a {} difference of {}. Sensitivity is {}.".format("positive" if is_metric_larger_the_better else "negative", difference, sensitivity))
-  return difference > sensitivity
-
-
-def _compute_multiple_permuted_performances_from_trainer(
-    factory, fname_ftypes, trainer, parse_fn, record_count):
-  """Compute performances with fname and fype permuted
-  """
-  metrics_hook = _get_metrics_hook(trainer)
-  trainer._estimator.evaluate(
-    input_fn=factory.get_permuted_input_fn(
-      batch_size=trainer._params.eval_batch_size, parse_fn=parse_fn, fname_ftypes=fname_ftypes),
-    steps=(record_count + trainer._params.eval_batch_size) // trainer._params.eval_batch_size,
-    hooks=[metrics_hook],
-    checkpoint_path=trainer.best_or_latest_checkpoint)
-  return metrics_hook.metric_values
-
-
-def _get_extra_feature_group_performances(factory, trainer, parse_fn, extra_groups, feature_to_type, record_count):
-  """Compute performance differences for the extra feature groups
-  """
-  extra_group_feature_performance_results = {}
-  for group_name, raw_feature_regex_list in extra_groups.items():
-    start = time.time()
-    fnames = match_feature_regex_list(
-      features=feature_to_type.keys(),
-      feature_regex_list=[regex for regex in raw_feature_regex_list],
-      preprocess=False,
-      as_dict=False)
-
-    fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames]
-
-    logging.info("Extracted extra group {} with features {}".format(group_name, fnames_ftypes))
-    extra_group_feature_performance_results[group_name] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fnames_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      group_name, int(time.time() - start)))
-  return extra_group_feature_performance_results
-
-
-def _feature_importances_tree_algorithm(
-    data_dir, trainer, parse_fn, fnames, stopping_metric, file_list=None, datarecord_filter_fn=None, split_feature_group_on_period=True,
-    record_count=99999, is_metric_larger_the_better=None, sensitivity=0.025, extra_groups=None, dont_build_tree=False):
-  """Tree algorithm for feature and feature group importances. This algorithm build a prefix tree of
-  the feature names and then traverses the tree with a BFS. At each node (aka group of features with
-  a shared prefix) the algorithm computes the performance of the model when we permute all features
-  in the group. The algorithm only zooms-in on groups that impact the performance by more than
-  sensitivity. As a result, features that affect the model performance by less than sensitivity will
-  not have an exact importance.
-  Args:
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    parse_fn: (function): The parse_fn used by eval_input_fn
-    fnames (list<string>): The list of feature names
-    stopping_metric (str): The metric to use to determine when to stop expanding trees
-    file_list (list<str>): The list of filenames. Exactly one of file_list and data_dir should be
-      provided
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    split_feature_group_on_period (boolean): If true, split feature groups by period rather than on
-      optimal prefix
-    record_count (int): The number of records to compute importances over
-    is_metric_larger_the_better (boolean): If true, assume that stopping_metric is a metric where larger
-      values are better (e.g. ROC-AUC)
-    sensitivity (float): The smallest change in performance to continue to expand the tree
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group. You should only supply a value for this argument if you have a set
-      of features that you want to evaluate as a group but don't share a prefix
-    dont_build_tree (boolean): If True, don't build the tree and only compute the extra_groups importances
-  Returns:
-    A dictionary that contains the individual and group feature importances
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  baseline_performance = _compute_multiple_permuted_performances_from_trainer(
-    factory=factory, fname_ftypes=[],
-    trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-  out = {"None": baseline_performance}
-
-  if stopping_metric not in baseline_performance:
-    raise ValueError("The stopping metric '{}' not found in baseline_performance. Metrics are {}".format(
-      stopping_metric, list(baseline_performance.keys())))
-
-  is_metric_larger_the_better = (
-    is_metric_larger_the_better if is_metric_larger_the_better is not None else _infer_if_is_metric_larger_the_better(stopping_metric))
-  logging.info("Using {} as the stopping metric for the tree algorithm".format(stopping_metric))
-
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-  all_feature_types = list(feature_to_type.items())
-
-  individual_feature_performances = {}
-  feature_group_performances = {}
-  if dont_build_tree:
-    logging.info("Not building feature importance trie. Will only compute importances for the extra_groups")
-  else:
-    logging.info("Building feature importance trie")
-    # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where
-    #   each feature in list_of_feature_type_pairs will have have the prefix "prefix"
-    feature_list_queue = _repartition(
-      feature_list_queue=Queue(), fnames_ftypes=all_feature_types, split_feature_group_on_period=split_feature_group_on_period)
-
-    while not feature_list_queue.empty():
-      # Pop the queue. We should never have an empty list in the queue
-      prefix, fnames_ftypes = feature_list_queue.get()
-      assert len(fnames_ftypes) > 0
-
-      # Compute performance from permuting all features in fname_ftypes
-      logging.info(
-        "\n\nComputing importances for {} ({}...). {} elements left in the queue \n\n".format(
-          prefix, fnames_ftypes[:5], feature_list_queue.qsize()))
-      start = time.time()
-      computed_performance = _compute_multiple_permuted_performances_from_trainer(
-        factory=factory, fname_ftypes=fnames_ftypes,
-        trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-      logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-        prefix, int(time.time() - start)))
-      if len(fnames_ftypes) == 1:
-        individual_feature_performances[fnames_ftypes[0][0]] = computed_performance
-      else:
-        feature_group_performances[prefix] = computed_performance
-      # Dig deeper into the features in fname_ftypes only if there is more than one feature in the
-      #    list and the performance drop is nontrivial
-      logging.info("Checking performance for {} ({}...)".format(prefix, fnames_ftypes[:5]))
-      check = _check_whether_tree_should_expand(
-        baseline_performance=baseline_performance, computed_performance=computed_performance,
-        sensitivity=sensitivity, stopping_metric=stopping_metric, is_metric_larger_the_better=is_metric_larger_the_better)
-      if len(fnames_ftypes) > 1 and check:
-        logging.info("Expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-        feature_list_queue = _repartition(
-          feature_list_queue=feature_list_queue, fnames_ftypes=fnames_ftypes, split_feature_group_on_period=split_feature_group_on_period)
-      else:
-        logging.info("Not expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-
-  # Baseline performance is grouped in with individual_feature_importance_results
-  individual_feature_performance_results = dict(
-    out, **{k: v for k, v in individual_feature_performances.items()})
-  group_feature_performance_results = {k: v for k, v in feature_group_performances.items()}
-
-  if extra_groups is not None:
-    logging.info("Computing performances for extra groups {}".format(extra_groups.keys()))
-    for group_name, performances in _get_extra_feature_group_performances(
-        factory=factory,
-        trainer=trainer,
-        parse_fn=parse_fn,
-        extra_groups=extra_groups,
-        feature_to_type=feature_to_type,
-        record_count=record_count).items():
-      group_feature_performance_results[group_name] = performances
-  else:
-    logging.info("Not computing performances for extra groups")
-
-  return {INDIVIDUAL: individual_feature_performance_results,
-          GROUP: group_feature_performance_results}
-
-
-def _feature_importances_serial_algorithm(
-    data_dir, trainer, parse_fn, fnames, file_list=None, datarecord_filter_fn=None, factory=None, record_count=99999):
-  """Serial algorithm for feature importances. This algorithm computes the
-  importance of each feature.
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-
-  out = {}
-  for fname, ftype in list(feature_to_type.items()) + [(None, None)]:
-    logging.info("\n\nComputing importances for {}\n\n".format(fname))
-    start = time.time()
-    fname_ftypes = [(fname, ftype)] if fname is not None else []
-    out[str(fname)] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fname_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      fname, int(time.time() - start)))
-  # The serial algorithm does not compute group feature results.
-  return {INDIVIDUAL: out, GROUP: {}}
-
-
-def _process_feature_name_for_mldash(feature_name):
-  # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as
-  #   part of a url
-  return feature_name.replace("/", "__")
-
-
-def compute_feature_importances(
-    trainer, data_dir=None, feature_config=None, algorithm=TREE, parse_fn=None, datarecord_filter_fn=None, **kwargs):
-  """Perform a feature importance analysis on a trained model
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    algorithm (str): The algorithm to use
-    parse_fn: (function): The parse_fn used by eval_input_fn. By default this is
-      feature_config.get_parse_fn()
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-  """
-
-  # We only use the trainer's eval files if an override data_dir is not provided
-  if data_dir is None:
-    logging.info("Using trainer._eval_files (found {} as files)".format(trainer._eval_files))
-    file_list = trainer._eval_files
-  else:
-    logging.info("data_dir provided. Looking at {} for data.".format(data_dir))
-    file_list = None
-
-  feature_config = feature_config or trainer._feature_config
-  out = {}
-  if not feature_config:
-    logging.warn("WARN: Not computing feature importance because trainer._feature_config is None")
-    out = None
-  else:
-    parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn()
-    fnames = _get_feature_name_from_config(feature_config)
-    logging.info("Computing importances for {}".format(fnames))
-    logging.info("Using the {} feature importance computation algorithm".format(algorithm))
-    algorithm = {
-      SERIAL: _feature_importances_serial_algorithm,
-      TREE: _feature_importances_tree_algorithm}[algorithm]
-    out = algorithm(data_dir=data_dir, trainer=trainer, parse_fn=parse_fn, fnames=fnames, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn, **kwargs)
-  return out
-
-
-def write_feature_importances_to_hdfs(
-    trainer, feature_importances, output_path=None, metric="roc_auc"):
-  """Publish a feature importance analysis to hdfs as a tsv
-  Args:
-    (see compute_feature_importances for other args)
-    trainer (Trainer)
-    feature_importances (dict): Dictionary of feature importances
-    output_path (str): The remote or local file to write the feature importances to. If not
-      provided, this is inferred to be the trainer save dir
-    metric (str): The metric to write to tsv
-  """
-  # String formatting appends (Individual) or (Group) to feature name depending on type
-  perfs = {"{} ({})".format(k, importance_key) if k != "None" else k: v[metric]
-    for importance_key, importance_value in feature_importances.items()
-    for k, v in importance_value.items()}
-
-  output_path = ("{}/feature_importances-{}".format(
-    trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir,
-    output_path if output_path is not None else str(time.time())))
-
-  if len(perfs) > 0:
-    logging.info("Writing feature_importances for {} to hdfs".format(perfs.keys()))
-    entries = [
-      {
-        "name": name,
-        "drop": perfs["None"] - perfs[name],
-        "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8),
-        "perf": perfs[name]
-      } for name in perfs.keys()]
-    out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"]
-    for entry in sorted(entries, key=lambda d: d["drop"]):
-      out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry))
-    logging.info("\n".join(out))
-    write_list_to_hdfs_gfile(out, output_path)
-    logging.info("Wrote feature feature_importances to {}".format(output_path))
-  else:
-    logging.info("Not writing feature_importances to hdfs")
-  return output_path
-
-
-def write_feature_importances_to_ml_dash(trainer, feature_importances, feature_config=None):
-  # type: (DataRecordTrainer, FeatureConfig, dict) -> None
-  """Publish feature importances + all feature names to ML Metastore
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    feature_importances (dict, default=None): Dictionary of precomputed feature importances
-    feature_importance_metric (str, default=None): The metric to write to ML Dashboard
-  """
-  experiment_tracking_path = trainer.experiment_tracker.tracking_path\
-    if trainer.experiment_tracker.tracking_path\
-    else ExperimentTracker.guess_path(trainer._save_dir)
-
-  logging.info('Computing feature importances for run: {}'.format(experiment_tracking_path))
-
-  feature_importance_list = []
-  for key in feature_importances:
-    for feature, imps in feature_importances[key].items():
-      logging.info('FEATURE NAME: {}'.format(feature))
-      feature_name = feature.split(' (').pop(0)
-      for metric_name, value in imps.items():
-        try:
-          imps[metric_name] = float(value)
-          logging.info('Wrote feature importance value {} for metric: {}'.format(str(value), metric_name))
-        except Exception as ex:
-          logging.error("Skipping writing metric:{} to ML Metastore due to invalid metric value: {} or value type: {}. Exception: {}".format(metric_name, str(value), type(value), str(ex)))
-          pass
-
-      feature_importance_list.append(FeatureImportance(
-        run_id=experiment_tracking_path,
-        feature_name=_process_feature_name_for_mldash(feature_name),
-        feature_importance_metrics=imps,
-        is_group=key == GROUP
-      ))
-
-# setting feature config to match the one used in compute_feature_importances
-  feature_config = feature_config or trainer._feature_config
-  feature_names = FeatureNames(
-    run_id=experiment_tracking_path,
-    names=list(feature_config.features.keys())
-  )
-
-  try:
-    client = ModelRepoClient()
-    logging.info('Writing feature importances to ML Metastore')
-    client.add_feature_importances(feature_importance_list)
-    logging.info('Writing feature names to ML Metastore')
-    client.add_feature_names(feature_names)
-  except (HTTPError, RetryError) as err:
-    logging.error('Feature importance is not being written due to: '
-                  'HTTPError when attempting to write to ML Metastore: \n{}.'.format(err))
diff --git a/twml/twml/contrib/feature_importances/feature_permutation.docx b/twml/twml/contrib/feature_importances/feature_permutation.docx
new file mode 100644
index 000000000..c54243886
Binary files /dev/null and b/twml/twml/contrib/feature_importances/feature_permutation.docx differ
diff --git a/twml/twml/contrib/feature_importances/feature_permutation.py b/twml/twml/contrib/feature_importances/feature_permutation.py
deleted file mode 100644
index 809f5fde0..000000000
--- a/twml/twml/contrib/feature_importances/feature_permutation.py
+++ /dev/null
@@ -1,129 +0,0 @@
-from copy import deepcopy
-import random
-import types
-
-from twitter.deepbird.util.thrift.simple_converters import (
-  bytes_to_thrift_object, thrift_object_to_bytes)
-
-from tensorflow.compat.v1 import logging
-from com.twitter.ml.api.ttypes import DataRecord  # pylint: disable=import-error
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class PermutedInputFnFactory(object):
-
-  def __init__(self, data_dir, record_count, file_list=None, datarecord_filter_fn=None):
-    """
-    Args:
-      data_dir (str): The location of the records on hdfs
-      record_count (int): The number of records to process
-      file_list (list<str>, default=None): The list of data files on HDFS. If provided, use this instead
-        of data_dir
-      datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    if not (data_dir is None) ^ (file_list is None):
-      raise ValueError("Exactly one of data_dir and file_list can be provided. Got {} for data_dir and {} for file_list".format(
-        data_dir, file_list))
-
-    file_list = file_list if file_list is not None else twml.util.list_files(twml.util.preprocess_path(data_dir))
-    _next_batch = twml.input_fns.default_input_fn(file_list, 1, lambda x: x,
-      num_threads=2, shuffle=True, shuffle_files=True)
-    self.records = []
-    # Validate datarecord_filter_fn
-    if datarecord_filter_fn is not None and not isinstance(datarecord_filter_fn, types.FunctionType):
-      raise TypeError("datarecord_filter_fn is not function type")
-    with tf.Session() as sess:
-      for i in range(record_count):
-        try:
-          record = bytes_to_thrift_object(sess.run(_next_batch)[0], DataRecord)
-          if datarecord_filter_fn is None or datarecord_filter_fn(record):
-            self.records.append(record)
-        except tf.errors.OutOfRangeError:
-          logging.info("Stopping after reading {} records out of {}".format(i, record_count))
-          break
-      if datarecord_filter_fn:
-        logging.info("datarecord_filter_fn has been applied; keeping {} records out of {}".format(len(self.records), record_count))
-
-  def _get_record_generator(self):
-    return (thrift_object_to_bytes(r) for r in self.records)
-
-  def get_permuted_input_fn(self, batch_size, parse_fn, fname_ftypes):
-    """Get an input function that passes in a preset number of records that have been feature permuted
-    Args:
-      parse_fn (function): The function to parse inputs
-      fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    """
-    def permuted_parse_pyfn(bytes_array):
-      out = []
-      for b in bytes_array:
-        rec = bytes_to_thrift_object(b, DataRecord)
-        if fname_ftypes:
-          rec = _permutate_features(rec, fname_ftypes=fname_ftypes, records=self.records)
-        out.append(thrift_object_to_bytes(rec))
-      return [out]
-
-    def permuted_parse_fn(bytes_tensor):
-      parsed_bytes_tensor = parse_fn(tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string))
-      return parsed_bytes_tensor
-
-    def input_fn(batch_size=batch_size, parse_fn=parse_fn, factory=self):
-      return (tf.data.Dataset
-          .from_generator(self._get_record_generator, tf.string)
-          .batch(batch_size)
-          .map(permuted_parse_fn, 4)
-          .make_one_shot_iterator()
-          .get_next())
-    return input_fn
-
-
-def _permutate_features(rec, fname_ftypes, records):
-  """Replace a feature value with a value from random selected record
-  Args:
-    rec: (datarecord): A datarecord returned from DataRecordGenerator
-    fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    records: (list<datarecord>): The records to sample from
-  Returns:
-    The record with the feature permuted
-  """
-  rec_new = deepcopy(rec)
-  rec_replace = random.choice(records)
-
-  # If the replacement datarecord does not have the feature type entirely, add it in
-  #   to make the logic a bit simpler
-  for fname, feature_type in fname_ftypes:
-    fid = twml.feature_id(fname)[0]
-    if rec_replace.__dict__.get(feature_type, None) is None:
-      rec_replace.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-    if rec_new.__dict__.get(feature_type, None) is None:
-      rec_new.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-
-    if feature_type != 'binaryFeatures':
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, dict()):
-        # If the replacement datarecord does not contain the feature but the original does
-        del rec_new.__dict__[feature_type][fid]
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = dict()
-        rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[feature_type][fid]
-      else:
-        # If neither datarecord contains this feature
-        pass
-    else:
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, set()):
-        # If the replacement datarecord does not contain the feature but the original does
-        rec_new.__dict__[feature_type].remove(fid)
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = set()
-        rec_new.__dict__[feature_type].add(fid)
-        # If neither datarecord contains this feature
-      else:
-        # If neither datarecord contains this feature
-        pass
-  return rec_new
diff --git a/twml/twml/contrib/feature_importances/helpers.docx b/twml/twml/contrib/feature_importances/helpers.docx
new file mode 100644
index 000000000..1969a63f5
Binary files /dev/null and b/twml/twml/contrib/feature_importances/helpers.docx differ
diff --git a/twml/twml/contrib/feature_importances/helpers.py b/twml/twml/contrib/feature_importances/helpers.py
deleted file mode 100644
index f3f600e8b..000000000
--- a/twml/twml/contrib/feature_importances/helpers.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import uuid
-
-from tensorflow.compat.v1 import logging
-import twml
-import tensorflow.compat.v1 as tf
-
-
-def write_list_to_hdfs_gfile(list_to_write, output_path):
-  """Use tensorflow gfile to write a list to a location on hdfs"""
-  locname = "/tmp/{}".format(str(uuid.uuid4()))
-  with open(locname, "w") as f:
-    for row in list_to_write:
-      f.write("%s\n" % row)
-  tf.io.gfile.copy(locname, output_path, overwrite=False)
-
-
-def decode_str_or_unicode(str_or_unicode):
-  return str_or_unicode.decode() if hasattr(str_or_unicode, 'decode') else str_or_unicode
-
-
-def longest_common_prefix(strings, split_character):
-  """
-  Args:
-    string (list<str>): The list of strings to find the longest common prefix of
-    split_character (str): If not None, require that the return string end in this character or
-      be the length of the entire string
-  Returns:
-    The string corresponding to the longest common prefix
-  """
-  sorted_strings = sorted(strings)
-  s1, s2 = sorted_strings[0], sorted_strings[-1]
-  if s1 == s2:
-    # If the strings are the same, just return the full string
-    out = s1
-  else:
-    # If the strings are not the same, return the longest common prefix optionally ending in split_character
-    ix = 0
-    for i in range(min(len(s1), len(s2))):
-      if s1[i] != s2[i]:
-        break
-      if split_character is None or s1[i] == split_character:
-        ix = i + 1
-    out = s1[:ix]
-  return out
-
-
-def _expand_prefix(fname, prefix, split_character):
-  if len(fname) == len(prefix):
-    # If the prefix is already the full feature, just take the feature name
-    out = fname
-  elif split_character is None:
-    # Advance the prefix by one character
-    out = fname[:len(prefix) + 1]
-  else:
-    # Advance the prefix to the next instance of split_character or the end of the string
-    for ix in range(len(prefix), len(fname)):
-      if fname[ix] == split_character:
-        break
-    out = fname[:ix + 1]
-  return out
-
-
-def _get_feature_types_from_records(records, fnames):
-  # This method gets the types of the features in fnames by looking at the datarecords themselves.
-  #   The reason why we do this rather than extract the feature types from the feature_config is
-  #   that the feature naming conventions in the feature_config are different from those in the
-  #   datarecords.
-  fids = [twml.feature_id(fname)[0] for fname in fnames]
-  feature_to_type = {}
-  for record in records:
-    for feature_type, values in record.__dict__.items():
-      if values is not None:
-        included_ids = set(values)
-        for fname, fid in zip(fnames, fids):
-          if fid in included_ids:
-            feature_to_type[fname] = feature_type
-  return feature_to_type
-
-
-def _get_metrics_hook(trainer):
-  def get_metrics_fn(trainer=trainer):
-    return {k: v[0]for k, v in trainer.current_estimator_spec.eval_metric_ops.items()}
-  return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn)
-
-
-def _get_feature_name_from_config(feature_config):
-  """Extract the names of the features on a feature config object
-  """
-  decoded_feature_names = []
-  for f in feature_config.get_feature_spec()['features'].values():
-    try:
-      fname = decode_str_or_unicode(f['featureName'])
-    except UnicodeEncodeError as e:
-      logging.error("Encountered decoding exception when decoding %s: %s" % (f, e))
-    decoded_feature_names.append(fname)
-  return decoded_feature_names
diff --git a/twml/twml/contrib/hooks.docx b/twml/twml/contrib/hooks.docx
new file mode 100644
index 000000000..80ecc1407
Binary files /dev/null and b/twml/twml/contrib/hooks.docx differ
diff --git a/twml/twml/contrib/hooks.py b/twml/twml/contrib/hooks.py
deleted file mode 100644
index 6d68831fc..000000000
--- a/twml/twml/contrib/hooks.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import datetime
-
-from absl import logging
-import pytz
-import tensorflow.compat.v1 as tf
-
-
-class StopAtTimeHook(tf.train.SessionRunHook):
-  """
-  Hook that stops training at a fixed datetime
-  """
-
-  def __init__(self, stop_time):
-    """
-    Arguments:
-      stop_time:
-        a datetime.datetime or a datetime.timedelta specifying when to stop.
-        For naive datetime.datetime objects (with no time zone specified),
-        UTC time zone is assumed.
-    """
-    if isinstance(stop_time, datetime.timedelta):
-      self._stop_datetime = pytz.utc.localize(datetime.datetime.utcnow() + stop_time)
-    elif isinstance(stop_time, datetime.datetime):
-      if stop_time.tzinfo is None:
-        self._stop_datetime = pytz.utc.localize(stop_time)
-      else:
-        self._stop_datetime = stop_time.astimezone(pytz.UTC)
-    else:
-      raise ValueError("Expecting datetime or timedelta for stop_time arg")
-    self._stop_requested = False
-
-  def after_run(self, run_context, run_values):
-    delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow())
-    if delta.total_seconds() <= 0:
-      logging.info("StopAtTimeHook reached stop_time; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
diff --git a/twml/twml/contrib/initializers.docx b/twml/twml/contrib/initializers.docx
new file mode 100644
index 000000000..cf805ed46
Binary files /dev/null and b/twml/twml/contrib/initializers.docx differ
diff --git a/twml/twml/contrib/initializers.py b/twml/twml/contrib/initializers.py
deleted file mode 100644
index 52bad3a19..000000000
--- a/twml/twml/contrib/initializers.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-
-TWML_INIT_FEED_KEY = "TWML_INIT_FEED_COLLECTION"
-
-
-class PartitionConstant(tf.keras.initializers.Constant):
-  """A constant initializer that supports partitions"""
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if not isinstance(self.value, np.ndarray):
-        raise ValueError(
-          "Currently, PartitionConstant only supports "
-          "partitioning on np.ndarrays. Got {}".format(type(self.value).__name__))
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
-
-
-partition_constant_initializer = PartitionConstant
-
-
-class PlaceholderInitializer(tf.keras.initializers.Initializer):
-  """A placeholder initializer that supports partitions"""
-
-  def __init__(self, shape, dtype):
-    self.dtype = dtype
-    self.value = tf.placeholder(dtype=dtype, shape=shape)
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if self.dtype != dtype:
-        raise ValueError("dtype does not match placeholder dtype")
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
-
-
-def get_init_feed_dict():
-  """Get the init feed dictionary to be used when running the init op."""
-  # Get the reference to the collection.
-  init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY)
-  init_feed_dict = {}
-  for d in init_feed_collection:
-    init_feed_dict.update(d)
-  return init_feed_dict
-
-
-def clear_init_feed_collection():
-  """Clear the init feed collection."""
-  init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY)
-  while init_feed_collection:
-    init_feed_collection.pop()
diff --git a/twml/twml/contrib/layers/__init__.docx b/twml/twml/contrib/layers/__init__.docx
new file mode 100644
index 000000000..e62ad0a4f
Binary files /dev/null and b/twml/twml/contrib/layers/__init__.docx differ
diff --git a/twml/twml/contrib/layers/__init__.py b/twml/twml/contrib/layers/__init__.py
deleted file mode 100644
index aa6e7d7e4..000000000
--- a/twml/twml/contrib/layers/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# pylint: disable=wildcard-import
-""" This module contains all contrib Layers. """
-
-from .hashed_percentile_discretizer import HashedPercentileDiscretizer  # noqa: F401
-from .hashing_discretizer import HashingDiscretizer  # noqa: F401
-from .mask_layer import MaskLayer  # noqa: F401
-from .embedding_lookup import EmbeddingLookup  # noqa: F401
-from .factorization_machine import FactorizationMachine # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
-from .stacked_rnn import StackedRNN, stacked_rnn  # noqa: F401
-from .zscore_normalization import ZscoreNormalization, zscore_normalization  # noqa: F401
diff --git a/twml/twml/contrib/layers/embedding_lookup.docx b/twml/twml/contrib/layers/embedding_lookup.docx
new file mode 100644
index 000000000..8a27345ec
Binary files /dev/null and b/twml/twml/contrib/layers/embedding_lookup.docx differ
diff --git a/twml/twml/contrib/layers/embedding_lookup.py b/twml/twml/contrib/layers/embedding_lookup.py
deleted file mode 100644
index c83dc7edd..000000000
--- a/twml/twml/contrib/layers/embedding_lookup.py
+++ /dev/null
@@ -1,419 +0,0 @@
-import os
-import re
-import time
-
-from collections import OrderedDict
-
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.python.ops.lookup_ops import index_table_from_tensor
-
-import twml
-
-# Padding is 0, UNK is 1:
-PAD_WORD_ID = 0
-OOV_WORD_ID = 1
-
-
-def load_initializers_from_csv(
-  embedding_path, vocab_size=-1, embedding_size=None, separator=None, vocab=None
-):
-  """
-  Loads embeddings saved in the `glove format <https://nlp.stanford.edu/projects/glove/>`_.
-  The glove format is a txt file separated by spaces.
-  Each line looks like: "word 0.00001 0.2334 ...".
-
-  Arguments:
-    embedding_path:
-      path to the embeddings file on HDFS (hdfs://default/...)
-      or its local_path (/path/to/...).
-      The embedding_path may also specify a pattern. In which case, the embeddings
-      are read in the lexical order of the filenames that match the order.
-    vocab_size:
-      the maximum size of the vocabulary. The top ``vocab_size`` words in the file
-      are included in the vocabulary. If you specify a positive vocab_size,
-      the words are expected to be in descending order of frequency.
-      This allows the embeddings to be easily filtered to top vocab_size words.
-      Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words.
-      A negative vocab_size loads all embeddings.
-      Reducing the vocab_size may also help with memory issues,
-      allowing the embedding initializers to fit inside the graph.
-    embedding_size:
-      Defaults to None. If None, the embedding size is infered from the file name.
-      For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both infrered
-      as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is
-      inferred from the first line in the file. If ``embedding_size`` is provided,
-      only the last ``embedding_size`` values of each line are considered. This
-      allows the line parser to recover from partial word parsing errors.
-    separator:
-      Specifies the separator to use when splitting each line into values.
-      Default value is a whitespace (same as glove format).
-    vocab:
-      OrderedDict mapping words to np.array embedding vectors. Initializes the vocabulary.
-      Duplicate words found in the file are ignored.
-      Defaults to a vocabulary of two words::
-
-        vocab = OrderedDict()
-        vocab[''] = np.random.randn(embedding_size)
-        vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  Returns:
-    tuple of (vocab_initializer, weight_initializer, shape)
-
-    vocab_initializer:
-      A tf.constant_initializer containing a vector of word strings of size vocab_size.
-    weight_initializer:
-      A twml.contrib.initializers.partition_constant_initializer containing
-      the weight matrix of embeddings of size vocab_size x embedding_size.
-    shape:
-      A tuple containing of (vocab_size, embedding_size).
-
-  """
-
-  start = time.time()
-
-  embedding_path = twml.util.sanitize_hdfs_path(embedding_path)
-
-  is_user_vocab = True
-  if vocab is None:
-    vocab = OrderedDict()
-    vocab[''] = True
-    vocab['<UNK>'] = True
-    is_user_vocab = False
-  elif not isinstance(vocab, OrderedDict):
-    raise RuntimeError(
-      "Expecting vocab argument of type OrderedDict or None. "
-      "Got type %s instead." % type(vocab).__name__
-    )
-
-  if embedding_size is None:
-    embedding_file = os.path.basename(embedding_path)
-    match = re.search(r"[^\d]([\d]+)d", embedding_file)
-    if match is not None:
-      embedding_size = int(match.group(1))
-
-  if embedding_size is not None and not isinstance(embedding_size, int):
-    raise RuntimeError(
-      "Expecting embedding_size argument of type int or None. "
-      "Got type %s, instead." % type(embedding_size).__name__
-    )
-
-  embedding_paths = sorted(tf.io.gfile.glob(embedding_path))
-
-  if len(embedding_paths) > 1:
-    raise ValueError(
-      "You are most likely using a the wrong --embedding.path"
-    )
-
-  embedding_path = embedding_paths[0]
-  logging.info("Reading embeddings file from path %s.." % embedding_path)
-
-  with tf.io.gfile.GFile(embedding_path) as f:
-    lines = f.readlines()
-
-  logging.info("Done reading embeddings file from path %s." % embedding_path)
-
-  logging.info("Parsing vocbulary and embeddings...")
-
-  for line in lines:
-    # Word and weights separated by space
-    values = line.strip().split(separator)
-    # Word is first symbol on each line
-    word = values[0]
-
-    if word not in vocab:
-      if embedding_size is None or embedding_size <= 0:
-        # get all elements after the first one.
-        word_weights = values[1:]
-        embedding_size = len(word_weights)
-      else:
-        # get the last embedding_size elements
-        word_weights = values[-min(embedding_size, len(values) - 1) :]
-
-      try:
-        if len(word_weights) != embedding_size:
-          raise ValueError
-
-        word_weights = np.asarray(word_weights, dtype=np.float32)
-        vocab[word] = word_weights
-      except ValueError:
-        logging.info("Wasn't able to load embeddings for word '%s'. Ignoring it" % word)
-
-      vocab_len = len(vocab)
-      if vocab_size > 0 and vocab_len == vocab_size:
-        # Limit vocabulary to top terms
-        break
-      elif (vocab_len % 1000) == 0:
-        logging.info("Loaded %d words into vocab" % vocab_len)
-
-    else:
-      logging.info("found duplicate word: %s" % word)
-
-  if not is_user_vocab:
-    vocab[''] = np.random.randn(embedding_size)
-    vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  words = list(vocab.keys())
-  weights = list(vocab.values())
-
-  weights = np.asarray(weights, dtype=np.float32)
-  assert weights.shape[0] == len(vocab)
-  assert weights.shape[1] == embedding_size
-
-  vocab_initializer = tf.constant_initializer(words, tf.string)
-  weight_initializer = twml.contrib.initializers.PartitionConstant(weights, tf.float32)
-
-  logging.info("Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start))
-  return vocab_initializer, weight_initializer, weights.shape
-
-
-def add_parser_arguments(parser):
-  """
-  Adds the embedding.path and embedding.vocab_size command-line arguments to the parser.
-  These can be used to call an initializer loader function like
-  the ``load_initializers_from_csv`` function.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument(
-    "--embedding.path",
-    "--embedding_path",
-    dest="embedding_path",
-    type=str,
-    default=None,
-    help="When specified, loads glove embeddings from .txt glove file",
-  )
-  parser.add_argument(
-    "--embedding.vocab_size",
-    "--embedding_vocab_size",
-    dest="embedding_vocab_size",
-    type=int,
-    default=-1,
-    help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).",
-  )
-
-  return parser
-
-
-class EmbeddingLookup(twml.layers.Layer):
-  """Layer for looking up embeddings.
-  Transforms a sequence of strings to a sequence of embeddings.
-
-  Arguments:
-    vocab_size:
-      The number of word strings and embeddings in the vocabulary.
-    output_size:
-      Long or Integer, dimensionality of the output space. The embedding vector size.
-    vocab_initializer:
-      Initializer function for the vocabulary. Required. The initializer should
-      return a list of strings of size vocab_size.
-    weight_initializer:
-      Initializer function for the weight matrix of size vocab_size x output_size.
-      This argument defaults to zeros_initializer().
-      This is valid when the EmbeddingLookup is the first layer of
-      parameters but should be changed otherwise.
-    trainable:
-      Boolean, if `True` adds variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-      Defaults to True: trains the embeddings.
-    num_oov_buckets:
-      The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket
-      ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not
-      specified, index `OOV_WORD_ID` is used for OOV strings.
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column, does not support yet)
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    dtype:
-      Defaults to tf.float32. Specifies the dtype of the weights.
-    use_placeholder:
-      Defaults to True.
-      If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`.
-      If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support.
-    checkpoint_dir:
-      Default to None.
-      If set to the path of a checkpoint, load embedding from the checkpoint.
-    convert_to_lowercase:
-      Default to True.
-      Converting all string inputs to lowercase.
-
-  Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`.
-  """
-
-  def __init__(
-    self,
-    vocab_size,
-    output_size,
-    vocab_initializer,
-    weight_initializer=None,
-    trainable=True,
-    num_oov_buckets=None,
-    oov_word_id=None,
-    name=None,
-    num_partitions=1,
-    partition_axis=0,
-    weight_regularizer=None,
-    dtype=None,
-    use_placeholder=True,
-    checkpoint_dir=None,
-    convert_to_lowercase=True,
-    **kwargs,
-  ):
-    if dtype is None:
-      # prevents a bug where the parent class defaults to the type of the first input tensor.
-      dtype = tf.float32
-    super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-
-    is_constant_init = isinstance(weight_initializer, tf.keras.initializers.Constant)
-    if use_placeholder and (not is_constant_init) and (weight_initializer is not None):
-      raise ValueError("Weight initializer should be a `Constant` or `None`.")
-
-    if weight_initializer is None:
-      self.weight_initializer = tf.zeros_initializer()
-    else:
-      self.weight_initializer = weight_initializer
-    self.use_placeholder = use_placeholder
-    self.checkpoint_dir = checkpoint_dir
-    self.convert_to_lowercase = convert_to_lowercase
-
-    self.vocab_initializer = vocab_initializer
-    self.vocab_size = vocab_size
-    self.output_size = output_size
-    self.num_partitions = num_partitions
-    self.partition_axis = partition_axis
-    self.weight_regularizer = weight_regularizer
-    self.trainable = trainable
-    self.oov_word_id = oov_word_id
-    self.num_oov_buckets = num_oov_buckets
-
-    if self.oov_word_id is not None and self.num_oov_buckets is not None:
-      raise ValueError("At most one of oov_word_id or num_oov_buckets should be specified")
-    elif self.oov_word_id is None and self.num_oov_buckets is None:
-      self.oov_word_id = OOV_WORD_ID  # use the default OOV word id
-
-    if partition_axis != 0:
-      raise NotImplementedError("embedding_lookup only supports partition_axis = 0")
-
-  def build(self, input_shapes):
-    """
-    creates the ``vocab`` and ``weight`` Variables
-    of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively.
-    """
-    partitioner = None
-
-    additional_buckets_for_oov = self.num_oov_buckets if self.num_oov_buckets is not None else 0
-    shape = [self.vocab_size + additional_buckets_for_oov, self.output_size]
-
-    if self.use_placeholder:
-      embedding_weight_initializer = twml.contrib.initializers.PlaceholderInitializer(
-        shape, self.dtype
-      )
-      tf.add_to_collection(
-        twml.contrib.initializers.TWML_INIT_FEED_KEY,
-        {embedding_weight_initializer.value: self.weight_initializer.value},
-      )
-    else:
-      embedding_weight_initializer = self.weight_initializer
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self.vocab = self.add_variable(
-      'vocab',
-      initializer=self.vocab_initializer,
-      shape=[self.vocab_size],
-      dtype=tf.string,
-      trainable=False,
-    )
-
-    self.weight = self.add_variable(
-      'weight',
-      initializer=None if self.checkpoint_dir is not None else embedding_weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=self.trainable,
-      partitioner=partitioner,
-    )
-    if self.checkpoint_dir is not None:
-      twml.trainers.trainer.init_from_checkpoint(self.checkpoint_dir, {'weight': self.weight.name})
-
-    self.built = True
-
-  def call(
-    self, inputs, debug=False, oov_summaries=False, **kwargs
-  ):  # pylint: disable=unused-argument
-    """Converts word strings to word ids using the vocabulary lookup table.
-    Then converts the word ids to their commensurate embedding vector.
-
-    Arguments:
-      inputs:
-        A tensor of word strings. Typically, of size batch_size x seq_len.
-      debug:
-        When True, prints the input strings and their commensurate input_ids.
-        Defaults to False.
-      oov_summaries:
-        When True, log the out-of-vocabulary (OOV) rate to TensorBoard
-        Defaults to False.
-
-    Returns:
-      The mapping of input word strings to output embedding vectors.
-      Given an input of shape ``batch_size x seq_len``, the output has shape
-      ``batch_size x seq_len x embedding_size``.
-    """
-    if self.convert_to_lowercase:
-      inputs = tf.strings.lower(inputs)
-    if self.num_oov_buckets is None:
-      lookup_table = index_table_from_tensor(self.vocab, default_value=self.oov_word_id)
-    else:
-      lookup_table = index_table_from_tensor(self.vocab, num_oov_buckets=self.num_oov_buckets)
-    input_ids = lookup_table.lookup(inputs)
-
-    if oov_summaries:
-      oov_count = tf.reduce_sum(
-        tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32)
-      )
-      valid_count = tf.reduce_sum(
-        tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32)
-      )
-      oov_rate = oov_count / valid_count
-      tf.summary.scalar('OOV_rate', oov_rate)
-
-    if debug:
-
-      def print_debug():
-        return tf.print("input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140)
-
-      with tf.control_dependencies([twml.util.do_every_n_steps(print_debug, 1000)]):
-        input_ids = tf.identity(input_ids)
-
-    output_embeddings = tf.nn.embedding_lookup(
-      params=self.weight, ids=input_ids, partition_strategy='div'
-    )
-
-    output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size]))
-    output_embeddings.set_shape(output_shape)
-
-    return output_embeddings
diff --git a/twml/twml/contrib/layers/factorization_machine.docx b/twml/twml/contrib/layers/factorization_machine.docx
new file mode 100644
index 000000000..0ee332e10
Binary files /dev/null and b/twml/twml/contrib/layers/factorization_machine.docx differ
diff --git a/twml/twml/contrib/layers/factorization_machine.py b/twml/twml/contrib/layers/factorization_machine.py
deleted file mode 100644
index 3b8adae42..000000000
--- a/twml/twml/contrib/layers/factorization_machine.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# pylint: disable=no-member, arguments-differ, attribute-defined-outside-init, unused-argument
-"""
-Implementing factorization Layer
-"""
-
-from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs
-
-import tensorflow.compat.v1 as tf
-import twml
-from twml.layers.layer import Layer
-
-
-class FactorizationMachine(Layer):
-  """factorization machine layer class.
-  This layer implements the factorization machine operation.
-  The paper is "Factorization Machines" by Steffen Rendle.
-  TDD: go/tf-fm-tdd
-
-  Arguments:
-    num_latent_variables:
-      num of latent variables
-      The number of parameter in this layer is num_latent_variables x n where n is number of
-      input features.
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-  """
-
-  def __init__(self,
-    num_latent_variables=10,
-    weight_initializer=None,
-    activation=None,
-    trainable=True,
-    name=None,
-    use_sparse_grads=True,
-    use_binary_values=False,
-    weight_regularizer=None,
-    substract_self_cross=True,
-    **kwargs):
-    super(FactorizationMachine, self).__init__(trainable=trainable, name=name, **kwargs)
-
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.num_latent_variables = num_latent_variables
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.substract_self_cross = substract_self_cross
-
-  def build(self, input_shape):
-    """
-    creates``weight`` Variable of shape``[input_size, num_latent_variables]``.
-
-    """
-
-    shape = [input_shape[1], self.num_latent_variables]
-
-    # There is a 2GB limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    requested_size = input_shape[1] * self.num_latent_variables * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor can not be larger than 2GB. " %
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total)"
-                       (input_shape[1], self.num_latent_variables, dtype.name))
-
-    if not callable(self.weight_initializer):
-      shape = None
-
-    # dense tensor
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-    )
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A SparseTensor
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns a number with cross info
-    """
-    # The following are given:
-    # - inputs is a sparse tensor, we call it sp_x.
-    # - The dense_v tensor is a dense matrix, whose row i
-    #   corresponds to the vector V_i.
-    #   weights has shape [num_features, k]
-    sp_x = inputs
-    if isinstance(inputs, twml.SparseTensor):
-      sp_x = inputs.to_tf()
-    elif not isinstance(sp_x, tf.SparseTensor):
-      raise TypeError("The sp_x must be of type tf.SparseTensor or twml.SparseTensor")
-
-    indices = sp_x.indices[:, 1]
-    batch_ids = sp_x.indices[:, 0]
-    values = tf.reshape(sp_x.values, [-1, 1], name=self.name)
-    if self.use_sparse_grads:
-      v = tf.nn.embedding_lookup(self.weight, indices)
-      # if (self.use_binary_values):
-      #   values = tf.ones(tf.shape(values), dtype=values.dtype)
-      v_times_x = v * values
-      # First term: Sum_k  [Sum_i (v_ik * x_i)]^2
-      all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name)
-      all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1)
-
-      if self.substract_self_cross:
-        # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ]
-        v_times_x_2 = v_times_x**2
-        self_crosses = tf.reduce_sum(tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1)
-        outputs = all_crosses_squared - self_crosses
-      else:
-        outputs = all_crosses_squared
-    else:
-      # need to check if prediction is faster with code below
-      crossTerm = tf.reduce_sum((tf.sparse_tensor_dense_matmul(sp_x, self.weight)**2), 1)
-
-      if self.substract_self_cross:
-        # compute self-cross term
-        self_crossTerm = tf.reduce_sum(tf.segment_sum((tf.gather(self.weight, indices) * values)**2, batch_ids), 1)
-        outputs = crossTerm - self_crossTerm
-      else:
-        outputs = crossTerm
-
-    if self.activation is not None:
-      outputs = self.activation(outputs)
-
-    outputs = tf.reshape(outputs, [-1, 1], name=self.name)
-    outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32))
-    # set more explicit and static shape to avoid shape inference error
-    # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None`
-    outputs.set_shape([None, 1])
-    return outputs
diff --git a/twml/twml/contrib/layers/full_dense.docx b/twml/twml/contrib/layers/full_dense.docx
new file mode 100644
index 000000000..d23300198
Binary files /dev/null and b/twml/twml/contrib/layers/full_dense.docx differ
diff --git a/twml/twml/contrib/layers/full_dense.py b/twml/twml/contrib/layers/full_dense.py
deleted file mode 100644
index ad78a91a4..000000000
--- a/twml/twml/contrib/layers/full_dense.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# pylint: disable=no-member,arguments-differ, attribute-defined-outside-init
-"""
-Implementing Full Dense Layer
-"""
-from twml.layers import Layer
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.layers import core
-
-
-class FullDense(Layer):
-  """
-  Full-connected, Dense input layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    use_bias:
-      Boolean whether to include a bias parameter in the layer
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weights:
-      list of underlying weight and bias matrix components. no guarantee on order of elements
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
-  """
-
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs)
-    self._output_sizes = self._get_output_partition_sizes(output_size, num_partitions)
-    self._units = output_size
-    self._activation = activation
-    self._weight_initializer = weight_initializer
-    self._bias_initializer = bias_initializer
-    self._weight_regularizer = weight_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._weight_constraint = weight_constraint
-    self._bias_constraint = bias_constraint
-    self._use_bias = use_bias
-    # NOTE - many initializers depend on fan_in and fan_out
-    #      - as such, initialization here may be different than
-    #      - for a non-partitioned FullDense
-    self._parts = [core.Dense(units=out_size,
-                              activation=activation,
-                              use_bias=use_bias,
-                              kernel_initializer=weight_initializer,
-                              bias_initializer=bias_initializer,
-                              kernel_regularizer=weight_regularizer,
-                              bias_regularizer=bias_regularizer,
-                              activity_regularizer=activity_regularizer,
-                              kernel_constraint=weight_constraint,
-                              bias_constraint=bias_constraint,
-                              trainable=trainable,
-                              name=name,
-                              **kwargs) for out_size in self._output_sizes]
-
-  @staticmethod
-  def _get_output_partition_sizes(out_size, num_parts):
-    """ Returns the appropriate output sizes of the partitions """
-    boundaries = [out_size * n // num_parts for n in range(num_parts + 1)]
-    return [k - j for j, k in zip(boundaries[:], boundaries[1:])]
-
-  def build(self, input_shapes):
-    """ Create the appropriately sized weights and biases in each layer partition """
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    for part in self._parts:
-      part.build(input_shape)
-
-    self.built = True
-
-  @property
-  def units(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def output_size(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def activation(self):
-    """ Returns the activation function """
-    return self._activation
-
-  @property
-  def weight_initializer(self):
-    """ Returns the weight_initializer """
-    return self._weight_initializer
-
-  @property
-  def weight_regularizer(self):
-    """ Returns the weight_regularizer """
-    return self._weight_regularizer
-
-  @property
-  def weight_constraint(self):
-    """ Returns the weight_constraint """
-    return self._weight_constraint
-
-  @property
-  def bias_initializer(self):
-    """ Returns the bias_initializer """
-    return self._bias_initializer
-
-  @property
-  def bias_regularizer(self):
-    """ Returns the bias_regularizer """
-    return self._bias_regularizer
-
-  @property
-  def bias_constraint(self):
-    """ Returns the bias_constraint """
-    return self._bias_constraint
-
-  @property
-  def use_bias(self):
-    """ Returns whether a bias is used in the layer """
-    return self._use_bias
-
-  @property
-  def trainable_variables(self):
-    """ Returns the trainable variables of the layer """
-    trainable_vars = []
-    for pt in self._parts:
-      trainable_vars += pt.trainable_variables
-    return trainable_vars
-
-  @property
-  def trainable_weights(self):
-    """ Returns the trainable variables of the layer """
-    return self.trainable_variables
-
-  @property
-  def non_trainable_variables(self):
-    """ Returns the non-trainable variables of the layer """
-    non_trainable_vars = []
-    for pt in self._parts:
-      non_trainable_vars += pt.non_trainable_variables
-    return non_trainable_vars
-
-  @property
-  def non_trainable_weights(self):
-    """ Returns the non-trainable variables of the layer """
-    return self.non_trainable_variables
-
-  @property
-  def variables(self):
-    """ Returns a list of all weights and biases in this layer """
-    layer_vars = []
-    for pt in self._parts:
-      layer_vars += pt.weights
-    return layer_vars
-
-  @property
-  def weights(self):
-    """ Returns a list of all weights and biases in this layer """
-    return self.variables
-
-  @property
-  def dtype(self):
-    """ Returns the dtype of the layers weights """
-    return self._parts[0].dtype
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A dense Tensor or a list of such.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
-
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-       `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`.
-    """
-    if not isinstance(inputs, (list, tuple)):
-      inputs = [inputs]
-
-    outputs = []
-    for inp in inputs:
-      part_outputs = [part(inp) for part in self._parts]
-      outputs.append(tf.concat(part_outputs, axis=-1))
-
-    return tf.accumulate_n(outputs)
-
-
-def full_dense(inputs, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               reuse=None,
-               **kwargs):
-  """Functional interface for the fully-connected dense-input layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    inputs: Tensor input.
-    output_size: Integer or Long, dimensionality of the output space.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor with shape `inputs.shape[:-1] + [output_size]`.
-  """
-  if not isinstance(inputs, (list, tuple)):
-    inputs = [inputs]
-
-  dtype = inputs[0].dtype.base_dtype
-
-  layer = FullDense(output_size=output_size,
-                    weight_initializer=weight_initializer,
-                    weight_regularizer=weight_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    num_partitions=num_partitions,
-                    activation=activation,
-                    use_bias=use_bias,
-                    bias_initializer=bias_initializer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    trainable=trainable,
-                    name=name,
-                    dtype=dtype,
-                    _scope=name,
-                    _reuse=reuse,
-                    **kwargs)
-
-  return layer(inputs)
diff --git a/twml/twml/contrib/layers/hashed_percentile_discretizer.docx b/twml/twml/contrib/layers/hashed_percentile_discretizer.docx
new file mode 100644
index 000000000..08587d88e
Binary files /dev/null and b/twml/twml/contrib/layers/hashed_percentile_discretizer.docx differ
diff --git a/twml/twml/contrib/layers/hashed_percentile_discretizer.py b/twml/twml/contrib/layers/hashed_percentile_discretizer.py
deleted file mode 100644
index b32c3be8d..000000000
--- a/twml/twml/contrib/layers/hashed_percentile_discretizer.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing HashedPercentileDiscretizer Layer
-"""
-
-
-from twitter.deepbird.util.hashing import (
-  integer_multiplicative_hashing_uniform,
-  integer_multiplicative_hashing,
-)  # noqa: F401
-
-from libtwml import percentile_discretizer_bin_indices
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-from twml.layers.layer import Layer
-from twml.layers.partition import Partition
-from twml.layers.stitch import Stitch
-
-
-class HashedPercentileDiscretizer(Layer):
-  """
-  HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator
-  after accumulating data
-  and performing minimum description length (PercentileDiscretizer) calibration.
-
-  HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an HashedPercentileDiscretizer
-  bin.
-  Each HashedPercentileDiscretizer input feature is converted to n_bin bins.
-  Each HashedPercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values
-  per bin is roughly equal (for each given HashedPercentileDiscretizer feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  The difference between this layer and PercentileDiscretizer is that the
-  DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the
-  same input feature id + bin. This is useful if you want to user transfer learning on pre-trained
-  sparse to dense embedding layers, but re-calibrate your discretizer on newer data.
-  """
-
-  def __init__(self, n_feature, n_bin, out_bits,
-               bin_values=None, hash_keys=None, hash_values=None,
-               bin_ids=None, feature_offsets=None,
-               hash_fn=integer_multiplicative_hashing_uniform, **kwargs):
-    """
-    Creates a non-initialized `HashedPercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during HashedPercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of HashedPercentileDiscretizer bins used for
-        HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys,
-        hash_values, bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that HashedPercentileDiscretizer discretizes and knows
-        about. The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer
-          2. transate the HashedPercentileDiscretizer features into a hash_feature ID that
-          HashedPercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the HashedPercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-      hash_fn:
-        a function that takes in `feature_ids`, `bucket_indices` and `output_size` and
-        hashes the bucketed features into the `output_size` buckets. The default uses knuth's
-        multiplicative hashing
-    """
-    super(HashedPercentileDiscretizer, self).__init__(**kwargs)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    # build variables
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._out_bits = out_bits
-
-    hash_keys = hash_keys
-    if hash_keys is None:
-      hash_keys = np.empty(n_feature, dtype=np.int64)
-
-    hash_values = hash_values
-    if hash_values is None:
-      hash_values = np.empty(n_feature, dtype=np.int64)
-
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-    self.bin_ids = bin_ids
-    if bin_ids is None:
-      bin_ids = np.empty(max_discretizer_feature, dtype=np.int64)
-
-    self.bin_values = bin_values
-    if bin_values is None:
-      bin_values = np.empty(max_discretizer_feature, dtype=np.float32)
-
-    self.feature_offsets = feature_offsets
-    if feature_offsets is None:
-      feature_offsets = np.empty(n_feature, dtype=np.int64)
-
-    self.hash_fn = hash_fn
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements HashedPercentileDiscretizer inference where inputs are intersected with a
-    hash_map.
-    Part of the inputs are discretized using twml.discretizer
-    to produce a discretizer_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    hashed_keys = self.hash_map.lookup(keys)
-    hashed_keys = tf.cast(hashed_keys, tf.int64)
-
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    found = tf.reshape(found, [-1])
-    continuous_feature_ids = tf.boolean_mask(keys, found)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_discretizer_keys, discretizer_in_keys = key
-    non_discretizer_vals, discretizer_in_vals = vals
-
-    non_discretizer_keys = twml.util.limit_bits(non_discretizer_keys, self._out_bits)
-    self.non_discretizer_keys = non_discretizer_keys
-
-    # run HashedPercentileDiscretizer on the keys/values it knows about
-    output = percentile_discretizer_bin_indices(discretizer_in_keys,
-                                                discretizer_in_vals,
-                                                self.bin_ids,
-                                                self.bin_values,
-                                                self.feature_offsets)
-    discretizer_bucket_idxs, discretizer_vals = output
-    new_discretizer_keys = self.hash_fn(continuous_feature_ids, discretizer_bucket_idxs,
-                                        self.output_size)
-    # Stitch the keys and values from discretizer and non discretizer indices back, with help
-    # of the Stitch Layer
-    self.discretizer_out_keys = new_discretizer_keys
-
-    concat_data = self.stitch([non_discretizer_vals, discretizer_vals],
-                              [non_discretizer_keys, new_discretizer_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
diff --git a/twml/twml/contrib/layers/hashing_discretizer.docx b/twml/twml/contrib/layers/hashing_discretizer.docx
new file mode 100644
index 000000000..7cab90c27
Binary files /dev/null and b/twml/twml/contrib/layers/hashing_discretizer.docx differ
diff --git a/twml/twml/contrib/layers/hashing_discretizer.py b/twml/twml/contrib/layers/hashing_discretizer.py
deleted file mode 100644
index 2a8244f4b..000000000
--- a/twml/twml/contrib/layers/hashing_discretizer.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing HashingDiscretizer Layer
-"""
-
-
-import libtwml
-import tensorflow.compat.v1 as tf
-import twml
-from twml.constants import HashingDiscretizerOptions
-from twml.layers.layer import Layer
-
-
-class HashingDiscretizer(Layer):
-  """A layer that discretizes continuous features, with hashed feature assignments
-
-  HashingDiscretizer converts sparse continuous features into sparse
-  binary features. Each binary output feature indicates the presence of a
-  value in a HashingDiscretizer bin.
-
-  Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins.
-
-  - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins
-  - bin assignment = sum(bin_vals<val)
-
-  The difference between this layer and PercentileDiscretizer is that the
-  HashingDiscretizer always assigns the same output id in the
-  SparseTensor to the same input (feature id, bin) pair. This is useful if you
-  want to user transfer learning on pre-trained sparse to dense embedding
-  layers, but re-calibrate your discretizer on newer data.
-
-  If there are no calibrated features, then the discretizer will only apply
-  twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-  the discretizer will be a "no-operation", other than obeying `out_bits`
-
-  Typically, a HashingDiscretizer layer will be generated by calling the
-  to_layer() method of the HashingDiscretizerCalibrator
-  """
-
-  def __init__(self, feature_ids, bin_vals, n_bin, out_bits,
-               cost_per_unit=500, options=None, **kwargs):
-    """
-    Creates a non-initialized `HashingDiscretizer` object.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      feature_ids (1D int64 numpy array):
-      - list of feature IDs that have been calibrated and have corresponding
-        bin boundary values in the bin_vals array
-      - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin]
-      bin_vals (1D float numpy array):
-      - These are the bin boundary values for each calibrated feature
-      - len(bin_vals) = n_bin*len(feature_ids)
-      n_bin (int):
-      - number of HashingDiscretizer bins is actually n_bin + 1
-      - ***Note*** that if a value N is passed for the value of n_bin to
-        HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator
-        will generate N+1 bin boundaries for each feature, and hence there
-        will actually be N+2 potential bins for each feature
-      out_bits (int):
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      cost_per_unit (int):
-      - heuristic for intra op multithreading. approximate nanoseconds per input value.
-      options (int or None for default):
-      - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing.
-      - Use values in twml.constants.HashingDiscretizerOptions to select options as follows
-        choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND}
-        choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT}
-        Bitwise OR these together to construct the options input.
-        For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)`
-    """
-    super(HashingDiscretizer, self).__init__(**kwargs)
-
-    self._feature_ids = feature_ids
-    self._bin_vals = bin_vals
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-    self.cost_per_unit = cost_per_unit
-    if options is None:
-      options = HashingDiscretizerOptions.SEARCH_LOWER_BOUND | HashingDiscretizerOptions.HASH_32BIT
-    self._options = options
-
-    if not self.built:
-      self.build(input_shape=None)
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
-    """
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """
-    Implements HashingDiscretizer inference on a twml.SparseTensor.
-    Alternatively, accepts a tf.SparseTensor that can be converted
-    to twml.SparseTensor.
-
-    Performs discretization of input values.
-    i.e. bucket_val = bucket(val | feature_id)
-
-    This bucket mapping depends on the calibration (i.e. the bin boundaries).
-    However, (feature_id, bucket_val) pairs are mapped to new_feature_id in
-    a way that is independent of the calibration procedure
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashingDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A tf.SparseTensor, created from twml.SparseTensor.to_tf()
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if len(self._feature_ids) > 0:
-      # pass all inputs to the c++ op
-      # the op determines whether to discretize (when a feature is calibrated),
-      #   or whether to simply limit bits and pass through (when not calibrated)
-      # NOTE - Hashing is done in C++
-      discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer(
-        input_ids=keys,  # Input
-        input_vals=vals,  # Input
-        bin_vals=self._bin_vals,  # Input
-        feature_ids=tf.make_tensor_proto(self._feature_ids),  # Attr
-        n_bin=self._n_bin,  # Attr
-        output_bits=self._out_bits,  # Attr
-        cost_per_unit=self.cost_per_unit,  # Attr
-        options=self._options,  # Attr
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_size = tf.convert_to_tensor(1 << self._out_bits, tf.int64)
-    output_shape = [batch_size, output_size]
-
-    return twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
diff --git a/twml/twml/contrib/layers/mask_layer.docx b/twml/twml/contrib/layers/mask_layer.docx
new file mode 100644
index 000000000..dbbc06a1a
Binary files /dev/null and b/twml/twml/contrib/layers/mask_layer.docx differ
diff --git a/twml/twml/contrib/layers/mask_layer.py b/twml/twml/contrib/layers/mask_layer.py
deleted file mode 100644
index f5e788c7b..000000000
--- a/twml/twml/contrib/layers/mask_layer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from twml.contrib.pruning import apply_mask
-from twml.layers import Layer
-
-
-class MaskLayer(Layer):
-  """
-  This layer corresponds to `twml.contrib.pruning.apply_mask`.
-
-  It applies a binary mask to mask out channels of a given tensor. The masks can be
-  optimized using `twml.contrib.trainers.PruningDataRecordTrainer`.
-  """
-
-  def call(self, inputs, **kwargs):
-    """
-    Applies a binary mask to the channels of the input.
-
-    Arguments:
-      inputs:
-        input tensor
-      **kwargs:
-        additional keyword arguments
-
-    Returns:
-      Masked tensor
-    """
-    return apply_mask(inputs)
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
diff --git a/twml/twml/contrib/layers/stacked_rnn.docx b/twml/twml/contrib/layers/stacked_rnn.docx
new file mode 100644
index 000000000..e3a0ad212
Binary files /dev/null and b/twml/twml/contrib/layers/stacked_rnn.docx differ
diff --git a/twml/twml/contrib/layers/stacked_rnn.py b/twml/twml/contrib/layers/stacked_rnn.py
deleted file mode 100644
index e05f5d853..000000000
--- a/twml/twml/contrib/layers/stacked_rnn.py
+++ /dev/null
@@ -1,189 +0,0 @@
-
-from twitter.deepbird.compat.v1.rnn import stack_bidirectional_dynamic_rnn
-
-import tensorflow.compat.v1 as tf
-import tensorflow
-import twml
-
-
-def _get_rnn_cell_creator(cell_type):
-  if cell_type == "LSTM":
-    Cell = tf.nn.rnn_cell.LSTMCell
-  elif cell_type == "GRU":
-    Cell = tf.nn.rnn_cell.GRUCell
-  else:
-    raise ValueError("cell_type: %s is not supported."
-                     "It should be one of 'LSTM' or 'GRU'." % cell_type)
-  return Cell
-
-
-def _apply_dropout_wrapper(rnn_cells, dropout):
-  """ Apply dropout wrapper around each cell if necessary """
-  if rnn_cells is None:
-    return None
-
-  cells = []
-  for i, dropout_rate in enumerate(dropout):
-    cell = rnn_cells[i]
-    if dropout_rate > 0:
-      cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=(1.0 - dropout_rate))
-    cells.append(cell)
-  return cells
-
-
-def _create_bidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells_forward = [Cell(output_size) for output_size in num_units]
-    cells_backward = [Cell(output_size) for output_size in num_units]
-    cells_forward = _apply_dropout_wrapper(cells_forward, dropout)
-    cells_backward = _apply_dropout_wrapper(cells_backward, dropout)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
-    with tf.variable_scope(scope_name):
-      outputs, final_states, _ = stack_bidirectional_dynamic_rnn(
-        cells_fw=cells_forward, cells_bw=cells_backward, inputs=inputs,
-        sequence_length=sequence_lengths, dtype=inputs.dtype)
-      return final_states[-1][-1]
-
-  return stacked_rnn_cell
-
-
-def _create_unidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells = [Cell(output_size) for output_size in num_units]
-    cells = _apply_dropout_wrapper(cells, dropout)
-    multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
-    with tf.variable_scope(scope_name):
-      outputs, final_states = tf.nn.static_rnn(
-        multi_cell,
-        tf.unstack(inputs, axis=1),
-        dtype=inputs.dtype,
-        sequence_length=sequence_lengths)
-      return final_states[-1].h
-
-  return stacked_rnn_cell
-
-
-def _create_regular_rnn_cell(num_units, dropout, cell_type, is_bidirectional):
-  if is_bidirectional:
-    return _create_bidirectional_rnn_cell(num_units, dropout, cell_type)
-  else:
-    return _create_unidirectional_rnn_cell(num_units, dropout, cell_type)
-
-
-class StackedRNN(twml.layers.Layer):
-  """
-  Layer for stacking RNN modules.
-  This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs.
-
-  Arguments:
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented.
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      This is for forward compatibility, this is not yet implemented.
-      Defaults to False.
-  """
-
-  def __init__(self,
-               num_units,
-               dropout=0,
-               is_training=True,
-               cell_type="LSTM",
-               is_bidirectional=False,
-               name="stacked_rnn"):
-
-    super(StackedRNN, self).__init__(name=name)
-
-    if (is_bidirectional):
-      raise NotImplementedError("Bidirectional RNN is not yet implemented")
-
-    if (cell_type != "LSTM"):
-      raise NotImplementedError("Only LSTMs are supported")
-
-    if not isinstance(num_units, (list, tuple)):
-      num_units = [num_units]
-    else:
-      num_units = num_units
-
-    self.num_layers = len(num_units)
-    if not isinstance(dropout, (tuple, list)):
-      dropout = [dropout] * self.num_layers
-    else:
-      dropout = dropout
-
-    self.is_training = is_training
-
-    is_gpu_available = twml.contrib.utils.is_gpu_available()
-    same_unit_size = all(size == num_units[0] for size in num_units)
-    same_dropout_rate = any(val == dropout[0] for val in dropout)
-
-    self.stacked_rnn_cell = None
-    self.num_units = num_units
-    self.dropout = dropout
-    self.cell_type = cell_type
-    self.is_bidirectional = is_bidirectional
-
-  def build(self, input_shape):
-    self.stacked_rnn_cell = _create_regular_rnn_cell(self.num_units,
-                                                     self.dropout,
-                                                     self.cell_type,
-                                                     self.is_bidirectional)
-
-  def call(self, inputs, sequence_lengths):
-    """
-    Arguments:
-      inputs:
-        A tensor of size [batch_size, max_sequence_length, embedding_size].
-      sequence_lengths:
-        The length of each input sequence in the batch. Should be of size [batch_size].
-    Returns:
-      final_output
-        The output of at the end of sequence_length.
-    """
-    return self.stacked_rnn_cell(inputs, sequence_lengths)
-
-
-def stacked_rnn(inputs, sequence_lengths, num_units,
-                dropout=0, is_training=True,
-                cell_type="LSTM", is_bidirectional=False, name="stacked_rnn"):
-  """Functional interface for StackedRNN
-  Arguments:
-    inputs:
-      A tensor of size [batch_size, max_sequence_length, embedding_size].
-    sequence_lengths:
-      The length of each input sequence in the batch. Should be of size [batch_size].
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM" or "GRU".
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      Defaults to False.
-  Returns
-    outputs, state.
-  """
-  rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name)
-  return rnn(inputs, sequence_lengths)
diff --git a/twml/twml/contrib/layers/zscore_normalization.docx b/twml/twml/contrib/layers/zscore_normalization.docx
new file mode 100644
index 000000000..3aa7a6837
Binary files /dev/null and b/twml/twml/contrib/layers/zscore_normalization.docx differ
diff --git a/twml/twml/contrib/layers/zscore_normalization.py b/twml/twml/contrib/layers/zscore_normalization.py
deleted file mode 100644
index 8a1064965..000000000
--- a/twml/twml/contrib/layers/zscore_normalization.py
+++ /dev/null
@@ -1,247 +0,0 @@
-"""
-Contains the twml.layers.ZscoreNormalization layer.
-"""
-from twml.layers.layer import Layer
-import tensorflow.compat.v1 as tf
-
-from tensorflow.python.training import moving_averages
-
-
-# This is copied from tensorflow.contrib.framework.python.ops.add_model_variable in 1.15
-# Not available in 2.x
-# TODO: Figure out if this is really necessary.
-def _add_model_variable(var):
-  """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
-  Args:
-    var: a variable.
-  """
-  if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES):
-    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var)
-
-
-def update_moving_variable(batch_var, moving_var, decay, zero_debias=True, name=None):
-  update_op = moving_averages.assign_moving_average(
-      moving_var, batch_var, decay, zero_debias=zero_debias, name=None)
-  _add_model_variable(moving_var)
-  with tf.control_dependencies([update_op]):
-    return tf.identity(moving_var)
-
-
-class ZscoreNormalization(Layer):
-  """
-  Perform z-score normalization using moving mean and std.
-  Missing values are not included during mean/std calculation
-  This layer should only be used right after input layer.
-
-  Args:
-    decay:
-      using large decay to include longer moving means.
-    data_type:
-      use float64 to prevent overflow during variance calculation.
-    name:
-      Layer name
-  Returns:
-    A layer representing the output of the ZscoreNormalization transformation.
-   """
-
-  def __init__(
-    self,
-    decay=0.9999,
-    data_type=tf.float64,
-    name=None,
-    **kwargs):
-    super(ZscoreNormalization, self).__init__(name=name, **kwargs)
-    self.epsilon = tf.constant(1., data_type)
-    self.decay = decay
-    self.data_type = data_type
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the moving_mean and moving_var tf.Variables of the layer."""
-    input_dim = input_shape[1]
-    self.moving_mean = self.add_variable(
-      '{}_mean/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.moving_var = self.add_variable(
-      '{}_variance/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    """
-
-    return input_shape
-
-  def _training_pass(self, input, dense_mask, input_dtype, handle_single, zero_debias):
-    epsilon = self.epsilon
-    moving_mean, moving_var = self.moving_mean, self.moving_var
-    # calculate the number of exisiting value for each feature
-    tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0)
-    mask_ones = tf.cast(tensor_batch_num, tf.bool)
-    eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon)
-    # the following filled 0 with epision
-    tensor_batch_num_eps = tf.where(mask_ones,
-                                    tensor_batch_num,
-                                    eps_vector
-                                  )
-    tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0)
-    tensor_batch_divided = input / tensor_batch_num_eps_broacast
-    tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0)
-
-    # update moving mean here, and use it to calculate the std.
-    tensor_moving_mean = update_moving_variable(tensor_batch_mean, moving_mean, self.decay,
-                                                zero_debias, name="mean_ema_op")
-
-    tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0)
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean))
-    # divided by sqrt(n) before square, and then do summation for numeric stability.
-    broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0)
-    tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps
-    tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div)
-    tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0)
-
-    # update moving var here, dont replace 0 with eps before updating.
-    tensor_moving_var = update_moving_variable(tensor_batch_var, moving_var, self.decay,
-                                               zero_debias, name="var_ema_op")
-
-    # if std is 0, replace it with epsilon
-    tensor_moving_std = tf.sqrt(tensor_moving_var)
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                    eps_vector,
-                                    tensor_moving_std)
-
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0)
-      moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0)
-      missing_input_norm = tf.where(
-        tf.math.logical_and(dense_mask, moving_var_mask_zero),
-        tf.ones_like(missing_input_norm),
-        missing_input_norm
-      )
-    if input_dtype != self.data_type:
-      missing_input_norm = tf.cast(missing_input_norm, input_dtype)
-    return missing_input_norm
-
-  def _infer_pass(self, input, dense_mask, input_dtype, handle_single):
-    epsilon = tf.cast(self.epsilon, input_dtype)
-    testing_moving_mean = tf.cast(self.moving_mean, input_dtype)
-    tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype)
-
-    broad_mean = tf.expand_dims(testing_moving_mean, 0)
-    tensor_batch_sub_mean = input - broad_mean
-
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean)
-                            )
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                      tf.fill(tf.shape(tensor_moving_std), epsilon),
-                                      tensor_moving_std)
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_broad = tf.expand_dims(tensor_moving_std, 0)
-      moving_var_mask_zero = tf.math.logical_not(tf.cast(moving_var_broad, tf.bool))
-
-      missing_input_norm = tf.where(tf.math.logical_and(dense_mask, moving_var_mask_zero),
-                          tf.ones_like(missing_input_norm),
-                          missing_input_norm
-                          )
-    return missing_input_norm
-
-  def call(
-    self,
-    input,
-    is_training,
-    dense_mask=None,
-    zero_debias=True,
-    handle_single=False):
-    """
-    Args:
-    -----------
-    input:  B x D : float32/float64
-      missing value must be set to 0.
-    is_training: bool
-      training phase or testing phase
-    dense_mask: B x D : bool
-      missing value should be marked as 0, non-missing as 1. same shape as input
-    zero_debias: bool
-      bias correction of the moving average. (biased towards 0 in the beginning.
-      see adam paper. https://arxiv.org/abs/1412.6980)
-    handle_single: bool
-      if std==0, and feature is not missing value, set the value to 1, instead of 0.
-      This is super rare if input only consists of continous feature.
-      But if one-hot feature is included,
-      they will all have same values 1, in that case, make sure to set handle_single to true.
-    """
-
-    if dense_mask is None:
-      dense_mask = tf.math.logical_not(tf.equal(input, 0))
-    input_dtype = input.dtype
-
-    if is_training:
-      if input_dtype != self.data_type:
-        input = tf.cast(input, self.data_type)
-      return self._training_pass(input, dense_mask, input_dtype, handle_single, zero_debias)
-    else:
-      return self._infer_pass(input, dense_mask, input_dtype, handle_single)
-
-
-def zscore_normalization(
-  input,
-  is_training,
-  decay=0.9999,
-  data_type=tf.float64,
-  name=None,
-  dense_mask=None,
-  zero_debias=True,
-  handle_single=False, **kwargs):
-  """
-  Args:
-  ------------
-  input:  B x D : float32/float64
-    missing value must be set to 0.
-  is_training: bool
-    training phase or testing phase
-  decay:
-    using large decay to include longer moving means.
-  data_type:
-    use float64 to zprevent overflow during variance calculation.
-  name:
-    Layer name
-  dense_mask: B x D : bool
-    missing value should be marked as 0, non-missing as 1. same shape as input
-  zero_debias: bool
-    bias correction of the moving average. (biased towards 0 in the beginning.
-    see adam paper. https://arxiv.org/abs/1412.6980)
-  handle_single: bool
-    if std==0, and feature is not missing value, set the value to 1, instead of 0.
-    This is super rare if input only consists of continous feature.
-    But if one-hot feature is included,
-    they will all have same values 1, in that case, make sure to set handle_single to true.
-  """
-
-  norm_layer = ZscoreNormalization(decay=decay, data_type=data_type, name=name, **kwargs)
-  return norm_layer(input,
-                    is_training,
-                    dense_mask=dense_mask,
-                    zero_debias=zero_debias,
-                    handle_single=handle_single)
diff --git a/twml/twml/contrib/metrics/__init__.docx b/twml/twml/contrib/metrics/__init__.docx
new file mode 100644
index 000000000..e0ce59de3
Binary files /dev/null and b/twml/twml/contrib/metrics/__init__.docx differ
diff --git a/twml/twml/contrib/metrics/__init__.py b/twml/twml/contrib/metrics/__init__.py
deleted file mode 100644
index 37e6563c9..000000000
--- a/twml/twml/contrib/metrics/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental metric(s) for search and ranking"""
-
-from .search_metrics import get_search_metric_fn, ndcg  # noqa: F401
-from .metrics import *  # noqa: F401
diff --git a/twml/twml/contrib/metrics/metrics.docx b/twml/twml/contrib/metrics/metrics.docx
new file mode 100644
index 000000000..048aba692
Binary files /dev/null and b/twml/twml/contrib/metrics/metrics.docx differ
diff --git a/twml/twml/contrib/metrics/metrics.py b/twml/twml/contrib/metrics/metrics.py
deleted file mode 100644
index dea1a5273..000000000
--- a/twml/twml/contrib/metrics/metrics.py
+++ /dev/null
@@ -1,209 +0,0 @@
-"""
-Module containing extra tensorflow metrics used at Twitter.
-This module conforms to conventions used by tf.metrics.*.
-In particular, each metric constructs two subgraphs: value_op and update_op:
-  - The value op is used to fetch the current metric value.
-  - The update_op is used to accumulate into the metric.
-
-Note: similar to tf.metrics.*, metrics in here do not support multi-label learning.
-We will have to write wrapper classes to create one metric per label.
-
-Note: similar to tf.metrics.*, batches added into a metric via its update_op are cumulative!
-
-"""
-
-from collections import OrderedDict
-
-import tensorflow.compat.v1 as tf
-from twml.metrics import get_multi_binary_class_metric_fn
-
-
-
-# checkstyle: noqa
-def get_partial_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1, predcols=None):
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    if predcols is None:
-      preds = graph_output['output']
-    else:
-      if isinstance(predcols, int):
-        predcol_list=[predcols]
-      else:
-        predcol_list=list(predcols)
-      for col in predcol_list:
-        assert 0 <= col < graph_output['output'].shape[class_dim], 'Invalid Prediction Column Index !'
-      preds  = tf.gather(graph_output['output'], indices=predcol_list, axis=class_dim)     # [batchSz, num_col]
-      labels = tf.gather(labels, indices=predcol_list, axis=class_dim)                     # [batchSz, num_col]
-
-    predInfo = {'output': preds}
-    if 'threshold' in graph_output:
-      predInfo['threshold'] = graph_output['threshold']
-    if 'hard_output' in graph_output:
-      predInfo['hard_output'] = graph_output['hard_output']
-
-    metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim)
-    metrics_op_res = metrics_op(predInfo, labels, weights)
-    return metrics_op_res
-
-  return get_eval_metric_ops
-
-
-
-# Numeric Prediction Performance among TopK Predictions
-def mean_numeric_label_topK(labels, predictions, weights, name, topK_id):
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  return tf.metrics.mean(values=top_k_labels, name=name)
-
-def mean_gated_numeric_label_topK(labels, predictions, weights, name, topK_id, bar=2.0):
-  assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float"
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  gated_top_k_labels  = tf.cast(top_k_labels > bar*1.0, tf.int32)
-  return tf.metrics.mean(values=gated_top_k_labels, name=name)
-
-SUPPORTED_NUMERIC_METRICS = {
-  'mean_numeric_label_topk': mean_numeric_label_topK,
-  'mean_gated_numeric_label_topk': mean_gated_numeric_label_topK
-}
-DEFAULT_NUMERIC_METRICS = ['mean_numeric_label_topk', 'mean_gated_numeric_label_topk']
-
-
-
-def get_metric_topK_fn_helper(targetMetrics, supportedMetrics_op, metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  """
-  :param targetMetrics:        Target Metric List
-  :param supportedMetrics_op:  Supported Metric Operators             Dict
-  :param metrics:              Metric Set to evaluate
-  :param topK:                 (topK_min, topK_max, topK_delta)       Tuple
-  :param predcol:              Prediction Column Index
-  :param labelcol:             Label Column Index
-  :return:
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if targetMetrics is None or supportedMetrics_op is None:
-    raise ValueError("Invalid Target Metric List/op !")
-
-  targetMetrics = set([m.lower() for m in targetMetrics])
-  if metrics is None:
-    metrics = list(targetMetrics)
-  else:
-    metrics = [m.lower() for m in metrics if m.lower() in targetMetrics]
-
-  num_k     = int((topK[1]-topK[0])/topK[2]+1)
-  topK_list = [topK[0]+d*topK[2] for d in range(num_k)]
-  if 1 not in topK_list:
-    topK_list = [1] + topK_list
-
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-    eval_metric_ops = OrderedDict()
-
-    if predcol is None:
-      pred = graph_output['output']
-    else:
-      assert 0 <= predcol < graph_output['output'].shape[1], 'Invalid Prediction Column Index !'
-      assert labelcol is not None
-      pred   = tf.reshape(graph_output['output'][:, predcol], shape=[-1, 1])
-      labels = tf.reshape(labels[:, labelcol], shape=[-1, 1])
-    numOut = graph_output['output'].shape[1]
-    pred_score = tf.reshape(graph_output['output'][:, numOut-1], shape=[-1, 1])
-
-    # add metrics to eval_metric_ops dict
-    for metric_name in metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in supportedMetrics_op:
-        metric_factory = supportedMetrics_op.get(metric_name)
-
-        if 'topk' not in metric_name:
-          value_op, update_op = metric_factory(
-            labels=labels,
-            predictions=pred,
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
-        else:
-          for K in topK_list:
-            K_min = tf.minimum(K, tf.shape(pred_score)[0])
-            topK_id = tf.nn.top_k(tf.reshape(pred_score, shape=[-1]), k=K_min)[1]           # [topK]
-            value_op, update_op = metric_factory(
-              labels=labels,
-              predictions=pred,
-              weights=weights,
-              name=metric_name+'__k_'+str(K),
-              topK_id=topK_id)
-            eval_metric_ops[metric_name+'__k_'+str(K)] = (value_op, update_op)
-
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-
-def get_numeric_metric_fn(metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  if metrics is None:
-    metrics = list(DEFAULT_NUMERIC_METRICS)
-  metrics   = list(set(metrics))
-
-  metric_op = get_metric_topK_fn_helper(targetMetrics=list(DEFAULT_NUMERIC_METRICS),
-                                        supportedMetrics_op=SUPPORTED_NUMERIC_METRICS,
-                                        metrics=metrics, topK=topK, predcol=predcol, labelcol=labelcol)
-  return metric_op
-
-
-
-def get_single_binary_task_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 1]        [pred_Task1]
-  labels:                        [BatchSz, 2]        [Task1, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
-
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=0, labelcol=1)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
-
-  return get_eval_metric_ops
-
-
-def get_dual_binary_tasks_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 3]        [pred_Task1, pred_Task2, Score]
-  labels:                        [BatchSz, 3]        [Task1, Task2, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
-
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=2, labelcol=2)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
-
-  return get_eval_metric_ops
diff --git a/twml/twml/contrib/metrics/search_metrics.docx b/twml/twml/contrib/metrics/search_metrics.docx
new file mode 100644
index 000000000..10e616ead
Binary files /dev/null and b/twml/twml/contrib/metrics/search_metrics.docx differ
diff --git a/twml/twml/contrib/metrics/search_metrics.py b/twml/twml/contrib/metrics/search_metrics.py
deleted file mode 100644
index 7d7a502f1..000000000
--- a/twml/twml/contrib/metrics/search_metrics.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""
-Module containing extra tensorflow metrics used at Twitter.
-This module conforms to conventions used by tf.metrics.*.
-In particular, each metric constructs two subgraphs: value_op and update_op:
-  - The value op is used to fetch the current metric value.
-  - The update_op is used to accumulate into the metric.
-
-Note: similar to tf.metrics.*, metrics in here do not support multi-label learning.
-We will have to write wrapper classes to create one metric per label.
-
-Note: similar to tf.metrics.*, batches added into a metric via its update_op are cumulative!
-
-"""
-
-from collections import OrderedDict
-from functools import partial
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes, ops
-from tensorflow.python.ops import array_ops, state_ops
-import twml
-from twml.contrib.utils import math_fns
-
-
-def ndcg(labels, predictions,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None,
-                  top_k_int=1):
-  # pylint: disable=unused-argument
-  """
-  Compute full normalized discounted cumulative gain (ndcg) based on predictions
-  ndcg = dcg_k/idcg_k, k is a cut off ranking postion
-  There are a few variants of ndcg
-  The dcg (discounted cumulative gain) formula used in
-  twml.contrib.metrics.ndcg is::
-
-    \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)}
-
-  k is the length of items to be ranked in a batch/query
-  Notice that whether k will be replaced with a fixed value requires discussions
-  The scores in predictions are transformed to order and relevance scores to calculate ndcg
-  A relevance score means how relevant a DataRecord is to a particular query
-
-  Arguments:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Returns:
-    ndcg: A `Tensor` representing the ndcg score.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'ndcg', (labels, predictions)):
-    label_scores = tf.to_float(labels, name='label_to_float')
-    predicted_scores = tf.to_float(predictions, name='predictions_to_float')
-
-    if context.executing_eagerly():
-      raise RuntimeError('ndcg is not supported when eager execution '
-                         'is enabled.')
-
-    total_ndcg = _metric_variable([], dtypes.float32, name='total_ndcg')
-    count_query = _metric_variable([], dtypes.float32, name='query_count')
-
-    # actual ndcg cutoff position top_k_int
-    max_prediction_size = array_ops.size(predicted_scores)
-    top_k_int = tf.minimum(max_prediction_size, top_k_int)
-    # the ndcg score of the batch
-    ndcg = math_fns.cal_ndcg(label_scores,
-      predicted_scores, top_k_int=top_k_int)
-    # add ndcg of the current batch to total_ndcg
-    update_total_op = state_ops.assign_add(total_ndcg, ndcg)
-    with ops.control_dependencies([ndcg]):
-      # count_query stores the number of queries
-      # count_query increases by 1 for each batch/query
-      update_count_op = state_ops.assign_add(count_query, 1)
-
-    mean_ndcg = math_fns.safe_div(total_ndcg, count_query, 'mean_ndcg')
-    update_op = math_fns.safe_div(update_total_op, update_count_op, 'update_mean_ndcg_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_ndcg)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean_ndcg, update_op
-
-
-# Copied from metrics_impl.py with minor modifications.
-# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
-def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
-
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
-
-
-# binary metric_name: (metric, requires thresholded output)
-SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML binary metrics
-  'rce': (twml.metrics.rce, False),
-  'nrce': (partial(twml.metrics.rce, normalize=True), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (twml.metrics.ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (twml.metrics.predicted_ctr, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR'), False),
-}
-
-# search metric_name: metric
-SUPPORTED_SEARCH_METRICS = {
-  # TWML search metrics
-  # ndcg needs the raw prediction scores to sort
-  'ndcg': ndcg,
-}
-
-
-def get_search_metric_fn(binary_metrics=None, search_metrics=None,
-  ndcg_top_ks=[1, 3, 5, 10], use_binary_metrics=False):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for ranking. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions. Required.
-    threshold:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      If the lables are 0s and 1s
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Arguments:
-    only used in pointwise learning-to-rank
-
-    binary_metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ctr (same as positive sample ratio.)
-        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-        - nrce (normalized rce, do not use this one if you do not understand what it is)
-        - pr_auc
-        - roc_auc
-        - accuracy (percentage of predictions that are correct)
-        - precision (true positives) / (true positives + false positives)
-        - recall (true positives) / (true positives + false negatives)
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When binary_metrics is None (the default), it defaults to all supported metrics
-
-    search_metrics (list of String):
-      a list of metrics of interest. E.g. ['ndcg']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ndcg
-
-      NOTE: ndcg works for ranking-relatd problems.
-      A batch contains all DataRecords that belong to the same query
-      If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords
-      that belong to the same query and have different labels -- ndcg does not apply in here.
-
-      When search_metrics is None (the default), it defaults to all supported search metrics
-      currently only 'ndcg'
-
-    ndcg_top_ks (list of integers):
-      The cut-off ranking postions for a query
-      When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10]
-
-    use_binary_metrics:
-      False (default)
-      Only set it to true in pointwise learning-to-rank
-  """
-  # pylint: disable=dict-keys-not-iterating
-
-  if ndcg_top_ks is None or not ndcg_top_ks:
-    ndcg_top_ks = [1, 3, 5, 10]
-
-  if search_metrics is None:
-    search_metrics = list(SUPPORTED_SEARCH_METRICS.keys())
-
-  if binary_metrics is None and use_binary_metrics:
-    # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well
-    # they are only used in pointwise learing-to-rank
-    binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys())
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    # hard_preds is a tensor
-    # check hard_preds is None and then check if it is empty
-    if hard_preds is None or tf.equal(tf.size(hard_preds), 0):
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add search metrics to eval_metric_ops dict
-    for metric_name in search_metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name)
-      if search_metric_factory:
-        if metric_name == 'ndcg':
-          for top_k in ndcg_top_ks:
-            # metric name will show as ndcg_1, ndcg_10, ...
-            metric_name_ndcg_top_k = metric_name + '_' + str(top_k)
-            top_k_int = tf.constant(top_k, dtype=tf.int32)
-            # Note: having weights in ndcg does not make much sense
-            # Because ndcg already has position weights/discounts
-            # Thus weights are not applied in ndcg metric
-            value_op, update_op = search_metric_factory(
-              labels=labels,
-              predictions=preds,
-              name=metric_name_ndcg_top_k,
-              top_k_int=top_k_int)
-            eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the search metric named ' + metric_name)
-
-    if use_binary_metrics:
-      # add binary metrics to eval_metric_ops dict
-      for metric_name in binary_metrics:
-
-        if metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        metric_name = metric_name.lower()  # metric name are case insensitive.
-        binary_metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        if binary_metric_factory:
-          value_op, update_op = binary_metric_factory(
-            labels=labels,
-            predictions=(hard_preds if requires_threshold else preds),
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the binary metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
diff --git a/twml/twml/contrib/optimizers/__init__.docx b/twml/twml/contrib/optimizers/__init__.docx
new file mode 100644
index 000000000..4e5632bf8
Binary files /dev/null and b/twml/twml/contrib/optimizers/__init__.docx differ
diff --git a/twml/twml/contrib/optimizers/__init__.py b/twml/twml/contrib/optimizers/__init__.py
deleted file mode 100644
index 112b2b410..000000000
--- a/twml/twml/contrib/optimizers/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental optimizer classes"""
-from .deep_gradient_compression_optimizer import DeepGradientCompressionOptimizer  # noqa: F401
-from .pruning_optimizer import PruningOptimizer  # noqa: F401
diff --git a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.docx b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.docx
new file mode 100644
index 000000000..7f527b525
Binary files /dev/null and b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.docx differ
diff --git a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
deleted file mode 100644
index 2c71ed13f..000000000
--- a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-A custom optimizer to implement Deep Gradient Compression. The general idea of
-gradient compression is to compress the gradients exchanged across machines,
-in order to reduce the communication overhead of distributing computing efforts.
-More details in https://arxiv.org/abs/1712.01887
-"""
-
-# TODO: Test how much communication overhead this DeepGradientCompressionOptimizer can reduce under
-# multi-GPU and distributed setting.
-
-import tensorflow.compat.v1 as tf
-
-
-def compute_threshold(grad, density):
-  """
-  A utility function to compute the threshold for gradient sparsification, given the gradient
-  tensor and the density.
-  Args:
-    grad(tf.Tensor):
-      Gradient tensor for some variable.
-    density(float):
-      Density degree when sparsifying gradients.
-  Returns(float):
-    Threshold for gradient sparsification.
-  """
-  flat_grad = tf.reshape(grad, [-1])
-  abs_flat_grad = tf.abs(flat_grad)
-  size = tf.shape(abs_flat_grad)[0]
-  k = tf.maximum(tf.constant(1),
-                 tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32))
-  topk, _ = tf.nn.top_k(abs_flat_grad, k, False)
-  return topk[-1]
-
-
-def get_top_row_indices(values, density):
-  """
-  A utility function to get indices of most significant rows, given the density degree.
-  Args:
-    values(tf.Tensor):
-      Gradient or locally accumulated gradient for some variable.
-    density(float):
-      Density degree when filtering out rows.
-  Returns(list(int)):
-    Indices of most significant rows.
-  """
-  abs_values = tf.abs(values)
-
-  try:
-    row_num = tf.shape(abs_values)[0]
-    k = tf.maximum(tf.constant(1),
-                   tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32))
-    row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True))
-    _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False)
-    # print "abs_values", abs_values, "row_sums", row_sums
-    return top_row_indices
-    # return tf.range(row_num)
-
-  except ValueError:  # if the tensor is 0-D or 1-D
-    return None
-
-
-class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer):
-  """
-  A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887).
-  """
-
-  def __init__(self, learning_rate, use_locking=False, name="Sparse",
-               density=1.0,
-               density_decay=False,
-               density_decay_steps=10000,
-               density_decay_rate=0.5,
-               min_density=0.1,
-               accumulation=False):
-    super(DeepGradientCompressionOptimizer, self).__init__(learning_rate, use_locking, name)
-    self._initial_density_t = tf.convert_to_tensor(density)
-    self._density_decay = density_decay
-    dtype = self._initial_density_t.dtype
-    self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype)
-    self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype)
-    self._min_density_t = tf.convert_to_tensor(min_density, dtype)
-    self._accumulation = accumulation
-
-  def _prepare(self):
-    super(DeepGradientCompressionOptimizer, self)._prepare()
-    if not self._density_decay:
-      self._density_t = self._initial_density_t
-    else:
-      dtype = self._initial_density_t.dtype
-      global_step = tf.cast(tf.train.get_global_step(), dtype)
-      p = tf.floor(tf.divide(global_step, self._density_decay_steps_t))
-      decayed_density = tf.multiply(self._initial_density_t,
-                                    tf.pow(self._density_decay_rate_t, p))
-      self._density_t = tf.maximum(self._min_density_t, decayed_density)
-
-  def _create_slots(self, var_list):
-    """
-    Create a slot variable to accumulate gradients locally for each variable in `var_list`.
-    Args:
-      var_list(list(tf.Variable)):
-        List of variables to accumulate gradients locally for.
-    """
-    for var in var_list:
-      self._zeros_slot(var, "g_buffer", self._name)
-
-  def _apply_dense(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(grad, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
-
-      g_buffer = tf.assign_add(g_buffer, grad)
-
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
-
-      return tf.group(*[update_var, update_g_buffer])
-
-  def _apply_sparse_duplicate_indices(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad.values, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(grad, var)  # noqa: E501
-
-      sparsified_values = tf.gather(grad.values, top_row_indices)
-      sparsified_indices = tf.gather(grad.indices, top_row_indices)
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
-
-      g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values)
-
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer,
-                     self)._apply_sparse_duplicate_indices(grad, var)
-
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
-
-      return tf.group(*[update_var, update_g_buffer])
diff --git a/twml/twml/contrib/optimizers/pruning_optimizer.docx b/twml/twml/contrib/optimizers/pruning_optimizer.docx
new file mode 100644
index 000000000..fe543ae11
Binary files /dev/null and b/twml/twml/contrib/optimizers/pruning_optimizer.docx differ
diff --git a/twml/twml/contrib/optimizers/pruning_optimizer.py b/twml/twml/contrib/optimizers/pruning_optimizer.py
deleted file mode 100644
index 2bcd612ed..000000000
--- a/twml/twml/contrib/optimizers/pruning_optimizer.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""
-Provides a general optimizer for pruning features of a neural network.
-
-The optimizer estimates the computational cost of features, combines this information with pruning
-signals indicating their usefulness, and disables features via binary masks at regular intervals.
-
-To make a layer prunable, use `twml.contrib.pruning.apply_mask`:
-
-  dense1 = tf.layers.dense(inputs=inputs, units=50, activation=tf.nn.relu)
-  dense1 = apply_mask(dense1)
-
-To prune the network, apply PruningOptimizer to any cross-entropy loss:
-
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  optimizer = PruningOptimizer(learning_rate=0.001, momentum=0.5)
-  minimize = optimizer.minimize(
-      loss=loss,
-      prune_every=10,
-      burn_in=100,
-      global_step=tf.train.get_global_step())
-"""
-
-import tensorflow.compat.v1 as tf
-
-from twml.contrib.pruning import computational_cost, prune, update_pruning_signals
-from twml.contrib.pruning import MASK_COLLECTION
-
-
-class PruningOptimizer(tf.train.MomentumOptimizer):
-  """
-  Updates parameters with SGD and pruning masks using Fisher pruning.
-
-  Arguments:
-    learning_rate: float
-      Learning rate of SGD
-
-    momentum: float
-      Momentum used by SGD
-
-    use_locking: bool
-      If `True`, use locks for update operations
-
-    name: str
-      Optional name prefix for the operations created when applying gradients
-
-    use_nesterov: bool
-      If `True`, use Nesterov momentum
-  """
-
-  def __init__(
-      self,
-      learning_rate,
-      momentum=0.9,
-      use_locking=False,
-      name="PruningOptimizer",
-      use_nesterov=False):
-    super(PruningOptimizer, self).__init__(
-        learning_rate=learning_rate,
-        momentum=momentum,
-        use_locking=use_locking,
-        name=name,
-        use_nesterov=use_nesterov)
-
-  def minimize(
-    self,
-    loss,
-    prune_every=100,
-    burn_in=0,
-    decay=.96,
-    flops_weight='AUTO',
-    flops_target=0,
-    update_params=None,
-    method='Fisher',
-    *args,
-    **kwargs):
-    """
-    Create operations to minimize loss and to prune features.
-
-    A pruning signal measures the importance of feature maps. This is weighed against the
-    computational cost of computing a feature map. Features are then iteratively pruned
-    based on a weighted average of feature importance S and computational cost C (in FLOPs):
-
-    $$S + w * C$$
-
-    Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not
-    necessarily optimal.
-
-    Arguments:
-      loss: tf.Tensor
-        The value to minimize
-
-      prune_every: int
-        One entry of a mask is set to zero only every few update steps
-
-      burn_in: int
-        Pruning starts only after this many parameter updates
-
-      decay: float
-        Controls exponential moving average of pruning signals
-
-      flops_weight: float or str
-        Controls the targeted trade-off between computational complexity and performance
-
-      flops_target: float
-        Stop pruning when computational complexity is less or this many floating point ops
-
-      update_params: tf.Operation
-        Optional training operation used instead of MomentumOptimizer to update parameters
-
-      method: str
-        Method used to compute pruning signal (currently only supports 'Fisher')
-
-    Returns:
-      A `tf.Operation` updating parameters and pruning masks
-
-    References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
-    """
-
-    # gradient-based updates of parameters
-    if update_params is None:
-      update_params = super(PruningOptimizer, self).minimize(loss, *args, **kwargs)
-
-    masks = tf.get_collection(MASK_COLLECTION)
-
-    with tf.variable_scope('pruning_opt', reuse=True):
-      # estimate computational cost per data point
-      batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0]
-      cost = tf.divide(computational_cost(loss), batch_size, name='computational_cost')
-
-      tf.summary.scalar('computational_cost', cost)
-
-      if masks:
-        signals = update_pruning_signals(loss, masks=masks, decay=decay, method=method)
-
-        # estimate computational cost per feature map
-        costs = tf.gradients(cost, masks)
-
-        # trade off computational complexity and performance
-        if flops_weight.upper() == 'AUTO':
-          signals = [s / (c + 1e-6) for s, c in zip(signals, costs)]
-        elif not isinstance(flops_weight, float) or flops_weight != 0.:
-          signals = [s - flops_weight * c for s, c in zip(signals, costs)]
-
-        counter = tf.Variable(0, name='pruning_counter')
-        counter = tf.assign_add(counter, 1, use_locking=True)
-
-        # only prune every so often after a burn-in phase
-        pruning_cond = tf.logical_and(counter > burn_in, tf.equal(counter % prune_every, 0))
-
-        # stop pruning after reaching threshold
-        if flops_target > 0:
-          pruning_cond = tf.logical_and(pruning_cond, tf.greater(cost, flops_target))
-
-        update_masks = tf.cond(
-          pruning_cond,
-          lambda: prune(signals, masks=masks),
-          lambda: tf.group(masks))
-
-        return tf.group([update_params, update_masks])
-
-    # no masks found
-    return update_params
diff --git a/twml/twml/contrib/parsers.docx b/twml/twml/contrib/parsers.docx
new file mode 100644
index 000000000..3719329fc
Binary files /dev/null and b/twml/twml/contrib/parsers.docx differ
diff --git a/twml/twml/contrib/parsers.py b/twml/twml/contrib/parsers.py
deleted file mode 100644
index a27f2acbd..000000000
--- a/twml/twml/contrib/parsers.py
+++ /dev/null
@@ -1,21 +0,0 @@
-'''
-Contains implementations of functions to parse the contrib.FeatureConfig
-
-Modelers can use the functions in this module as the the train/eval_parse_fn of
-the DataRecordTrainer constructor to customize how to parse their datasets.
-
-Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
-
-from twitter.deepbird.io.legacy.contrib.parsers import (
-  _convert_to_fixed_length_tensor,  # noqa: F401
-  _get_input_receiver_fn_feature_dict,  # noqa: F401
-  _merge_dictionaries,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_keras_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_string_tensor_parse_fn,  # noqa: F401
-  get_string_tensor_serving_input_receiver_fn,  # noqa: F401
-  get_supervised_input_receiver_fn_feature_dict,  # noqa: F401
-  parse_string_tensor,  # noqa: F401
-)
diff --git a/twml/twml/contrib/pruning.docx b/twml/twml/contrib/pruning.docx
new file mode 100644
index 000000000..7800a5611
Binary files /dev/null and b/twml/twml/contrib/pruning.docx differ
diff --git a/twml/twml/contrib/pruning.py b/twml/twml/contrib/pruning.py
deleted file mode 100644
index b6ddee693..000000000
--- a/twml/twml/contrib/pruning.py
+++ /dev/null
@@ -1,363 +0,0 @@
-"""
-This module implements tools for pruning neural networks.
-
-In particular, it provides tools for dealing with masks:
-
-  features = apply_mask(features)
-
-The function `apply_mask` applies a binary mask to the channels of a given tensor. Consider the
-following loss:
-
-  logits = tf.matmul(features, weights)
-  loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
-
-Each mask has a corresponding pruning signal. The function `update_pruning_signals` will update and
-return these signals:
-
-  signals = update_pruning_signals(loss)
-
-The pruning operation will zero out the mask entry with the smallest corresponding pruning signal:
-
-  prune(signals)
-
-The following function allows us to estimate the computational cost of a graph (number of FLOPs):
-
-  cost = computational_cost(loss)
-
-To compute the cost of each feature per data point, we can do:
-
-  costs = tf.gradients(cost / batch_size, masks)
-
-The current implementation of `computational_cost` is designed to work with standard feed-forward
-and convolutional network architectures only, but may fail with more complicated architectures.
-"""
-
-
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-MASK_COLLECTION = 'pruning/masks'
-MASK_EXTENDED_COLLECTION = 'pruning/masks_extended'
-OP_COLLECTION = 'pruning/ops'
-
-
-def apply_mask(tensor, name='pruning'):
-  """
-  Point-wise multiplies a tensor with a binary mask.
-
-  During training, pruning is simulated by setting entries of the mask to zero.
-
-  Arguments:
-    tensor: tf.Tensor
-      A tensor where the last dimension represents channels which will be masked
-
-  Returns:
-    `tf.Tensor` with same shape as `tensor`
-  """
-
-  tensor_shape = tensor.shape
-
-  with tf.variable_scope(name, reuse=True):
-    # allocate masks and corresponding pruning signals
-    mask = tf.Variable(tf.ones(tensor.shape.as_list()[-1]), trainable=False, name='mask')
-    pruning_signal = tf.Variable(tf.zeros_like(mask), trainable=False, name='signal')
-
-    # extending masks is a trick to get a separate gradient for each data point
-    mask_extended = extend_mask(mask, tensor)
-
-  # store extended mask, pruning signal, and other vars for easy access later
-  mask.extended = mask_extended
-  mask.pruning_signal = pruning_signal
-  mask.tensor = tensor
-
-  # mask tensor
-  tensor = tf.multiply(tensor, mask_extended)
-  tensor.set_shape(tensor_shape)
-  tensor._mask = mask
-
-  tf.add_to_collection(MASK_COLLECTION, mask)
-  tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended)
-  tf.add_to_collection(OP_COLLECTION, tensor.op)
-
-  return tensor
-
-
-def extend_mask(mask, tensor):
-  """
-  Repeats the mask for each data point stored in a tensor.
-
-  If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional
-  tensor with A copies or `mask`.
-
-  Arguments:
-    mask: tf.Tensor
-      The mask which will be extended
-
-    tensor: tf.Tensor
-      The tensor to which the extended mask will be applied
-
-  Returns:
-    The extended mask
-  """
-
-  batch_size = tf.shape(tensor)[:1]
-  ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype)
-  multiples = tf.concat([batch_size, ones], 0)
-  mask_shape = tf.concat([ones, [-1]], 0)
-  return tf.tile(tf.reshape(mask, mask_shape), multiples)
-
-
-def find_input_mask(tensor):
-  """
-  Find ancestral mask affecting the number of pruned channels of a tensor.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  if tensor.op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D', 'Transpose']:
-    # op produces a new number of channels, preceding mask therefore irrelevant
-    return None
-  if not tensor.op.inputs:
-    return None
-  for input in tensor.op.inputs:
-    mask = find_input_mask(input)
-    if mask is not None:
-      return mask
-
-
-def find_output_mask(tensor):
-  """
-  Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape.
-
-  Arguments:
-    tensor: tf.Tensor or tf.Variable
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if isinstance(tensor, tf.Variable):
-    return find_output_mask(tensor.op.outputs[0])
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  for op in tensor.consumers():
-    if len(op.outputs) != 1:
-      continue
-    if op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D']:
-      # masks of descendants are only relevant if tensor is right-multiplied
-      if tensor == op.inputs[1]:
-        return find_output_mask(op.outputs[0])
-      return None
-    mask = find_output_mask(op.outputs[0])
-    if mask is not None:
-      return mask
-
-
-def find_mask(tensor):
-  """
-  Returns masks indicating channels of the tensor that are effectively removed from the graph.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a mask
-
-  Returns:
-    A `tf.Tensor` with binary entries indicating disabled channels
-  """
-
-  input_mask = find_input_mask(tensor)
-  output_mask = find_output_mask(tensor)
-  if input_mask is None:
-    return output_mask
-  if output_mask is None:
-    return input_mask
-  if input_mask is output_mask:
-    return input_mask
-  return input_mask * output_mask
-
-
-def pruned_shape(tensor):
-  """
-  Computes the shape of a tensor after taking into account pruning of channels.
-
-  Note that the shape will only differ in the last dimension, even if other dimensions are also
-  effectively disabled by pruning masks.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a pruned shape
-
-  Returns:
-    A `tf.Tensor[tf.float32]` representing the pruned shape
-  """
-
-  mask = find_mask(tensor)
-
-  if mask is None:
-    return tf.cast(tf.shape(tensor), tf.float32)
-
-  return tf.concat([
-    tf.cast(tf.shape(tensor)[:-1], mask.dtype),
-    tf.reduce_sum(mask, keepdims=True)], 0)
-
-
-def computational_cost(op_or_tensor, _observed=None):
-  """
-  Estimates the computational complexity of a pruned graph (number of floating point operations).
-
-  This function currently only supports sequential graphs such as those of MLPs and
-  simple CNNs with 2D convolutions in NHWC format.
-
-  Note that the computational cost returned by this function is proportional to batch size.
-
-  Arguments:
-    op_or_tensor: tf.Tensor or tf.Operation
-      Root node of graph for which to compute computational cost
-
-  Returns:
-    A `tf.Tensor` representing a number of floating point operations
-  """
-
-  cost = tf.constant(0.)
-
-  # exclude cost of computing extended pruning masks
-  masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)]
-  if op_or_tensor in masks_extended:
-    return cost
-
-  # convert tensor to op
-  op = op_or_tensor.op if isinstance(op_or_tensor, (tf.Tensor, tf.Variable)) else op_or_tensor
-
-  # make sure cost of op will not be counted twice
-  if _observed is None:
-    _observed = []
-  elif op in _observed:
-    return cost
-  _observed.append(op)
-
-  # compute cost of computing inputs
-  for tensor in op.inputs:
-    cost = cost + computational_cost(tensor, _observed)
-
-  # add cost of operation
-  if op.op_def is None or op in tf.get_collection(OP_COLLECTION):
-    # exclude cost of undefined ops and pruning ops
-    return cost
-
-  elif op.op_def.name == 'MatMul':
-    shape_a = pruned_shape(op.inputs[0])
-    shape_b = pruned_shape(op.inputs[1])
-    return cost + shape_a[0] * shape_b[1] * (2. * shape_a[1] - 1.)
-
-  elif op.op_def.name in ['Add', 'Mul', 'BiasAdd']:
-    return cost + tf.cond(
-        tf.size(op.inputs[0]) > tf.size(op.inputs[1]),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[0])),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[1])))
-
-  elif op.op_def.name in ['Conv2D']:
-    output_shape = pruned_shape(op.outputs[0])
-    input_shape = pruned_shape(op.inputs[0])
-    kernel_shape = pruned_shape(op.inputs[1])
-    inner_prod_cost = (tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2. - 1.)
-    return cost + tf.reduce_prod(output_shape) * inner_prod_cost
-
-  return cost
-
-
-def update_pruning_signals(loss, decay=.96, masks=None, method='Fisher'):
-  """
-  For each mask, computes corresponding pruning signals indicating the importance of a feature.
-
-  Arguments:
-    loss: tf.Tensor
-      Any cross-entropy loss
-
-    decay: float
-      Controls exponential moving average of pruning signals
-
-    method: str
-      Method used to compute pruning signal (currently only supports 'Fisher')
-
-  Returns:
-    A `list[tf.Tensor]` of pruning signals corresponding to masks
-
-  References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  if method not in ['Fisher']:
-    raise ValueError('Pruning method \'{0}\' not supported.'.format(method))
-
-  if not masks:
-    return []
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # compute gradients of extended masks (yields separate gradient for each data point)
-    grads = tf.gradients(loss, [m.extended for m in masks])
-
-    # estimate Fisher pruning signals from batch
-    signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads]
-
-    # update pruning signals
-    signals = [m.pruning_signal for m in masks]
-    signals = [tf.assign(s, decay * s + (1. - decay) * f, use_locking=True)
-      for s, f in zip(signals, signals_batch)]
-
-  return signals
-
-
-def prune(signals, masks=None):
-  """
-  Prunes a single feature by zeroing the mask entry with the smallest pruning signal.
-
-  Arguments:
-    signals: list[tf.Tensor]
-      A list of pruning signals
-
-    masks: list[tf.Tensor]
-      A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)`
-
-  Returns:
-    A `tf.Operation` which updates masks
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # make sure we don't select already pruned units
-    signals = [tf.where(m > .5, s, tf.zeros_like(s) + np.inf) for m, s in zip(masks, signals)]
-
-    # find units with smallest pruning signal in each layer
-    min_idx = [tf.argmin(s) for s in signals]
-    min_signals = [s[i] for s, i in zip(signals, min_idx)]
-
-    # find layer with smallest pruning signal
-    l = tf.argmin(min_signals)
-
-    # construct pruning operations, one for each mask
-    updates = []
-    for k, i in enumerate(min_idx):
-      # set mask of layer l to 0 where pruning signal is smallest
-      updates.append(
-        tf.cond(
-          tf.equal(l, k),
-          lambda: tf.scatter_update(
-            masks[k], tf.Print(i, [i], message="Pruning layer [{0}] at index ".format(k)), 0.),
-          lambda: masks[k]))
-
-    updates = tf.group(updates, name='prune')
-
-  return updates
diff --git a/twml/twml/contrib/readers/__init__.docx b/twml/twml/contrib/readers/__init__.docx
new file mode 100644
index 000000000..4518bfc68
Binary files /dev/null and b/twml/twml/contrib/readers/__init__.docx differ
diff --git a/twml/twml/contrib/readers/__init__.py b/twml/twml/contrib/readers/__init__.py
deleted file mode 100644
index e96cf0449..000000000
--- a/twml/twml/contrib/readers/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental readers classes"""
-from .batch_prediction_request import BatchPredictionRequest  # noqa: F401
-from .data_record import DataRecord  # noqa: F401
-from .hashed_batch_prediction_request import HashedBatchPredictionRequest  # noqa: F401
diff --git a/twml/twml/contrib/readers/batch_prediction_request.docx b/twml/twml/contrib/readers/batch_prediction_request.docx
new file mode 100644
index 000000000..4bb4ee853
Binary files /dev/null and b/twml/twml/contrib/readers/batch_prediction_request.docx differ
diff --git a/twml/twml/contrib/readers/batch_prediction_request.py b/twml/twml/contrib/readers/batch_prediction_request.py
deleted file mode 100644
index 4408b33b4..000000000
--- a/twml/twml/contrib/readers/batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for BatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.contrib.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/contrib/readers/data_record.docx b/twml/twml/contrib/readers/data_record.docx
new file mode 100644
index 000000000..f2f6381e6
Binary files /dev/null and b/twml/twml/contrib/readers/data_record.docx differ
diff --git a/twml/twml/contrib/readers/data_record.py b/twml/twml/contrib/readers/data_record.py
deleted file mode 100644
index ae8cc0b68..000000000
--- a/twml/twml/contrib/readers/data_record.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-This module includes facilities for manipulating data records in DeepBird v2.
-This contains a submodule that allows for easy feature access as Tensors.
-The result of this subclass methods are dictionaries of Tensors and SparseTensors
-"""
-
-from twitter.deepbird.io.legacy.contrib.readers.data_record import (
-  SUPPORTED_DENSE_FEATURE_TYPES,  # noqa: F401
-  DataRecord,  # noqa: F401
-)
diff --git a/twml/twml/contrib/readers/hashed_batch_prediction_request.docx b/twml/twml/contrib/readers/hashed_batch_prediction_request.docx
new file mode 100644
index 000000000..510eab79e
Binary files /dev/null and b/twml/twml/contrib/readers/hashed_batch_prediction_request.docx differ
diff --git a/twml/twml/contrib/readers/hashed_batch_prediction_request.py b/twml/twml/contrib/readers/hashed_batch_prediction_request.py
deleted file mode 100644
index 3454f8483..000000000
--- a/twml/twml/contrib/readers/hashed_batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for HashedBatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.contrib.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/contrib/trainers/__init__.docx b/twml/twml/contrib/trainers/__init__.docx
new file mode 100644
index 000000000..16246b19b
Binary files /dev/null and b/twml/twml/contrib/trainers/__init__.docx differ
diff --git a/twml/twml/contrib/trainers/__init__.py b/twml/twml/contrib/trainers/__init__.py
deleted file mode 100644
index 3226cd805..000000000
--- a/twml/twml/contrib/trainers/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental trainer classes"""
-from .batch_prediction_request_trainer import BatchPredictionRequestTrainer  # noqa: F401
-from .pruning_data_record_trainer import PruningDataRecordTrainer  # noqa: F401
-from .trainer_utils import build_keras_trainer # noqa: F401
diff --git a/twml/twml/contrib/trainers/batch_prediction_request_trainer.docx b/twml/twml/contrib/trainers/batch_prediction_request_trainer.docx
new file mode 100644
index 000000000..e62b56cdc
Binary files /dev/null and b/twml/twml/contrib/trainers/batch_prediction_request_trainer.docx differ
diff --git a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
deleted file mode 100644
index 2effa87ed..000000000
--- a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# pylint: disable=arguments-differ, invalid-name
-"""
-This file contains the DataRecordTrainer class.
-"""
-import warnings
-
-import twml
-from twml.trainers import DataRecordTrainer
-
-
-class BatchPredictionRequestTrainer(DataRecordTrainer):  # pylint: disable=abstract-method
-  """
-  The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases
-  that input is BatchPredictionRequest at Twitter and also where only the build_graph methods
-  needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
-    """
-    The BatchPredictionRequestTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer and twml.DataRecordTrainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
-    """
-
-    # Check and update train_batch_size and eval_batch_size in params before initialization
-    # to print correct parameter logs and does not stop running
-    # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params
-    updated_params = self.check_batch_size_params(params)
-    super(BatchPredictionRequestTrainer, self).__init__(
-      name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs)
-
-  def check_batch_size_params(self, params):
-    """ Verify that params has the correct key,values """
-    # updated_params is an instance of tensorflow.contrib.training.HParams
-    updated_params = twml.util.convert_to_hparams(params)
-    param_values = updated_params.values()
-
-    # twml.trainers.Trainer.check_params already checks other constraints,
-    # such as being an integer
-    if 'train_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if param_values['train_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that train_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'train_batch_size is always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper error warning, change/pass --train.batch_size 1
-        # so that train_batch_size = 1
-        updated_params.train_batch_size = 1
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError('Expecting params.eval_batch_size to be an integer.')
-      if param_values['eval_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that eval_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'eval_batch_size is also always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper warning raises, change/pass --eval.batch_size 1
-        # so that eval_batch_size = 1
-        updated_params.eval_batch_size = 1
-
-    if 'eval_batch_size' not in param_values:
-      updated_params.eval_batch_size = 1
-
-    if not updated_params.eval_batch_size:
-      updated_params.eval_batch_size = 1
-
-    return updated_params
-
-  @staticmethod
-  def add_batch_prediction_request_arguments():
-    """
-    Add commandline args to parse typically for the BatchPredictionRequestTrainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(BatchPredictionRequestTrainer,
-      BatchPredictionRequestTrainer).add_parser_arguments()
-
-    # mlp arguments
-    parser.add_argument(
-      '--model.use_existing_discretizer', action='store_true',
-      dest="model_use_existing_discretizer",
-      help='Load a pre-trained calibration or train a new one')
-    parser.add_argument(
-      '--model.use_binary_values', action='store_true',
-      dest='model_use_binary_values',
-      help='Use the use_binary_values optimization')
-
-    # control hom many featues we keep in sparse tensors
-    # 12 is enough for learning-to-rank for now
-    parser.add_argument(
-      '--input_size_bits', type=int, default=12,
-      help='Number of bits allocated to the input size')
-
-    parser.add_argument(
-      '--loss_function', type=str, default='ranknet',
-      dest='loss_function',
-      help='Options are pairwise: ranknet (default), lambdarank, '
-      'listnet, listmle, attrank, '
-      'pointwise')
-
-    # whether convert sparse tensors to dense tensor
-    # in order to use dense normalization methods
-    parser.add_argument(
-      '--use_dense_tensor', action='store_true',
-      dest='use_dense_tensor',
-      default=False,
-      help='If use_dense_tensor is False, '
-      'sparse tensor and spare normalization are in use. '
-      'If use_dense_tensor is True, '
-      'dense tensor and dense normalization are in use.')
-
-    parser.add_argument(
-      '--dense_normalization', type=str, default='mean_max_normalizaiton',
-      dest='dense_normalization',
-      help='Options are mean_max_normalizaiton (default), standard_normalizaiton')
-
-    parser.add_argument(
-      '--sparse_normalization', type=str, default='SparseMaxNorm',
-      dest='sparse_normalization',
-      help='Options are SparseMaxNorm (default), SparseBatchNorm')
-
-    # so far only used in pairwise learning-to-rank
-    parser.add_argument(
-      '--mask', type=str, default='full_mask',
-      dest='mask',
-      help='Options are full_mask (default), diag_mask')
-
-    return parser
diff --git a/twml/twml/contrib/trainers/pruning_data_record_trainer.docx b/twml/twml/contrib/trainers/pruning_data_record_trainer.docx
new file mode 100644
index 000000000..73fe142c8
Binary files /dev/null and b/twml/twml/contrib/trainers/pruning_data_record_trainer.docx differ
diff --git a/twml/twml/contrib/trainers/pruning_data_record_trainer.py b/twml/twml/contrib/trainers/pruning_data_record_trainer.py
deleted file mode 100644
index 4796e5390..000000000
--- a/twml/twml/contrib/trainers/pruning_data_record_trainer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-from twml.trainers import DataRecordTrainer
-from twml.contrib.optimizers import PruningOptimizer
-
-
-class PruningDataRecordTrainer(DataRecordTrainer):
-  @staticmethod
-  def get_train_op(params, loss):
-    train_op = DataRecordTrainer.get_train_op(params, loss)
-
-    optimizer = PruningOptimizer(learning_rate=params.get('learning_rate'))
-
-    return optimizer.minimize(
-        loss=loss,
-        prune_every=params.get('pruning_iter', 5000),
-        burn_in=params.get('pruning_burn_in', 100000),
-        decay=params.get('pruning_decay', .9999),
-        flops_target=params.get('pruning_flops_target', 250000),
-        update_params=train_op,
-        global_step=tf.train.get_global_step())
-
-  def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
-    kwargs['optimize_loss_fn'] = self.get_train_op
-
-    super(PruningDataRecordTrainer, self).__init__(
-      name=name,
-      params=params,
-      build_graph_fn=build_graph_fn,
-      feature_config=feature_config,
-      **kwargs)
-
-  def export_model(self, *args, **kwargs):
-    # TODO: modify graph before exporting to take into account masks
-    return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs)
-
-  @staticmethod
-  def add_parser_arguments():
-    parser = DataRecordTrainer.add_parser_arguments()
-    parser.add_argument(
-      "--pruning.iter", "--pruning_iter", type=int, default=5000,
-      dest="pruning_iter",
-      help="A single feature or feature map is pruned every this many iterations")
-    parser.add_argument(
-      "--pruning.burn_in", "--pruning_burn_in", type=int, default=100000,
-      dest="pruning_burn_in",
-      help="Only start pruning after collecting statistics for this many training steps")
-    parser.add_argument(
-      "--pruning.flops_target", "--pruning_flops_target", type=int, default=250000,
-      dest="pruning_flops_target",
-      help="Stop pruning when estimated number of floating point operations reached this target. \
-      For example, a small feed-forward network might require 250,000 FLOPs to run.")
-    parser.add_argument(
-      "--pruning.decay", "--pruning_decay", type=float, default=.9999,
-      dest="pruning_decay",
-      help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \
-      signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \
-      steps.")
-    return parser
diff --git a/twml/twml/contrib/trainers/trainer_utils.docx b/twml/twml/contrib/trainers/trainer_utils.docx
new file mode 100644
index 000000000..e89ea33c7
Binary files /dev/null and b/twml/twml/contrib/trainers/trainer_utils.docx differ
diff --git a/twml/twml/contrib/trainers/trainer_utils.py b/twml/twml/contrib/trainers/trainer_utils.py
deleted file mode 100644
index f279571be..000000000
--- a/twml/twml/contrib/trainers/trainer_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-This is a temporary close gap solution that allows TensorFlow users to do exploration and
-experimentation using Keras models, and production training using twml Trainer.
-
-As of now (Q4 2019), Keras model training using `model.fit()` has various issues, making it unfit
-for production training:
-  1. `model.fit()` is slow in TF 1.14. This will be fixed with future TensorFlow updates.
-  2. `model.fit()` crashes during model saving or in eager mode when the input has SparseTensor.
-  3. Models saved using TF 2.0 API cannot be served by TensorFlow's Java API.
-
-Until MLCE team resolves the above issues, MLCE team recommends the following:
-  - Please feel free to use Keras models for experimentation and exploration.
-  - Please stick to twml Trainer for production training & exporting,
-    especially if you want to serve your model using Twitter's prediction servers.
-
-This module provide tooling for easily training keras models using twml Trainer.
-
-This module takes a Keras model that performs binary classification, and returns a
-`twml.trainers.Trainer` object performing the same task.
-The common way to use the returned Trainer object is to call its
-`train`, `evaluate`, `learn`, or `train_and_evaluate` method with an input function.
-This input function can be created from the tf.data.Dataset you used with your Keras model.
-
-.. note: this util handles the most common case. If you have cases not satisfied by this util,
-         consider writing your own build_graph to wrap your keras models.
-"""
-from twitter.deepbird.hparam import HParams
-
-import tensorflow  # noqa: F401
-import tensorflow.compat.v2 as tf
-
-import twml
-
-
-def build_keras_trainer(
-  name,
-  model_factory,
-  save_dir,
-  loss_fn=None,
-  metrics_fn=None,
-  **kwargs):
-  """
-  Compile the given model_factory into a twml Trainer.
-
-  Args:
-    name: a string name for the returned twml Trainer.
-
-    model_factory: a callable that returns a keras model when called.
-      This keras model is expected to solve a binary classification problem.
-      This keras model takes a dict of tensors as input, and outputs a logit or probability.
-
-    save_dir: a directory where the trainer saves data. Can be an HDFS path.
-
-    loss_fn: the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy.
-
-    metrics_fn: metrics function used by TensorFlow estimators.
-    Defaults to twml.metrics.get_binary_class_metric_fn().
-
-    **kwargs: for people familiar with twml Trainer's options, they can be passed in here
-      as kwargs, and they will be forwarded to Trainer as opts.
-      See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args.
-
-  Returns:
-    a twml.trainers.Trainer object which can be used for training and exporting models.
-  """
-  build_graph = create_build_graph_fn(model_factory, loss_fn)
-
-  if metrics_fn is None:
-    metrics_fn = twml.metrics.get_binary_class_metric_fn()
-
-  opts = HParams(**kwargs)
-  opts.add_hparam('save_dir', save_dir)
-
-  return twml.trainers.Trainer(
-    name,
-    opts,
-    build_graph_fn=build_graph,
-    save_dir=save_dir,
-    metric_fn=metrics_fn)
-
-
-def create_build_graph_fn(model_factory, loss_fn=None):
-  """Create a build graph function from the given keras model."""
-
-  def build_graph(features, label, mode, params, config=None):
-    # create model from model factory.
-    model = model_factory()
-
-    # create loss function if the user didn't specify one.
-    if loss_fn is None:
-      build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
-    else:
-      build_graph_loss_fn = loss_fn
-
-    output = model(features)
-    if mode == 'infer':
-      loss = None
-    else:
-      weights = features.get('weights', None)
-      loss = build_graph_loss_fn(y_true=label, y_pred=output, sample_weight=weights)
-
-    if isinstance(output, dict):
-      if loss is None:
-        return output
-      else:
-        output['loss'] = loss
-        return output
-    else:
-      return {'output': output, 'loss': loss}
-
-  return build_graph
diff --git a/twml/twml/contrib/utils/__init__.docx b/twml/twml/contrib/utils/__init__.docx
new file mode 100644
index 000000000..7f0fb4e6d
Binary files /dev/null and b/twml/twml/contrib/utils/__init__.docx differ
diff --git a/twml/twml/contrib/utils/__init__.py b/twml/twml/contrib/utils/__init__.py
deleted file mode 100644
index 56a083048..000000000
--- a/twml/twml/contrib/utils/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental util functions for contrib."""
-
-from .math_fns import safe_div, safe_log, cal_ndcg, cal_swapped_ndcg  # noqa: F401
-from .masks import diag_mask, full_mask  # noqa: F401
-from .normalizer import mean_max_normalizaiton, standard_normalizaiton  # noqa: F401
-from .scores import get_pairwise_scores, get_pairwise_label_scores  # noqa: F401
-# pointwise functions
-from .loss_fns import get_pointwise_loss  # noqa: F401
-# ranknet functions
-from .loss_fns import get_pair_loss  # noqa: F401
-# listwise functions
-from .loss_fns import get_attrank_loss, get_listnet_loss, get_listmle_loss  # noqa: F401
-# lambdarank functions
-from .loss_fns import get_lambda_pair_loss  # noqa: F401
-from .device import get_device_map, get_gpu_list, get_gpu_count, is_gpu_available  # noqa: F401
-from .similarities import cosine_similarity  # noqa: F401
-from . import interp # noqa: F401
diff --git a/twml/twml/contrib/utils/datasets.docx b/twml/twml/contrib/utils/datasets.docx
new file mode 100644
index 000000000..3970f4ff7
Binary files /dev/null and b/twml/twml/contrib/utils/datasets.docx differ
diff --git a/twml/twml/contrib/utils/datasets.py b/twml/twml/contrib/utils/datasets.py
deleted file mode 100644
index d31ea3ae4..000000000
--- a/twml/twml/contrib/utils/datasets.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import random
-
-import twml
-
-get_time_based_dataset_files = twml.util.list_files_by_datetime
-
-
-def resolve_train_and_eval_files_overlap(
-  train_files, eval_files, fraction_kept_for_eval, seed=None
-):
-  """Resolve any overlap between train and eval files.
-
-  Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of
-  the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the
-  `eval_files`.
-
-  The following example demonstrates its usage:
-
-  >>> orig_train_files = ['f1', 'f2', 'f3', 'f4']
-  >>> orig_eval_files = ['f1', 'f2', 'f3']
-  >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap(
-  ...     orig_train_files, orig_eval_files, 0.5
-  ... )
-  >>> set(resolved_train_files) & set(resolved_eval_files) == set()
-  True
-  >>> len(resolved_train_files) == 3
-  True
-  >>> len(resolved_eval_files) == 2
-  True
-
-  Args:
-    train_files: A list of the files used for training.
-    eval_files: A list of the files used for validation.
-    fraction_kept_for_eval: A fraction of files in the intersection between `train_files` and
-      `eval_files` exclusively kept for evaluation.
-    seed: A seed for generating random numbers.
-
-  Returns:
-    A tuple `(new_train_files, new_eval_files)` with the overlapping resolved.
-  """
-
-  rng = random.Random(seed)
-
-  train_files = set(train_files)
-  eval_files = set(eval_files)
-  overlapping_files = train_files & eval_files
-  train_files_selected_for_eval = set(rng.sample(
-    overlapping_files,
-    int(len(overlapping_files) * fraction_kept_for_eval)
-  ))
-  train_files = train_files - train_files_selected_for_eval
-  eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval
-  return list(train_files), list(eval_files)
-
-
-def get_time_based_dataset_files_for_train_and_eval(
-  base_path,
-  train_start_datetime,
-  train_end_datetime,
-  eval_start_datetime,
-  eval_end_datetime,
-  fraction_kept_for_eval,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1
-):
-  """Get train/eval dataset files organized with a time-based prefix.
-
-  This is just a convenience built around `get_dataset_files_prefixed_by_time` and
-  `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation.
-  """
-
-  train_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=train_start_datetime,
-    end_datetime=train_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  eval_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=eval_start_datetime,
-    end_datetime=eval_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  return resolve_train_and_eval_files_overlap(
-    train_files=train_files,
-    eval_files=eval_files,
-    fraction_kept_for_eval=fraction_kept_for_eval
-  )
diff --git a/twml/twml/contrib/utils/device.docx b/twml/twml/contrib/utils/device.docx
new file mode 100644
index 000000000..7bf7611a9
Binary files /dev/null and b/twml/twml/contrib/utils/device.docx differ
diff --git a/twml/twml/contrib/utils/device.py b/twml/twml/contrib/utils/device.py
deleted file mode 100644
index ab189c98a..000000000
--- a/twml/twml/contrib/utils/device.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-Functions to query devices being used by tensorflow
-"""
-
-from tensorflow.python.client import device_lib
-
-
-def get_device_map():
-  """Returns the map of device name to device type"""
-  local_device_protos = device_lib.list_local_devices()
-  return {x.name: x.device_type for x in local_device_protos}
-
-
-def get_gpu_list():
-  """Returns the list of GPUs available"""
-  device_map = get_device_map()
-  return [name for name in device_map if device_map[name] == 'GPU']
-
-
-def get_gpu_count():
-  """Returns the count of GPUs available"""
-  return len(get_gpu_list())
-
-
-def is_gpu_available():
-  """Returns if GPUs are available"""
-  return get_gpu_count() > 0
diff --git a/twml/twml/contrib/utils/interp.docx b/twml/twml/contrib/utils/interp.docx
new file mode 100644
index 000000000..2f33c21c5
Binary files /dev/null and b/twml/twml/contrib/utils/interp.docx differ
diff --git a/twml/twml/contrib/utils/interp.py b/twml/twml/contrib/utils/interp.py
deleted file mode 100644
index 419d89030..000000000
--- a/twml/twml/contrib/utils/interp.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Interpolation functions
-"""
-
-import libtwml
-import tensorflow.compat.v1 as tf
-import twml
-
-
-def linear_interp1(inputs, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    ref_inputs:
-      Reference grid points used for interpolation.
-    ref_outputs:
-      Reference output values used for interpolation.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  ndims = inputs.shape.ndims
-  ref_inputs_ndims = ref_inputs.shape.ndims
-  ref_outputs_ndims = ref_inputs.shape.ndims
-
-  if (ref_inputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims))
-
-  if (ref_outputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_outputs: %d" % (ndims, ref_outputs_ndims))
-
-  if ndims > 2:
-    raise ValueError("Input dimensions should be < 2D. But got %d." % ndims)
-
-  original_input_shape = tf.shape(inputs)
-  # This is needed because isotonic_calibration expects:
-  # - inputs of size [num_samples, num_classes]
-  # - ref_inputs, ref_outputs of size [num_classes, num_bins]
-  inputs = tf.reshape(inputs, [-1, 1])
-  ref_inputs = tf.reshape(ref_inputs, [1, -1])
-  ref_outputs = tf.reshape(ref_outputs, [1, -1])
-
-  # isotonic_calibration is simply doing linear interpolation.
-  # This needs to be renamed in the future to make it consistent.
-  outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs)
-  return tf.reshape(outputs, original_input_shape)
-
-
-def linear_interp1_by_class(inputs, input_classes, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    input_classes:
-      The class index to use from the reference grid.
-    ref_inputs:
-      Reference 2D grid points used for interpolation.
-      Each row denotes the grid from a different class.
-    ref_outputs:
-      Reference 2D output values used for interpolation.
-      Each row denotes the grid from a different class.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  input_classes = tf.convert_to_tensor(input_classes)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  original_input_shape = tf.shape(inputs)
-
-  # pass through
-  def in_func(x):
-    return x
-
-  # indexed function
-  def cond_func(i, fn):
-    idx = input_classes[i]
-    x = tf.expand_dims(fn(), axis=0)
-    return linear_interp1(x, ref_inputs[idx], ref_outputs[idx])
-
-  # Use while loop for now, needs to be replace by a custom C++ op later.
-  outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func)
-  return tf.reshape(outputs, original_input_shape)
diff --git a/twml/twml/contrib/utils/loss_fns.docx b/twml/twml/contrib/utils/loss_fns.docx
new file mode 100644
index 000000000..4fa7b2159
Binary files /dev/null and b/twml/twml/contrib/utils/loss_fns.docx differ
diff --git a/twml/twml/contrib/utils/loss_fns.py b/twml/twml/contrib/utils/loss_fns.py
deleted file mode 100644
index eb25b430a..000000000
--- a/twml/twml/contrib/utils/loss_fns.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import tensorflow.compat.v1 as tf
-from twml.contrib.utils import masks, math_fns
-
-
-def get_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params):
-  """
-  Paiwise learning-to-rank ranknet loss
-  Check paper https://www.microsoft.com/en-us/research/publication/
-  learning-to-rank-using-gradient-descent/
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count))
-  return loss
-
-
-def get_lambda_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params, swapped_ndcg):
-  """
-  Paiwise learning-to-rank lambdarank loss
-  faster than the previous gradient method
-  Note: this loss depends on ranknet cross-entropy
-  delta NDCG is applied to ranknet cross-entropy
-  Hence, it is still a gradient descent method
-  Check paper http://citeseerx.ist.psu.edu/viewdoc/
-  download?doi=10.1.1.180.634&rep=rep1&type=pdf for more information
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-    swapped_ndcg: swapped ndcg of shape [n_data, n_data]
-    ndcg values when swapping each pair in the prediction ranking order
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count, swapped_ndcg))
-  return loss
-
-
-def _get_average_cross_entropy_loss(pairwise_label_scores, pairwise_predicted_scores,
-                                    mask, pair_count, swapped_ndcg=None):
-  """
-  Average the loss for a batchPredictionRequest based on a desired number of pairs
-  """
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=pairwise_label_scores,
-    logits=pairwise_predicted_scores)
-  loss = mask * loss
-  if swapped_ndcg is not None:
-    loss = loss * swapped_ndcg
-  loss = tf.reduce_sum(loss) / pair_count
-  return loss
-
-
-def get_listmle_loss(labels, predicted_scores):
-  r"""
-  listwise learning-to-rank listMLE loss
-  Note: Simplified MLE formula is used in here (omit the proof in here)
-  \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores)))
-  n is tf.shape(predicted_scores)[0]
-  Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-  Returns:
-    average loss
-  """
-  labels = tf.reshape(labels, [-1, 1])
-  n_data = tf.shape(labels)[0]
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-
-  predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(labels,
-    predicted_scores, n_data)
-
-  loss = (-1) * tf.reduce_sum(predicted_scores)
-  # sum over 1 to n_data - 1
-  temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1])
-  temp = tf.reshape(temp, [])
-  loss = tf.add(loss, temp)
-
-  exps = tf.exp(predicted_scores_ordered_by_labels)
-  exp_sum = tf.reduce_sum(exps)
-  # clip exp_sum for safer log
-  loss = tf.add(loss, math_fns.safe_log(exp_sum))
-
-  iteration = tf.constant(0)
-
-  def _cond(iteration, loss, exp_sum, exp):
-    return tf.less(iteration, n_data - 2)
-
-  def _gen_loop_body():
-    def loop_body(iteration, loss, exp_sum, exps):
-      temp = tf.gather(exps, [iteration])
-      temp = tf.reshape(temp, [])
-      exp_sum = tf.subtract(exp_sum, temp)
-      # clip exp_sum for safer log
-      loss = tf.add(loss, math_fns.safe_log(exp_sum))
-      return tf.add(iteration, 1), loss, exp_sum, exps
-    return loop_body
-
-  iteration, loss, exp_sum, exps = tf.while_loop(_cond, _gen_loop_body(),
-    (iteration, loss, exp_sum, exps))
-  loss = loss / tf.cast(n_data, dtype=tf.float32)
-  return loss
-
-
-def _get_ordered_predicted_scores(labels, predicted_scores, n_data):
-  """
-  Order predicted_scores based on sorted labels
-  """
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(labels), k=n_data)
-  ordered_labels_indices = tf.transpose(ordered_labels_indices)
-  predicted_scores_ordered_by_labels = tf.gather_nd(predicted_scores,
-    ordered_labels_indices)
-  return predicted_scores_ordered_by_labels
-
-
-def get_attrank_loss(labels, predicted_scores, weights=None):
-  """
-  Modified listwise learning-to-rank AttRank loss
-  Check paper https://arxiv.org/abs/1804.05936 for more information
-  Note: there is an inconsistency between the paper statement and
-  their public code
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # The authors immeplemented the following, which is basically listnet
-  # attention_labels = _get_attentions(labels)
-  # attention_labels = tf.reshape(attention_labels, [1, -1])
-  # predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=attention_labels,
-  #   logits=predicted_scores))
-
-  # The paper proposed the following
-  # attention_labels = _get_attentions(labels)
-  # # However the following line is wrong based on their statement
-  # # as _get_attentions can give 0 results when input < 0
-  # # and the result cannot be used in _get_attrank_cross_entropy
-  # # log(a_i^S)
-  # # attention_predicted_scores = _get_attentions(predicted_scores)
-  # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  # # the range of attention_predicted_scores is [0, 1)
-  # # this gives sigmoid [0.5, 0.732)
-  # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either
-
-  # Implemented the following instead
-  # _get_attentions is applied to labels
-  # softmax is applied to predicted_scores
-  reshaped_labels = tf.reshape(labels, [1, -1])
-  attention_labels = _get_attentions(reshaped_labels)
-  reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores)
-  loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  return loss
-
-
-def _get_attentions(raw_scores):
-  """
-  Used in attention weights in AttRank loss
-  for a query/batch/batchPreidictionRequest
-  (a rectified softmax function)
-  """
-  not_consider = tf.less_equal(raw_scores, 0)
-  mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  expon_labels = mask * tf.exp(raw_scores)
-
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = math_fns.safe_div(expon_labels, expon_label_sum)
-  return attentions
-
-
-def _get_attrank_cross_entropy(labels, logits):
-  # logits is not safe based on their satement
-  # do not use this function directly elsewhere
-  results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(1 - logits)
-  results = (-1) * results
-  results = tf.reduce_mean(results)
-  return results
-
-
-def get_listnet_loss(labels, predicted_scores, weights=None):
-  """
-  Listwise learning-to-rank listet loss
-  Check paper https://www.microsoft.com/en-us/research/
-  wp-content/uploads/2016/02/tr-2007-40.pdf
-  for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # top one probability is the same as softmax
-  labels_top_one_probs = _get_top_one_probs(labels)
-  predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores)
-
-  if weights is None:
-    loss = tf.reduce_mean(
-      _get_listnet_cross_entropy(labels=labels_top_one_probs,
-      logits=predicted_scores_top_one_probs))
-    return loss
-
-  loss = tf.reduce_mean(
-    _get_listnet_cross_entropy(labels=labels_top_one_probs,
-    logits=predicted_scores_top_one_probs) * weights) / tf.reduce_mean(weights)
-  return loss
-
-
-def _get_top_one_probs(labels):
-  """
-  Used in listnet top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  (essentially a softmax function)
-  """
-  expon_labels = tf.exp(labels)
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = expon_labels / expon_label_sum
-  return attentions
-
-
-def _get_listnet_cross_entropy(labels, logits):
-  """
-  Used in listnet
-  cross entropy on top-one probabilities
-  between ideal/label top-one probabilities
-  and predicted/logits top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  """
-  # it is safe to use log on logits
-  # that come from _get_top_one_probs
-  # do not use this function directly elsewhere
-  results = (-1) * labels * math_fns.safe_log(logits)
-  return results
-
-
-def get_pointwise_loss(labels, predicted_scores, weights=None):
-  """
-  Pointwise learning-to-rank pointwise loss
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  if weights is None:
-    loss = tf.reduce_mean(
-      tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-      logits=predicted_scores))
-    return loss
-  loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-        logits=predicted_scores) * weights) / tf.reduce_mean(weights)
-  return loss
diff --git a/twml/twml/contrib/utils/masks.docx b/twml/twml/contrib/utils/masks.docx
new file mode 100644
index 000000000..5751a9acd
Binary files /dev/null and b/twml/twml/contrib/utils/masks.docx differ
diff --git a/twml/twml/contrib/utils/masks.py b/twml/twml/contrib/utils/masks.py
deleted file mode 100644
index f3143dc52..000000000
--- a/twml/twml/contrib/utils/masks.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-def diag_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except the diagonal
-    each cell contains a paiwise score difference
-    only selfs/diags are 0s
-  """
-  mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data]))
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
-
-
-def full_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except pairs that have the same labels
-    each cell contains a paiwise score difference
-    all pairwise_label_scores = 0.5: selfs and same labels are 0s
-  """
-  not_consider = tf.equal(pairwise_label_scores, 0.5)
-  mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.reduce_sum(mask)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
diff --git a/twml/twml/contrib/utils/math_fns.docx b/twml/twml/contrib/utils/math_fns.docx
new file mode 100644
index 000000000..d2dd4dbc2
Binary files /dev/null and b/twml/twml/contrib/utils/math_fns.docx differ
diff --git a/twml/twml/contrib/utils/math_fns.py b/twml/twml/contrib/utils/math_fns.py
deleted file mode 100644
index 2d9e72282..000000000
--- a/twml/twml/contrib/utils/math_fns.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import tensorflow.compat.v1 as tf
-from tensorflow.python.ops import array_ops, math_ops
-
-
-# Copied from metrics_impl.py
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/metrics_impl.py#L216
-def safe_div(numerator, denominator, name=None):
-  """
-  Example usage: calculating NDCG = DCG / IDCG to handle cases when
-  IDCG = 0 returns 0 instead of Infinity 
-  Do not use this dividing funciton unless it makes sense to your problem
-  Divides two tensors element-wise, returns 0 if the denominator is <= 0.
-  Args:
-    numerator: a real `Tensor`.
-    denominator: a real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
-def cal_ndcg(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate NDCG score for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds DCG / IDCG.
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  dcg = _dcg_idcg(predicted_relevance, cg_discount)
-  idcg = _dcg_idcg(sorted_relevance, cg_discount)
-  # the ndcg score of the batch
-  # idcg is 0 if label_scores are all 0
-  ndcg = safe_div(dcg, idcg, 'one_ndcg')
-  return ndcg
-
-
-def cal_swapped_ndcg(label_scores, predicted_scores, top_k_int):
-  """
-  Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`. 
-  Returns:
-    a `Tensor` that holds swapped NDCG by .
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  # cg_discount is safe as a denominator
-  dcg_k = predicted_relevance / cg_discount
-  dcg = tf.reduce_sum(dcg_k)
-
-  idcg_k = sorted_relevance / cg_discount
-  idcg = tf.reduce_sum(idcg_k)
-
-  ndcg = safe_div(dcg, idcg, 'ndcg_in_lambdarank_training')
-
-  # remove the gain from label i then add the gain from label j
-  tiled_ij = tf.tile(dcg_k, [1, top_k_int])
-  new_ij = (predicted_relevance / tf.transpose(cg_discount))
-
-  tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1])
-  new_ji = tf.transpose(predicted_relevance) / cg_discount
-
-  # if swap i and j, remove the stale cg for i, then add the new cg for i,
-  # remove the stale cg for j, and then add the new cg for j
-  new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji
-
-  new_ndcg = safe_div(new_dcg, idcg, 'new_ndcg_in_lambdarank_training')
-  swapped_ndcg = tf.abs(ndcg - new_ndcg)
-  return swapped_ndcg
-
-
-def _dcg_idcg(relevance_scores, cg_discount):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    relevance_scores: a real `Tensor`.
-    cg_discount: a real `Tensor`, with dtype matching relevance_scores
-  Returns:
-    a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount}  
-  """
-  # cg_discount is safe
-  dcg_k = relevance_scores / cg_discount
-  return tf.reduce_sum(dcg_k)
-
-
-def _get_ranking_orders(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: an integer or an int `Tensor`.
-  Returns:
-    two `Tensors` that hold sorted_labels: the ground truth relevance socres
-    and predicted_order: relevance socres based on sorted predicted_scores
-  """
-  # sort predictions_scores and label_scores
-  # size [batch_size/num of DataRecords, 1]
-  label_scores = tf.reshape(label_scores, [-1, 1])
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-  # sorted_labels contians the relevance scores of the correct order
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(label_scores), k=top_k_int)
-  sorted_labels = tf.transpose(sorted_labels)
-  # sort predicitons and use the indices to obtain the relevance scores of the predicted order
-  sorted_predictions, ordered_predictions_indices = tf.nn.top_k(
-    tf.transpose(predicted_scores), k=top_k_int)
-  ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices)
-  # predicted_order contians the relevance scores of the predicted order
-  predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels)
-  return sorted_labels, predicted_order
-
-
-def _get_cg_discount(top_k_int=1):
-  r"""
-  Calculate discounted gain factor for ranking position till top_k_int
-  Args:
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds \log_{2}(i + 1), i \in [1, k] 
-  """
-  log_2 = tf.log(tf.constant(2.0, dtype=tf.float32))
-  # top_k_range needs to start from 1 to top_k_int
-  top_k_range = tf.range(top_k_int) + 1
-  top_k_range = tf.reshape(top_k_range, [-1, 1])
-  # cast top_k_range to float
-  top_k_range = tf.cast(top_k_range, dtype=tf.float32)
-  cg_discount = tf.log(top_k_range + 1.0) / log_2
-  return cg_discount
-
-
-def _get_relevance_scores(scores):
-  return 2 ** scores - 1
-
-
-def safe_log(raw_scores, name=None):
-  """
-  Calculate log of a tensor, handling cases that
-  raw_scores are close to 0s
-  Args:
-    raw_scores: An float `Tensor`.
-  Returns:
-    A float `Tensor` that hols the safe log base e of input
-  """
-  epsilon = 1E-8
-  clipped_raw_scores = tf.maximum(raw_scores, epsilon)
-  return tf.log(clipped_raw_scores)
diff --git a/twml/twml/contrib/utils/normalizer.docx b/twml/twml/contrib/utils/normalizer.docx
new file mode 100644
index 000000000..6f2a56123
Binary files /dev/null and b/twml/twml/contrib/utils/normalizer.docx differ
diff --git a/twml/twml/contrib/utils/normalizer.py b/twml/twml/contrib/utils/normalizer.py
deleted file mode 100644
index a6a7035b8..000000000
--- a/twml/twml/contrib/utils/normalizer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import tensorflow.compat.v1 as tf
-from twml.contrib.utils import math_fns
-
-
-def mean_max_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / abs(max value)
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0])
-  dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0]))
-  dense_tensor = math_fns.safe_div(dense_tensor - dense_mean, dense_abs_max,
-    'mean_max_normalization_in_batch')
-  return dense_tensor
-
-
-def standard_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  z-normalization or standard_normalization in batch
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / variance
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  epsilon = 1E-7
-  dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0)
-  # using epsilon is safer than math_fns.safe_div in here
-  dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon)
-  return dense_tensor
diff --git a/twml/twml/contrib/utils/scores.docx b/twml/twml/contrib/utils/scores.docx
new file mode 100644
index 000000000..cdbbeaffa
Binary files /dev/null and b/twml/twml/contrib/utils/scores.docx differ
diff --git a/twml/twml/contrib/utils/scores.py b/twml/twml/contrib/utils/scores.py
deleted file mode 100644
index 84e792c13..000000000
--- a/twml/twml/contrib/utils/scores.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-def get_pairwise_scores(tensor_input):
-  """
-  This is so far used in pariwise learning-to-rank
-
-  Arguments:
-    tensor_input: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-
-  Returns:
-    pairwise scores: a dense `Tensor` of shape [n_data, n_data].
-  """
-  return tensor_input - tf.transpose(tensor_input)
-
-
-def get_pairwise_label_scores(labels):
-  """
-  This is so far used in pariwise learning-to-rank
-  Args:
-    labels: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-  Returns:
-    pairwise label scores: a dense `Tensor` of shape [n_data, n_data].
-      each value is within [0, 1]
-  """
-  # raw pairwise label scores/differences
-  pairwise_label_scores = get_pairwise_scores(labels)
-  # sanity check to make sure values in differences_ij are [-1, 1]
-  differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0)
-  # values in pairwise_label_scores are within [0, 1] for cross entropy
-  return (1.0 / 2.0) * (1.0 + differences_ij)
diff --git a/twml/twml/contrib/utils/similarities.docx b/twml/twml/contrib/utils/similarities.docx
new file mode 100644
index 000000000..90cc59736
Binary files /dev/null and b/twml/twml/contrib/utils/similarities.docx differ
diff --git a/twml/twml/contrib/utils/similarities.py b/twml/twml/contrib/utils/similarities.py
deleted file mode 100644
index 212065f88..000000000
--- a/twml/twml/contrib/utils/similarities.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-def cosine_similarity(x1, x2, axis):
-  """
-  cosine similarity of two tensors.
-
-  Arguments:
-    x1:
-      A tf.Tensor
-    x2:
-      A tf.Tensor
-    axis: Dimension along which to normalize.
-  """
-  normalize_x1 = tf.nn.l2_normalize(x1, axis=axis)
-  normalize_x2 = tf.nn.l2_normalize(x2, axis=axis)
-  return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis)
diff --git a/twml/twml/dataset.docx b/twml/twml/dataset.docx
new file mode 100644
index 000000000..d2ee986e7
Binary files /dev/null and b/twml/twml/dataset.docx differ
diff --git a/twml/twml/dataset.py b/twml/twml/dataset.py
deleted file mode 100644
index 4356fdc7c..000000000
--- a/twml/twml/dataset.py
+++ /dev/null
@@ -1,372 +0,0 @@
-"""
-This module implements custom tf.data.datasets for twml.
-"""
-import numbers
-
-from absl import logging
-from kazoo.client import KazooClient
-from libtwml import OPLIB
-import tensorflow.compat.v1 as tf
-from twml.constants import DEFAULT_ZOOKEEPER_BASE_ZNODE, DEFAULT_ZOOKEEPER_HOST
-
-
-class BlockFormatDataset(tf.data.Dataset):
-  """A ``tf.data.Dataset`` comprising records from one or more TFRecord files."""
-
-  def __init__(self, filenames, compression_type="auto", buffer_size=1 << 20):
-    """
-    Creates a ``BlockFormatDataset``.
-
-    Args:
-      filenames:
-        A `tf.string` tensor containing one or more filenames.
-      compression_type:
-        A string specifying the compression type.
-        Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default).
-        When compression_type == 'auto', it is inferred from file extension.
-      buffer_size:
-        Buffer size to be used during decompression. default: 1<<20.
-    """
-    self._filenames = tf.convert_to_tensor(filenames, dtype=tf.string, name="filenames")
-    self._compression_type = tf.convert_to_tensor(compression_type.lower(), name="compression_type")
-    self._buffer_size = tf.convert_to_tensor(buffer_size, dtype=tf.int64, name="buffer_size")
-    # Parent class calss self._as_variant_tensor in init. So call this at the end.
-    super(BlockFormatDataset, self).__init__()
-
-  def _as_variant_tensor(self):
-    """
-    Create the resource handle for the dataset.
-    """
-    try:
-      block_format_dataset = __import__("libtwml_internal").OPLIB.block_format_dataset
-      return block_format_dataset(self._filenames)
-    except ImportError:
-      block_format_dataset = OPLIB.block_format_dataset_v2
-      return block_format_dataset(self._filenames, self._compression_type, self._buffer_size)
-
-  def _inputs(self):
-    return []
-
-  @property
-  def output_shapes(self):
-    """Return output shapes"""
-    return tf.TensorShape([])
-
-  @property
-  def output_types(self):
-    """Return output types"""
-    return tf.string
-
-  @property
-  def output_classes(self):
-    """Return output classes"""
-    return tf.Tensor
-
-
-def downsample_dataset(dataset, sample_rate, rate_name):
-  """
-  Downsample a tf.data.Dataset at sample_rate
-  """
-  if sample_rate is None or sample_rate == 1.0:
-    return dataset
-  elif not isinstance(sample_rate, numbers.Real):
-    raise TypeError("dataset %s must be a real number" % rate_name)
-  elif sample_rate <= 0 or sample_rate > 1:
-    raise ValueError("dataset %s must be in range (0, 1])" % rate_name)
-  return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate)
-
-
-def _filenames_dataset(files, shards=None, shard_index=None):
-  """
-  Get a tf.data.Dataset with file names from a list of files
-  Optionally shard the file list (see stream_block_format_dataset)
-  """
-  files = tf.data.Dataset.from_tensor_slices(files)
-
-  if [shards, shard_index] != [None, None]:
-    logging.info("Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards))
-    files = files.shard(num_shards=shards, index=shard_index)
-
-  return files
-
-
-def stream_block_format_dataset(
-        files, parse_fn, batch_size, num_threads,
-        shuffle=True, repeat=False,
-        block_length=None, part_file_parallelism=None, file_shuffle_size=None,
-        record_shuffle_size=None, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None, prefetch_size=2,
-        shards=None, shard_index=None, shuffle_files=True, interleave=True):
-  """
-  Helper function to stream a list of part files.
-
-  Args:
-    files:
-      List of input files which will create a dataset.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    num_threads:
-      Number of threads working on the data in parallel.
-    shuffle:
-      Shuffle records within each file using ``record_shuffle_size``. Defaults to True.
-    repeat:
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset
-      (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached).
-    block_length (optional):
-      Number of consecutive records to pull from a single part file.
-      Defaults to batch_size.
-    part_file_parallelism (optional):
-      Number of part files to read from in parallel. Once a part file is completely read, it will
-      be replaced by the next part file in the part file list.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    file_shuffle_size (optional):
-      the buffer_size used for shuffling of the list of files.
-      Defaults to 1000. For example, if you have 2000 files, the first
-      1000 files are shuffled together, iterated through, then the next 1000 files are shuffled
-      and iterated through.
-    record_shuffle_size (optional):
-      the ``buffer_size`` used for shuffling records in each thread.
-      Defaults to ``batch_size * 8`` records.
-    dataset_fn (optional):
-      A function of that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-      Note that this argument is only useful in conjunction with a [train,eval]_steps of -1
-      (that is, when the entire dataset is used). Furthermore, note that even in this case, each
-      epoch will see a different set of part files. This is because new part files are re-sampled
-      every epoch. In other words, this argument is only provided for backwards compatibility with
-      DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate)
-      instead.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation and other
-      techniques that require each worker to train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-  Returns:
-    tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online.
-  """
-  # Creating a dataset from an input directory
-
-  files = _filenames_dataset(files, shards=shards, shard_index=shard_index)
-
-  file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000
-  record_shuffle_size = record_shuffle_size if record_shuffle_size is not None else (batch_size * 8)
-  block_length = block_length if block_length is not None else batch_size
-
-  logging.info("NUM_THREADS: %d", num_threads)
-
-  if repeat:
-    files = files.repeat()
-
-  if shuffle_files:
-    # Randomly shuffle the files list.
-    files = files.shuffle(buffer_size=file_shuffle_size)
-
-  # Downsample parts files
-  files = downsample_dataset(files, parts_downsampling_rate, "parts_downsampling_rate")
-
-  # Interleave the result from BlockFormatDataset
-  # block_length == batch_size results in batch_size records being read from a single file.
-  def map_fn(filenames):
-    '''function that maps each filename to a BlockFormatDataset'''
-    # reach each file using BlockFormatDataset
-    dataset = BlockFormatDataset(filenames)
-
-    # early prefetching can sometimes improve performance (like on GCS)
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-
-    # Shuffling before repeating ensures strong ordering.
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=record_shuffle_size)
-
-    return dataset
-
-  if interleave:
-    part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-    dataset = files.interleave(
-      map_fn, cycle_length=part_file_parallelism, block_length=block_length, num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, keep_rate, "keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(prefetch_size)
-
-  return dataset_fn(dataset, parse_fn, batch_size)
-
-
-def cx_zk_path(path):
-  if path is None:
-    raise ValueError("Path for zookeeper dataset pointer is None. You must specify a path.")
-  return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path])
-  logging.info("Zookeeper path is: {}".format(return_path))
-  return return_path
-
-
-def zookeeper_ordered_dataset(
-        files, parse_fn, batch_size, zk_counter_path, repeat=False,
-        num_threads=2, block_length=None, part_file_parallelism=None,
-        batch_shuffle_size=None, file_keep_rate=None, record_keep_rate=None,
-        prefetch_size=2, interleave=False, dataset_fn=None, verbose=False):
-  """
-  Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of
-  which file to read, and to coordinate multiple workers.
-
-  Args:
-    files:
-      ordered list of (typically HDFS) filenames. This must remain consistent
-      between different workers, and between worker restarts (e.g. in the case
-      of instance failure or preemption).
-      To ensure this remains consistent, consider using the --train.files_list
-      option from DataRecordTrainer.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    zk_counter_path:
-      Path under the root node for the underlying zookeeper shared counter that
-      is used to coordinate distributed iteration over the list of files.
-      Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`.
-    repeat:
-      Default False. Set True to repeat over the files forever.
-    num_threads:
-      Default 2. Number of threads working on the data in parallel.
-      Only used if interleave=True.
-    block_length:
-      Default None. Number of consecutive records to pull from a single part file.
-      If None, then block_length=batch_size will be used.
-      Only used if interleave=True.
-    part_file_parallelism:
-      Default None. Number of part files to read from in parallel. Once a part file is completely
-      read, it will be replaced by the next part file indicated by the zookeeper counter.
-      Only used if interleave=True.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    batch_shuffle_size:
-      Default None. Size of shuffle buffer, for shuffling that will be applied after batching.
-      if None, then batches will not be shuffled. Ignored if dataset_fn is provided.
-    file_keep_rate:
-      Default None. Fraction of files to keep, or None to keep all files.
-    record_keep_rate:
-      Default None. Fraction of records to keep, or None to keep all records.
-    prefetch_size:
-      Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided.
-    interleave:
-      Default False. Set True to use tf.data.Dataset.interleave rather than flat_map.
-    dataset_fn:
-      A function that is applied to the dataset of individual records, after
-      these have been read from the parts files.
-      If ``None`` (the default), the behavior will be as though dataset_fn were set to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          dataset = dataset.batch(batch_size)
-          dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE)
-          if batch_shuffle_size:
-            dataset = dataset.shuffle(batch_shuffle_size)
-          return dataset.prefetch(prefetch_size)
-
-    verbose:
-      Default False. Set True to log the names of files loaded by TF.
-  """
-  block_length = batch_size if block_length is None else block_length
-  part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-
-  def zk_index_generator(my_files=files):
-    zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST)
-    zk.start()
-    my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0)
-    while True:
-      my_counter += 1
-      counter_pre_value = my_counter.pre_value
-      if repeat:
-        counter_pre_value = counter_pre_value % len(my_files)
-      if counter_pre_value >= len(my_files):
-        break
-      else:
-        chosen_file = my_files[counter_pre_value]
-        if verbose:
-          logging.info("{}. yielding {}".format(counter_pre_value, chosen_file))
-        yield chosen_file
-    zk.stop()
-
-  files = tf.data.Dataset.from_generator(zk_index_generator, tf.string)
-
-  # Downsample parts files
-  files = downsample_dataset(files, file_keep_rate, "file_keep_rate")
-
-  def map_fn(filenames):
-    return BlockFormatDataset(filenames).prefetch(20)
-
-  # Dont interleave for sequential training
-  if interleave:
-    dataset = files.interleave(
-      map_fn,
-      cycle_length=part_file_parallelism,
-      block_length=block_length,
-      num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    dataset = dataset.batch(batch_size)
-    dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    # shuffle after batching and parsing for performance reasons
-    # faster b/c 1 random selection is made per batch rather than per record
-    if batch_shuffle_size:
-      dataset = dataset.shuffle(buffer_size=batch_shuffle_size)
-    dataset = dataset.prefetch(prefetch_size)
-
-  else:
-    dataset = dataset_fn(dataset, parse_fn, batch_size)
-
-  return dataset
diff --git a/twml/twml/errors.docx b/twml/twml/errors.docx
new file mode 100644
index 000000000..46c86d076
Binary files /dev/null and b/twml/twml/errors.docx differ
diff --git a/twml/twml/errors.py b/twml/twml/errors.py
deleted file mode 100644
index 9b50fcd79..000000000
--- a/twml/twml/errors.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""
-Error classes for twml
-"""
-
-
-class EarlyStopError(Exception):
-  """Exception used to indicate evaluator needs to early stop."""
-  pass
-
-
-class CheckpointNotFoundError(Exception):
-  """Exception used to indicate a checkpoint hasnt been found."""
-  pass
diff --git a/twml/twml/export_output_fns.docx b/twml/twml/export_output_fns.docx
new file mode 100644
index 000000000..fbcfa1cba
Binary files /dev/null and b/twml/twml/export_output_fns.docx differ
diff --git a/twml/twml/export_output_fns.py b/twml/twml/export_output_fns.py
deleted file mode 100644
index f72e1d0fe..000000000
--- a/twml/twml/export_output_fns.py
+++ /dev/null
@@ -1,17 +0,0 @@
-'''
-Contains implemenations of DataRecordTrainer.get_export_output_fns that specify how to
-export model graph outputs from build_graph to DataRecords for prediction servers.
-
-Modelers can use the functions in this module as the export_output_fn parameter of
-the DataRecordTrainer constructor to customize how to export their model outputs.
-
-Modelers may also provide a custom implementation of export_output_fn using these as reference.
-'''
-
-# pylint: disable=invalid-name
-from twitter.deepbird.io.legacy.export_output_fns import (
-  batch_prediction_continuous_output_fn,  # noqa: F401
-  batch_prediction_tensor_output_fn,  # noqa: F401
-  default_output_fn,  # noqa: F401
-  variable_length_continuous_output_fn,  # noqa: F401
-)
diff --git a/twml/twml/feature_config.docx b/twml/twml/feature_config.docx
new file mode 100644
index 000000000..dbf611229
Binary files /dev/null and b/twml/twml/feature_config.docx differ
diff --git a/twml/twml/feature_config.py b/twml/twml/feature_config.py
deleted file mode 100644
index 37004f442..000000000
--- a/twml/twml/feature_config.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""
-Feature configuration for DeepBird jobs:
-- Which features to keep
-- Which features to blacklist
-- Which features are labels
-- Which feature is the weight
-"""
-
-from twitter.deepbird.io.legacy import feature_config
-
-
-class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-    # Override the class in the spec.
-    doc["class"] = "twml.FeatureConfig"
-    return doc
-
-
-class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  def build(self):
-    # Overwrite self.build() to return twml.FeatureConfig instead
-    """
-    Builds and returns FeatureConfig object.
-    """
-
-    (
-      features,
-      tensor_types,
-      sparse_tensor_types,
-      feature_map,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
-
-    return FeatureConfig(
-      features=features,
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=tensor_types,
-      sparse_tensor_types=sparse_tensor_types,
-      feature_types=feature_map,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=self._feature_name_to_feature_parser,
-      feature_in_bq_name=self._feature_in_bq_name,
-    )
-
-
-_name_to_id = feature_config._name_to_id
diff --git a/twml/twml/filters.docx b/twml/twml/filters.docx
new file mode 100644
index 000000000..d2e28ec9d
Binary files /dev/null and b/twml/twml/filters.docx differ
diff --git a/twml/twml/filters.py b/twml/twml/filters.py
deleted file mode 100644
index e48633808..000000000
--- a/twml/twml/filters.py
+++ /dev/null
@@ -1,9 +0,0 @@
-'''
-Includes functions to filter features dict build from
-data records.
-'''
-
-from twitter.deepbird.io.legacy.filters import (
-  balance_binary_class_samples,  # noqa: F401
-  sparse_keep_feature_if,  # noqa: F401
-  sparse_keep_sample_if)  # noqa: F401
diff --git a/twml/twml/hooks.docx b/twml/twml/hooks.docx
new file mode 100644
index 000000000..f043d1cb1
Binary files /dev/null and b/twml/twml/hooks.docx differ
diff --git a/twml/twml/hooks.py b/twml/twml/hooks.py
deleted file mode 100644
index cdf733535..000000000
--- a/twml/twml/hooks.py
+++ /dev/null
@@ -1,562 +0,0 @@
-""" This file contains tf.train.SessionRunHooks defined by TWML """
-from datetime import datetime
-import json
-import operator
-import os
-
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.python.training.basic_session_run_hooks import NeverTriggerTimer, SecondOrStepTimer
-import twml
-
-
-class StepProgressHook(tf.train.SessionRunHook):
-  """Hook that displays a progress bar to monitor global step progress """
-
-  def __init__(self, max_step):
-    """
-    Initializes a `StepProgressHook`.
-    This hook displays a progress bar for max_steps.
-
-    Note that this hook only works for training and calibration.
-
-    Args:
-      max_steps:
-        maximum steps to monitor in progress bar.
-        When this many steps is reached, the progress bar will be full.
-    """
-    self._max_step = max_step
-    self._start_step = 0
-    self._global_step_tensor = None
-    self._progress_bar = None
-
-  def begin(self):
-    """ sets the global_step_tensor """
-    self._global_step_tensor = tf.train.get_or_create_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError("Global step should be created to use StepProgressHook.")
-
-  def after_create_session(self, session, coord):
-    """ creates the progress bar and keeps track of the first global step upon session creation """
-    global_step = session.run(self._global_step_tensor)
-    self._start_step = global_step
-    self._progress_bar = tf.keras.utils.Progbar(self._max_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    """ invoked before calling session.run """
-    return tf.train.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    """ invoked after run is called. Updates the progress bar. """
-    step = run_context.session.run(self._global_step_tensor)
-    self._progress_bar.update(step - self._start_step)
-
-
-class GetMetricsHook(tf.train.SessionRunHook):
-  """
-  Hook used to obtain evaluation metrics.
-  Typically used for early-stopping by obtaining the value of a
-  metric at the end of an epoch.
-  Note that the metric tensor and its commensurate update Op
-  are responsible for aggregating the metric during the session
-  (one session per epoch). Used for evaluation.
-  """
-
-  def __init__(self, get_metrics_fn):
-    """GetMetricsHook constructor.
-
-    Args:
-      get_metrics_fn:
-        Function that returns a dict mapping metric keys to
-        tensors as a tf.Tensor.
-        See Trainer.learn for an example use-case.
-    """
-
-    self._get_metrics_fn = get_metrics_fn
-    self._metric_tensors = None
-    self.metric_values = None
-
-  def begin(self):
-    """ sets the global_step_tensor and metric tensor"""
-    self._metric_tensors = self._get_metrics_fn()
-    assert isinstance(self._metric_tensors, dict)
-
-  def end(self, session):
-    self.metric_values = session.run(self._metric_tensors)
-
-
-class EarlyStopHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with early-stopping logic for use
-  within the Trainer.learn method.
-  """
-
-  def __init__(self,
-               metric,
-               patience,
-               minimize,
-               get_estimator_spec_fn,
-               checkpoint_dir,
-               file_path=None,
-               exit_on_end=True,
-               start_epoch=0,
-               tolerance=0):
-    """
-    Prepare early-stopping hook and variables.
-
-    Args:
-      metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-      minimize:
-        Set this to True for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-        Defaults to 0.
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      checkpoint_dir:
-        path to directory containing the Estimator checkpoints.
-      file_path:
-        path to file that is used by this hook to communicate early-stopping
-        to StopIfExistsHook. This hook would be used for evaluation, while
-        the StopIfExistsHooks (the listeners) would be used for training.
-        When the file is created, the StopIfExistsHooks detect and terminate training.
-        This argument is used by ``Trainer.train_and_evaluate``.
-      exit_on_end:
-        when the end() method is called to indicate that the session is terminating,
-        and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job.
-        This is set to False by the trainer for non distributed jobs.
-      start_epoch:
-        Specifies the starting epoch number. This is used for logging purposes only.
-    """
-    if not isinstance(metric, str):
-      raise ValueError("Expecting string for metric arg")
-    if not isinstance(patience, int):
-      raise ValueError("Expecting positive number for metric arg")
-
-    self.should_stop = False
-    self._metric = metric
-    self._patience = patience
-    self._current_patience = patience
-    self._checkpoint_dir = checkpoint_dir
-    self._exit_on_end = exit_on_end
-    self._latest_checkpoint_path = None
-    # used for distributed training (tf.estimator.train_and_evaluate)
-    self._file_path = file_path
-    self._epoch = start_epoch
-    if self._file_path is not None:
-      # TODO try to read epoch from a file that we create
-      if tf.io.gfile.exists(self._file_path):
-        # delete the file if it exists (not sure this makes sense)
-        logging.info("EarlyStopHook: Removing existing file: %s.", self._file_path)
-        tf.io.gfile.remove(self._file_path)
-
-    # best_checkpoint dir will contain the best checkpoint
-    self._best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint')
-    self._eval_checkpoint_path = os.path.join(checkpoint_dir, 'eval_checkpoint')
-    self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric)
-
-    if tf.io.gfile.exists(self._best_metric_path):
-      with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f:
-        best_metric_from_file = float(f.read())
-    else:
-      best_metric_from_file = None
-
-    if minimize:
-      # current < best : is better
-      self._is_better_than = operator.lt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = np.inf
-      else:
-        self._best_metric = best_metric_from_file - tolerance
-      # used for printing
-      self._early_stop_name = "minimum"
-    else:
-      # current > best : is better
-      self._is_better_than = operator.gt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = -np.inf
-      else:
-        self._best_metric = best_metric_from_file + tolerance
-      # used for printing
-      self._early_stop_name = "maximum"
-
-    def get_metrics_fn():
-      """ function to get metric tensors to early-stopping """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      if metric not in eval_metric_ops:
-        raise ValueError(
-          "Expecting early_stop_metric '%s' key in eval_metric_ops dict"
-          % (metric))
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-
-    # initialize GetMetricsHook to get current value of metric from session
-    super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def early_stop(self, epoch):
-    """
-    Looks at the current value of the early stopping metric.
-    Decrements current patience. If metric improves, patience is reset
-    and latest checkpoint is moved to checkpoint_dir/best_checkpoint.
-    If current patience reaches zero, returns True.
-
-    Args:
-      epoch:
-        The current epoch number.
-
-    Returns:
-      True when early-stopped. False otherwise.
-    """
-    # decrement patience
-    self._current_patience -= 1
-
-    # get the current metric value
-    current_metric = self.metric_values[self._metric]
-
-    if self._is_better_than(current_metric, self._best_metric):
-      # save best version of model
-      self._best_metric = current_metric
-      logging.info(
-        "Found new %s %s=%f @ epoch %d",
-        self._early_stop_name, self._metric, self._best_metric, epoch)
-      # backup the file to checkpoint_dir/best_checkpoint
-      assert self._latest_checkpoint_path, "expecting latest checkpoint"
-      logging.info("Backing up " + self._latest_checkpoint_path)
-
-      try:
-        eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path)
-        twml.util.backup_checkpoint(
-          checkpoint_path_prefix=eval_checkpoint,
-          backup_path=self._best_checkpoint_path)
-      except twml.errors.CheckpointNotFoundError as ex:
-        msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-        raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
-
-      tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path))
-      with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f:
-        # Write with enough precision
-        f.write("%.8f" % self._best_metric)
-
-      # reset patience
-      self._current_patience = self._patience
-
-    elif self._current_patience > 0:
-      logging.info("No new %s found after %d epochs",
-                   self._early_stop_name, self._patience - self._current_patience)
-    elif self._current_patience == 0:
-      logging.info(
-        "No new %s found after %d epochs. Early-stopping experiment.",
-        self._early_stop_name, self._patience)
-      return True
-
-    return False
-
-  def cleanup_checkpoints(self):
-    """
-    makes it so that the best checkpoint is the only checkpoint
-    in checkpoint_dir.
-    """
-    raise NotImplementedError("cleanup_checkpoints is no longer supported")
-
-  def end(self, session):
-    """
-    This method is called at the end of an evaluation/epoch.
-    When file_path constructor argument is provided, this
-    will call ``early_stop()``.
-    When ``early_stop()`` returns True, it creates the file_path,
-    which will be detected by StopIfExistsHooks
-    and stop training for all workers and the chief. It will
-    also call ``cleanup_checkpoints()``.
-    """
-    super(EarlyStopHook, self).end(session)
-
-    # Checks for early stopping criteria and makes a backup
-    self.should_stop = self.early_stop(self._epoch)
-
-    if self._file_path is not None:
-      if self.should_stop:
-        # create a file to inform workers
-        with tf.io.gfile.GFile(self._file_path, "wb") as gfile:
-          gfile.write("early-stop\n")
-        # makes the best checkpoint the only checkpoint in save_dir.
-        msg = "early-stopping evaluation at epoch %d" % self._epoch
-        logging.info(msg)
-        if self._exit_on_end:
-          raise twml.errors.EarlyStopError(msg)
-      else:
-        self._latest_checkpoint_path = None
-
-    self._epoch += 1
-
-  def begin(self):
-    """
-    Saves the latest_checkpoint in case it gets superseded by another checkpoint.
-    Remember that when used with train_and_evaluate, the chief saves checkpoints
-    continuouly. The chief could save a checkpoint after evaluation started.
-    So saving the checkpoint at the beginning of evaluation ensures that we
-    later save the correct best checkpoint.
-    """
-    super(EarlyStopHook, self).begin()
-    self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir)
-
-    assert self._latest_checkpoint_path, "expecting latest checkpoint"
-    # Backup to temporary directory
-    try:
-      twml.util.backup_checkpoint(
-        checkpoint_path_prefix=self._latest_checkpoint_path,
-        backup_path=self._eval_checkpoint_path)
-    except twml.errors.CheckpointNotFoundError as ex:
-      msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-      raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
-
-
-class MetricsUpdateHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with logic to map SessionRun events to metrics updates.
-  It is mainly used by `TrackRun` to persist model metrics via Model Repo.
-  """
-
-  def __init__(self,
-               get_estimator_spec_fn,
-               add_metrics_fn,
-               every_n_iter=None,
-               every_n_secs=None
-               ):
-    """
-    Args:
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      add_metrics_fn: `function` callback used to report metrics, called automatically
-        at the end of every epoch.
-      every_n_iter: `int`, log the metrics once every N local
-        steps taken in the current epoch.
-      every_n_secs: `int` or `float`, log the metrics once every N
-        seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs`
-        should be provided.
-    Raises:
-      ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and
-        `every_n_secs` is set when `add_progress_metrics_fn` is provided.
-    """
-    only_log_at_end = (every_n_iter is None) and (every_n_secs is None)
-
-    if (not only_log_at_end and every_n_iter and every_n_secs):
-      raise ValueError(
-        'exactly one of every_n_iter and every_n_secs must be provided'
-      )
-
-    # TODO: should have a minimum to avoid too many calls to ModelRepo?
-    if every_n_iter is not None and every_n_iter <= 0:
-      raise ValueError("invalid every_n_iter=%s." % every_n_iter)
-
-    self._timer = (
-      NeverTriggerTimer() if only_log_at_end else
-      SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
-    )
-
-    self._should_trigger = False
-    self._iter_count = 0
-
-    self._add_metrics_fn = add_metrics_fn
-
-    def get_metrics_fn():
-      """
-      Function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-    super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def report_metrics(self):
-    """
-    Triggers a metrics report.
-    """
-    self._timer.update_last_triggered_step(self._iter_count)
-    if self.metric_values is not None:
-      self._add_metrics_fn(self.metric_values)
-
-  def begin(self):
-    """
-    Triggered before each epoch.
-    """
-    self._timer.reset()
-    self._iter_count = 0
-    return super(MetricsUpdateHook, self).begin()
-
-  def before_run(self, run_context):
-    """
-    Triggered before each step.
-    """
-    self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
-    return super(MetricsUpdateHook, self).before_run(run_context)
-
-  def after_run(self, run_context, run_values):
-    """
-    Triggered after each step.
-    """
-    if self._should_trigger:
-      self.report_metrics()
-    self._iter_count += 1
-    return super(MetricsUpdateHook, self).after_run(run_context, run_values)
-
-  def end(self, session):
-    """
-    Triggered after each epoch.
-    """
-    self.report_metrics()
-    return super(MetricsUpdateHook, self).end(session)
-
-
-class EarlyStopDuration(tf.train.SessionRunHook):
-  """
-  Hook that can be used to terminate a job (training or validation) after a certain duration.
-  The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes,
-  then it will only run for 15 minutes once restarted.
-
-  Args:
-    max_duration: 
-      A float. When this argument is defined, the job will automatically terminate after
-      `max_duration` seconds if it has not already compeleted. 
-    
-    overwrite:
-      A boolean. If set to True, this hook will overwrite the file containing the elapsed time
-      since the beginning of the job. In a distributed setting, this will be used so only one 
-      job writes to the file while all others will have read access. In a distributed setting,
-      if all executors have this parameter set to False, then it just means that the hook will 
-      not be fault tolerant. When restarted, the job will restart the clock from 0.
-      
-    save_dir:
-      String. A directory (located on a file system that is Tensorflow compatible) where 
-      we can store the file which contains the record of the elapsed time. This file is what makes 
-      the hook faul tolerant.
-
-    exit_on_end:
-      when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job.
-      This is usually set to True to kill a validation job in a distributed setting.
-   """
-
-  def __init__(self, max_duration: float, exit_on_end: bool, save_dir: str, overwrite: bool):
-    self._overwrite = overwrite
-    self._save_dir = save_dir
-    self._exit_on_end = exit_on_end
-    self._max_duration = max_duration
-    self._last_time_check = datetime.now()
-
-    # Initialize elapse time file
-    if overwrite:
-      self.elapsed_time()
-
-  @property
-  def elapsed_file_path(self):
-    return os.path.join(self._save_dir, "early_stop_duration.txt")
-
-  def early_stop(self) -> bool:
-    return self.elapsed_time() > self._max_duration
-
-  def elapsed_time(self) -> float:
-    # Recorded elapsed time is 0 unless it's been recorded in a file already
-    recorded_elapsed_time = 0
-    if tf.io.gfile.exists(self.elapsed_file_path):
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file:
-        recorded_elapsed_time = json.loads(file.read())["elapsed_time"]
-
-    elapsed_time = recorded_elapsed_time + (datetime.now() - self._last_time_check).total_seconds()
-    self._last_time_check = datetime.now()
-
-    if self._overwrite:
-      # Record the actualized new elapsed time to the file
-      tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path))
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file:
-        record = {
-          "elapsed_time": elapsed_time,
-          "max_duration": self._max_duration
-        }
-        file.write(json.dumps(record, indent=2))
-
-    return elapsed_time
-
-  def before_run(self, run_context: tf.estimator.SessionRunContext) -> None:
-    if self.early_stop():
-      message = f"""
-        Stopping job which now exceeded the maximum duration of {self._max_duration} seconds. 
-      """
-      logging.info(message)
-      run_context.request_stop()
-
-      if self._exit_on_end:
-        raise twml.errors.EarlyStopError(message)
-
-
-class StopAtStepHook(tf.train.StopAtStepHook):
-  """
-  Overrides ``tf.train.StopAtStepHook`` so that
-  a ``stop_requested`` property can be accessed to determine
-  if this hook requested a stop.
-  """
-
-  def __init__(self, *args, **kwargs):
-    super(StopAtStepHook, self).__init__(*args, **kwargs)
-    self._stop_requested = False
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
-
-  def after_run(self, run_context, run_values):
-    """ sets self.stop_requested to true when requesting a stop """
-    super(StopAtStepHook, self).after_run(run_context, run_values)
-    self._stop_requested = run_context.stop_requested
-
-
-class StopIfExistsHook(tf.train.SessionRunHook):
-  """
-  Hook that requests stop if a file exists.
-  This hook is used with the EarlyStopHook to implement
-  early-stopping for distributed training (tf.estimator.train_and_evaluate).
-  """
-
-  def __init__(self, file_path):
-    """
-    Arguments:
-      file_path:
-        path to file. When this hook detects that the file exists,
-        it requests a stop, which effectively kills this worker.
-    """
-    self._file_path = file_path
-    self._stop_requested = False
-
-  def after_run(self, run_context, run_values):
-    if tf.io.gfile.exists(self._file_path):
-      logging.info("Early-stopping file detected; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
diff --git a/twml/twml/input_fns.docx b/twml/twml/input_fns.docx
new file mode 100644
index 000000000..56a883c07
Binary files /dev/null and b/twml/twml/input_fns.docx differ
diff --git a/twml/twml/input_fns.py b/twml/twml/input_fns.py
deleted file mode 100644
index 394fc8674..000000000
--- a/twml/twml/input_fns.py
+++ /dev/null
@@ -1,129 +0,0 @@
-'''
-Contains implementations of functions to read input data.
-'''
-from .dataset import stream_block_format_dataset
-
-import tensorflow.compat.v1 as tf
-
-
-def data_record_input_fn(
-        files, batch_size, parse_fn,
-        num_threads=2, repeat=False, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None,
-        shards=None, shard_index=None, shuffle=True, shuffle_files=True, interleave=True,
-        initializable=False, log_tf_data_summaries=False,
-        **kwargs):
-  """
-  Returns a nested structure of tf.Tensors containing the next element.
-  Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer.
-  By default, works with DataRecord dataset for compressed partition files.
-
-  Args:
-    files:
-      List of files that will be parsed.
-    batch_size:
-      number of samples per batch.
-    parse_fn:
-      function passed to data loading for parsing individual data records.
-      Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``.
-    num_threads (optional):
-      number of threads used for loading data. Defaults to 2.
-    repeat (optional):
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use ``train_steps`` or ``eval_steps``
-      greater than the size of the dataset
-      (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached).
-    dataset_fn (optional):
-      A function that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in (0.0, 1.0] that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation
-      (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to
-      train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle (optional):
-      Whether to shuffle the records. Defaults to True.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-    initializable (optional):
-      A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or
-      a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false)
-      is used for most plain iterators.
-
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-  Returns:
-    Iterator of elements of the dataset.
-  """
-  if not parse_fn:
-    raise ValueError("default_input_fn requires a parse_fn")
-
-  if log_tf_data_summaries and not initializable:
-    raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-  dataset = stream_block_format_dataset(
-    files=files,
-    parse_fn=parse_fn,
-    batch_size=batch_size,
-    repeat=repeat,
-    num_threads=num_threads,
-    dataset_fn=dataset_fn,
-    keep_rate=keep_rate,
-    parts_downsampling_rate=parts_downsampling_rate,
-    shards=shards,
-    shard_index=shard_index,
-    shuffle=shuffle,
-    shuffle_files=shuffle_files,
-    interleave=interleave,
-    **kwargs
-  )
-
-  # Add a tf.data.experimental.StatsAggregator
-  # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator
-  if log_tf_data_summaries:
-    aggregator = tf.data.experimental.StatsAggregator()
-    options = tf.data.Options()
-    options.experimental_stats.aggregator = aggregator
-    dataset = dataset.with_options(options)
-    stats_summary = aggregator.get_summary()
-    tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-
-  if initializable:
-    # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and
-    # therefore we need to be run explicitly
-    iterator = dataset.make_initializable_iterator()
-    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)
-  else:
-    iterator = dataset.make_one_shot_iterator()
-  return iterator.get_next()
-
-
-default_input_fn = data_record_input_fn  # pylint: disable=invalid-name
diff --git a/twml/twml/layers/__init__.docx b/twml/twml/layers/__init__.docx
new file mode 100644
index 000000000..d28b078ca
Binary files /dev/null and b/twml/twml/layers/__init__.docx differ
diff --git a/twml/twml/layers/__init__.py b/twml/twml/layers/__init__.py
deleted file mode 100644
index 917c61867..000000000
--- a/twml/twml/layers/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# pylint: disable=wildcard-import
-"""
-This module contains the ``tf.layers.Layer`` subclasses implemented in twml.
-Layers are used to instantiate common subgraphs.
-Typically, these layers are used when defining a ``build_graph_fn``
-for the ``twml.trainers.Trainer``.
-"""
-
-from .batch_prediction_tensor_writer import BatchPredictionTensorWriter  # noqa: F401
-from .batch_prediction_writer import BatchPredictionWriter  # noqa: F401
-from .data_record_tensor_writer import DataRecordTensorWriter  # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
-from .full_sparse import full_sparse, FullSparse  # noqa: F401
-from .isotonic import Isotonic  # noqa: F401
-from .layer import Layer  # noqa: F401
-from .mdl import MDL  # noqa: F401
-from .partition import Partition  # noqa: F401
-from .percentile_discretizer import PercentileDiscretizer  # noqa: F401
-from .sequential import Sequential  # noqa: F401
-from .sparse_max_norm import MaxNorm, sparse_max_norm, SparseMaxNorm  # noqa: F401
-from .stitch import Stitch  # noqa: F401
diff --git a/twml/twml/layers/batch_prediction_tensor_writer.docx b/twml/twml/layers/batch_prediction_tensor_writer.docx
new file mode 100644
index 000000000..0b1805c47
Binary files /dev/null and b/twml/twml/layers/batch_prediction_tensor_writer.docx differ
diff --git a/twml/twml/layers/batch_prediction_tensor_writer.py b/twml/twml/layers/batch_prediction_tensor_writer.py
deleted file mode 100644
index 3f6633a8e..000000000
--- a/twml/twml/layers/batch_prediction_tensor_writer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# pylint: disable=no-member, invalid-name
-"""
-Implementing Writer Layer
-"""
-from .layer import Layer
-
-import libtwml
-
-
-class BatchPredictionTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production) when model predictions are dense tensors.
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
-
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_tensor_response_writer(self.keys, values)
-    return write_op
diff --git a/twml/twml/layers/batch_prediction_writer.docx b/twml/twml/layers/batch_prediction_writer.docx
new file mode 100644
index 000000000..2409ec853
Binary files /dev/null and b/twml/twml/layers/batch_prediction_writer.docx differ
diff --git a/twml/twml/layers/batch_prediction_writer.py b/twml/twml/layers/batch_prediction_writer.py
deleted file mode 100644
index 118d21921..000000000
--- a/twml/twml/layers/batch_prediction_writer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# pylint: disable=no-member, invalid-name
-"""
-Implementing Writer Layer
-"""
-from .layer import Layer
-
-import libtwml
-
-
-class BatchPredictionWriter(Layer):
-  """
-  A layer that packages keys and values into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production).
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
-
-    Arguments:
-      values:
-        values corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values)
-    return write_op
diff --git a/twml/twml/layers/data_record_tensor_writer.docx b/twml/twml/layers/data_record_tensor_writer.docx
new file mode 100644
index 000000000..06685a08a
Binary files /dev/null and b/twml/twml/layers/data_record_tensor_writer.docx differ
diff --git a/twml/twml/layers/data_record_tensor_writer.py b/twml/twml/layers/data_record_tensor_writer.py
deleted file mode 100644
index 0f70186b4..000000000
--- a/twml/twml/layers/data_record_tensor_writer.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# pylint: disable=no-member, invalid-name
-"""
-Implementing Writer Layer
-"""
-from .layer import Layer
-
-import libtwml
-
-
-class DataRecordTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a DataRecord.
-  This layer was initially added to support exporting user embeddings as tensors.
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a DataRecord serialized using Thrift into a uint8 tensor
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(DataRecordTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
-
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.data_record_tensor_writer(self.keys, values)
-    return write_op
diff --git a/twml/twml/layers/full_dense.docx b/twml/twml/layers/full_dense.docx
new file mode 100644
index 000000000..50c73cef9
Binary files /dev/null and b/twml/twml/layers/full_dense.docx differ
diff --git a/twml/twml/layers/full_dense.py b/twml/twml/layers/full_dense.py
deleted file mode 100644
index 9c354ad3e..000000000
--- a/twml/twml/layers/full_dense.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# pylint: disable=no-member,arguments-differ, attribute-defined-outside-init
-"""
-Implementing Full Dense Layer
-"""
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.ops import init_ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine.base_layer import InputSpec
-import tensorflow.compat.v1 as tf
-
-
-class FullDense(core_layers.Dense):
-  """
-  Densely-connected layer class.
-  This is wrapping tensorflow.python.layers.core.Dense
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
-
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weight:
-      Weight matrix (TensorFlow variable or tensor). (weight)
-    bias:
-      Bias vector, if applicable (TensorFlow variable or tensor).
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
-
-  """
-
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=None,
-               **kwargs):
-    super(FullDense, self).__init__(units=output_size,
-                                    kernel_initializer=weight_initializer,
-                                    kernel_regularizer=weight_regularizer,
-                                    kernel_constraint=weight_constraint,
-                                    **kwargs)
-    self._num_partitions = num_partitions
-
-  def build(self, input_shape):
-    '''
-    code adapted from TF 1.12 Keras Dense layer:
-    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956
-    '''
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1] is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = InputSpec(min_ndim=2,
-                                axes={-1: input_shape[-1]})
-
-    partitioner = None
-    if self._num_partitions:
-      partitioner = tf.fixed_size_partitioner(self._num_partitions)
-
-    self.kernel = self.add_weight(
-        'kernel',
-        shape=[input_shape[-1], self.units],
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        partitioner=partitioner,
-        trainable=True)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=[self.units, ],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  @property
-  def output_size(self):
-    """
-    Returns output_size
-    """
-    return self.units
-
-  @property
-  def weight(self):
-    """
-    Returns weight
-    """
-    return self.kernel
-
-  @property
-  def weight_regularizer(self):
-    """
-    Returns weight_regularizer
-    """
-    return self.kernel_regularizer
-
-  @property
-  def weight_initializer(self):
-    """
-    Returns weight_initializer
-    """
-    return self.kernel_initializer
-
-  @property
-  def weight_constraint(self):
-    """
-    Returns weight_constraint
-    """
-    return self.kernel_constraint
-
-
-def full_dense(inputs, output_size,
-               activation=None,
-               use_bias=True,
-               weight_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               weight_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               num_partitions=None,
-               reuse=None):
-  """Functional interface for the densely-connected layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  Arguments:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor the same shape as `inputs` except the last dimension is of
-    size `units`.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = FullDense(output_size,
-                    activation=activation,
-                    use_bias=use_bias,
-                    weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer,
-                    weight_regularizer=weight_regularizer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    trainable=trainable,
-                    name=name,
-                    dtype=inputs.dtype.base_dtype,
-                    num_partitions=num_partitions,
-                    _scope=name,
-                    _reuse=reuse)
-  return layer.apply(inputs)
diff --git a/twml/twml/layers/full_sparse.docx b/twml/twml/layers/full_sparse.docx
new file mode 100644
index 000000000..49118a85e
Binary files /dev/null and b/twml/twml/layers/full_sparse.docx differ
diff --git a/twml/twml/layers/full_sparse.py b/twml/twml/layers/full_sparse.py
deleted file mode 100644
index 4f0f21930..000000000
--- a/twml/twml/layers/full_sparse.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# pylint: disable=no-member, arguments-differ, attribute-defined-outside-init, unused-argument
-"""
-Implementing Full Sparse Layer
-"""
-
-import math
-
-from twitter.deepbird.sparse import sparse_dense_matmul
-
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class FullSparse(Layer):
-  """Fully-sparse layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Arguments:
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    input_size:
-      The number of input units. (Deprecated)
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-      This argument defaults to tf.constant_initializer(1/output_size)
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-    use_binary_sparse_dense_matmul:
-      If binary sparse dense matmul op is to be used. It will only be enabled if
-      `use_binary_values` is set true. It only should be used for inference, best practice is
-      to set `use_binary_sparse_dense_matmul = not is_training`.
-  """
-
-  def __init__(self,
-               output_size,
-               input_size=None,
-               weight_initializer=None,
-               activation=None,
-               bias_initializer=None,
-               trainable=True,
-               name=None,
-               use_sparse_grads=True,
-               num_partitions=None,
-               partition_axis=0,
-               use_binary_values=False,
-               bias_regularizer=None,
-               weight_regularizer=None,
-               use_compression=False,
-               use_binary_sparse_dense_matmul=False,
-               **kwargs):
-    super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs)
-    # TODO - remove input_size warning.
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-
-    # The bias initialization and weights initialization is set to match v1's implementation.
-    if bias_initializer is None:
-      bias_initializer = tf.constant_initializer(1 / output_size)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.bias_initializer = bias_initializer
-    self.output_size = output_size
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.num_partitions = num_partitions
-    if partition_axis != 0 and partition_axis != 1:
-      raise ValueError('partition_axis must be 0 or 1')
-    self.partition_axis = partition_axis
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.bias_regularizer = bias_regularizer
-    self._use_compression = use_compression
-    self._cast_indices_dtype = tf.int32 if self._use_compression else None
-    self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul
-
-  def _make_weight_var(self, shape, partitioner):
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-      partitioner=partitioner,
-    )
-
-  def build(self, input_shapes):
-    """
-    creates the ``bias`` and ``weight`` Variables
-    of shape ``[output_size]`` and ``[input_size, output_size]`` respectively.
-    """
-
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    self.bias = self.add_variable(
-      'bias',
-      initializer=self.bias_initializer,
-      regularizer=self.bias_regularizer,
-      shape=[self.output_size, ],
-      dtype=self.dtype,
-      trainable=True
-    )
-
-    partitioner = None
-    shape = [input_shape[1], self.output_size]
-
-    # There is a 2gb limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    num_partitions = 1 if self.num_partitions is None else self.num_partitions
-    in_shape = input_shape[1]
-    out_shape = self.output_size
-
-    # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int.
-    if isinstance(in_shape, tf.Dimension):
-      in_shape = in_shape.value
-
-    if in_shape is None:
-      raise ValueError("Input tensor should have shape."
-                       " You can set it using twml.util.limit_sparse_tensor_size")
-
-    (split_dim, other_dim) = (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape)
-    requested_size = math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor partitions cannot be larger than 2GB.\n"
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n"
-                       "Possible solutions:\n"
-                       "- reduce the params.output_size_bits\n"
-                       "- reduce the output_size of the sparse_layer\n"
-                       "- specify a larger num_partitions argument\n"
-                       "- reduce input_size_bits" %
-                       (in_shape, self.output_size, dtype.name, requested_size, num_partitions))
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self._make_weight_var(shape, partitioner)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A SparseTensor or a list of SparseTensors.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
-
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-        `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
-
-    """
-    if isinstance(inputs, (list, tuple)):
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        use_binary_values = self.use_binary_values
-      else:
-        use_binary_values = [self.use_binary_values] * len(inputs)
-
-      num_inputs = len(inputs)
-      if num_inputs != len(use_binary_values):
-        raise ValueError("#inputs is %d while #use_binary_values is %d"
-                         % (num_inputs, len(use_binary_values)))
-
-      outputs = []
-      for n in range(num_inputs):
-        outputs.append(sparse_dense_matmul(inputs[n], self.weight,
-                                           self.use_sparse_grads,
-                                           use_binary_values[n],
-                                           name='sparse_mm_' + str(n),
-                                           partition_axis=self.partition_axis,
-                                           num_partitions=self.num_partitions,
-                                           compress_ids=self._use_compression,
-                                           cast_indices_dtype=self._cast_indices_dtype,
-                                           use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul))
-      outputs = tf.accumulate_n(outputs)
-    else:
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        raise ValueError("use_binary_values can not be %s when inputs is %s" %
-                         (type(self.use_binary_values), type(inputs)))
-
-      outputs = sparse_dense_matmul(inputs, self.weight,
-                                    self.use_sparse_grads,
-                                    self.use_binary_values,
-                                    name='sparse_mm',
-                                    partition_axis=self.partition_axis,
-                                    num_partitions=self.num_partitions,
-                                    compress_ids=self._use_compression,
-                                    cast_indices_dtype=self._cast_indices_dtype,
-                                    use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul)
-
-    if self.bias is not None:
-      outputs = tf.nn.bias_add(outputs, self.bias)
-
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
-
-
-def full_sparse(
-        inputs, output_size,
-        input_size=None,
-        activation=None,
-        bias_regularizer=None,
-        weight_regularizer=None,
-        bias_initializer=None,
-        weight_initializer=None,
-        trainable=True,
-        name=None,
-        reuse=None,
-        use_sparse_grads=True,
-        num_partitions=None,
-        partition_axis=0,
-        use_binary_values=False,
-        use_compression=False):
-  """Functional interface for the sparsely-connected layer.
-
-  Arguments:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-  Returns:
-    Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``.
-  """
-  # TODO - remove input_size warning.
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now \
-                      automatically inferred from your input.')
-
-  dtype = None
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-    dtype = inputs.dtype.base_dtype
-
-  if isinstance(inputs, (list, tuple)):
-    inputs = [inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs]
-    dtype = inputs[0].dtype.base_dtype
-
-  layer = FullSparse(output_size=output_size,
-                     activation=activation,
-                     trainable=trainable,
-                     name=name,
-                     weight_initializer=weight_initializer,
-                     bias_initializer=bias_initializer,
-                     weight_regularizer=weight_regularizer,
-                     bias_regularizer=bias_regularizer,
-                     dtype=dtype,
-                     _scope=name,
-                     _reuse=reuse,
-                     use_sparse_grads=use_sparse_grads,
-                     num_partitions=num_partitions,
-                     partition_axis=partition_axis,
-                     use_compression=use_compression,
-                     use_binary_values=use_binary_values)
-  return layer(inputs)
diff --git a/twml/twml/layers/isotonic.docx b/twml/twml/layers/isotonic.docx
new file mode 100644
index 000000000..789f92573
Binary files /dev/null and b/twml/twml/layers/isotonic.docx differ
diff --git a/twml/twml/layers/isotonic.py b/twml/twml/layers/isotonic.py
deleted file mode 100644
index 7113f7af4..000000000
--- a/twml/twml/layers/isotonic.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# pylint: disable=no-member, invalid-name, attribute-defined-outside-init
-"""
-Contains the Isotonic Layer
-"""
-
-from .layer import Layer
-
-import libtwml
-import numpy as np
-
-
-class Isotonic(Layer):
-  """
-  This layer is created by the IsotonicCalibrator.
-  Typically it is used intead of sigmoid activation on the output unit.
-
-  Arguments:
-    n_unit:
-      number of input units to the layer (same as number of output units).
-    n_bin:
-      number of bins used for isotonic calibration.
-      More bins means a more precise isotonic function.
-      Less bins means a more regularized isotonic function.
-    xs_input:
-      A tensor containing the boundaries of the bins.
-    ys_input:
-      A tensor containing calibrated values for the corresponding bins.
-
-  Output:
-      output:
-        A layer containing calibrated probabilities with same shape and size as input.
-  Expected Sizes:
-      xs_input, ys_input:
-        [n_unit, n_bin].
-  Expected Types:
-      xs_input, ys_input:
-        same as input.
-  """
-
-  def __init__(self, n_unit, n_bin, xs_input=None, ys_input=None, **kwargs):
-    super(Isotonic, self).__init__(**kwargs)
-
-    self._n_unit = n_unit
-    self._n_bin = n_bin
-
-    self.xs_input = np.empty([n_unit, n_bin], dtype=np.float32) if xs_input is None else xs_input
-    self.ys_input = np.empty([n_unit, n_bin], dtype=np.float32) if ys_input is None else ys_input
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the variables of the layer."""
-
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs: input tensor(s).
-
-    Returns:
-      The output from the layer
-    """
-    calibrate_op = libtwml.ops.isotonic_calibration(inputs, self.xs_input, self.ys_input)
-    return calibrate_op
diff --git a/twml/twml/layers/layer.docx b/twml/twml/layers/layer.docx
new file mode 100644
index 000000000..9a1789f9a
Binary files /dev/null and b/twml/twml/layers/layer.docx differ
diff --git a/twml/twml/layers/layer.py b/twml/twml/layers/layer.py
deleted file mode 100644
index c1b00eb13..000000000
--- a/twml/twml/layers/layer.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# pylint: disable=no-member
-"""
-Implementing a base layer for twml
-"""
-import tensorflow.compat.v1 as tf
-from tensorflow.python.layers import base
-
-
-class Layer(base.Layer):
-  """
-  Base Layer implementation for twml.
-  Overloads `twml.layers.Layer
-  <https://www.tensorflow.org/versions/master/api_docs/python/tf/layers/Layer>`_
-  from tensorflow and adds a couple of custom methods.
-  """
-
-  @property
-  def init(self):
-    """
-    Return initializer ops. By default returns tf.no_op().
-    This method is overwritten by classes like twml.layers.MDL, which
-    uses a HashTable internally, that must be initialized with its own op.
-    """
-    return tf.no_op()
-
-  def call(self, inputs, **kwargs):
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-      **kwargs:
-        additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
-    """
-    raise NotImplementedError
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/mdl.docx b/twml/twml/layers/mdl.docx
new file mode 100644
index 000000000..a87cfe529
Binary files /dev/null and b/twml/twml/layers/mdl.docx differ
diff --git a/twml/twml/layers/mdl.py b/twml/twml/layers/mdl.py
deleted file mode 100644
index cf4018afa..000000000
--- a/twml/twml/layers/mdl.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing MDL Layer
-"""
-
-
-from .layer import Layer
-from .partition import Partition
-from .stitch import Stitch
-
-import libtwml
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class MDL(Layer):  # noqa: T000
-  """
-  MDL layer is constructed by MDLCalibrator after accumulating data
-  and performing minimum description length (MDL) calibration.
-
-  MDL takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an MDL bin.
-  Each MDL input feature is converted to n_bin bins.
-  Each MDL calibration tries to find bin delimiters such that the number of features values
-  per bin is roughly equal (for each given MDL feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-          self,
-          n_feature, n_bin, out_bits,
-          bin_values=None, hash_keys=None, hash_values=None,
-          bin_ids=None, feature_offsets=None, **kwargs):
-    """
-    Creates a non-initialized `MDL` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during MDL calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of MDL bins used for MDL calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that MDL discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces: MDL vs non-MDL
-          2. transate the MDL features into a hash_feature ID that MDL understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for MDL.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the MDL features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-    """
-    super(MDL, self).__init__(**kwargs)
-    tf.logging.warning("MDL will be deprecated. Please use PercentileDiscretizer instead")
-
-    max_mdl_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    self._hash_keys_initializer = tf.constant_initializer(
-      hash_keys if hash_keys is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._hash_values_initializer = tf.constant_initializer(
-      hash_values if hash_values is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_ids_initializer = tf.constant_initializer(
-      bin_ids if bin_ids is not None
-      else np.empty(max_mdl_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_values_initializer = tf.constant_initializer(
-      bin_values if bin_values is not None
-      else np.empty(max_mdl_feature, dtype=np.float32),
-      dtype=np.float32
-    )
-    self._feature_offsets_initializer = tf.constant_initializer(
-      feature_offsets if feature_offsets is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-
-    # note that calling build here is an exception as typically __call__ would call build().
-    # We call it here because we need to initialize hash_map.
-    # Also note that the variable_scope is set by add_variable in build()
-    if not self.built:
-      self.build(input_shape=None)
-
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-
-    # build variables
-
-    hash_keys = self.add_variable(
-      'hash_keys',
-      initializer=self._hash_keys_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    hash_values = self.add_variable(
-      'hash_values',
-      initializer=self._hash_values_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # hashmap converts known features into range [0, n_feature)
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-
-    self.bin_ids = self.add_variable(
-      'bin_ids',
-      initializer=self._bin_ids_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.int64,
-      trainable=False)
-
-    self.bin_values = self.add_variable(
-      'bin_values',
-      initializer=self._bin_values_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.float32,
-      trainable=False)
-
-    self.feature_offsets = self.add_variable(
-      'feature_offsets',
-      initializer=self._feature_offsets_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements MDL inference where inputs are intersected with a hash_map.
-    Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to MDL for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    # get intersect(keys, hash_map)
-    hashed_keys = self.hash_map.lookup(keys)
-
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_mdl_keys, mdl_in_keys = key
-    non_mdl_vals, mdl_in_vals = vals
-
-    self.non_mdl_keys = non_mdl_keys
-
-    # run MDL on the keys/values it knows about
-    mdl_keys, mdl_vals = libtwml.ops.mdl(mdl_in_keys, mdl_in_vals, self.bin_ids, self.bin_values,
-                                         self.feature_offsets)
-
-    # handle output ID conflicts
-    mdl_size = tf.size(self.bin_ids, out_type=tf.int64)
-    non_mdl_size = tf.subtract(self.output_size, mdl_size)
-    non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size)
-
-    # Stitch the keys and values from mdl and non mdl indices back, with help
-    # of the Stitch Layer
-
-    # out for inference checking
-    self.mdl_out_keys = mdl_keys
-
-    concat_data = self.stitch([non_mdl_vals, mdl_vals],
-                              [non_mdl_keys, mdl_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/partition.docx b/twml/twml/layers/partition.docx
new file mode 100644
index 000000000..917d7d417
Binary files /dev/null and b/twml/twml/layers/partition.docx differ
diff --git a/twml/twml/layers/partition.py b/twml/twml/layers/partition.py
deleted file mode 100644
index 0e7c85f18..000000000
--- a/twml/twml/layers/partition.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-Implementing partition Layer
-"""
-
-
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
-
-
-class Partition(Layer):
-  """
-  This layer implements:
-
-  .. code-block:: python
-
-    tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-
-  Input:
-    partitions:
-      the number of partitions which we will divide the hashmap keys/bvalues
-
-  Output:
-    A layer that performs partitioning
-   """
-
-  def __init__(self, partitions=2, **kwargs):
-    self.partitions = partitions
-    super(Partition, self).__init__(**kwargs)
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, partition_ids, input_vals, input_keys, **kwargs):
-    """This layer is responsible for partitioning the values/keys of a hashmap
-
-    Arguments:
-      partition_ids:
-        Tensor that is equivalent to boolean (int32).
-      input_vals:
-        Tensor that represents the values of the hashmap(float).
-      input_keys:
-        Tensor that represents the keys of the hashmap(float)
-
-    Returns:
-      The output of the partition layer, which is a list of lists which looks
-      something like:
-
-      .. code-block:: python
-
-        [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]]
-
-      where:
-        vals_x:
-          values of the hashmap for partition x
-        keys_x:
-          keys of the hashmap for partition x
-        indices_x:
-          indices of the hashmap for partition x
-    """
-    partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-    partioned_keys = tf.dynamic_partition(input_keys, partition_ids, self.partitions)
-    partioned_indices = tf.dynamic_partition(tf.range(tf.shape(partition_ids)[0]),
-                                             tf.cast(partition_ids, tf.int32), self.partitions)
-    return [partioned_val, partioned_keys, partioned_indices]
diff --git a/twml/twml/layers/percentile_discretizer.docx b/twml/twml/layers/percentile_discretizer.docx
new file mode 100644
index 000000000..aefbffc50
Binary files /dev/null and b/twml/twml/layers/percentile_discretizer.docx differ
diff --git a/twml/twml/layers/percentile_discretizer.py b/twml/twml/layers/percentile_discretizer.py
deleted file mode 100644
index 55bb4de8c..000000000
--- a/twml/twml/layers/percentile_discretizer.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing PercentileDiscretizer Layer
-"""
-
-
-import libtwml
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-from twml.layers import Layer
-
-
-class PercentileDiscretizer(Layer):
-  """
-  PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
-  accumulating data and performing percentile bucket calibration.
-
-  PercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
-  Each PercentileDiscretizer input feature is converted to n_bin bins.
-  Each PercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values per bin is roughly equal (for
-  each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
-  equiprobable, according to the given calibration data.
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-      self,
-      n_feature, n_bin, out_bits,
-      bin_values=None, hash_keys=None, hash_values=None,
-      bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs):
-    """
-    Creates a non-initialized `PercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during PercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that PercentileDiscretizer discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          PercentileDiscretizer vs non-PercentileDiscretizer
-          2. transate the PercentileDiscretizer features into a hash_feature ID that
-          PercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the PercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-    """
-
-    super(PercentileDiscretizer, self).__init__(**kwargs)
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    # build variables
-    self._out_bits = out_bits
-    self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._hash_keys = (hash_keys if hash_keys is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._hash_values = (hash_values if hash_values is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._bin_ids = (bin_ids if bin_ids is not None else
-     np.empty(max_discretizer_feature, dtype=np.int64))
-    self._bin_values = (bin_values if bin_values is not None else
-     np.empty(max_discretizer_feature, dtype=np.float32))
-    self._feature_offsets = (feature_offsets if feature_offsets is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self.num_parts = num_parts
-    self.cost_per_unit = cost_per_unit
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
-    """
-    self.built = True
-
-  def call(self, inputs, keep_inputs=False, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
-    Input features that were not calibrated have their feature IDs truncated, so as
-    to be less than 1<<output_bits, but their values remain untouched (not discretized)
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Args:
-      inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      keep_inputs:
-        Include the original inputs in the output.
-        Note - if True, undiscretized features will be passed through, but will have
-        their values doubled (unless there are no calibrated features to discretize).
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if self._n_feature > 0:
-      discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
-        input_ids=keys,  # inc key assigned to feature_id, or -1
-        input_vals=vals,  # the observed feature values
-        bin_ids=self._bin_ids,  # n_feat X (n_bin+1) 2D arange
-        bin_vals=self._bin_values,  # bin boundaries
-        feature_offsets=self._feature_offsets,  # 0 : nbin_1 : max_feat
-        output_bits=self._out_bits,
-        feature_ids=tf.make_tensor_proto(self._hash_keys),  # feature ids to build internal hash map
-        feature_indices=tf.make_tensor_proto(self._hash_values),  # keys associated w/ feat. indices
-        start_compute=tf.constant(0, shape=[], dtype=tf.int64),
-        end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
-        cost_per_unit=self.cost_per_unit
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-      # don't 2x the input.
-      keep_inputs = False
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self._output_size]
-
-    output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
-
-    if keep_inputs:
-      # Note the non-discretized features will end up doubled,
-      #   since these are already in `output`
-      # handle output ID conflicts
-      mdl_size = self._n_feature * (self._n_bin + 1)
-      non_mdl_size = tf.subtract(self._output_size, mdl_size)
-      input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
-
-      new_input = twml.SparseTensor(
-        ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf()
-
-      # concatenate discretizer output with original input
-      sparse_add = tf.sparse_add(new_input, output)
-      output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape)
-
-    return output
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/sequential.docx b/twml/twml/layers/sequential.docx
new file mode 100644
index 000000000..8f3976668
Binary files /dev/null and b/twml/twml/layers/sequential.docx differ
diff --git a/twml/twml/layers/sequential.py b/twml/twml/layers/sequential.py
deleted file mode 100644
index c0d4b92cc..000000000
--- a/twml/twml/layers/sequential.py
+++ /dev/null
@@ -1,160 +0,0 @@
-"""
-Implementing Sequential Layer container
-"""
-
-
-from .layer import Layer
-
-from tensorflow import keras
-from tensorflow.python.layers import base
-
-
-class Sequential(Layer):
-  """
-  A sequential stack of layers.
-
-  Arguments:
-      layers: list of layers to add to the model.
-
-  Output:
-      the output of the sequential layers
-   """
-
-  def __init__(self, layers=None, **kwargs):
-    self._layers = []  # Stack of layers.
-    self._layer_names = []  # Stack of layers names
-    self._layer_outputs = []
-    # Add to the model any layers passed to the constructor.
-    if layers:
-      for layer in layers:
-        self.add(layer)
-    super(Sequential, self).__init__(**kwargs)
-
-  def add(self, layer):
-    """Adds a layer instance on top of the layer stack.
-
-    Arguments:
-      layer:
-        layer instance.
-
-    Raises:
-      TypeError:
-        if the layer argument is not instance of base.Layer
-    """
-    if not isinstance(layer, base.Layer) and not isinstance(layer, keras.layers.Layer):
-      raise TypeError('The added layer must be an instance of class Layer')
-
-    if layer.name in self._layer_names:
-      raise ValueError('Layer with name %s already exists in sequential layer' % layer.name)
-
-    self._layers.append(layer)
-    self._layer_names.append(layer.name)
-
-  def pop(self):
-    """Removes the last layer in the model.
-
-    Raises:
-      TypeError:
-        if there are no layers in the model.
-    """
-    if not self._layers or not self._layer_names:
-      raise TypeError('There are no layers in the model.')
-    self._layers.pop()
-    self._layer_names.pop()
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-
-    Returns:
-      The output of the sequential layers
-    """
-    self._layer_outputs = []
-    for layer in self._layers:
-      # don't use layer.call because you want to build individual layers
-      inputs = layer(inputs)  # overwrites the current input after it has been processed
-      self._layer_outputs.append(inputs)
-    return inputs
-
-  @property
-  def layers(self):
-    """ Return the layers in the sequential layer """
-    return self._layers
-
-  @property
-  def layer_names(self):
-    """ Return the layer names in the sequential layer """
-    return self._layer_names
-
-  @property
-  def layer_outputs(self):
-    """ Return the layer outputs in the sequential layer """
-    return self._layer_outputs
-
-  def get(self, key):
-    """Retrieves the n-th layer.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The n-th layer where n is equal to the key.
-    """
-    return self._layers[key]
-
-  def get_output(self, key):
-    """Retrieves the n-th layer output.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The intermediary output equivalent to the nth layer, where n is equal to the key.
-    """
-    return self._layer_outputs[key]
-
-  def get_layer_by_name(self, name):
-    """Retrieves the layer corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
-
-    Output:
-      list of layers that have the name desired
-    """
-    return self._layers[self._layer_names.index(name)]
-
-  def get_layer_output_by_name(self, name):
-    """Retrieves the layer output corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
-
-    Output:
-      list of the output of the layers that have the desired name
-    """
-    return self._layer_outputs[self._layer_names.index(name)]
-
-  @property
-  def init(self):
-    """ returns a list of initialization ops (one per layer) """
-    return [layer.init for layer in self._layers]
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/sparse_max_norm.docx b/twml/twml/layers/sparse_max_norm.docx
new file mode 100644
index 000000000..3bfbffb9b
Binary files /dev/null and b/twml/twml/layers/sparse_max_norm.docx differ
diff --git a/twml/twml/layers/sparse_max_norm.py b/twml/twml/layers/sparse_max_norm.py
deleted file mode 100644
index e1f423fe0..000000000
--- a/twml/twml/layers/sparse_max_norm.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, duplicate-code
-"""
-Contains the twml.layers.SparseMaxNorm layer.
-"""
-from .layer import Layer
-
-from libtwml import OPLIB
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class SparseMaxNorm(Layer):
-  """
-  Computes a max-normalization and adds bias to the sparse_input,
-  forwards that through a sparse affine transform followed
-  by an non-linear activation on the resulting dense representation.
-
-  This layer has two parameters, one of which learns through gradient descent:
-    bias_x (optional):
-      vector of shape [input_size]. Learned through gradient descent.
-    max_x:
-      vector of shape [input_size]. Holds the maximas of input ``x`` for normalization.
-      Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both.
-
-  The pseudo-code for this layer looks like:
-
-  .. code-block:: python
-
-    abs_x = abs(x)
-    normed_x = clip_by_value(x / max_x, -1, 1)
-    biased_x = normed_x + bias_x
-    return biased
-
-
-  Args:
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    A layer representing the output of the sparse_max_norm transformation.
-   """
-
-  def __init__(
-          self,
-          input_size=None,
-          max_x_initializer=None,
-          bias_x_initializer=None,
-          is_training=True,
-          epsilon=1E-5,
-          use_bias=True,
-          **kwargs):
-
-    super(SparseMaxNorm, self).__init__(**kwargs)
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-    if max_x_initializer is None:
-      max_x_initializer = tf.zeros_initializer()
-    self.max_x_initializer = max_x_initializer
-
-    self._use_bias = use_bias
-    if use_bias:
-      if bias_x_initializer is None:
-        bias_x_initializer = tf.zeros_initializer()
-      self.bias_x_initializer = bias_x_initializer
-
-    self.epsilon = epsilon
-    self.is_training = is_training
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the max_x and bias_x tf.Variables of the layer."""
-
-    self.max_x = self.add_variable(
-      'max_x',
-      initializer=self.max_x_initializer,
-      shape=[input_shape[1]],
-      dtype=tf.float32,
-      trainable=False)
-
-    if self._use_bias:
-      self.bias_x = self.add_variable(
-        'bias_x',
-        initializer=self.bias_x_initializer,
-        shape=[input_shape[1]],
-        dtype=tf.float32,
-        trainable=True)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def _call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
-
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
-    """
-
-    if isinstance(inputs, twml.SparseTensor):
-      inputs = inputs.to_tf()
-    elif not isinstance(inputs, tf.SparseTensor):
-      raise TypeError("The inputs must be of type tf.SparseTensor or twml.SparseTensor")
-
-    indices_x = inputs.indices[:, 1]
-    values_x = inputs.values
-
-    if self.is_training is False:
-      normalized_x = OPLIB.sparse_max_norm_inference(self.max_x,
-                                                     indices_x,
-                                                     values_x,
-                                                     self.epsilon)
-
-      update_op = tf.no_op()
-    else:
-      max_x, normalized_x = OPLIB.sparse_max_norm_training(self.max_x,
-                                                           indices_x,
-                                                           values_x,
-                                                           self.epsilon)
-
-      update_op = tf.assign(self.max_x, max_x)
-
-    with tf.control_dependencies([update_op]):
-      normalized_x = tf.stop_gradient(normalized_x)
-
-    # add input bias
-    if self._use_bias:
-      normalized_x = normalized_x + tf.gather(self.bias_x, indices_x)
-
-    # convert back to sparse tensor
-    return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape)
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
-
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
-    """
-    with tf.device(self.max_x.device):
-      return self._call(inputs, **kwargs)
-
-# For backwards compatiblity and also because I don't want to change all the tests.
-MaxNorm = SparseMaxNorm
-
-
-def sparse_max_norm(inputs,
-                    input_size=None,
-                    max_x_initializer=None,
-                    bias_x_initializer=None,
-                    is_training=True,
-                    epsilon=1E-5,
-                    use_bias=True,
-                    name=None,
-                    reuse=None):
-  """
-  Functional inteface to SparseMaxNorm.
-
-  Args:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    input_size:
-      number of input units
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    Output after normalizing with the max value.
-   """
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now automatically \
-                     inferred from your input.')
-
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-
-  layer = SparseMaxNorm(max_x_initializer=max_x_initializer,
-                        bias_x_initializer=bias_x_initializer,
-                        is_training=is_training,
-                        epsilon=epsilon,
-                        use_bias=use_bias,
-                        name=name,
-                        _scope=name,
-                        _reuse=reuse)
-  return layer(inputs)
diff --git a/twml/twml/layers/stitch.docx b/twml/twml/layers/stitch.docx
new file mode 100644
index 000000000..a0c956fe7
Binary files /dev/null and b/twml/twml/layers/stitch.docx differ
diff --git a/twml/twml/layers/stitch.py b/twml/twml/layers/stitch.py
deleted file mode 100644
index 51dffdb8e..000000000
--- a/twml/twml/layers/stitch.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# pylint: disable=useless-super-delegation
-"""
-Implementing Stitch Layer
-"""
-
-
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
-
-
-class Stitch(Layer):
-  """
-  This layer is responsible for stitching a partioned layer together.
-
-  Output:
-    A layer that performs stitching
-  """
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, partioned_val, partioned_keys,
-           partioned_indices, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """
-    This layer is responsible for stitching a partioned layer together.
-
-    Input:
-      partioned_val:
-        a list of partioned Tensors which represent the vals of the hashmap
-      partioned_keys:
-        a list of partioned Tensors which represent the keys of the hashmap
-      partioned_indices:
-        a list of partioned Tensors which represent the indices of the hashmap
-    Output:
-      List which contains: [output_vals, output_keys]
-        output_vals:
-          Values of the HashMap (float)
-        output_keys:
-          Keys of HashMap (float)
-    """
-    indices = [tf.to_int32(index) for index in partioned_indices]
-    concat_keys = tf.dynamic_stitch(indices, partioned_keys)
-    concat_vals = tf.dynamic_stitch(indices, partioned_val)
-    return [concat_vals, concat_keys]
diff --git a/twml/twml/learning_rate_decay.docx b/twml/twml/learning_rate_decay.docx
new file mode 100644
index 000000000..164154a7c
Binary files /dev/null and b/twml/twml/learning_rate_decay.docx differ
diff --git a/twml/twml/learning_rate_decay.py b/twml/twml/learning_rate_decay.py
deleted file mode 100644
index be522d75b..000000000
--- a/twml/twml/learning_rate_decay.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# pylint: disable=too-many-branches
-""" This module includes functions for managing learning rate decay """
-import tensorflow.compat.v1 as tf
-
-
-def get_learning_rate_decay_fn(params):
-  """
-  Returns a learning rate decay function that takes the initial
-  learning_rate and global_step
-  as arguments and returns the current learning rate.
-
-  Currently supports params.learning_rate_decay values of:
-  exponential | polynomial | piecewise_constant | cosine | cosine restarts.
-  See `Decaying the Leanring Rate
-  <https://www.tensorflow.org/api_guides/python/train#Decaying_the_learning_rate>`_ for details.
-
-  Arguments:
-    params:
-      a tensorflow.contrib.train.HParams object containing the relevant hyperparameters.
-  """
-  paramsv = params.values()
-  if 'learning_rate_decay' not in paramsv or params.learning_rate_decay == 'no_learning_rate_decay':
-    return None
-  elif params.learning_rate_decay == 'exponential_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'exponential'")
-    if 'exponential_decay_rate' not in paramsv:
-      raise ValueError("Expecting params.exponential_decay_rate for "
-                       "params.learning_rate_decay == 'exponential'")
-
-    def exponential_decay_fn(learning_rate, global_step):
-      """ exponential decay function to be passed to optimize_loss """
-      return tf.train.exponential_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.exponential_decay_rate
-      )
-    return exponential_decay_fn
-  elif params.learning_rate_decay == 'piecewise_constant_learning_rate_decay':
-    if 'piecewise_constant_boundaries' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_boundaries for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    if 'piecewise_constant_values' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_values for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    # pylint: disable=unused-argument
-
-    def piecewise_constant_fn(learning_rate, global_step):
-      """ piecewise_constant decay function to be passed to optimize_loss """
-      return tf.train.piecewise_constant(
-        x=global_step,
-        boundaries=params.piecewise_constant_boundaries,
-        values=params.piecewise_constant_values
-      )
-    return piecewise_constant_fn
-  elif params.learning_rate_decay == 'polynomial_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'polynomial'")
-    if 'end_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.end_learning_rate for "
-                       "params.learning_rate_decay == 'polynomial'")
-
-    def polynomial_decay_fn(learning_rate, global_step):
-      """ polynomial decay function to be passed to optimize_loss """
-      return tf.train.polynomial_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        end_learning_rate=params.end_learning_rate,
-        power=params.polynomial_power if 'polynomial_power' in paramsv else 1.0,
-      )
-    return polynomial_decay_fn
-
-  elif params.learning_rate_decay == 'inverse_learning_rate_decay':
-    if 'min_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.min_learning_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_rate' not in paramsv:
-      raise ValueError("Expecting params.decay_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'inverse'")
-
-    def bounded_inverse_time_decay_fn(learning_rate, global_step):
-      '''
-      Returns the decayed learning_rate by applying the function:
-      decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)),
-                       min_learning_rate)
-      Arguments:
-        learning_rate:
-          A scalar `float32` or `float64` `Tensor` or a Python number.
-          The initial learning rate.
-        global_step:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Global step to use for the decay computation.  Must not be negative.
-        min_learning_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Minimum possible learning_rate. The decayed learning_rate will not be
-          smaller than the min_learning_rate
-        decay_steps:
-          How often to apply decay. In dbv1, this should be 1.
-        decay_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Rate in which we decay the learning rate.
-        Returns:
-        A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-        learning rate.
-      '''
-      decayed_rate = tf.train.inverse_time_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.decay_rate)
-      # Getting dtype of returned Tensor
-      dtype = decayed_rate.dtype
-      # Casting the min_learning rate the same dtype as decayes rate
-      min_learning_rate = tf.cast(params.min_learning_rate, dtype)
-      # Returning the maximum between the two
-      return tf.maximum(decayed_rate, min_learning_rate)
-
-    return bounded_inverse_time_decay_fn
-
-  elif params.learning_rate_decay == 'cosine_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    def cosine_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        alpha=params.alpha
-      )
-    return cosine_decay_fn
-  elif params.learning_rate_decay == 'cosine_restarts_learning_rate_decay':
-    if 'first_decay_steps' not in paramsv:
-      raise ValueError("Expecting params.first_decay_steps for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 't_mul' not in paramsv:
-      raise ValueError("Expecting params.t_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 'm_mul' not in paramsv:
-      raise ValueError("Expecting params.m_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    def cosine_restart_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay_restarts(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        first_decay_steps=params.first_decay_steps,
-        t_mul=params.t_mul,
-        m_mul=params.m_mul,
-        alpha=params.alpha
-      )
-    return cosine_restart_decay_fn
-
-  raise ValueError("Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay)
diff --git a/twml/twml/lookup/__init__.docx b/twml/twml/lookup/__init__.docx
new file mode 100644
index 000000000..acdf0704b
Binary files /dev/null and b/twml/twml/lookup/__init__.docx differ
diff --git a/twml/twml/lookup/__init__.py b/twml/twml/lookup/__init__.py
deleted file mode 100644
index 87392d719..000000000
--- a/twml/twml/lookup/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from tensorflow.python.ops.lookup_ops import (
-  index_table_from_file,
-  index_table_from_tensor,
-  index_to_string_table_from_file
-)  # noqa: F401
-
-
-"""
-NOTE: Using `from tensorflow.python.ops.lookup_ops import index_table_from_tensor` in the code works.
-This stub exists because it was easier to refactor code because twml is widely used.
-"""
diff --git a/twml/twml/metrics.docx b/twml/twml/metrics.docx
new file mode 100644
index 000000000..9ed2ef17d
Binary files /dev/null and b/twml/twml/metrics.docx differ
diff --git a/twml/twml/metrics.py b/twml/twml/metrics.py
deleted file mode 100644
index ee2f82b74..000000000
--- a/twml/twml/metrics.py
+++ /dev/null
@@ -1,1380 +0,0 @@
-"""
-This module contains custom tensorflow metrics used at Twitter.
-Its components conform to conventions used by the ``tf.metrics`` module.
-
-"""
-
-from collections import OrderedDict
-from functools import partial
-
-import numpy as np
-import tensorboard as tb
-import tensorflow.compat.v1 as tf
-
-
-CLAMP_EPSILON = 0.00001
-
-
-def total_weight_metric(
-    labels,
-    predictions,
-    weights=None,
-    metrics_collections=None,
-    updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'total_weight', (labels, predictions, weights)):
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float64)
-
-    if weights is None:
-      weights = tf.cast(tf.size(labels), total_weight.dtype, name="default_weight")
-    else:
-      weights = tf.cast(weights, total_weight.dtype)
-
-    # add up the weights to get total weight of the eval set
-    update_total_weight = tf.assign_add(total_weight, tf.reduce_sum(weights), name="update_op")
-
-    value_op = tf.identity(total_weight)
-    update_op = tf.identity(update_total_weight)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return value_op, update_op
-
-
-def num_samples_metric(
-    labels,
-    predictions,
-    weights=None,
-    metrics_collections=None,
-    updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'num_samples', (labels, predictions, weights)):
-    num_samples = _metric_variable(name='num_samples', shape=[], dtype=tf.float64)
-    update_num_samples = tf.assign_add(num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op")
-
-    value_op = tf.identity(num_samples)
-    update_op = tf.identity(update_num_samples)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return value_op, update_op
-
-
-def ctr(labels, predictions,
-        weights=None,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive sample ratio based on labels
-  (i.e. weighted average percentage of positive labels).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    ctr: A `Tensor` representing positive sample ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=labels,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def predicted_ctr(labels, predictions,
-                  weights=None,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive ratio based on predictions,
-  (i.e. weighted averaged predicted positive probability).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    predicted_ctr: A `Tensor` representing the predicted positive ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=predictions,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def prediction_std_dev(labels, predictions,
-                       weights=None,
-                       metrics_collections=None,
-                       updates_collections=None,
-                       name=None):
-  """
-  Compute the weighted standard deviation of the predictions.
-  Note - this is not a confidence interval metric.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'pred_std_dev', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    # State kept during streaming of examples
-    total_weighted_preds = _metric_variable(
-        name='total_weighted_preds', shape=[], dtype=tf.float64)
-    total_weighted_preds_sq = _metric_variable(
-        name='total_weighted_preds_sq', shape=[], dtype=tf.float64)
-    total_weights = _metric_variable(
-        name='total_weights', shape=[], dtype=tf.float64)
-
-    # Update state
-    update_total_weighted_preds = tf.assign_add(total_weighted_preds, tf.reduce_sum(weights * predictions))
-    update_total_weighted_preds_sq = tf.assign_add(total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions))
-    update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights))
-
-    # Compute output
-    def compute_output(tot_w, tot_wp, tot_wpp):
-      return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2)
-    std_dev_est = compute_output(total_weights, total_weighted_preds, total_weighted_preds_sq)
-    update_std_dev_est = compute_output(update_total_weights, update_total_weighted_preds, update_total_weighted_preds_sq)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, std_dev_est)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_std_dev_est)
-
-    return std_dev_est, update_std_dev_est
-
-
-def _get_arce_predictions(predictions, weights, label_weighted, labels,
-                         up_weight, deprecated_rce,
-                         total_positive, update_total_positive):
-  """
-  Returns the ARCE predictions, total_positive, update_total_positive and weights
-  used by the rest of the twml.metrics.rce metric computation.
-  """
-  predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
-  label_weighted_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(label_weighted))
-  pred_weight_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted))
-  normalizer_comp = label_weighted_comp / pred_weight_comp
-
-  if up_weight is False:
-    total_positive_unweighted = _metric_variable(
-      name='total_positive_unweighted', shape=[], dtype=tf.float32)
-
-    update_total_positive_unweighted = tf.assign_add(
-      total_positive_unweighted, tf.reduce_sum(labels),
-      name="total_positive_unweighted_update")
-
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted)
-    else:
-      # sum of labels / sum of weighted labels
-      normalizer = update_total_positive_unweighted / update_total_positive
-
-    label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels))
-    normalizer_comp = label_comp / label_weighted_comp
-
-    # note that up_weight=True changes these for the rest of the twml.metric.rce computation
-    weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    total_positive = total_positive_unweighted
-    update_total_positive = update_total_positive_unweighted
-  else:
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-    else:
-      # normalizer used for NRCE (and ARCE with up_weight=True)
-      total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
-
-      # update the variable holding the sum of weighted predictions
-      update_total_prediction = tf.assign_add(
-        total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-      # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      # but it measure normalizer over batch was too flawed an approximation.
-      normalizer = update_total_positive / update_total_prediction
-
-  pred_comp = tf.subtract(tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions)
-  pred_comp_norm = tf.multiply(pred_comp, normalizer_comp, name="normalized_predictions_comp")
-  pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator")
-  pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator")
-  predictions = pred_num / pred_denom
-
-  return predictions, total_positive, update_total_positive, weights
-
-
-def rce(labels, predictions,
-        weights=None,
-        normalize=False,
-        arce=False,
-        up_weight=True,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None,
-        deprecated_rce=False):
-  """
-  Compute the relative cross entropy (RCE).
-  The RCE is a relative measurement compared to the baseline model's performance.
-  The baseline model always predicts average click-through-rate (CTR).
-  The RCE measures, in percentage, how much better the predictions are, compared
-  to the baseline model, in terms of cross entropy loss.
-
-  y = label; p = prediction;
-  binary cross entropy = y * log(p) + (1-y) * log(1-p)
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    normalize:
-      if set to true, produce NRCEs used at Twitter. (normalize preds by weights first)
-      NOTE: if you don't understand what NRCE is, please don't use it.
-    arce:
-      if set to true, produces `ARCE <http://go/arce>`_.
-      This can only be activated if `normalize=True`.
-    up_weight:
-      if set to true, produces arce in the up_weighted space (considers CTR after up_weighting
-      data), while False gives arce in the original space (only considers CTR before up_weighting).
-      In the actual version, this flag can only be activated if arce is True.
-      Notice that the actual version of NRCE corresponds to up_weight=True.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-    deprecated_rce:
-      enables the previous NRCE/ARCE calculations which calculated some label metrics
-      on the batch instead of on all batches seen so far. Note that the older metric
-      calculation is less stable, especially for smaller batch sizes. You should probably
-      never have to set this to True.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  .. note:: Must have at least 1 positive and 1 negative sample accumulated,
-     or RCE will come out as NaN.
-  """
-  with tf.variable_scope(name, 'rce', (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
-
-    total_positive = _metric_variable(name='total_positive', shape=[], dtype=tf.float32)
-    total_loss = _metric_variable(name='total_loss', shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float32)
-
-    label_weighted = tf.multiply(labels, weights, name="weighted_label")
-
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(label_weighted), name="total_pos_update")
-
-    if arce:
-      if normalize is False:
-        raise ValueError('This configuration of parameters is not actually allowed')
-
-      predictions, total_positive, update_total_positive, weights = _get_arce_predictions(
-        predictions=predictions, weights=weights, deprecated_rce=deprecated_rce,
-        label_weighted=label_weighted, labels=labels, up_weight=up_weight,
-        total_positive=total_positive, update_total_positive=update_total_positive)
-
-    elif normalize:
-      predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
-
-      if deprecated_rce:
-        normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      else:
-        total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
-
-        # update the variable holding the sum of weighted predictions
-        update_total_prediction = tf.assign_add(
-          total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-        # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-        # but it measure normalizer over batch was too flawed an approximation.
-        normalizer = update_total_positive / update_total_prediction
-
-      # NRCE
-      predictions = tf.multiply(predictions, normalizer, name="normalized_predictions")
-
-    # clamp predictions to keep log(p) stable
-    clip_p = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss")
-
-    logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-
-    # metric value retrieval subgraph
-    ctr1 = tf.truediv(total_positive, total_weight, name="ctr")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce")
-    pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce")
-
-    rce_t = tf.multiply(
-      1.0 - tf.truediv(pred_ce, baseline_ce),
-      100,
-      name="rce")
-
-    # metric update subgraph
-    ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce2 = _binary_cross_entropy(pred=ctr2, target=ctr2, name="baseline_ce_update")
-    pred_ce2 = tf.truediv(update_total_loss, update_total_weight, name="pred_ce_update")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(pred_ce2, baseline_ce2),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, rce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return rce_t, update_op
-
-
-def ce(p_true, p_est=None):
-  if p_est is None:
-    p_est = p_true
-  return _binary_cross_entropy(pred=p_est, target=p_true, name=None)
-
-
-def rce_transform(outputs, labels, weights):
-  '''
-  Construct an OrderedDict of quantities to aggregate over eval batches
-  outputs, labels, weights are TensorFlow tensors, and are assumed to
-    be of shape [N] for batch_size = N
-  Each entry in the output OrderedDict should also be of shape [N]
-  '''
-  out_vals = OrderedDict()
-  out_vals['weighted_loss'] = weights * ce(p_true=labels, p_est=outputs)
-  out_vals['weighted_labels'] = labels * weights
-  out_vals['weight'] = weights
-  return out_vals
-
-
-def rce_metric(aggregates):
-  '''
-  input ``aggregates`` is an OrderedDict with the same keys as those created
-    by rce_transform(). The dict values are the aggregates (reduce_sum)
-    of the values produced by rce_transform(), and should be scalars.
-  output is the value of RCE
-  '''
-  # cummulative weighted loss of model predictions
-  total_weighted_loss = aggregates['weighted_loss']
-  total_weighted_labels = aggregates['weighted_labels']
-  total_weight = aggregates['weight']
-
-  model_average_loss = total_weighted_loss / total_weight
-  baseline_average_loss = ce(total_weighted_labels / total_weight)
-  return 100.0 * (1 - model_average_loss / baseline_average_loss)
-
-
-def metric_std_err(labels, predictions,
-                   weights=None,
-                   transform=rce_transform, metric=rce_metric,
-                   metrics_collections=None,
-                   updates_collections=None,
-                   name='rce_std_err'):
-  """
-  Compute the weighted standard error of the RCE metric on this eval set.
-  This can be used for confidence intervals and unpaired hypothesis tests.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    transform: a function of the following form:
-
-      .. code-block:: python
-
-        def transform(outputs, labels, weights):
-          out_vals = OrderedDict()
-          ...
-          return out_vals
-
-      where outputs, labels, and weights are all tensors of shape [eval_batch_size].
-      The returned OrderedDict() should have values that are tensors of shape  [eval_batch_size].
-      These will be aggregated across many batches in the eval dataset, to produce
-      one scalar value per key of out_vals.
-    metric: a function of the following form
-
-      .. code-block:: python
-
-        def metric(aggregates):
-          ...
-          return metric_value
-
-      where aggregates is an OrderedDict() having the same keys created by transform().
-      Each of the corresponding dict values is the reduce_sum of the values produced by
-      transform(), and is a TF scalar. The return value should be a scalar representing
-      the value of the desired metric.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'metric_std_err', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    labels = tf.reshape(labels, [-1])
-    predictions = tf.reshape(predictions, [-1])
-    predictions = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    weights = tf.reshape(weights, [-1])
-
-    # first apply the supplied transform function to the output, label, weight data
-    # returns an OrderedDict of 1xN tensors for N input samples
-    # for each sample, compute f = transform(pred, l, w)
-    transformed = transform(predictions, labels, weights)
-
-    # we track 3 types of aggregate information
-    # 1. total number of samples
-    # 2. aggregated transformed samples (moment1), i.e. sum(f)
-    # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T)
-
-    # count total number of samples
-    sample_count = _metric_variable(
-        name='sample_count', shape=[], dtype=tf.int64)
-    update_sample_count = tf.assign_add(sample_count, tf.size(labels, out_type=sample_count.dtype))
-
-    # compose the ordered dict into a single vector
-    # so f can be treated as a single column vector rather than a collection of scalars
-    N = len(transformed)
-    transformed_vec = tf.stack(list(transformed.values()), axis=1)
-
-    # compute and update transformed samples (1st order statistics)
-    # i.e. accumulate f into F as F += sum(f)
-    aggregates_1 = _metric_variable(
-        name='aggregates_1', shape=[N], dtype=tf.float64)
-    update_aggregates_1 = tf.assign_add(aggregates_1, tf.reduce_sum(transformed_vec, axis=0))
-
-    # compute and update crossed transformed samples (2nd order statistics)
-    # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f))
-    aggregates_2 = _metric_variable(
-        name='aggregates_2', shape=[N, N], dtype=tf.float64)
-    moment_2_temp = (
-      tf.reshape(transformed_vec, shape=[-1, N, 1])
-      * tf.reshape(transformed_vec, shape=[-1, 1, N])
-    )
-    update_aggregates_2 = tf.assign_add(aggregates_2, tf.reduce_sum(moment_2_temp, axis=0))
-
-    def compute_output(agg_1, agg_2, samp_cnt):
-      # decompose the aggregates back into a dict to pass to the user-supplied metric fn
-      aggregates_dict = OrderedDict()
-      for i, key in enumerate(transformed.keys()):
-        aggregates_dict[key] = agg_1[i]
-
-      metric_value = metric(aggregates_dict)
-
-      # derivative of metric with respect to the 1st order aggregates
-      # i.e. d M(agg1) / d agg1
-      metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1)
-
-      # estimated covariance of agg_1
-      # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N
-      #     = agg_2 - (agg_1 * agg_1^T) / N
-      N_covariance_estimate = agg_2 - (
-        tf.reshape(agg_1, shape=[-1, 1])
-        @ tf.reshape(agg_1, shape=[1, -1])
-        / tf.cast(samp_cnt, dtype=tf.float64)
-      )
-
-      # push N_covariance_estimate through a linearization of metric around agg_1
-      # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1)
-      metric_variance = (
-        tf.reshape(metric_prime, shape=[1, -1])
-        @ N_covariance_estimate
-        @ tf.reshape(metric_prime, shape=[-1, 1])
-      )
-      # result should be a single element, but the matmul is 2D
-      metric_variance = metric_variance[0][0]
-      metric_stderr = tf.sqrt(metric_variance)
-      return metric_stderr
-
-    metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count)
-    update_metric_stderr = compute_output(update_aggregates_1, update_aggregates_2, update_sample_count)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, metric_stderr)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_metric_stderr)
-
-    return metric_stderr, update_metric_stderr
-
-
-def lolly_nrce(labels, predictions,
-               weights=None,
-               metrics_collections=None,
-               updates_collections=None,
-               name=None):
-  """
-  Compute the Lolly NRCE.
-
-  Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large,
-  especially when the adjusted ctr goes above 1.0.
-
-  Calculation:
-
-  ::
-
-    NRCE: lolly NRCE
-    BCE: baseline cross entropy
-    NCE: normalized cross entropy
-    CE: cross entropy
-    y_i: label of example i
-    p_i: prediction of example i
-    y: ctr
-    p: average prediction
-    a: normalizer
-
-    Assumes any p_i and a * p_i is within [0, 1)
-    NRCE = (1 - NCE / BCE) * 100
-    BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y))
-        = - (y * log(y) + (1 - y) * log(1 - y))
-    a = y / p
-    CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-    NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i))
-        = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-          - sum_i(y_i * log(a))
-          + sum_i((1 - y_i) * log(1 - p_i))
-          - sum_i((1 - y_i) * log(1 - a * p_i))
-        ~= CE - sum_i(y_i) * log(a)
-          + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j)))
-          - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j)))
-          # Takes 5 items from the Taylor expansion, can be increased if needed
-          # Error for each example is O(p_i^6)
-        = CE - sum_i(y_i) * log(a)
-          - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j)
-        = CE - sum_i(y_i) * log(a)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j)
-
-  Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5.
-  We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that
-  we can get a at the end, which leads to this NRCE.
-
-  NRCE uses ctr and average pctr to normalize the pctrs.
-  It removes the impact of prediction error from RCE.
-  Usually NRCE is higher as the prediction error impact on RCE is negative.
-  Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE.
-
-  In Lolly NRCE we use ctr and average pctr of the whole dataset.
-  We thus remove the dataset level error in NRCE calculation.
-  In this case, when we want to improve RCE to the level of NRCE,
-  it is achievable as dataset level prediction error is easy to remove by calibration.
-  Lolly NRCE is thus a good estimate about the potential gain by adding calibration.
-
-  In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error.
-  This error is difficult to remove by modeling improvement,
-  at least not by simple calibration.
-  It thus cannot indicate the same opportunity as the Lolly NRCE does.
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  Note: Must have at least 1 positive and 1 negative sample accumulated,
-        or NRCE will come out as NaN.
-  """
-  with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
-
-    positive_weights = tf.multiply(labels, weights, name="positive_weights")
-
-    # clamp predictions to keep log(p) stable
-    clip_predictions = tf.clip_by_value(
-      predictions,
-      CLAMP_EPSILON,
-      1.0 - CLAMP_EPSILON,
-      name="clip_predictions")
-    weighted_predictions = tf.multiply(
-      predictions, weights,
-      name="weighted_predictions")
-
-    logloss = _binary_cross_entropy(pred=clip_predictions, target=labels, name="logloss")
-    weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    negatives = tf.subtract(
-      tf.ones(shape=tf.shape(labels), dtype=tf.float32),
-      labels,
-      name="negatives")
-    negative_predictions = tf.multiply(
-      predictions,
-      negatives,
-      name="negative_predictions")
-    weighted_negative_predictions = tf.multiply(
-      negative_predictions, weights,
-      name="weighted_negative_predictions")
-    negative_squared_predictions = tf.multiply(
-      negative_predictions,
-      negative_predictions,
-      name="negative_squared_predictions")
-    weighted_negative_squared_predictions = tf.multiply(
-      negative_squared_predictions, weights,
-      name="weighted_negative_squared_predictions")
-    negative_cubed_predictions = tf.multiply(
-      negative_squared_predictions,
-      negative_predictions,
-      name="negative_cubed_predictions")
-    weighted_negative_cubed_predictions = tf.multiply(
-      negative_cubed_predictions, weights,
-      name="weighted_negative_cubed_predictions")
-    negative_quartic_predictions = tf.multiply(
-      negative_cubed_predictions,
-      negative_predictions,
-      name="negative_quartic_predictions")
-    weighted_negative_quartic_predictions = tf.multiply(
-      negative_quartic_predictions, weights,
-      name="weighted_negative_quartic_predictions")
-    negative_quintic_predictions = tf.multiply(
-      negative_quartic_predictions,
-      negative_predictions,
-      name="negative_quintic_predictions")
-    weighted_negative_quintic_predictions = tf.multiply(
-      negative_quintic_predictions, weights,
-      name="weighted_negative_quintic_predictions")
-
-    # Tracked stats
-    total_positive = _metric_variable(name="total_positive", shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
-
-    total_prediction = _metric_variable(name="total_prediction", shape=[], dtype=tf.float32)
-
-    total_negative_prediction = _metric_variable(
-      name="total_negative_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_squared_prediction = _metric_variable(
-      name="total_negative_squared_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_cubed_prediction = _metric_variable(
-      name="total_negative_cubed_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quartic_prediction = _metric_variable(
-      name="total_negative_quartic_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quintic_prediction = _metric_variable(
-      name="total_negative_quintic_prediction",
-      shape=[], dtype=tf.float32)
-
-    total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
-
-    # Update tracked stats
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(positive_weights), name="total_positive_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-    update_total_prediction = tf.assign_add(
-      total_prediction, tf.reduce_sum(weighted_predictions), name="total_prediction_update")
-    update_total_negative_prediction = tf.assign_add(
-      total_negative_prediction,
-      tf.reduce_sum(weighted_negative_predictions), name="total_negative_prediction_update")
-    update_total_negative_squared_prediction = tf.assign_add(
-      total_negative_squared_prediction,
-      tf.reduce_sum(weighted_negative_squared_predictions),
-      name="total_negative_squared_prediction_update")
-    update_total_negative_cubed_prediction = tf.assign_add(
-      total_negative_cubed_prediction,
-      tf.reduce_sum(weighted_negative_cubed_predictions),
-      name="total_negative_cubed_prediction_update")
-    update_total_negative_quartic_prediction = tf.assign_add(
-      total_negative_quartic_prediction,
-      tf.reduce_sum(weighted_negative_quartic_predictions),
-      name="total_negative_quartic_prediction_update")
-    update_total_negative_quintic_prediction = tf.assign_add(
-      total_negative_quintic_prediction,
-      tf.reduce_sum(weighted_negative_quintic_predictions),
-      name="total_negative_quintic_prediction_update")
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update")
-
-    # metric value retrieval subgraph
-    # ctr of this batch
-    positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_loss = _binary_cross_entropy(
-      pred=positive_rate,
-      target=positive_rate,
-      name="baseline_loss")
-
-    # normalizing ratio for nrce
-    # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr
-    normalizer = tf.truediv(total_positive, total_prediction, name="normalizer")
-    # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a))
-    # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i)
-    # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i)
-    normalized_loss = (
-      total_loss -
-      total_positive * tf.log(normalizer) +
-      total_negative_prediction * (normalizer - 1) +
-      total_negative_squared_prediction * (normalizer * normalizer - 1) / 2 +
-      total_negative_cubed_prediction *
-      (normalizer * normalizer * normalizer - 1) / 3 +
-      total_negative_quartic_prediction *
-      (normalizer * normalizer * normalizer * normalizer - 1) / 4 +
-      total_negative_quintic_prediction *
-      (normalizer * normalizer * normalizer * normalizer * normalizer - 1) / 5)
-
-    # average normalized loss
-    avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss")
-
-    nrce_t = tf.multiply(
-      1.0 - tf.truediv(avg_loss, baseline_loss),
-      100,
-      name="lolly_nrce")
-
-    # metric update subgraph
-    update_positive_rate = tf.truediv(
-      update_total_positive,
-      update_total_weight,
-      name="update_positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    update_baseline_loss = _binary_cross_entropy(
-      pred=update_positive_rate,
-      target=update_positive_rate,
-      name="update_baseline_loss")
-
-    update_normalizer = tf.truediv(
-      update_total_positive,
-      update_total_prediction,
-      name="update_normalizer")
-    update_normalized_loss = (
-      update_total_loss -
-      update_total_positive * tf.log(update_normalizer) +
-      update_total_negative_prediction *
-      (update_normalizer - 1) +
-      update_total_negative_squared_prediction *
-      (update_normalizer * update_normalizer - 1) / 2 +
-      update_total_negative_cubed_prediction *
-      (update_normalizer * update_normalizer * update_normalizer - 1) / 3 +
-      update_total_negative_quartic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer - 1) / 4 +
-      update_total_negative_quintic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer * update_normalizer - 1) / 5)
-
-    update_avg_loss = tf.truediv(
-      update_normalized_loss,
-      update_total_weight,
-      name="update_avg_loss")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(update_avg_loss, update_baseline_loss),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, nrce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return nrce_t, update_op
-
-
-def _binary_cross_entropy(pred, target, name):
-  return - tf.add(
-    target * tf.log(pred),
-    (1.0 - target) * tf.log(1.0 - pred),
-    name=name)
-
-
-# Copied from metrics_impl.py with minor modifications.
-# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
-def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
-
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
-
-PERCENTILES = np.linspace(0, 1, 101, dtype=np.float32)
-
-# metric_name: (metric, requires thresholded output)
-SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML metrics
-  'total_weight': (total_weight_metric, False),
-  'num_samples': (num_samples_metric, False),
-  'rce': (rce, False),
-  'rce_std_err': (partial(metric_std_err, transform=rce_transform, metric=rce_metric, name='rce_std_err'), False),
-  'nrce': (partial(rce, normalize=True), False),
-  'lolly_nrce': (lolly_nrce, False),
-  'arce': (partial(rce, normalize=True, arce=True), False),
-  'arce_original': (partial(rce, normalize=True, arce=True, up_weight=False), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (predicted_ctr, False),
-  'pred_std_dev': (prediction_std_dev, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-
-  'false_positives': (tf.metrics.false_positives, True),
-  'false_negatives': (tf.metrics.false_negatives, True),
-  'true_positives': (tf.metrics.true_positives, True),
-  'true_negatives': (tf.metrics.true_negatives, True),
-
-  'precision_at_percentiles': (partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES), False),
-  'recall_at_percentiles': (partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES), False),
-  'false_positives_at_percentiles': (partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'false_negatives_at_percentiles': (partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_positives_at_percentiles': (partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_negatives_at_percentiles': (partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES), False),
-
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC',
-    summation_method='careful_interpolation'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR',
-    summation_method='careful_interpolation'), False),
-
-  # tensorboard curves
-  'pr_curve': (tb.summary.v1.pr_curve_streaming_op, False),
-
-  # deprecated metrics
-  'deprecated_nrce': (partial(rce, normalize=True, deprecated_rce=True), False),
-  'deprecated_arce': (partial(rce, normalize=True, arce=True, deprecated_rce=True), False),
-  'deprecated_arce_original': (partial(rce, normalize=True, arce=True,
-                                     up_weight=False, deprecated_rce=True), False)
-}
-
-# default metrics provided by get_binary_class_metric_fn
-DEFAULT_BINARY_CLASS_METRICS = ['total_weight', 'num_samples', 'rce', 'rce_std_err',
-                                'nrce', 'arce', 'ctr', 'predicted_ctr', 'pred_std_dev',
-                                'accuracy', 'precision', 'recall', 'roc_auc', 'pr_auc']
-
-
-def get_binary_class_metric_fn(metrics=None):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for binary classification. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - `arce <http://go/arce>`_ (a more recent proposed improvment over NRCE)
-      - arce_original
-      - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-      - deprecated_arce (ARCE as it was calculated before a stability fix)
-      - deprecated_nrce (NRCE as it was calculated before a stability fix)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add metrics to eval_metric_ops dict
-    for metric in metrics:
-      if isinstance(metric, tuple) and len(metric) == 3:
-        metric_name, metric_factory, requires_threshold = metric
-        metric_name = metric_name.lower()
-      elif isinstance(metric, str):
-        metric_name = metric.lower()  # metric name are case insensitive.
-        metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-      else:
-        raise ValueError("Metric should be either string or tuple of length 3.")
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      if metric_factory:
-        value_op, update_op = metric_factory(
-          labels=labels,
-          predictions=(hard_preds if requires_threshold else preds),
-          weights=weights, name=metric_name)
-        eval_metric_ops[metric_name] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  In multiple binary classification problems, the
-  ``predictions`` (that is, ``graph_output['output']``)
-  are expected to have shape ``batch_size x n_classes``,
-  where ``n_classes`` is the number of binary classification.
-  Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1)
-  and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output``
-  with binary values (0 or 1). The weights can be of size ``batch_size`` or
-  ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities,
-  and need to have separate metrics.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of Metrics):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-  """
-  # pylint: disable=invalid-name,dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    shape = labels.get_shape()
-    # basic sanity check: multi_metric dimension must exist
-    assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
-
-    num_labels = shape[class_dim]
-    # If we are doing multi-class / multi-label metric, the number of classes / labels must
-    # be know at graph construction time.  This dimension cannot have size None.
-    assert num_labels is not None, "The multi-metric dimension cannot be None."
-    assert classes is None or len(classes) == num_labels, (
-      "Number of classes must match the number of labels")
-
-    weights_shape = weights.get_shape() if weights is not None else None
-    if weights_shape is None:
-      num_weights = None
-    elif len(weights_shape) > 1:
-      num_weights = weights_shape[class_dim]
-    else:
-      num_weights = 1
-
-    for i in range(num_labels):
-
-      # add metrics to eval_metric_ops dict
-      for metric in metrics:
-        if isinstance(metric, tuple) and len(metric) == 3:
-          metric_name, metric_factory, requires_threshold = metric
-          metric_name = metric_name.lower()
-        elif isinstance(metric, str):
-          metric_name = metric.lower()  # metric name are case insensitive.
-          metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        else:
-          raise ValueError("Metric should be either string or tuple of length 3.")
-
-        class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
-
-        if class_metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        class_labels = tf.gather(labels, indices=[i], axis=class_dim)
-        class_preds = tf.gather(preds, indices=[i], axis=class_dim)
-        class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
-
-        if num_weights is None:
-          class_weights = None
-        elif num_weights == num_labels:
-          class_weights = tf.gather(weights, indices=[i], axis=class_dim)
-        elif num_weights == 1:
-          class_weights = weights
-        else:
-          raise ValueError("num_weights (%d) and num_labels (%d) do not match"
-                           % (num_weights, num_labels))
-
-        if metric_factory:
-          value_op, update_op = metric_factory(
-            labels=class_labels,
-            predictions=(class_hard_preds if requires_threshold else class_preds),
-            weights=class_weights, name=class_metric_name)
-          eval_metric_ops[class_metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-def _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=True):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops with uncalibrated output.
-
-  The following graph_output keys are recognized:
-    uncalibrated_output:
-      the uncalibrated raw predictions between 0 and 1. Required.
-    output:
-      the calibrated predictions between 0 and 1.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    calibrated_metric_fn: metrics function with calibration and weight.
-    keep_weight: Bool indicating whether we keep weight.
-  """
-  metric_scope = 'uncalibrated' if keep_weight else 'unweighted'
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-    with tf.variable_scope(metric_scope):
-      if 'uncalibrated_output' not in graph_output:
-        raise Exception("Missing uncalibrated_output in graph_output!")
-      un_calibrated_weights = weights if keep_weight else tf.ones_like(weights)
-      uncalibrated_output = {
-        'output': graph_output['uncalibrated_output'],
-        'threshold': graph_output.get('threshold', 0.5),
-        'hard_output': graph_output.get('hard_output'),
-        **{k: v for k, v in graph_output.items() if k not in ['output', 'threshold', 'hard_output']}
-      }
-
-      eval_metrics_ops = calibrated_metric_fn(uncalibrated_output, labels, un_calibrated_weights)
-
-      renamed_metrics_ops = {f'{metric_scope}_{k}': v for k, v in eval_metrics_ops.items()}
-      return renamed_metrics_ops
-
-  return get_eval_metric_ops
-
-
-def get_multi_binary_class_uncalibrated_metric_fn(
-  metrics, classes=None, class_dim=1, keep_weight=True):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications without calibration.
-
-  Note: 'uncalibrated_output' is required key in graph_output.
-
-  The main use case for this function is:
-
-  1) To calculated roc-auc for rare event.
-  Calibrated prediction score for rare events will be concentrated near zero. As a result,
-  the roc-auc can be seriously underestimated with current implementation in tf.metric.auc.
-  Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc.
-  For more details, please refer to: go/roc-auc-invariance.
-
-  2) To set keep_weight=False and get unweighted and uncalibrated metrics.
-  This is useful to eval how the model is fitted to its actual training data, since
-  often time the model is trained without weight.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-
-    keep_weight (bool):
-      Whether to keep weights for the metric.
-  """
-
-  calibrated_metric_fn = get_multi_binary_class_metric_fn(
-    metrics, classes=classes, class_dim=class_dim)
-  return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight)
-
-
-def combine_metric_fns(*fn_list):
-  """
-  Combine multiple metric functions.
-  For example, we can combine metrics function generated by
-  get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn.
-
-  Args:
-    *fn_list: Multiple metric functions to be combined
-
-  Returns:
-    Combined metric function.
-  """
-  def combined_metric_ops(*args, **kwargs):
-    eval_metric_ops = OrderedDict()
-    for fn in fn_list:
-      eval_metric_ops.update(fn(*args, **kwargs))
-    return eval_metric_ops
-  return combined_metric_ops
diff --git a/twml/twml/optimizers/__init__.docx b/twml/twml/optimizers/__init__.docx
new file mode 100644
index 000000000..1e544ffb8
Binary files /dev/null and b/twml/twml/optimizers/__init__.docx differ
diff --git a/twml/twml/optimizers/__init__.py b/twml/twml/optimizers/__init__.py
deleted file mode 100644
index eaa29883c..000000000
--- a/twml/twml/optimizers/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from twitter.deepbird.compat.v1.optimizers import (
-  LazyAdamOptimizer,
-  optimize_loss,
-  OPTIMIZER_SUMMARIES) # noqa: F401
diff --git a/twml/twml/parsers.docx b/twml/twml/parsers.docx
new file mode 100644
index 000000000..c3915fd13
Binary files /dev/null and b/twml/twml/parsers.docx differ
diff --git a/twml/twml/parsers.py b/twml/twml/parsers.py
deleted file mode 100644
index eac60083a..000000000
--- a/twml/twml/parsers.py
+++ /dev/null
@@ -1,20 +0,0 @@
-'''
-Contains implementations of functions to parse training and evaluation data.
-
-Modelers can use the functions in this module as the the train/eval_parse_fn of
-the DataRecordTrainer constructor to customize how to parse their datasets.
-
-Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
-
-from twitter.deepbird.io.legacy.parsers import (
-  convert_to_supervised_input_receiver_fn,  # noqa: F401
-  get_continuous_parse_fn,  # noqa: F401
-  get_default_parse_fn,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_labels_in_features_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_sparse_parse_fn,  # noqa: F401
-  get_sparse_serving_input_receiver_fn,  # noqa: F401
-  get_tensor_parse_fn,  # noqa: F401
-)
diff --git a/twml/twml/readers/__init__.docx b/twml/twml/readers/__init__.docx
new file mode 100644
index 000000000..9d6714f46
Binary files /dev/null and b/twml/twml/readers/__init__.docx differ
diff --git a/twml/twml/readers/__init__.py b/twml/twml/readers/__init__.py
deleted file mode 100644
index 06a6d79f5..000000000
--- a/twml/twml/readers/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# pylint: disable=wildcard-import
-""" This module contains data readers """
-
-from .batch_prediction_request import BatchPredictionRequest  # noqa: F401
-from .data_record import DataRecord, SPARSE_DATA_RECORD_FEATURE_FIELDS  # noqa: F401
-from .hashed_batch_prediction_request import HashedBatchPredictionRequest  # noqa: F401
-from .hashed_data_record import HashedDataRecord  # noqa: F401
\ No newline at end of file
diff --git a/twml/twml/readers/batch_prediction_request.docx b/twml/twml/readers/batch_prediction_request.docx
new file mode 100644
index 000000000..8865a45ae
Binary files /dev/null and b/twml/twml/readers/batch_prediction_request.docx differ
diff --git a/twml/twml/readers/batch_prediction_request.py b/twml/twml/readers/batch_prediction_request.py
deleted file mode 100644
index 512a8c514..000000000
--- a/twml/twml/readers/batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for BatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/readers/data_record.docx b/twml/twml/readers/data_record.docx
new file mode 100644
index 000000000..b445f7b3c
Binary files /dev/null and b/twml/twml/readers/data_record.docx differ
diff --git a/twml/twml/readers/data_record.py b/twml/twml/readers/data_record.py
deleted file mode 100644
index d1c377afd..000000000
--- a/twml/twml/readers/data_record.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module includes facilities for manipulating data records.
-"""
-
-from twitter.deepbird.io.legacy.readers.data_record import (
-  _SPEC_TO_TF,  # noqa: F401
-  SPARSE_DATA_RECORD_FEATURE_FIELDS,  # noqa: F401
-  _FeaturesBase,  # noqa: F401
-  _Features,  # noqa: F401
-  _DiscreteFeatures,  # noqa: F401
-  _StringFeatures,  # noqa: F401
-  _BaseDataRecord,  # noqa: F401
-  DataRecord,  # noqa: F401
-)
diff --git a/twml/twml/readers/hashed_batch_prediction_request.docx b/twml/twml/readers/hashed_batch_prediction_request.docx
new file mode 100644
index 000000000..777ff4525
Binary files /dev/null and b/twml/twml/readers/hashed_batch_prediction_request.docx differ
diff --git a/twml/twml/readers/hashed_batch_prediction_request.py b/twml/twml/readers/hashed_batch_prediction_request.py
deleted file mode 100644
index 5850c4497..000000000
--- a/twml/twml/readers/hashed_batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for HashedBatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/readers/hashed_data_record.docx b/twml/twml/readers/hashed_data_record.docx
new file mode 100644
index 000000000..b87f3a133
Binary files /dev/null and b/twml/twml/readers/hashed_data_record.docx differ
diff --git a/twml/twml/readers/hashed_data_record.py b/twml/twml/readers/hashed_data_record.py
deleted file mode 100644
index 1ff9ce816..000000000
--- a/twml/twml/readers/hashed_data_record.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# checkstyle: noqa
-# pylint: disable=invalid-name
-"""
-This module includes facilities for manipulating hashed data records.
-"""
-
-from twitter.deepbird.io.legacy.readers.hashed_data_record import (
-  _HASHED_FIELDS,
-  _FEATURE_NAMES,
-  _FEATURE_TYPES,
-  HashedDataRecord,
-)
diff --git a/twml/twml/saved_model_cli/__init__.docx b/twml/twml/saved_model_cli/__init__.docx
new file mode 100644
index 000000000..f127ba712
Binary files /dev/null and b/twml/twml/saved_model_cli/__init__.docx differ
diff --git a/twml/twml/saved_model_cli/__init__.py b/twml/twml/saved_model_cli/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml/saved_model_cli/__main__.docx b/twml/twml/saved_model_cli/__main__.docx
new file mode 100644
index 000000000..16ca05910
Binary files /dev/null and b/twml/twml/saved_model_cli/__main__.docx differ
diff --git a/twml/twml/saved_model_cli/__main__.py b/twml/twml/saved_model_cli/__main__.py
deleted file mode 100644
index ad5326431..000000000
--- a/twml/twml/saved_model_cli/__main__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""
-This module is responsible for running saved_model_cli.
-"""
-import sys
-
-from tensorflow.python.tools import saved_model_cli
-
-if __name__ == '__main__':
-  sys.exit(saved_model_cli.main())
diff --git a/twml/twml/summary/__init__.docx b/twml/twml/summary/__init__.docx
new file mode 100644
index 000000000..a7add1e11
Binary files /dev/null and b/twml/twml/summary/__init__.docx differ
diff --git a/twml/twml/summary/__init__.py b/twml/twml/summary/__init__.py
deleted file mode 100644
index 284d7cf3f..000000000
--- a/twml/twml/summary/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from tensorflow.python.ops.summary_ops_v2 import flush  # noqa: F401
-
-"""
-NOTE: Using `from tensorflow.python.ops.summary_ops_v2 import flush` in the code works.
-This stub exists because it was easier to refactor code because twml is widely used.
-"""
diff --git a/twml/twml/tensorboard/__init__.docx b/twml/twml/tensorboard/__init__.docx
new file mode 100644
index 000000000..f127ba712
Binary files /dev/null and b/twml/twml/tensorboard/__init__.docx differ
diff --git a/twml/twml/tensorboard/__init__.py b/twml/twml/tensorboard/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml/tensorboard/__main__.docx b/twml/twml/tensorboard/__main__.docx
new file mode 100644
index 000000000..1c17bb976
Binary files /dev/null and b/twml/twml/tensorboard/__main__.docx differ
diff --git a/twml/twml/tensorboard/__main__.py b/twml/twml/tensorboard/__main__.py
deleted file mode 100644
index c426060d1..000000000
--- a/twml/twml/tensorboard/__main__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""
-This module is responsible for running tensorboard.
-"""
-import logging
-import re
-import sys
-
-from tensorboard.main import run_main
-
-
-if __name__ == '__main__':
-  # Tensorboard relies on werkzeug for its HTTP server which logs at info level
-  # by default
-  logging.getLogger('werkzeug').setLevel(logging.WARNING)
-  sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-  sys.exit(run_main())
diff --git a/twml/twml/tensorio.docx b/twml/twml/tensorio.docx
new file mode 100644
index 000000000..344617f63
Binary files /dev/null and b/twml/twml/tensorio.docx differ
diff --git a/twml/twml/tensorio.py b/twml/twml/tensorio.py
deleted file mode 100644
index bc551ac56..000000000
--- a/twml/twml/tensorio.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# pylint: disable=missing-docstring, bare-except, pointless-statement,
-# pointless-string-statement, redundant-unittest-assert, no-else-return,
-# no-member, old-style-class, dangerous-default-value, protected-access,
-# too-few-public-methods
-
-import os
-
-import numpy as np
-import yaml
-
-
-"""
-Utility to load tensors serialized by Deepbird V1.
-
-Note that Deepbird V1 serialize tensor names as \"weight\".\'1\'.
-For user-friendliness, the quotes are removed from the tensor names.
-"""
-
-
-# helper class used to assist hierarchical key access by remembering intermediate keys.
-class _KeyRecorder(object):
-  def __init__(self, tensorio, keys=[]):
-    self.tensorio = tensorio
-    self.keys = keys
-
-  def __getitem__(self, k):
-    new_keys = self.keys + [str(k)]
-    prefix = ".".join(new_keys)
-
-    key_list = self.tensorio.list_tensors()
-
-    # if we have a complete key, load the tensor.
-    if prefix in key_list:
-      return self.tensorio._load(prefix)
-
-    # we don't have a complete key yet, but at least one tensor should start with this prefix.
-    for k_value in key_list:
-      if k_value.startswith(prefix):
-        return _KeyRecorder(self.tensorio, new_keys)
-
-    # if no key starts with the prefix, this _key_recorder is not valid.
-    raise ValueError("Key not found: " + prefix)
-
-
-# convert tensorio tensor type to numpy data type.
-# also returns element size in bytes.
-def _get_data_type(data_type):
-  if data_type == 'Double':
-    return (np.float64, 8)
-
-  if data_type == 'Float':
-    return (np.float32, 4)
-
-  if data_type == 'Int':
-    return (np.int32, 4)
-
-  if data_type == 'Long':
-    return (np.int64, 8)
-
-  if data_type == 'Byte':
-    return (np.int8, 1)
-
-  raise ValueError('Unexpected tensorio data type: ' + data_type)
-
-
-class TensorIO(object):
-  """
-  Construct a TensorIO class.
-  tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported.
-  mmap_tensor:
-    By default, loaded tensors use mmap storage.
-    Set this to false to not use mmap. Useful when loading multiple tensors.
-  """
-
-  def __init__(self, tensorio_path, mmap_tensor=True):
-    self._tensorio_path = tensorio_path
-    self._mmap_tensor = mmap_tensor
-
-    # Make sure we can locate spec.yaml.
-    yaml_file = os.path.join(tensorio_path, 'spec.yaml')
-    if not os.path.exists(yaml_file):
-      raise ValueError('Invalid tensorio path: no spec.yaml found.')
-
-    # load spec.yaml.
-    with open(yaml_file, 'r') as file_open:
-      # Note that tensor names in the yaml are like this: \"weight\".\'1\'
-      # For user-friendliness, we remove the quotes.
-      _spec = yaml.safe_load(file_open)
-      self._spec = {k.replace("'", '').replace('"', ''): v for (k, v) in _spec.items()}
-
-  def list_tensors(self):
-    """
-    Returns a list of tensors saved in the given path.
-    """
-    return self._spec.keys()
-
-  def _load_tensor(self, name):
-    """
-    Load Tensor with the given name.
-    Raise value error if the named tensor is not found.
-    Returns a numpy array if the named tensor is found.
-    """
-    tensor_info = self._spec[name]
-    if tensor_info['type'] != 'tensor':
-      raise ValueError('Trying to load a tensor of unknown type: ' + tensor_info['type'])
-
-    filename = os.path.join(self._tensorio_path, tensor_info['filename'])
-    (data_type, element_size) = _get_data_type(tensor_info['tensorType'])
-
-    np_array = np.memmap(
-      filename,
-      dtype=data_type,
-      mode='r',
-      # -1 because lua offset is 1 based.
-      offset=(tensor_info['offset'] - 1) * element_size,
-      shape=tuple(tensor_info['size']),
-      order='C',
-    )
-
-    return np_array if self._mmap_tensor else np_array[:].copy()
-
-  def _load_nontensor_data(self, name):
-    """
-    Load non-tensor data with the given name.
-    Returns a python string.
-    """
-    tensor_info = self._spec[name]
-    return tensor_info['data']
-
-  def _load(self, name):
-    """
-    Load data serialized under the given name, it could be a tensor or regular data.
-    """
-    if name not in self._spec:
-      raise ValueError('The specified key {} is not found in {}'.format(name, self._tensorio_path))
-
-    data_type = self._spec[name]['type']
-    if data_type == 'tensor':
-      return self._load_tensor(name)
-    else:
-      return self._load_nontensor_data(name)
-
-  def load_all(self):
-    """
-    Load all tensors stored in the tensorio directory.
-    Returns a dictionary from tensor name to numpy arrays.
-    """
-    return {k: self._load(k) for k in self._spec}
-
-  ###########################################
-  # The below are utilities for convenience #
-  ###########################################
-  def __getitem__(self, k):
-    """
-    Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1']
-    """
-    if k in self._spec:
-      # We have a full tensor name, directly load it.
-      return self._load_tensor(k)
-    else:
-      return _KeyRecorder(self)[k]
diff --git a/twml/twml/tracking/__init__.docx b/twml/twml/tracking/__init__.docx
new file mode 100644
index 000000000..34c62826d
Binary files /dev/null and b/twml/twml/tracking/__init__.docx differ
diff --git a/twml/twml/tracking/__init__.py b/twml/twml/tracking/__init__.py
deleted file mode 100644
index 008a59f70..000000000
--- a/twml/twml/tracking/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""
-This module contains the ExperimentTracker class.
-"""
-
-from .experiment_tracker import ExperimentTracker  # noqa: F401
diff --git a/twml/twml/tracking/experiment_tracker.docx b/twml/twml/tracking/experiment_tracker.docx
new file mode 100644
index 000000000..72473d486
Binary files /dev/null and b/twml/twml/tracking/experiment_tracker.docx differ
diff --git a/twml/twml/tracking/experiment_tracker.py b/twml/twml/tracking/experiment_tracker.py
deleted file mode 100644
index 4f275ba4b..000000000
--- a/twml/twml/tracking/experiment_tracker.py
+++ /dev/null
@@ -1,543 +0,0 @@
-"""
-This module contains the experiment tracker for tracking training in ML Metastore
-"""
-from contextlib import contextmanager
-from datetime import datetime
-import getpass
-import hashlib
-import os
-import re
-import sys
-import time
-
-from absl import logging
-import tensorflow.compat.v1 as tf
-from twml.hooks import MetricsUpdateHook
-
-
-try:
-  from urllib import quote as encode_url
-except ImportError:
-  from urllib.parse import quote as encode_url
-
-
-try:
-  # ML Metastore packages might not be available on GCP.
-  # If they are not found, tracking is disabled
-  import requests
-  from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
-  from com.twitter.mlmetastore.modelrepo.core.path import (
-    check_valid_id, get_components_from_id, generate_id)
-  from com.twitter.mlmetastore.modelrepo.core import (
-    DeepbirdRun, Experiment, FeatureConfig, FeatureConfigFeature, Model, ProgressReport, Project, StatusUpdate)
-except ImportError:
-  ModelRepoClient = None
-
-
-class ExperimentTracker(object):
-  """
-  A tracker that records twml runs in ML Metastore.
-  """
-
-  def __init__(self, params, run_config, save_dir):
-    """
-
-    Args:
-      params (python dict):
-        The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and
-        `params.disable_experiment_tracking`.
-        If `experiment_tracking_path` is set to None, the tracker tries to guess a path with
-        save_dir.
-        If `disable_experiment_tracking` is True, the tracker is disabled.
-      run_config (tf.estimator.RunConfig):
-        The run config used by the estimator.
-      save_dir (str):
-        save_dir of the trainer
-    """
-    if isinstance(params, dict):
-      self._params = params
-    else:
-      # preserving backward compatibility for people still using HParams
-      logging.warning("Please stop using HParams and use python dicts. HParams are removed in TF 2")
-      self._params = dict((k, v) for k, v in params.values().items() if v != 'null')
-    self._run_config = run_config
-    self._graceful_shutdown_port = self._params.get('health_port')
-
-    self.tracking_path = self._params.get('experiment_tracking_path')
-    is_tracking_path_too_long = self.tracking_path is not None and len(self.tracking_path) > 256
-
-    if is_tracking_path_too_long:
-      raise ValueError("Experiment Tracking Path longer than 256 characters")
-
-    self.disabled = (
-      self._params.get('disable_experiment_tracking', False) or
-      not self._is_env_eligible_for_tracking() or
-      ModelRepoClient is None
-    )
-
-    self._is_hogwild = bool(os.environ.get('TWML_HOGWILD_PORTS'))
-
-    self._is_distributed = bool(os.environ.get('TF_CONFIG'))
-
-    self._client = None if self.disabled else ModelRepoClient()
-
-    run_name_from_environ = self.run_name_from_environ()
-    run_name_can_be_inferred = (
-      self.tracking_path is not None or run_name_from_environ is not None)
-
-    # Turn the flags off as needed in hogwild / distributed
-    if self._is_hogwild or self._is_distributed:
-      self._env_eligible_for_recording_experiment = (
-        self._run_config.task_type == "evaluator")
-      if run_name_can_be_inferred:
-        self._env_eligible_for_recording_export_metadata = (
-          self._run_config.task_type == "chief")
-      else:
-        logging.info(
-          'experiment_tracking_path is not set and can not be inferred. '
-          'Recording export metadata is disabled because the chief node and eval node '
-          'are setting different experiment tracking paths.')
-        self._env_eligible_for_recording_export_metadata = False
-    else:
-      # Defaults to True
-      self._env_eligible_for_recording_experiment = True
-      self._env_eligible_for_recording_export_metadata = True
-
-    if not self.disabled:
-      # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name
-      # -> own:proj:exp:Run_Name
-      if self.tracking_path:
-        try:
-          check_valid_id(self.tracking_path)
-        except ValueError as err:
-          logging.error(f'Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}')
-          self.tracking_path = generate_id(
-            owner=self.path['owner'],
-            project_name=self.path['project_name'],
-            experiment_name=self.path['experiment_name'],
-            run_name=self.path['run_name']
-          )
-          logging.error(f'Generated sanitized experiment tracking path: {self.tracking_path}')
-      else:
-        logging.info(
-          'No experiment_tracking_path set. Experiment Tracker will try to guess a path')
-        self.tracking_path = self.guess_path(save_dir, run_name_from_environ)
-        logging.info('Guessed path: %s', self.tracking_path)
-
-      # additional check to see if generated path is valid
-      try:
-        check_valid_id(self.tracking_path)
-      except ValueError as err:
-        logging.error(
-          'Could not generate valid experiment tracking path. Disabling tracking. ' +
-          'Error:\n{}'.format(err)
-        )
-        self.disabled = True
-
-    self.project_id = None if self.disabled else '{}:{}'.format(
-      self.path['owner'], self.path['project_name'])
-    self.base_run_id = None if self.disabled else self.tracking_path
-    self._current_run_name_suffix = None
-
-    self._current_tracker_hook = None
-
-    if self.disabled:
-      logging.info('Experiment Tracker is disabled')
-    else:
-      logging.info('Experiment Tracker initialized with base run id: %s', self.base_run_id)
-
-  @contextmanager
-  def track_experiment(self, eval_hooks, get_estimator_spec_fn, name=None):
-    """
-    A context manager for tracking experiment. It should wrap the training loop.
-    An experiment tracker eval hook is appended to eval_hooks to collect metrics.
-
-    Args:
-      eval_hooks (list):
-        The list of eval_hooks to be used. When it's not None, and does not contain any ,
-        MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains
-        any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo
-        tracker (`TrackRun`).
-      get_estimator_spec_fn (func):
-        A function to get the current EstimatorSpec of the trainer, used by the eval hook.
-      name (str);
-        Name of this training or evaluation. Used as a suffix of the run_id.
-
-    Returns:
-      The tracker's eval hook which is appended to eval_hooks.
-    """
-
-    # disable this tracker if legacy TrackRun hook is present
-    # TODO: remove this once we completely deprecate the old TrackRun interface
-    if eval_hooks is not None:
-      self.disabled = self.disabled or any(isinstance(x, MetricsUpdateHook) for x in eval_hooks)
-
-    logging.info('Is environment eligible for recording experiment: %s',
-                 self._env_eligible_for_recording_experiment)
-
-    if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port:
-      requests.post('http://localhost:{}/track_training_start'.format(
-        self._graceful_shutdown_port
-      ))
-
-    if self.disabled or eval_hooks is None:
-      yield None
-    else:
-      assert self._current_tracker_hook is None, 'experiment tracking has been started already'
-
-      if name is not None:
-        self._current_run_name_suffix = '_' + name
-
-      logging.info('Starting experiment tracking. Path: %s', self._current_run_id)
-      logging.info('Is environment eligible for recording export metadata: %s',
-                   self._env_eligible_for_recording_export_metadata)
-      logging.info('This run will be available at: http://go/mldash/experiments/%s',
-                   encode_url(self.experiment_id))
-
-      try:
-        self._record_run()
-        self._add_run_status(StatusUpdate(self._current_run_id, status='RUNNING'))
-        self._register_for_graceful_shutdown()
-
-        self._current_tracker_hook = self.create_eval_hook(get_estimator_spec_fn)
-      except Exception as err:
-        logging.error(
-          'Failed to record run. This experiment will not be tracked. Error: %s', str(err))
-        self._current_tracker_hook = None
-
-      if self._current_tracker_hook is None:
-        yield None
-      else:
-        try:
-          eval_hooks.append(self._current_tracker_hook)
-          yield self._current_tracker_hook
-        except Exception as err:
-          self._add_run_status(
-            StatusUpdate(self._current_run_id, status='FAILED', description=str(err)))
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-          logging.error('Experiment tracking done. Experiment failed.')
-          raise
-
-        try:
-          if self._current_tracker_hook.metric_values:
-            self._record_update(self._current_tracker_hook.metric_values)
-          self._add_run_status(StatusUpdate(self._current_run_id, status='SUCCESS'))
-          logging.info('Experiment tracking done. Experiment succeeded.')
-        except Exception as err:
-          logging.error(
-            'Failed to update mark run as successful. Error: %s', str(err))
-        finally:
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-
-  def create_eval_hook(self, get_estimator_spec_fn):
-    """
-    Create an eval_hook to track eval metrics
-
-    Args:
-      get_estimator_spec_fn (func):
-        A function that returns the current EstimatorSpec of the trainer.
-    """
-    return MetricsUpdateHook(
-      get_estimator_spec_fn=get_estimator_spec_fn,
-      add_metrics_fn=self._record_update)
-
-  def register_model(self, export_path):
-    """
-    Record the exported model.
-
-    Args:
-      export_path (str):
-        The path to the exported model.
-    """
-    if self.disabled:
-      return None
-
-    try:
-      logging.info('Model is exported to %s. Computing hash of the model.', export_path)
-      model_hash = self.compute_model_hash(export_path)
-      logging.info('Model hash: %s. Registering it in ML Metastore.', model_hash)
-      self._client.register_model(Model(model_hash, self.path['owner'], self.base_run_id))
-    except Exception as err:
-      logging.error('Failed to register model. Error: %s', str(err))
-
-  def export_feature_spec(self, feature_spec_dict):
-    """
-    Export feature spec to ML Metastore (go/ml-metastore).
-
-    Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due
-    to the 1mb upper limit for values in manhattan, and more specific information (feature type,
-    feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset.
-
-    Args:
-       feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec()
-    """
-    if self.disabled or not self._env_eligible_for_recording_export_metadata:
-      return None
-
-    try:
-      logging.info('Exporting feature spec to ML Metastore.')
-      feature_list = feature_spec_dict['features']
-      label_list = feature_spec_dict['labels']
-      weight_list = feature_spec_dict['weight']
-      self._client.add_feature_config(FeatureConfig(self._current_run_id, list(feature_list.keys()),
-                                                    list(label_list.keys()), list(weight_list.keys())))
-
-      feature_config_features = [
-        FeatureConfigFeature(
-          hash_id=_feature_hash_id,
-          feature_name=_feature['featureName'],
-          feature_type=_feature['featureType']
-        )
-        for _feature_hash_id, _feature in zip(feature_list.keys(), feature_list.values())
-      ]
-      self._client.add_feature_config_features(list(feature_list.keys()), feature_config_features)
-
-      feature_config_labels = [
-        FeatureConfigFeature(
-          hash_id=_label_hash_id,
-          feature_name=_label['featureName']
-        )
-        for _label_hash_id, _label in zip(label_list.keys(), label_list.values())
-      ]
-      self._client.add_feature_config_features(list(label_list.keys()), feature_config_labels)
-
-      feature_config_weights = [
-        FeatureConfigFeature(
-          hash_id=_weight_hash_id,
-          feature_name=_weight['featureName'],
-          feature_type=_weight['featureType']
-        )
-        for _weight_hash_id, _weight in zip(weight_list.keys(), weight_list.values())
-      ]
-      self._client.add_feature_config_features(list(weight_list.keys()), feature_config_weights)
-
-    except Exception as err:
-      logging.error('Failed to export feature spec. Error: %s', str(err))
-
-  @property
-  def path(self):
-    if self.disabled:
-      return None
-    return get_components_from_id(self.tracking_path, ensure_valid_id=False)
-
-  @property
-  def experiment_id(self):
-    if self.disabled:
-      return None
-    return '%s:%s:%s' % (self.path['owner'], self.path['project_name'],
-                         self.path['experiment_name'])
-
-  @property
-  def _current_run_name(self):
-    """
-    Return the current run name.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.path['run_name'] + self._current_run_name_suffix
-    else:
-      return self.path['run_name']
-
-  @property
-  def _current_run_id(self):
-    """
-    Return the current run id.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.base_run_id + self._current_run_name_suffix
-    else:
-      return self.base_run_id
-
-  def get_run_status(self) -> str:
-    if not self.disabled:
-      return self._client.get_latest_dbv2_status(self._current_run_id)
-
-  def _add_run_status(self, status):
-    """
-    Add run status with underlying client.
-
-    Args:
-      status (StatusUpdate):
-        The status update to add.
-    """
-    if not self.disabled and self._env_eligible_for_recording_experiment:
-      self._client.add_run_status(status)
-
-  def _record_run(self):
-    """
-    Record the run in ML Metastore.
-    """
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    if not self._client.project_exists(self.project_id):
-      self._client.add_project(Project(self.path['project_name'], self.path['owner']))
-      time.sleep(1)
-
-    if not self._client.experiment_exists(self.experiment_id):
-      self._client.add_experiment(Experiment(
-        self.path['experiment_name'], self.path['owner'], self.project_id, ''))
-      time.sleep(1)
-
-    run = DeepbirdRun(self.experiment_id, self._current_run_name, '',
-                      {'raw_command': ' '.join(sys.argv)}, self._params)
-    self._client.add_deepbird_run(run, force=True)
-    time.sleep(1)
-
-  def _record_update(self, metrics):
-    """
-    Record metrics update in ML Metastore.
-
-    Args:
-      metrics (dict):
-        The dict of the metrics and their values.
-    """
-
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    reported_metrics = {}
-    for k, v in metrics.items():
-
-      if hasattr(v, 'item'):
-        reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist())
-      else:
-        logging.warning("Ignoring %s because the value (%s) is not valid" % (k, str(v)))
-
-    report = ProgressReport(self._current_run_id, reported_metrics)
-
-    try:
-      self._client.add_progress_report(report)
-    except Exception as err:
-      logging.error('Failed to record metrics in ML Metastore. Error: {}'.format(err))
-      logging.error('Run ID: {}'.format(self._current_run_id))
-      logging.error('Progress Report: {}'.format(report.to_json_string()))
-
-  def _register_for_graceful_shutdown(self):
-    """
-    Register the tracker with the health server, enabling graceful shutdown.
-
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/register_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
-
-  def _deregister_for_graceful_shutdown(self):
-    """
-    Deregister the tracker with the health server, disabling graceful shutdown.
-
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/deregister_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
-
-  def _is_env_eligible_for_tracking(self):
-    """
-    Determine if experiment tracking should run in the env.
-    """
-    is_unit_test = (
-      os.environ.get('PYTEST_CURRENT_TEST') is not None and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
-
-    is_running_on_ci = (
-      getpass.getuser() == 'scoot-service' and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
-
-    return (
-      not is_unit_test and
-      not is_running_on_ci
-    )
-
-  @classmethod
-  def run_name_from_environ(cls):
-    """
-    Create run id from environment if possible.
-    """
-    job_name = os.environ.get("TWML_JOB_NAME")
-    job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME")
-
-    if not job_name or not job_launch_time:
-      return None
-
-    try:
-      # job_launch_time should be in isoformat
-      # python2 doesnt support datetime.fromisoformat, so use hardcoded format string.
-      job_launch_time_formatted = datetime.strptime(job_launch_time,
-                                                    "%Y-%m-%dT%H:%M:%S.%f")
-    except ValueError:
-      # Fallback in case aurora config is generating datetime in a different format.
-      job_launch_time_formatted = (job_launch_time
-                                   .replace("-", "_").replace("T", "_")
-                                   .replace(":", "_").replace(".", "_"))
-
-    return '{}_{}'.format(
-      job_name, job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p'))
-
-  @classmethod
-  def guess_path(cls, save_dir, run_name=None):
-    """
-    Guess an experiment tracking path based on save_dir.
-
-    Returns:
-      (str) guessed path
-    """
-    if not run_name:
-      run_name = 'Unnamed_{}'.format(datetime.now().strftime('%m_%d_%Y_%I_%M_%p'))
-
-    if save_dir.startswith('hdfs://'):
-      path_match = re.search(r'/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)', save_dir)
-
-      if path_match:
-        groups = path_match.groups()
-        user = groups[0]
-        project_name = groups[1]
-
-        return generate_id(user, 'default', project_name, run_name)
-
-    user = getpass.getuser()
-    project_name = re.sub(r'^[a-z0-9\-_]', os.path.basename(save_dir), '')
-    if not project_name:
-      project_name = 'unnamed'
-
-    return generate_id(user, 'default', project_name, run_name)
-
-  @classmethod
-  def compute_model_hash(cls, export_path):
-    """
-    Computes the hash of an exported model. This is a gfile version of
-    twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate
-    the same hash when given the same model.
-
-    Args:
-      export_path (str):
-        The path to the exported model.
-
-    Returns:
-      (str) hash of the exported model
-    """
-    paths = []
-    for path, subdirs, files in tf.io.gfile.walk(export_path):
-      for name in sorted(files):
-        paths.append(os.path.join(path, name))
-
-    paths.sort()
-    hash_object = hashlib.new('sha1')
-
-    for path in paths:
-      with tf.io.gfile.GFile(path, "rb") as file:
-        hash_object.update(file.read())
-
-    return hash_object.hexdigest()
diff --git a/twml/twml/trainers/__init__.docx b/twml/twml/trainers/__init__.docx
new file mode 100644
index 000000000..af8c4e285
Binary files /dev/null and b/twml/twml/trainers/__init__.docx differ
diff --git a/twml/twml/trainers/__init__.py b/twml/twml/trainers/__init__.py
deleted file mode 100644
index e6664d9a6..000000000
--- a/twml/twml/trainers/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# pylint: disable=wildcard-import
-"""
-This module contains the Trainer and DataRecordTrainer classes.
-Trainers wrap a
-`tf.estimator.Estimator
-<https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/Estimator>`_.
-"""
-
-from .trainer import Trainer  # noqa: F401
-from .data_record_trainer import DataRecordTrainer  # noqa: F401
diff --git a/twml/twml/trainers/data_record_trainer.docx b/twml/twml/trainers/data_record_trainer.docx
new file mode 100644
index 000000000..112339a9f
Binary files /dev/null and b/twml/twml/trainers/data_record_trainer.docx differ
diff --git a/twml/twml/trainers/data_record_trainer.py b/twml/twml/trainers/data_record_trainer.py
deleted file mode 100644
index 76dd16f80..000000000
--- a/twml/twml/trainers/data_record_trainer.py
+++ /dev/null
@@ -1,821 +0,0 @@
-# pylint: disable=arguments-differ, invalid-name
-"""
-This module contains the ``DataRecordTrainer``.
-Unlike the parent ``Trainer`` class, the ``DataRecordTrainer``
-is used specifically for processing data records.
-It abstracts away a lot of the intricacies of working with DataRecords.
-`DataRecord <http://go/datarecord>`_ is the main piping format for data samples.
-The `DataRecordTrainer` assumes training data and production responses and requests
-to be organized as the `Thrift prediction service API
-
-A ``DataRecord`` is a Thrift struct that defines how to encode the data:
-
-::
-
-  struct DataRecord {
-    1: optional set<i64> binaryFeatures;                     // stores BINARY features
-    2: optional map<i64, double> continuousFeatures;         // stores CONTINUOUS features
-    3: optional map<i64, i64> discreteFeatures;              // stores DISCRETE features
-    4: optional map<i64, string> stringFeatures;             // stores STRING features
-    5: optional map<i64, set<string>> sparseBinaryFeatures;  // stores sparse BINARY features
-    6: optional map<i64, map<string, double>> sparseContinuousFeatures; // sparse CONTINUOUS feature
-    7: optional map<i64, binary> blobFeatures; // stores features as BLOBs (binary large objects)
-    8: optional map<i64, tensor.GeneralTensor> tensors; // stores TENSOR features
-    9: optional map<i64, tensor.SparseTensor> sparseTensors; // stores SPARSE_TENSOR features
-  }
-
-
-A significant portion of Twitter data is hydrated
-and then temporarily stored on HDFS as DataRecords.
-The files are compressed (.gz or .lzo) partitions of data records.
-These form supervised datasets. Each sample captures the relationship
-between input and output (cause and effect).
-To create your own dataset, please see https://github.com/twitter/elephant-bird.
-
-The default ``DataRecordTrainer.[train,evaluate,learn]()`` reads these datarecords.
-The data is a read from multiple ``part-*.[compression]`` files.
-The default behavior of ``DataRecordTrainer`` is to read sparse features from ``DataRecords``.
-This is a legacy default piping format at Twitter.
-The ``DataRecordTrainer`` is flexible enough for research and yet simple enough
-for a new beginner ML practioner.
-
-By means of the feature string to key hashing function,
-the ``[train,eval]_feature_config`` constructor arguments
-control which features can be used as sample labels, sample weights,
-or sample features.
-Samples ids, and feature keys, feature values and feature weights
-can be skipped, included, excluded or used as labels, weights, or features.
-This allows you to easily define and control sparse distributions of
-named features.
-
-Yet sparse data is difficult to work with. We are currently working to
-optimize the sparse operations due to inefficiencies in the gradient descent
-and parameter update processes. There are efforts underway
-to minimize the footprint of sparse data as it is inefficient to process.
-CPUs and GPUs much prefer dense tensor data.
-"""
-
-import datetime
-
-import tensorflow.compat.v1 as tf
-from twitter.deepbird.io.dal import dal_to_hdfs_path, is_dal_path
-import twml
-from twml.trainers import Trainer
-from twml.contrib.feature_importances.feature_importances import (
-  compute_feature_importances,
-  TREE,
-  write_feature_importances_to_hdfs,
-  write_feature_importances_to_ml_dash)
-from absl import logging
-
-
-class DataRecordTrainer(Trainer):  # pylint: disable=abstract-method
-  """
-  The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases
-  at Twitter where only the build_graph methods needs to be overridden.
-  For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
-    """
-    The DataRecordTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, DataRecordTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
-    """
-
-    # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL.
-    super(DataRecordTrainer, self).__init__(
-      name=name, params=params, build_graph_fn=build_graph_fn, **kwargs)
-
-    self._feature_config = feature_config
-
-    # date range parameters common to both training and evaluation data:
-    hour_resolution = self.params.get("hour_resolution", 1)
-    data_threads = self.params.get("data_threads", 4)
-    datetime_format = self.params.get("datetime_format", "%Y/%m/%d")
-
-    # retrieve the desired training dataset files
-    self._train_files = self.build_files_list(
-      files_list_path=self.params.get("train_files_list", None),
-      data_dir=self.params.get("train_data_dir", None),
-      start_datetime=self.params.get("train_start_datetime", None),
-      end_datetime=self.params.get("train_end_datetime", None),
-      datetime_format=datetime_format, data_threads=data_threads,
-      hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-      overwrite=self.params.get("train_overwrite_files_list", False),
-    )
-
-    # retrieve the desired evaluation dataset files
-    eval_name = self.params.get("eval_name", None)
-
-    if eval_name == "train":
-      self._eval_files = self._train_files
-    else:
-      self._eval_files = self.build_files_list(
-        files_list_path=self.params.get("eval_files_list", None),
-        data_dir=self.params.get("eval_data_dir", None),
-        start_datetime=self.params.get("eval_start_datetime", None),
-        end_datetime=self.params.get("eval_end_datetime", None),
-        datetime_format=datetime_format, data_threads=data_threads,
-        hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-        overwrite=self.params.get("eval_overwrite_files_list", False),
-      )
-
-      if not self.params.get("allow_train_eval_overlap"):
-        # if there is overlap between train and eval, error out!
-        if self._train_files and self._eval_files:
-          overlap_files = set(self._train_files) & set(self._eval_files)
-        else:
-          overlap_files = set()
-        if overlap_files:
-          raise ValueError("There is an overlap between train and eval files:\n %s" %
-                           (overlap_files))
-
-  @staticmethod
-  def build_hdfs_files_list(
-      files_list_path, data_dir,
-      start_datetime, end_datetime, datetime_format,
-      data_threads, hour_resolution, maybe_save, overwrite):
-    if files_list_path:
-      files_list_path = twml.util.preprocess_path(files_list_path)
-
-    if isinstance(start_datetime, datetime.datetime):
-      start_datetime = start_datetime.strftime(datetime_format)
-    if isinstance(end_datetime, datetime.datetime):
-      end_datetime = end_datetime.strftime(datetime_format)
-
-    list_files_by_datetime_args = {
-      "base_path": data_dir,
-      "start_datetime": start_datetime,
-      "end_datetime": end_datetime,
-      "datetime_prefix_format": datetime_format,
-      "extension": "lzo",
-      "parallelism": data_threads,
-      "hour_resolution": hour_resolution,
-      "sort": True,
-    }
-
-    # no cache of data file paths, just get the list by scraping the directory
-    if not files_list_path or not tf.io.gfile.exists(files_list_path):
-      # twml.util.list_files_by_datetime returns None if data_dir is None.
-      # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
-      files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-    else:
-      # the cached data file paths file exists.
-      files_info = twml.util.read_file(files_list_path, decode="json")
-      # use the cached list if data params match current params,
-      #  or if current params are None
-      # Not including None checks for datetime_format and hour_resolution,
-      #  since those are shared between eval and training.
-      if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or
-          (files_info["data_dir"] == data_dir and
-           files_info["start_datetime"] == start_datetime and
-           files_info["end_datetime"] == end_datetime and
-           files_info["datetime_format"] == datetime_format and
-           files_info["hour_resolution"] == hour_resolution)):
-        files_list = files_info["files"]
-      elif overwrite:
-        # current params are not none and don't match saved params
-        # `overwrite` indicates we should thus update the list
-        files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-      else:
-        # dont update the cached list
-        raise ValueError("Information in files_list is inconsistent with provided args.\n"
-                         "Did you intend to overwrite files_list using "
-                         "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
-                         "If you instead want to use the paths in files_list, ensure that "
-                         "data_dir, start_datetime, and end_datetime are None.")
-
-    if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)):
-      save_dict = {}
-      save_dict["files"] = files_list
-      save_dict["data_dir"] = data_dir
-      save_dict["start_datetime"] = start_datetime
-      save_dict["end_datetime"] = end_datetime
-      save_dict["datetime_format"] = datetime_format
-      save_dict["hour_resolution"] = hour_resolution
-      twml.util.write_file(files_list_path, save_dict, encode="json")
-
-    return files_list
-
-  @staticmethod
-  def build_files_list(files_list_path, data_dir,
-                        start_datetime, end_datetime, datetime_format,
-                        data_threads, hour_resolution, maybe_save, overwrite):
-    '''
-    When specifying DAL datasets, only data_dir, start_dateime, and end_datetime
-    should be given with the format:
-
-    dal://{cluster}/{role}/{dataset_name}/{env}
-
-    '''
-    if not data_dir or not is_dal_path(data_dir):
-      logging.warn(f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path.")
-      return DataRecordTrainer.build_hdfs_files_list(
-        files_list_path, data_dir,
-        start_datetime, end_datetime, datetime_format,
-        data_threads, hour_resolution, maybe_save, overwrite)
-
-    del datetime_format
-    del data_threads
-    del hour_resolution
-    del maybe_save
-    del overwrite
-
-    return dal_to_hdfs_path(
-      path=data_dir,
-      start_datetime=start_datetime,
-      end_datetime=end_datetime,
-    )
-
-  @property
-  def train_files(self):
-    return self._train_files
-
-  @property
-  def eval_files(self):
-    return self._eval_files
-
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_
-    and `DataRecordTrainer code
-    <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_
-    for a list and description of all cmd-line arguments.
-
-    Args:
-      learning_rate_decay:
-        Defaults to False. When True, parses learning rate decay arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments()
-    parser.add_argument(
-      "--train.files_list", "--train_files_list", type=str, default=None,
-      dest="train_files_list",
-      help="Path for a json file storing information on training data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "training files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if train_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--train.overwrite_files_list", "--train_overwrite_files_list", action="store_true", default=False,
-      dest="train_overwrite_files_list",
-      help="When the --train.files_list param is used, indicates whether to "
-           "overwrite the existing --train.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.files_list", "--eval_files_list", type=str, default=None,
-      dest="eval_files_list",
-      help="Path for a json file storing information on evaluation data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "evaluation files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if eval_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--eval.overwrite_files_list", "--eval_overwrite_files_list", action="store_true", default=False,
-      dest="eval_overwrite_files_list",
-      help="When the --eval.files_list param is used, indicates whether to "
-           "overwrite the existing --eval.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--eval.data_dir", "--eval_data_dir", type=str, default=None,
-      dest="eval_data_dir",
-      help="Path to the cross-validation data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--eval.start_date", "--eval_start_datetime",
-      type=str, default=None,
-      dest="eval_start_datetime",
-      help="Starting date for evaluating inside the eval data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.end_date", "--eval_end_datetime", type=str, default=None,
-      dest="eval_end_datetime",
-      help="Ending date for evaluating inside the eval data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %%Y/%%m/%%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
-    parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
-    parser.add_argument(
-      "--data_spec", type=str, required=True,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-    parser.add_argument(
-      "--train.keep_rate", "--train_keep_rate", type=float, default=None,
-      dest="train_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--eval.keep_rate", "--eval_keep_rate", type=float, default=None,
-      dest="eval_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--train.parts_downsampling_rate", "--train_parts_downsampling_rate",
-      dest="train_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
-      files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--eval.parts_downsampling_rate", "--eval_parts_downsampling_rate",
-      dest="eval_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
-      files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--allow_train_eval_overlap",
-      dest="allow_train_eval_overlap",
-      action="store_true",
-      help="Allow overlap between train and eval datasets."
-    )
-    parser.add_argument(
-      "--eval_name", type=str, default=None,
-      help="String denoting what we want to name the eval. If this is `train`, then we eval on \
-      the training dataset."
-    )
-    return parser
-
-  def contrib_run_feature_importances(self, feature_importances_parse_fn=None, write_to_hdfs=True, extra_groups=None, datarecord_filter_fn=None, datarecord_filter_run_name=None):
-    """Compute feature importances on a trained model (this is a contrib feature)
-    Args:
-      feature_importances_parse_fn (fn): The same parse_fn that we use for training/evaluation.
-        Defaults to feature_config.get_parse_fn()
-      write_to_hdfs (bool): Setting this to True writes the feature importance metrics to HDFS
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    logging.info("Computing feature importance")
-    algorithm = self._params.feature_importance_algorithm
-
-    kwargs = {}
-    if algorithm == TREE:
-      kwargs["split_feature_group_on_period"] = self._params.split_feature_group_on_period
-      kwargs["stopping_metric"] = self._params.feature_importance_metric
-      kwargs["sensitivity"] = self._params.feature_importance_sensitivity
-      kwargs["dont_build_tree"] = self._params.dont_build_tree
-      kwargs["extra_groups"] = extra_groups
-      if self._params.feature_importance_is_metric_larger_the_better:
-        # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC)
-        kwargs["is_metric_larger_the_better"] = True
-      elif self._params.feature_importance_is_metric_smaller_the_better:
-        # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS)
-        kwargs["is_metric_larger_the_better"] = False
-      else:
-        # The user has not specified which direction is better for the stopping metric
-        kwargs["is_metric_larger_the_better"] = None
-      logging.info("Using the tree algorithm with kwargs {}".format(kwargs))
-
-    feature_importances = compute_feature_importances(
-      trainer=self,
-      data_dir=self._params.get('feature_importance_data_dir'),
-      feature_config=self._feature_config,
-      algorithm=algorithm,
-      record_count=self._params.feature_importance_example_count,
-      parse_fn=feature_importances_parse_fn,
-      datarecord_filter_fn=datarecord_filter_fn,
-      **kwargs)
-
-    if not feature_importances:
-      logging.info("Feature importances returned None")
-    else:
-      if write_to_hdfs:
-        logging.info("Writing feature importance to HDFS")
-        write_feature_importances_to_hdfs(
-          trainer=self,
-          feature_importances=feature_importances,
-          output_path=datarecord_filter_run_name,
-          metric=self._params.get('feature_importance_metric'))
-      else:
-        logging.info("Not writing feature importance to HDFS")
-
-      logging.info("Writing feature importance to ML Metastore")
-      write_feature_importances_to_ml_dash(
-        trainer=self, feature_importances=feature_importances)
-    return feature_importances
-
-  def export_model(self, serving_input_receiver_fn=None,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICT graph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Args:
-      serving_input_receiver_fn (Function):
-        function preparing the model for inference requests.
-        If not set; defaults to the the serving input receiver fn set by the FeatureConfig.
-      export_output_fn (Function):
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory ``save_dir`` is chosen.
-
-    Returns:
-      The export directory where the PREDICT graph is saved.
-    """
-    if serving_input_receiver_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      serving_input_receiver_fn = self._feature_config.get_serving_input_receiver_fn()
-
-    if feature_spec is None:
-      if self._feature_config is None:
-        raise ValueError("feature_spec can not be inferred."
-                         "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method")
-      else:
-        feature_spec = self._feature_config.get_feature_spec()
-
-    if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig):
-      raise ValueError("Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn")
-    elif not callable(serving_input_receiver_fn):
-      raise ValueError("Expecting Function for serving_input_receiver_fn")
-
-    if export_output_fn is None:
-      export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-
-    return super(DataRecordTrainer, self).export_model(
-      export_dir=export_dir,
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path,
-      export_output_fn=export_output_fn,
-      feature_spec=feature_spec,
-    )
-
-  def get_train_input_fn(
-      self, parse_fn=None, repeat=None, shuffle=True, interleave=True, shuffle_files=None,
-      initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.train().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to the parser returned by the FeatureConfig selected
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`.
-        This ensures the training is run for atleast `params.train_steps`.
-        Toggling this to `False` results in training finishing when one of the following happens:
-          - The entire dataset has been trained upon once.
-          - `params.train_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        When `False`, files are read in alpha-numerical order. Also when `False`
-        the dataset is sharded among workers for Hogwild and distributed training
-        if no sharding configuration is provided in `params.train_dataset_shards`.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffle the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.train()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.train_steps > 0 or self.params.get('distributed', False)
-
-    if not shuffle and self.num_workers > 1 and self.params.train_dataset_shards is None:
-      num_shards = self.num_workers
-      shard_index = self.worker_index
-    else:
-      num_shards = self.params.train_dataset_shards
-      shard_index = self.params.train_dataset_shard_index
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._train_files,
-      batch_size=self.params.train_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.train_keep_rate,
-      parts_downsampling_rate=self.params.train_parts_downsampling_rate,
-      shards=num_shards,
-      shard_index=shard_index,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs)
-
-  def get_eval_input_fn(
-      self, parse_fn=None, repeat=None,
-      shuffle=True, interleave=True,
-      shuffle_files=None, initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.eval().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to twml.parsers.get_sparse_parse_fn(feature_config).
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`.
-        This ensures the evaluation is run for atleast `params.eval_steps`.
-        Toggling this to `False` results in evaluation finishing when one of the following happens:
-          - The entire dataset has been evaled upon once.
-          - `params.eval_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `False`, files are read in alpha-numerical order.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffles the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.eval()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not self._eval_files:
-      raise ValueError("`eval_files` was not present in `params` passed to `DataRecordTrainer`")
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.eval_steps > 0
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._eval_files,
-      batch_size=self.params.eval_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.eval_keep_rate,
-      parts_downsampling_rate=self.params.eval_parts_downsampling_rate,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs
-    )
-
-  def _assert_train_files(self):
-    if not self._train_files:
-      raise ValueError("train.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def _assert_eval_files(self):
-    if not self._eval_files:
-      raise ValueError("eval.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_train_input_fn().
-    See Trainer for more detailed documentation documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).train(input_fn=input_fn, steps=steps, hooks=hooks)
-
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_eval_input_fn().
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_eval_files()
-    input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False)
-    return super(DataRecordTrainer, self).evaluate(
-      input_fn=input_fn,
-      steps=steps,
-      hooks=hooks,
-      name=name
-    )
-
-  def learn(self, train_input_fn=None, eval_input_fn=None, **kwargs):
-    """
-    Overrides ``Trainer.learn`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.learn`` for more detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).learn(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def train_and_evaluate(self,
-                         train_input_fn=None, eval_input_fn=None,
-                          **kwargs):
-    """
-    Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.train_and_evaluate`` for detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).train_and_evaluate(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    Overrides the _model_fn to correct for the features shape of the sparse features
-    extracted with the contrib.FeatureConfig
-    """
-    if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig):
-      # Fix the shape of the features. The features dictionary will be modified to
-      # contain the shape changes.
-      twml.util.fix_shape_sparse(features, self._feature_config)
-    return super(DataRecordTrainer, self)._model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      params=params,
-      config=config
-    )
-
-  def calibrate(self,
-                calibrator,
-                input_fn=None,
-                steps=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.train_input_fn.
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).calibrate(calibrator=calibrator,
-                                             input_fn=input_fn,
-                                             steps=steps,
-                                             save_calibrator=save_calibrator,
-                                             hooks=hooks)
-
-  def save_checkpoints_and_export_model(self,
-                                        serving_input_receiver_fn,
-                                        export_output_fn=None,
-                                        export_dir=None,
-                                        checkpoint_path=None,
-                                        input_fn=None):
-    """
-    Exports saved module after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See export_model for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.export_model(serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path)
-
-  def save_checkpoints_and_evaluate(self,
-                                    input_fn=None,
-                                    steps=None,
-                                    hooks=None,
-                                    name=None):
-    """
-    Evaluates model after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See evaluate for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.evaluate(input_fn, steps, hooks, name)
diff --git a/twml/twml/trainers/trainer.docx b/twml/twml/trainers/trainer.docx
new file mode 100644
index 000000000..9e343ccfe
Binary files /dev/null and b/twml/twml/trainers/trainer.docx differ
diff --git a/twml/twml/trainers/trainer.py b/twml/twml/trainers/trainer.py
deleted file mode 100644
index e51b4e0fd..000000000
--- a/twml/twml/trainers/trainer.py
+++ /dev/null
@@ -1,1777 +0,0 @@
-# pylint: disable=too-many-lines
-"""
-``twml.trainers.Trainer`` is a wrapper around `tf.estimator.Estimator
-<https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/Estimator>`_
-to expose an easier to use API by
-hiding rarely used config knobs and supplying default values.
-
-The `Trainer` facilitates multi-phase training commonly used at Twitter: e.g.
-MDL calibration -> MLP training -> Isotonic calibration.
-The `Trainer` also facilitates hyperparameters tuning,
-with its simple `add_parser_arguments()` method.
-
-Learning rate decay functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Please note that we have four learning rate decay functions to choose from.
-Additionally, each trainer can only take one learning rate decay function and its parameters.
-If that is not the case, it will throw an error.
-Also, please note that the learning rate decay is a positional argument and should be placed as
-the last argument to the trainer, as you can see in the example above.
-The four learning decays options are:
-
-1. inverse_learning_rate_decay:
-
-  The function returns the decayed learning rate. It is computed as:
-
-  ::
-
-    decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /decay_step)
-    final_decayed_learning_rate = max(decayed_learning_rate, min_learning_rate)
-
-
-2. polynomial_learning_rate_decay:
-
-  The function returns the decayed learning rate. It is computed as:
-
-  ::
-
-    global_step = min(global_step, decay_steps)
-    decayed_learning_rate = (learning_rate - end_learning_rate) *
-                            (1 - global_step / decay_steps) ^ (power) +
-                            end_learning_rate
-
-
-3. piecewise_constant_learning_rate_decay:
-
-  Piecewise constant from boundaries and interval values.
-
-  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 for
-  the next 10000 steps, and 0.1 for any additional steps.
-
-  ::
-
-    global_step = tf.Variable(0, trainable=False)
-    boundaries = [100000, 110000]
-    values = [1.0, 0.5, 0.1]
-    learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
-
-4. exponential_learning_rate_decay:
-
-  The function returns the decayed learning rate. It is computed as:
-
-  ::
-
-    decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-
-"""
-
-import datetime
-import functools
-import math
-from operator import itemgetter
-import os
-import pprint as pp
-import random
-from string import Template
-import subprocess
-import sys
-import time
-from threading import Thread
-
-from twitter.common.metrics import AtomicGauge
-from twitter.deepbird.stats_server import utils as stats_server_utils
-from twitter.deepbird.stats_server.stats_exporter import StatsExporter
-from twitter.ml.common import metrics
-from twitter.ml.common.kubernetes import kubectl_delete_by_name, Resource
-from twitter.ml.twml.status import get_distributed_training_job_status, TrainingJobStatus
-
-from absl import logging
-from twml.optimizers import LazyAdamOptimizer, optimize_loss, OPTIMIZER_SUMMARIES
-from twml.contrib.optimizers import DeepGradientCompressionOptimizer
-from twml.tracking import ExperimentTracker
-from twml.util import (delete_file_or_dir,
-                       get_distributed_training_job_path,
-                       sanitize_hdfs_path)
-try:
-  from urllib import quote as encode_url
-except ImportError:
-  from urllib.parse import quote as encode_url
-import tensorflow.compat.v1 as tf
-import tensorflow
-import tensorflow_hub as hub
-
-import twitter.ml.twml.kubernetes.status as k8s_status
-import twml
-import twml.export_output_fns
-import twml.learning_rate_decay
-import twml.metrics
-
-
-_CLUSTER_TEMPLATE = Template('''{
-  "cluster": {
-    "ps": [$PS],
-    "chief": [$CHIEF],
-    "worker": [$WORKER]
-  },
-  "task": {"type": "$TYPE", "index": $INDEX}
-}
-''')
-
-
-def init_from_checkpoint(init_dir, init_map):
-  """
-  Wrapper around tf.train.init_from_checkpoint
-  """
-  if init_dir:
-    init_dir = sanitize_hdfs_path(init_dir)
-    tf.train.init_from_checkpoint(init_dir, init_map)
-
-
-class Trainer(object):
-  """
-  This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier.
-  Supports multi-phase training (for example, use a Trainer for MDL calibration, then
-  another for training the rest of the model, then another for isotonic calibration).
-  The Trainer also implements a training and evaluation loop via the ``learn()`` method.
-  Each Trainer is associated to a fixed set of hyper parameters (params), and a single model
-  specified by ``build_graph``. Given these constraints, a single Trainer can be called
-  multiple times for training and evaluation over multiple epochs.
-
-  However, if you intend to try different sets of hyper-parameters, we recommend you instantiate
-  a different Trainer for each such experiment. That way, each experiment can be tracked
-  in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain
-  checkpoints of the model (its graph, and variables), and the history of metrics (for example,
-  evaluation accuracy at each epoch), and other store observations like the average time per step.
-  The latter metrics can be viewed by pointing
-  TensorBoard to the save_dir and accessing TensorBoard via your browser.
-  """
-
-  def __init__(self, name, params, build_graph_fn,
-               metric_fn=None,
-               optimize_loss_fn=None,
-               run_config=None,
-               save_dir=None,
-               init_from_dir=None,
-               init_map=None,
-               warm_start_from=None,
-               profiler_steps=None,
-               **kwargs):
-    """
-
-    Args:
-      name (String):
-        string name of this estimator; used as scope names for variables and tensors.
-      params (HParams, Namespace, or Dict):
-        hyper-parameters to be passed to Estimator constructor.
-        Must include params.train_batch_size and params.eval_batch_size.
-        Note that params is passed to twml.util.convert_to_hparams() to produce an HParams.
-      build_graph_fn:
-        A function for building tensorflow graphs.
-        This matches TensorFlow Estimator's model_fn signature.
-        For example,
-
-        .. code-block:: python
-
-          def build_graph(features, label, mode, params, config=None):
-            # Implements a simple binary logistic regression model
-            sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-
-            logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1)
-
-            if mode == 'infer':
-              loss = None
-            else:
-              loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
-              loss = twml.util.weighted_average(loss, features['weights'])
-
-            output = tf.nn.sigmoid(logits)
-
-            return {'output': output, 'loss': loss}
-
-        Args:
-          features (dict of Tensor keyed by a string name):
-            input tensors.
-          mode (tf.estimator.ModeKeys / String):
-            one of 'train', 'eval', 'infer'.
-          label (Tensor):
-            if in ``mode == 'train'`` mode, these contain the corresponding labels for input.
-          params (HParams):
-            hyper parameters that control how to build a graph.
-          config:
-            the RunConfig object passed to Estimator constructor.
-
-        This function is expected to return a dictionary containing the following keys:
-
-        * 'output': a node representing model output; required.
-        * 'loss': (required) a loss node used for optimization; required for training and
-          evaluation.
-        * 'train_op': (optional) an operation that minimizes the loss (as output by
-          `tf.train.Optimizer.minimize`). If train_op is specified, train_op is used
-          for optimization as opposed to loss. Loss is always logged to tensorboard.
-
-        Notes:
-
-        * any tf.summary written inside build graph are logged to tensorboard during training.
-        * the ``build_graph_fn`` is called once or twice per epoch (once per training,
-          once per evaluation). All data loading (and preprocessing) logic not required
-          for serving should be in the ``input_fn`` passed to ``learn``, ``train``,
-          ``evalulate``, etc.
-
-      optimize_loss_fn:
-        Defaults to Trainer.get_train_op. A function that takes params and loss as arguments
-        and returns a training op. The training op is used to update parameters (that is, to learn).
-      metric_fn:
-        A function that returns the eval_metric_ops dict given graph_output, labels and weights.
-        Defaults to None.
-        Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn``
-        which implements many binary classification metrics.
-      run_config (RunConfig):
-        optional configuration to be passed to Estimator constructor. Defaults to None.
-      save_dir (String):
-        optional directory where to save model checkpoints,
-        tensorboard event files and trained parameters.
-        Overwrites and defaults to run_config.model_dir.
-      init_from_dir (String):
-        optional directory to load weights from.
-        if set to None (the default), do not init from any directory.
-      init_map (map from String to String):
-        Must be specified if init_from_dir is specified.
-        Defines which scopes and variables to load.
-        Keys are the variables and scopes to load from the directory.
-        Values are the destinations (in the current graph) to load into.
-        See tf.init_from_checkpoint for more information.
-        Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope
-        of any variable defined inside `build_graph_fn` and this should be taken into account when
-        defining the values.
-      warm_start_from:
-        Optional string filepath to a checkpoint to warm-start from,
-        or a tf.estimator.WarmStartSettings object to fully configure warm-starting.
-        If the string filepath is provided instead of a WarmStartSettings,
-        then all variables are warm-started, and it is assumed that
-        vocabularies and Tensor names are unchanged.
-      profiler_steps (Integer):
-        Defaults to None. If set defines the number of steps in the
-        `tf.train.ProfileHook <https://www.tensorflow.org/api_docs/python/tf/train/ProfilerHook>`_.
-        Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds.
-        When executing ``learn``, ``train`` or ``predict`` methods,
-        with ``profiler_steps`` set to a number,
-        a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data
-        storedin Chrome trace format. To view stored data, use the Chrome browser to follow
-        these steps:
-
-        1) Go to the page chrome://tracing.
-        2) In the upper left corner, you will find Load button.
-        3) Press it and load our JSON file, which can be found in the ``save_dir``
-
-        *Warning*: This could create too many these json files which can be a potential problem,
-        e.g. for  HDFS there is normally quota forfile count, so use with caution.
-
-        Note: this argument is ignored when a non-None ``hooks`` argument is pasesd to
-        ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing
-        ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example.
-    """
-
-    if tensorflow.__version__ >= "2.0":
-      RuntimeError("Trainer not yet supported for Tensorflow >= 2.0")
-
-    self._name = name
-    self._build_graph_fn = build_graph_fn
-    self._metric_fn = metric_fn
-    self._tensorboard_handle = None
-    self._current_estimator_spec = None  # holds the current estimator spec
-    self._profiler_steps = profiler_steps
-    self._export_output_fn = None
-    self._is_early_stopping = False
-
-    # NOTE: Sanitize all HDFS paths first.
-    save_dir = sanitize_hdfs_path(save_dir)
-    init_from_dir = sanitize_hdfs_path(init_from_dir)
-
-    # warm_start_from can be of type tf.estimator.WarmStartSettings.
-    if isinstance(warm_start_from, str):
-      warm_start_from = sanitize_hdfs_path(warm_start_from)
-
-    # convert to twitter.deepbird.hparam.hparam.HParams object
-    params = twml.util.convert_to_hparams(params)
-
-    # keep a copy of the params because calling self._estimator.params creates a deepcopy
-    self._params = params
-    self.check_params()
-
-    self._using_hogwild = True if os.environ.get('TWML_HOGWILD_PORTS') else False
-    # configure Hogwild (needs to be called before RunConfig is created)
-    self._hogwild_setup()
-
-    if not run_config:
-      session_config = tf.ConfigProto()
-      # By default each process tries to allocate (almost) all of the memory.
-      # This option ensures the gpu memory grows dynamically instead.
-      session_config.gpu_options.allow_growth = True  # pylint: disable=no-member
-
-      if 'TWML_NUM_CPUS' in os.environ:
-        num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8"))
-        if params.num_mkl_threads > 1:
-          os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads)
-          os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads)
-          session_config.inter_op_parallelism_threads = num_available_cpus // params.num_mkl_threads
-          session_config.intra_op_parallelism_threads = params.num_mkl_threads
-
-      run_config = tf.estimator.RunConfig(
-        session_config=session_config,
-        keep_checkpoint_max=self._params.get('keep_checkpoint_max', 20),
-        log_step_count_steps=10000,
-        save_checkpoints_secs=self._params.get('save_checkpoints_secs', 600),
-        tf_random_seed=self._tf_random_seed())
-    elif not isinstance(run_config, tf.estimator.RunConfig):
-      raise ValueError("Expecting run_config argument of type None or tf.estimator.RunConfig"
-        "Got %s instead." % type(run_config).__name__)
-    elif os.environ.get('TWML_HOGWILD_PORTS'):
-      raise ValueError("Custom RunConfig not supported with Hogwild")
-
-    if run_config.model_dir is None and save_dir is None:
-      raise ValueError(
-          "Expecting either save_dir or run_config.model_dir to be specified. Got None for each.")
-    elif run_config.model_dir is None:
-      run_config = run_config.replace(model_dir=save_dir)
-    elif save_dir is None:
-      save_dir = run_config.model_dir
-
-    self._save_dir = save_dir
-    self.experiment_tracker = ExperimentTracker(self._params, run_config, self._save_dir)
-
-    # Check if should delete the tsd running this training job. In certain use case when 
-    # there are other tf operations following trainer.train_and_evaluate (or trainer.learn),
-    # additional state files need to be specified to ensure those steps are executed after job restart.
-    kwargs['gke_state_files'] = kwargs.get('gke_state_files', ['_SUCCESS'])
-    self._maybe_del_tsd_exit(kwargs['gke_state_files'])
-    logging.info("Checkpoint and event files will be saved at save_dir=%s", save_dir)
-    self._optimize_loss_fn = self.get_train_op if optimize_loss_fn is None else optimize_loss_fn
-
-    # overwrite the current save_dir
-    if self._params.get('overwrite_save_dir') and tf.io.gfile.exists(self._save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % self._save_dir)
-      # if distributed or hogwild:
-      if self._params.get('distributed', False):
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-        if run_config.is_chief:
-          logging.info("Chief deleting the save_dir now")
-          delete_file_or_dir(self._save_dir)
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-      else:
-        delete_file_or_dir(self._save_dir)
-
-    # Exposing stats to a /vars.json endpoint that will be collected
-    # by the absorber
-    if self._params.get('stats_port'):
-      try:
-        stats_server_utils.start_stats_server(self._params.get('stats_port'), self._save_dir)
-      except Exception as err:
-        logging.error('Failed to start the stats server. Error: %s', str(err))
-
-    checkpoint = os.path.join(self._save_dir, 'checkpoint')
-    if tf.io.gfile.exists(checkpoint):
-      logging.info("The provided save_dir directory %s already exists."
-                   " Training will be resumed."
-                   % checkpoint)
-
-    self._maybe_restore_checkpoint = lambda: init_from_checkpoint(init_from_dir, init_map)
-
-    if init_from_dir is not None and init_map is None:
-      raise ValueError("Need to provide init_map when init_from_dir is provided.")
-
-    if not tf.io.gfile.exists(self._save_dir):
-      # so tensorboard can point to a directory that exists
-      tf.io.gfile.mkdir(self._save_dir)
-
-    self._estimator = tf.estimator.Estimator(
-      model_fn=self._model_fn,
-      params=self._params,  # HParams
-      config=run_config,  # RunConfig
-      warm_start_from=warm_start_from,
-      model_dir=self._save_dir,  # By this point it is same as run_config.model_dir
-    )
-
-    # Log parameters that are used to construct trainer. This allows people to see default values.
-    logging.info("Trainer constructed using the following parameters: ")
-    pp_params = pp.pformat(self._params.values())
-    logging.info(pp_params)
-
-    # Start TensorBoard
-    if self._params.get('disable_tensorboard', False):
-      logging.info("Skipping launching TensorBoard [--disable_tensorboard is set]")
-    elif "tensorboard_port" in self._params.values() and self._params.tensorboard_port is not None:
-      self.start_tensorboard(self._params.tensorboard_port)
-
-    # Export gauge that will track whether a model was exported
-    self.stats_exporter = StatsExporter("twml.trainer")
-    self.export_gauge = AtomicGauge('export_model')
-    self.stats_exporter.register_metrics(self.export_gauge)
-
-  def _hogwild_setup(self):
-    """
-    Setup the parameters required for hogwild.
-    """
-    self._num_workers = self._params.get('num_workers') or 1
-    logging.info("NUM_WORKERS: %d", self._num_workers)
-    if self._num_workers <= 1:
-      self._ports = None
-      return
-
-    # a hogwild job is considered distributed
-    if 'distributed' in self._params:
-      self._params.set_hparam('distributed', True)
-    else:
-      self._params.add_hparam('distributed', True)
-
-    ports = os.environ.get('TWML_HOGWILD_PORTS')
-    if ports:
-      self._ports = [int(port) for port in ports.strip().split(",")]
-      if (self._num_workers + 1!= len(self._ports)):
-        raise ValueError("Number of (workers + PS) and ports need to match")
-    else:
-      if self._num_workers > 1:
-        raise ValueError("TWML_HOGWILD_PORTS needs to be set to use hogwild training")
-
-    # Split the number of data threads across multiple workers
-    num_threads = self._params.get('num_threads')
-    num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers))
-    self._params.set_hparam('num_threads', num_threads_per_worker)
-
-    hogwild_task_type = os.environ.get('TWML_HOGWILD_TASK_TYPE')
-    hogwild_task_id = int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    os.environ['TF_CONFIG'] = self._get_cluster_config(hogwild_task_type, hogwild_task_id)
-
-  def _tf_random_seed(self):
-    """ Returns user set seed and deal with Hogwild multiple seeds """
-    tf_random_seed = self._params.get('tf_random_seed', None)
-    if tf_random_seed is None:
-      return None
-    elif self.using_hogwild and os.environ.get('TWML_HOGWILD_TASK_TYPE') == 'worker':
-      # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)...
-      return tf_random_seed + 1 + int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    else:
-      return tf_random_seed
-
-  def check_params(self):
-    """ Verify that params has the correct key,values """
-    param_values = self._params.values()
-
-    if 'train_batch_size' in param_values:
-      if not isinstance(self._params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if self._params.train_batch_size <= 0:
-        raise ValueError("train_batch_size needs to be positive")
-    else:
-      raise ValueError("train_batch_size needs to be present in params")
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(self._params.eval_batch_size, int):
-        raise ValueError("Expecting params.eval_batch_size to be an integer.")
-      if self._params.eval_batch_size <= 0:
-        raise ValueError("eval_batch_size needs to be positive.")
-    else:
-      self._params.add_hparam('eval_batch_size', self._params.train_batch_size)
-
-    if (self._params.get('distributed_training_cleanup') and
-      not self._params.get('distributed')):
-      # we only need to support training discontinuation for distributed training
-      # bc we are still using TSDs on GKE for distributed training
-      raise ValueError(
-        "Expecting params.distributed to be set if "
-        "params.distributed_training_cleanup is set."
-      )
-
-  def _get_cluster_config(self, name, index):
-    """Create a tensorflow cluster config from ports, name and index"""
-    host = '"localhost:%d"'
-    ps = host % self._ports[0]
-    chief = host % self._ports[1]
-    workers = ", ".join([host % port for port in self._ports[2:]])
-    config = _CLUSTER_TEMPLATE.substitute(
-      PS=ps,
-      CHIEF=chief,
-      WORKER=workers,
-      TYPE=name,
-      INDEX=index,
-    )
-    return config
-
-  @property
-  def current_estimator_spec(self):
-    """
-    returns the current estimator (warning: often reset)
-    """
-    return self._current_estimator_spec
-
-  @property
-  def estimator(self):
-    """ returns estimator encapsulated by Trainer """
-    return self._estimator
-
-  @property
-  def num_workers(self):
-    """ returns number of workers """
-    return self._estimator.config.num_worker_replicas
-
-  @property
-  def worker_index(self):
-    """
-    returns index of worker in the cluster
-    chief has index 0
-    non-chief workers have indices 1 through (num_workers - 1)
-    """
-    return self._estimator.config.global_id_in_cluster
-
-  @property
-  def using_hogwild(self):
-    """ returns a bool indicating whether hogwild is being used """
-    return self._using_hogwild
-
-  def set_estimator(self, estimator):
-    """ sets the estimator used internally by Trainer """
-    if not isinstance(estimator, tf.estimator.Estimator):
-      raise ValueError("Expecting tf.estimator.Estimator")
-    self._estimator = estimator
-    self._params = self.estimator.params
-
-  @property
-  def params(self):
-    """
-    returns the hyper-parameters passed to the constructor.
-    """
-    return self._params
-
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    return twml.argument_parser.get_trainer_parser()
-
-  @staticmethod
-  def get_train_op(params, loss):
-    """
-    Return a training Op, that is, a `twml.optimizers.optimize_loss
-    <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss>`_
-    instance given params and loss.
-    This method can be overwritten by passing the optimize_loss_fn to the Trainer
-    constructor.
-
-    Args:
-      params:
-        tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries,
-        gradient_noise_scale, clip_gradients and learning_rate_decay (including
-        other learning rate decay arguments).
-      loss:
-        scalar Op returned by the build_graph that specifies the training loss to
-        be minimized.
-    """
-    optimizer = params.get('optimizer')
-
-    if not optimizer:
-      optimizer = 'SGD'
-
-    if optimizer == 'LazyAdam':
-      optimizer = LazyAdamOptimizer
-
-    if optimizer == 'DGC':
-      optimizer = DeepGradientCompressionOptimizer(
-          learning_rate=params.learning_rate,
-          use_locking=False,
-          name="Sparse",
-          density=params.get('dgc_density'),
-          density_decay=params.get('dgc_density_decay'),
-          density_decay_steps=params.get('dgc_density_decay_steps'),
-          density_decay_rate=params.get('dgc_density_decay_rate'),
-          min_density=params.get('dgc_min_density'),
-          accumulation=params.get('dgc_accumulation')
-      )
-
-    summaries = ['loss']
-    if params.get('show_optimizer_summaries'):
-      summaries = OPTIMIZER_SUMMARIES
-
-    train_op = optimize_loss(
-      loss=loss,
-      global_step=tf.train.get_global_step(),
-      optimizer=optimizer,
-      learning_rate=params.learning_rate,
-      summaries=summaries,
-      colocate_gradients_with_ops=True,
-      gradient_noise_scale=params.get('gradient_noise_scale'),
-      clip_gradients=params.get('clip_gradients'),
-      learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(params)
-    )
-    return train_op
-
-  def export_model_effects(self, export_path, feature_spec=None, log_features=True):
-
-    # DO NOT CHANGE THE ORDER.
-    # This needs to be done before registering the model.
-    if feature_spec:
-      if log_features:
-        features = feature_spec['features']
-        feature_names = ['.'.join(features[fid]['featureName'].split('.')[1:]) for fid in features.keys()]
-        features_to_log = ','.join(feature_names)
-        try:
-          model_hash = self.experiment_tracker.compute_model_hash(export_path)
-          metrics.log_usage('dbv2', 'export_model_effects', 'v1', custom_attrs=[model_hash, "feature config present", features_to_log])
-        except:  # noqa: T803
-          logging.info("Failed to log Feature Config features")
-
-      twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec)
-      export_start_time = time.time()
-      self.experiment_tracker.export_feature_spec(feature_spec)
-      logging.info("Exported feature spec to ML Metastore in %s seconds.", time.time() - export_start_time)
-
-    self.experiment_tracker.register_model(str(export_path))
-    self.export_gauge.increment()
-
-  @property
-  def best_or_latest_checkpoint(self):
-    if self._is_early_stopping:
-      best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint")
-      checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path)
-      # Return best checkpoint if necessary
-      if checkpoint_path:
-        return checkpoint_path
-      else:
-        raise ValueError("Best checkpoint not found at %s." % best_checkpoint_path)
-    else:  # Fallback to latest checkpoint from save directory
-      return self.latest_checkpoint
-
-  @property
-  def latest_checkpoint(self):
-    return self.estimator.latest_checkpoint()
-
-  def export_model(self, serving_input_receiver_fn,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None,
-                   log_features=True):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICTgraph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Note that a valid self._export_output_fn is required.
-    If export_ouput_fn is provided, it is used to set the self._export_output_fn.
-
-    Args:
-      serving_input_receiver_fn:
-        function preparing the model for inference requests.
-        This funtion returns the ``features`` dict passed to ``build_graph``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory is chosen.
-      export_output_fn:
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to `twml.export_output_fns.default_output_fn`.
-
-    Return:
-      returns a string path to exported directory.
-
-    # set the export output function
-    """
-    if not self.is_chief():
-      logging.info("Trainer.export_model ignored due to the process not being chief.")
-      return
-
-    self._export_output_fn = export_output_fn or twml.export_output_fns.default_output_fn
-
-    if not callable(self._export_output_fn):
-      raise RuntimeError(
-        "Expecting export_output_fn function. Got %s."
-        % type(self._export_output_fn).__name__)
-
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
-
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
-
-    # actually export the model using the Estimator API
-    export_path = self._estimator.export_savedmodel(
-      export_dir_base=export_dir or os.path.join(self._save_dir, 'exported_models'),
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path)
-
-    # export_path is bytes, need to convert to string for python3 to work.
-    logging.info("The exported model path is: " + str(export_path))
-
-    self.export_model_effects(export_path, feature_spec, log_features)
-
-    return export_path
-
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators.
-    You would probably never need to modify this method.
-    Instead, you should override build_graph, which this method calls.
-
-    Args:
-      features:
-        Dict of input tensors.
-      labels:
-        Tensor of target labels.
-      mode:
-        an instance of tf.estimator.ModeKeys.
-        Typically used to toggle TRAINing or EVALuation.
-      params:
-        HParams object containing hyper-parameters.
-    """
-    # pylint: disable=too-many-branches
-    if isinstance(features, dict):
-      weights = features.get('weights', None)
-    else:
-      weights = None
-
-    with tf.variable_scope(self._name + '/model'):
-      graph_output = self._build_graph_fn(features, labels, mode, params, config)
-      loss = graph_output['loss'] if 'loss' in graph_output else None
-
-    self._maybe_restore_checkpoint()
-
-    with tf.variable_scope(self._name + '/optim'):
-      train_op = None
-      if mode == tf.estimator.ModeKeys.TRAIN:
-        if 'train_op' in graph_output:
-          train_op = graph_output['train_op']
-          graph_output['train_op'] = None  # remove from preds to prevent error
-        elif loss is not None:
-          train_op = self._optimize_loss_fn(params, loss)
-
-        if params.get('train_log_metrics') and self._metric_fn:
-          metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-          for metric_name in metric_ops:
-            tf.summary.scalar(
-              name="training_metric_" + metric_name,
-              tensor=metric_ops[metric_name][1])  # index 0 contains value_op, 1 contains update_op
-
-    if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None:
-      # note that this is ignored by the predict method.
-      # Estimator only uses export_output_fn for export_model.
-      export_outputs = self._export_output_fn(graph_output)
-    else:
-      export_outputs = None
-
-    if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn:
-      eval_metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-    else:
-      eval_metric_ops = None
-
-    # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output
-    preds = {key: graph_output[key] for key in graph_output if (graph_output[key] is not None) and (key is not 'loss')}
-
-    init_feed_dict = twml.contrib.initializers.get_init_feed_dict()
-    scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict)
-
-    # Clear the init feed collection to avoid serializing the initializers.
-    twml.contrib.initializers.clear_init_feed_collection()
-
-    # save estimator for use by later methods and hooks (warning: often reset)
-    self._current_estimator_spec = tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=preds,
-      export_outputs=export_outputs,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops,
-      scaffold=scaffold,
-    )
-
-    return self._current_estimator_spec
-
-  def get_train_hooks(self):
-    """Return SessionRunHooks used during training.
-
-    By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed.
-
-    If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook`
-    for monitoring the profile.
-
-    """
-    # Instead of having every_n_steps be a constant number,
-    # change it dynamically based on batch size.
-    # Ideally we should be using every_n_secs, but that seems buggy as of 1.7.
-    # The every_n_steps = 20K / batch_size
-    every_n_steps = ((2048 * 100) // self._params.train_batch_size)
-    step_counter = tf.train.StepCounterHook(
-      every_n_steps=every_n_steps, output_dir=self._save_dir
-    )
-    train_hooks = [step_counter]
-
-    if self._profiler_steps is not None:
-      if not self._params.get('distributed') or self._estimator.config.is_chief:
-        profiler = tf.train.ProfilerHook(
-          save_steps=self._profiler_steps,
-          output_dir=self._save_dir
-        )
-        train_hooks.append(profiler)
-
-    return train_hooks
-
-  def is_task_type(self, name):
-    """
-    Helper function to specify if the current process is of the given worker type.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    if os.environ.get('TF_CONFIG'):
-      if self._estimator.config.task_type == name:
-        return True
-      else:
-        return False
-    return True
-
-  def is_evaluator(self):
-    """
-    Helper function to let you know if the worker is evaluator.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("evaluator")
-
-  def is_chief(self):
-    """
-    Helper function to let you know if the worker is chief.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("chief") or self.is_task_type("master")
-
-  def is_ps(self):
-    """
-    Helper function to let you know if the task is parameter server.
-    """
-    if os.environ.get('TF_CONFIG') and self._estimator.config.task_type == 'ps':
-      return True
-    return False
-
-  def _exit_ps_after_training_complete(self):
-    """
-    Helper function to shutdown parameter server after training job complete (either succeed or failed).
-    """
-    if not self.is_ps():
-      return
-
-    # No need to exit ps if on the same machine
-    if os.environ.get('TWML_HOGWILD_PORTS'):
-      return
-
-    if self._params.get('disable_auto_ps_shutdown', False):
-      logging.info("Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]")
-      return
-
-    # checking job status is different on gke vs aurora
-    if self._is_on_gke():
-      get_job_status = functools.partial(
-        k8s_status.get_training_job_status,
-        cluster=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        environment=os.environ['TWML_JOB_ENV'],
-        job_name=os.environ['TWML_JOB_NAME'],
-        using_tsd=True)
-    else:
-      get_job_status = functools.partial(
-        get_distributed_training_job_path,
-        base_job_path=get_distributed_training_job_path()
-      )
-
-    def wait_complete_then_exit():
-      retry_max = 60
-      retry = 0
-      while True:
-        try:
-          training_status = get_job_status()
-          if training_status == TrainingJobStatus.FINISHED:
-            logging.info("Distributed training job succeed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.FAILED:
-            logging.info("Distributed training job failed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.NOT_FOUND:
-            raise Exception("Distributed training job status not found.")
-          else:
-            poke_interval = random.randrange(60, 90)  # prevent spike QPS to aurora endpoint
-            time.sleep(poke_interval)
-            retry = 0
-        except Exception as e:
-          if retry >= retry_max:
-            raise e  # only exception in this thread, won't fail parameter server thread
-          retry += 1
-          poke_interval = random.randrange(60, 90) + retry * 10
-          logging.warn("Error getting distributed training job status, will retry after %s seconds." % poke_interval)
-          time.sleep(poke_interval)
-    Thread(target=wait_complete_then_exit).start()
-
-  def get_eval_hooks(self):  # pylint: disable=no-self-use
-    """ Return SessionRunHooks used during evaluation."""
-    return None
-
-  def get_predict_hooks(self):
-    """ Return hooks used during prediction.
-    If profiler_steps is set in the constructor to the Trainer,
-    we pass a tf.Train.ProfilerHook to the estimator's predict function.
-    """
-    hooks = []
-    if self._profiler_steps is not None:
-      profiler = tf.train.ProfilerHook(
-        save_steps=self._profiler_steps,
-        output_dir=self._save_dir
-      )
-      hooks.append(profiler)
-    return hooks
-
-  def learn(self, train_input_fn=None, eval_input_fn=None,
-            train_max_steps=None,
-            train_steps=None, eval_steps=None,
-            train_hooks=None, eval_hooks=None,
-            early_stop_metric=None, early_stop_patience=-1,
-            early_stop_minimize=True, early_stop_tolerance=0, start_epoch=0,
-            exporters=None, export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps`` steps.
-    Each epoch involves ``train_steps`` training steps followed
-    by ``eval_steps`` evaluation steps. Note that each step
-    is a ``session.run()``, that is, each batch is a step.
-
-    Args:
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        None-values cause learn() to terminate after *one* call to train() and evaluate(),
-        which is usually useful when using train_steps=-1
-        Non-positive values trains indefinitely in a loop (use with caution),
-        which is usually useful when used with early stopping.
-      train_steps:
-        number of training steps per epoch. For example, 100 means each
-        training epoch will end after processing 100 batches.
-        Defaults to params.train_steps.
-        Non-positive values and None-values go through the entire training set each epoch.
-      eval_steps:
-        number of evaluation steps per epoch.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through the entire evaluation set each epoch.
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      train_hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      eval_hooks:
-        List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks()
-      start_epoch:
-        The epoch from which to start learn. If you want to do training and evaluation
-        for N epochs, you can call ``learn()`` in a loop as follows:
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-        .. code-block:: python
-
-          for epoch in range(1,max_epoch):
-            trainer.learn(start_epoch=epoch)
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-      That is, save_dir.
-      You can point TensorBoard to this directory to get metrics,
-      or pass it to another Trainer via ``init_from_dir`` when doing
-      multi-phase training.
-    """
-    # pylint: disable=too-many-branches
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    if os.environ.get('TF_CONFIG'):
-      raise ValueError("trainer.learn() can not be used with distributed / hogwild setups")
-
-    if exporters and export_output_fn:
-      self._export_output_fn = export_output_fn
-
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if train_steps is None:
-      train_steps = self.params.train_steps
-    if train_steps <= 0:
-      train_steps = None
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if early_stop_patience > 0:
-      assert train_max_steps is not None, "Early stopping and max_steps=None are not compatible."
-      # prepare early stopping hook (which also handles logic here)
-      self._is_early_stopping = True
-      early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        start_epoch=start_epoch)
-      # add early stop hook to eval hooks
-      eval_hooks.append(early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      train_hooks.append(train_early_stop_duration_hook)
-
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    if not self._is_early_stopping:
-      if (train_max_steps is not None) and (train_max_steps <= 0):
-        if ((max_duration is not None) and (max_duration < 0)) or (max_duration is None):
-          logging.warn("train.max_steps is non-positive, and no early or duration stopping is configured. "
-                      "Training job will loop forever.")
-
-    if train_max_steps is not None and train_max_steps > 0:
-      # we can't pass max_steps AND steps to estimator.train.
-      # so we pass steps to estimator.train and max_steps to this hook instead...
-      stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps)
-      train_hooks.append(stop_at_step_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks,
-                                                  lambda: self.current_estimator_spec):
-      # alternate training and evaluation epochs
-      epoch = start_epoch
-      while True:
-        logging.info("Training epoch %d", epoch)
-        self._estimator.train(train_input_fn, steps=train_steps, hooks=train_hooks)
-
-        logging.info("Evaluating epoch %d", epoch)
-        eval_result = self._estimator.evaluate(
-          eval_input_fn, steps=eval_steps, hooks=eval_hooks)
-
-        if exporters:
-          checkpoint_path = self.estimator.latest_checkpoint()
-          for exporter in exporters:
-            export_path = os.path.join(self._save_dir, "export", exporter.name)
-            exporter.export(
-              estimator=self.estimator, export_path=export_path,
-              checkpoint_path=checkpoint_path, eval_result=eval_result,
-              is_the_final_export=False)
-
-        # If train_max_step is none. Terminate after one loop.
-        if train_max_steps is None:
-          break
-
-        # If stop_at_step_hook requested a stop, break
-        if train_max_steps > 0 and stop_at_step_hook.stop_requested:
-          break
-
-        # early-stopping logic is handled internally by the hook
-        if early_stop_patience > 0 and early_stop_hook.should_stop:
-          # but we still need to break here
-          break
-        epoch += 1
-
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
-
-  def get_train_spec(self, input_fn, max_steps=None, hooks=None):
-    """Get the TrainSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable train_input_fn")
-
-    if max_steps is None:
-      max_steps = self.params.train_max_steps
-
-    if max_steps is not None and max_steps <= 0:
-      max_steps = None
-
-    hooks = self.get_train_hooks() if hooks is None else hooks
-
-    return tf.estimator.TrainSpec(input_fn=input_fn,
-                                  max_steps=max_steps,
-                                  hooks=hooks)
-
-  def get_eval_spec(self, input_fn, steps=None, delay=None, period=None,
-                    hooks=None, exporters=None):
-    """Get the EvalSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable eval_input_fn")
-
-    if steps is None:
-      steps = self.params.eval_steps
-
-    if steps <= 0:
-      steps = None
-
-    if delay is None:
-      delay = self.params.eval_delay
-
-    if period is None:
-      period = self.params.eval_period
-
-    hooks = self.get_eval_hooks() if hooks is None else hooks
-
-    eval_name = self.params.get("eval_name", None)
-
-    return tf.estimator.EvalSpec(input_fn=input_fn,
-                                 steps=steps,
-                                 name=eval_name,
-                                 start_delay_secs=delay,
-                                 throttle_secs=period,
-                                 hooks=hooks,
-                                 exporters=exporters)
-
-  def train_and_evaluate(self, train_input_fn=None, eval_input_fn=None,
-                         train_max_steps=None, eval_steps=None,
-                         eval_delay=None, eval_period=None,
-                         train_hooks=None, eval_hooks=None,
-                         early_stop_metric=None, early_stop_patience=-1,
-                         early_stop_minimize=True, early_stop_tolerance=0, exporters=None,
-                         export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps``
-    using ``tf.estimator.train_and_evaluate``.
-    With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method
-    can be used for distributed training (multi-node or multi-process).
-    Unlike the ``learn`` method, training is continuous with ``train_max_steps``.
-    For distributed use case, evaluation happens periodically.
-    That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps
-    occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint.
-    TF defaults to saving checkpoints every 10 mins.
-    For local use case, training occurs for train_max_steps epochs followed by a
-    single evaluation. For local use case we therefore recommend using learn() instead
-    as it provides early-stopping and multiple evaluations.
-
-    ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds.
-    It will stop after ``train_steps`` is reached.
-
-    You must ensure that all workers/servers are assigned the same `save_dir`.
-
-    .. Note::
-
-      If the TF_CONFIG environment variable is set, this function assumes its running a distribute job.
-
-    Args:
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train_and_evalute
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.train_and_evalute.
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        Non-positive values and None-values train indefinitely (use with caution).
-      eval_steps:
-        number of steps per evaluation.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through
-        the entire evaluation set for each evaluation.
-        Note that the number of eval_steps should be high enough to minimize noise.
-        This is especially true for early-stopping.
-      eval_delay:
-        Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s.
-      eval_period:
-        Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s.
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-    """
-
-    logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.")
-    logging.info("Trainer.train_and_evaluate may change or be removed in future versions.")
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    self._exit_ps_after_training_complete()
-
-    # Maybe export in eval processes.
-    if self.is_evaluator():
-      if self.params.get("eval_name") is not None:
-        # Do not export if running special eval.
-        exporters = None
-        export_output_fn = None
-      elif exporters and export_output_fn:
-        self._export_output_fn = export_output_fn
-      else:
-        # Default option.
-        self._export_output_fn = None
-
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    train_hooks = [] if train_hooks is None else train_hooks
-
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if eval_delay is None:
-      eval_delay = self.params.eval_delay
-    if eval_period is None:
-      eval_period = self.params.eval_period
-
-    if early_stop_patience > 0:
-      # when training hooks detect this file, they request a stop to training
-      early_stop_path = os.path.join(self._save_dir, 'earlystop_now.txt')
-      # prepare early stopping hook (which also handles logic here)
-
-      self._is_early_stopping = True
-
-      eval_early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        file_path=early_stop_path,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None)  # only exit for distributed jobs
-      # add early stop hook to eval hooks
-      eval_hooks.append(eval_early_stop_hook)
-
-      # prepare the commensurate training hook
-      train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path)
-      train_hooks.append(train_early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=self.is_chief()
-      )
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None,
-        save_dir=self._save_dir,
-        overwrite=False
-      )  # only exit for distributed jobs
-
-      train_hooks.append(train_early_stop_duration_hook)
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks, lambda: self.current_estimator_spec):
-      train_spec = self.get_train_spec(train_input_fn, train_max_steps, train_hooks)
-      eval_spec = self.get_eval_spec(eval_input_fn, eval_steps,
-                                     eval_delay, eval_period,
-                                     eval_hooks, exporters)
-      self._train_and_evaluate(train_spec, eval_spec)
-
-    if self.is_chief():
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
-
-  def _train_and_evaluate(self, train_spec, eval_spec):
-    """
-    Private method that calls
-    ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``.
-    """
-    try:
-      tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
-    except twml.errors.EarlyStopError:
-      # Ignore the exception if on evaluator.
-      if self.is_evaluator():
-        pass
-      else:
-        raise
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Train the estimator for `steps` training steps.
-
-    Args:
-      steps:
-        number of steps for which to perform training. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. trains on the entire dataset a single time.
-        Non-positive values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-    """
-    if os.environ.get('TF_CONFIG') and "is_calibrating" not in self.params:
-      raise ValueError("trainer.train() can not be used with distributed / hogwild setups")
-
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    if self._is_early_stopping:
-      raise ValueError("Can not call train() after learn() when using early stopping.")
-
-    hooks = self.get_train_hooks() if hooks is None else hooks
-    self._estimator.train(input_fn, steps=steps, hooks=hooks)
-    return self
-
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Evaluate the estimator for `steps` evaluation steps.
-
-    Args:
-      steps:
-        number of steps for which to perform evaluation. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. evaluates on the entire dataset a single time.
-        Negative values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      hooks:
-        List of SessionRunHooks used for evaluation. Defaults to None.
-        Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks()
-        as the latter may implement early-stopping, which isn't necessarilty the desired
-        behavior when calling evaluate() on its own.
-      name:
-        Name of the evaluation if user needs to run multiple evaluations on different data sets.
-        Metrics for different evaluations are saved in separate folders,
-        and appear separately in tensorboard.
-
-    Returns:
-      If `is_evaluator()`, returns a dict containing the evaluation metrics specified
-      in `metric_fn` keyed by name, as well as an entry `global_step` that contains
-      the value of the global step for which this evaluation was performed.
-      Otherwise (i.e. `is_evaluator() == False`), returns None.
-    """
-    if not self.is_evaluator():
-      return None
-
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    hooks = self.get_eval_hooks() if hooks is None else hooks
-    hooks = [] if hooks is None else hooks
-
-    # for consistency with train/learn
-    eval_steps = None if steps is not None and steps < 0 else steps
-
-    with self.experiment_tracker.track_experiment(hooks, lambda: self.current_estimator_spec, name=name):
-      checkpoint = self.best_or_latest_checkpoint
-      computed_metrics = self._estimator.evaluate(
-        input_fn,
-        steps=eval_steps,
-        hooks=hooks,
-        checkpoint_path=checkpoint,
-        name=name
-      )
-
-    return computed_metrics
-
-  def start_tensorboard(self, port=None):
-    """
-    Start tensorboard process to visualize logs in save_dir.
-    """
-    logging.info("Starting tensorboard.")
-    if self._tensorboard_handle:
-      logging.warn("Tensorboard already running. Nothing done.")
-      return
-
-    if port is None:
-      if 'tensorboard_port' not in self.params.values():
-        raise ValueError('You must specify a port for tensorboard to run on.')
-      elif self.params.tensorboard_port is None:
-        return
-      else:
-        port = self.params.tensorboard_port
-
-    mldash_path = 'experiments'
-    if self.experiment_tracker.path:
-      mldash_path += '/%s' % encode_url(self.experiment_tracker.experiment_id)
-    tensorboard_args = ['--logdir=%s' % self._save_dir, '--port=%d' % port]
-
-    try:
-      args = ['email_and_launch_tensorboard', mldash_path, '--'] + tensorboard_args
-      self._tensorboard_handle = subprocess.Popen(args)
-    except OSError:
-      try:
-        self._tensorboard_handle = subprocess.Popen(['tensorboard'] + tensorboard_args)
-      except OSError:
-        try:
-          # this will work with Twitter internal pants build when run locally
-          args = ['./pants', 'run', 'twml:tensorboard', '--'] + tensorboard_args
-          self._tensorboard_handle = subprocess.Popen(args)
-        except OSError:
-          logging.error("No tensorboard installed, won't able to visualize training in tensorboard.")
-
-  def stop_tensorboard(self):
-    """
-    Shutdown this Trainer's associated Tensorboard.
-    """
-    if self._tensorboard_handle:
-      logging.info("Shutting down tensorboard.")
-      self._tensorboard_handle.kill()
-    else:
-      logging.warn("No known tensorboard process. Nothing done.")
-
-  def calibrate(self,
-                calibrator,
-                steps=None,
-                input_fn=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Calibrate the calibrator for `steps` calibration steps using the estimator.train method.
-    The build_graph passed to the Trainer constructor should
-    call calibrator.accumulate using something like tf.py_func.
-    That way, when this method calls estimator.train the calibrator will
-    accumulate one epoch of samples. After which, this method calls calibrator.calibrate().
-    It is up to the user to then call calibrator.save() to save the calibrated Layer
-    and other information to disk for multi-phase training.
-
-    Args:
-      calibrator:
-        a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}.
-      steps:
-        Maximum steps to accumulate examples for calibration. Optional.
-        If not specified, examples will be accumulated until all downsampled parts are processed.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      save_calibrator:
-        Boolean (default: True). If set to True it will save the calibrator layer.
-    """
-
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    # making everything a dict to avoid multiple ifs
-    if isinstance(calibrator, twml.contrib.calibrators.Calibrator):
-      calibrator = {"default": calibrator}
-
-    # This is a dummy call to train, since we cannot predict without training
-    # from the Estimator API
-    self._estimator.train(input_fn, steps=1)
-    max_steps = steps if steps is not None else -1
-    for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)):
-      count = 0
-      for out in self._estimator.predict(input_fn, hooks=hooks, yield_single_examples=False):
-        if max_steps > 0 and count > max_steps:
-          break
-        clbrt.accumulate_feature(out)
-        count += 1
-      clbrt.calibrate()
-
-    # this step is done to allow us to keep the current phases event file for
-    # visualization on Tensorboard. It removes all files that
-    # are not event files. This piece of code should be deprecated when
-    # we deprecate the MDL calibrator (CX-12329)
-    for fname in tf.io.gfile.listdir(self._save_dir):
-      if not fname.startswith("events"):
-        tf.io.gfile.remove(os.path.join(self._save_dir, fname))
-
-    if save_calibrator:
-      # If we only have one calibrator, the calibrator signature
-      # will be set to default
-      if len(calibrator) == 1:
-        calibrator = calibrator['default']
-        calibrator.save(
-          self.params.save_dir,
-          name=calibrator.name,
-          verbose=True
-        )
-      else:
-        for name, clbrt in calibrator.items():
-          clbrt.save(
-            self.params.save_dir,
-            name=clbrt.name + str(name),
-            verbose=True
-          )
-
-  def predict(self, *args, **kwargs):
-    """
-    Wrapper over the tensorflow `Estimator.predict
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict>`_.
-    method. See that documentation for description of arguments accepted.
-
-    If hooks is passed as an argument, the specified hooks are used.
-    Else when profiler_steps is specified in the constructor of the Trainer, a
-    tf.train.ProfilerHook is passed to the predict interface.
-    Otherwise, hooks is set to an empty list.
-    """
-    if 'hooks' not in kwargs and len(args) < 3:
-      # If hooks is not specified as a keyword argument, nor as a positional argument
-      # add hooks as a keyword argument.
-      kwargs['hooks'] = self.get_predict_hooks()
-
-    return self.estimator.predict(*args, **kwargs)
-
-  def hub_export(self,
-                 name,
-                 serving_input_receiver_fn,
-                 export_dir=None,
-                 checkpoint_path=None,
-                 export_task_type_overrider=None):
-    """
-    Exports registered modules into a save directory.
-
-    This method creates a directory under export_path with the save TF Hub.
-    One sub-directory (named export_name) per module registered via register_module_for_export.
-
-    Arguments:
-      name:
-        unique name of the module to export.
-      serving_input_receiver_fn:
-        A function with no arguments that returns a ServingInputReceiver.
-        This is used with the estimator passed to export() to build the graph (in PREDICT mode)
-        that registers the modules for export. The model in that graph is never run,
-        so the actual data provided by this input fn does not matter.
-      export_dir:
-        A string containing a directory where to write the export directories.
-        Defaults to the save_dir.
-      checkpoint_path:
-        The checkpoint path to export. Defaults to the latest.
-      export_task_type_overrider:
-        Specifies the task type that will override the default task type used for export
-        (hogwild training defaults to evaluator, otherwise, defaults to chief)
-    """
-    if export_task_type_overrider:
-      if not self.is_task_type(export_task_type_overrider):
-        logging.info(
-          f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}")
-        return
-    else:
-      if self._using_hogwild:
-        if not self.is_evaluator():
-          logging.info("Trainer.hub_export ignored due to the process not being evaluator.")
-          return
-      else:
-        if not self.is_chief():
-          logging.info("Trainer.hub_export ignored due to the process not being chief.")
-          return
-
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
-
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
-
-    export_dir = export_dir if export_dir is not None else self._save_dir
-    exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn)
-    # The path_exporter by default contains a timestamp directory in its path.
-    path_exporter = exporter.export(estimator=self.estimator,
-                                    export_path=export_dir,
-                                    checkpoint_path=checkpoint_path)
-
-    # LatestModuleExporter.export() returns a binary string on Cloud ML Engine
-    # but tf.io.gfile.listdir() does not; this is an issue when joining paths
-    if isinstance(path_exporter, bytes):
-      path_exporter = path_exporter.decode()
-
-    # Copying the saved hub module to export_dir so we don't need to specify
-    # the timestamp when loading the module.
-    # This is a workaround due to the current implementation of hub.LatestModuleExporter.
-    # This works for multiple hub modules.
-    hub_exported_modules = tf.io.gfile.listdir(path_exporter)
-
-    backup_dir = os.path.join(export_dir, "backups",
-                              datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
-
-    for folder in hub_exported_modules:
-      hub_module_oldpath = os.path.join(path_exporter, folder)
-      hub_module_newpath = os.path.join(export_dir, folder)
-
-      # If the destination already exists, move to backup
-      if tf.io.gfile.exists(hub_module_newpath):
-        # Ensure backup_dir exists
-        tf.io.gfile.makedirs(backup_dir)
-        hub_module_backup = os.path.join(backup_dir, folder)
-        tf.io.gfile.rename(hub_module_newpath, hub_module_backup)
-
-      tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath)
-
-    # Since the timestamped folder exists but is empty, we can delete it.
-    tf.io.gfile.rmtree(path_exporter)
-
-  def _is_on_gke(self) -> bool:
-    """Returns True if running on gke."""
-    cluster = os.environ.get('TWML_JOB_CLUSTER')
-    if not cluster or cluster in {'smf1', 'atla'}:
-      return False
-    return True
-
-  def _maybe_del_tsd_exit(self, state_files) -> None:
-    """Handle potential early exit and TwitterSetDeployment deletion.
-
-      If:
-        - distributed training
-        - running GKE
-        - training is finished (all state_files exists)
-      we will exit early and not restart work
-
-      If --distributed_training_cleanup = True then we will also handle
-      cleaning up the TwitterSetDeployments.
-
-      Args:
-        state_files: A python list indicate state files to determine the finish 
-        state of the job.
-    """
-    # job type that is responsible for experiment tracking will remain alive
-    # until it marks the experiment as finished.
-    if self.experiment_tracker._env_eligible_for_recording_experiment:
-      exp_status = self.experiment_tracker.get_run_status()
-      if exp_status and exp_status not in {'Success', 'Failed'}:
-        logging.info(
-          f"Not exiting early because experiment is still {exp_status}."
-        )
-        return
-
-    # do not bother if we are on prem
-    if not self._is_on_gke():
-      logging.info("No need to exit early because running on prem.")
-      return
-
-    states = [
-      twml.util.file_exist_in_dir(self._save_dir, state_file) for state_file in state_files]
-    do_not_restart = (self._params.get('distributed') and all(states))
-    if not do_not_restart:
-      return
-
-    logging.info(
-      f"Exiting early because a _SUCCESS file already exists in {self._save_dir}")
-    if self._params.get('distributed_training_cleanup'):
-      resource_name = '-'.join([
-        os.environ['TWML_JOB_NAME'],
-        os.environ['TWML_DISTRIBUTED_JOB_TYPE'],
-        os.environ['TWML_JOB_ENV'],
-      ])
-      logging.info(f"Deleting TwitterSetDeployment {resource_name}")
-      # each job type will manage its own deletion so that deletion happens
-      # in the trainer init call for every job type
-      # otherwise we may kill another job type during an important
-      # process like experiment tracking management (handled by the evaluator
-      kubectl_delete_by_name(
-        zone=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        resource_type=Resource.TWITTERSETDEPLOYMENTS.value,
-        resource_name=resource_name,
-        wait=False,
-      )
-    sys.exit(0)
-
-  def write_state_to_disk(self, save_dir, filename='_SUCCESS') -> None:
-    """Write state file to disk to indicate the state of training process. This is usually used 
-      to mark the state of training progress and determine the start when job restarts/resumes.
-    Args:
-      save_dir: A str of local/gcs/hdfs dir to write the state file.
-      file_name: A str indicate the state file. Default to `_SUCCESS`.
-    """
-    file_path = os.path.join(save_dir, filename)
-    if tf.io.gfile.exists(file_path):
-      tf.logging.warn(f'{file_path} already exist.')
-      return
-
-    with tf.io.gfile.GFile(file_path, 'w') as f:
-      f.write('')
\ No newline at end of file
diff --git a/twml/twml/util.docx b/twml/twml/util.docx
new file mode 100644
index 000000000..5c772fd17
Binary files /dev/null and b/twml/twml/util.docx differ
diff --git a/twml/twml/util.py b/twml/twml/util.py
deleted file mode 100644
index cd7679a6f..000000000
--- a/twml/twml/util.py
+++ /dev/null
@@ -1,942 +0,0 @@
-"""
-This module contains utility functions for twml.
-"""
-
-import argparse
-from datetime import datetime
-import itertools
-import json
-import logging as _logging
-import os
-import re
-
-from twitter.ml.common.resources import AuroraPath
-from twitter.deepbird.hparam import HParams
-from twitter.deepbird.io.util import (
-  _get_feature_id,  # noqa: F401
-  feature_id,  # noqa: F401
-  preprocess_feature_regex,  # noqa: F401
-  preprocess_path,  # noqa: F401
-  sanitize_hdfs_path,  # noqa: F401
-  is_string,  # noqa: F401
-  list_files,  # noqa: F401
-  match_files,  # noqa: F401
-)
-from twitter.deepbird.io.legacy.util import (
-  batch_apply,  # noqa: F401
-  boolean_mask,  # noqa: F401
-  fixed_length_tensor,  # noqa: F401
-)
-from twitter.deepbird.sparse.util import (
-  convert_to_sparse,  # noqa: F401
-  limit_bits,  # noqa: F401
-)
-
-from dateutil import rrule
-from joblib import delayed, Parallel
-from six import string_types
-
-from absl import logging
-from libtwml import CLIB, OPLIB  # noqa: F401
-import tensorflow.compat.v1 as tf
-from tensorflow.python.platform import tf_logging
-import twml
-from twml.feature_config import FeatureConfigBuilder
-
-
-# big_prime is less than 2**32
-# This just needs to be co-prime with powers of 2
-# any large prime is sufficient, but it's not necessary.
-HASHING_PRIME = 2479700537
-
-
-def multiplicative_hash(input, hash_constant=HASHING_PRIME):
-  return input * hash_constant
-
-
-def _return_tensors_from_checkpoint_folder(init_dir, model_name=None):
-  """Returns tensors list from a checkpoint folder
-
-  Args:
-    init_dir: Name of the checkpoint directory.
-    model_name: the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default to the
-      latest model saved in the checkpont file.
-
-  """
-  if model_name is None:
-    # gets the most recently generated model.cpkt file
-    model_path = tf.train.latest_checkpoint(init_dir)
-    if model_path is None:
-      raise ValueError("Could not find a valid model checkpoint inside the directory")
-  else:
-    model_path = os.path.join(init_dir, model_name)
-  reader = tf.train.NewCheckpointReader(model_path)
-  try:
-    return (reader.debug_string().decode("utf-8"))
-  except OSError:
-    logging.error('Could not decode the string')
-
-
-def get_scope_dict(init_dir, incoming_scope_name, current_scope_name, model_name=None):
-  """Returns tensors map from a checkpoint file.
-
-  Args:
-    file_name:
-      Name of the checkpoint directory.
-    incoming_scope_name:
-      scope name of the previous phase
-    current_scope_name:
-      scope name of current phase
-    model_name:
-      the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default
-      to the latest model saved in the checkpoint file.
-  Returns:
-    init_map:
-      init_map which will be inputted to the checkpoint
-  """
-  init_map = {}
-  reader_dump = _return_tensors_from_checkpoint_folder(init_dir=init_dir,
-                                                       model_name=model_name).splitlines()
-  for member in reader_dump:
-    # remove global_step since it is not necessary
-    if 'global_step' not in member:
-      saved_variables = str(member.split(" ")[0])
-      saved_scope = saved_variables.rsplit('/', 1)[0] + "/"
-      new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1)
-      # create key in init_map
-      if saved_scope not in init_map.keys():  # pylint: disable=dict-keys-not-iterating
-        init_map[saved_scope] = new_scope
-  return init_map
-
-
-def get_init_map(
-        init_from_dir,
-        exclude_var_names=None,
-        exclude_name_scopes=None,
-        name_scope_to_remove=None,
-        name_scope_to_prepend=None):
-  """
-  Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint).
-
-  It assumes that the latter part of the variable names are consistent between the checkpoint and
-  the new model, but their name_scopes may be different. If the checkpoint model has variable names
-  of the form old/scope/var/foo, and the corresponding variable names for the new model should be
-  my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and
-  name_scope_to_prepend = 'my/new/'.
-
-  This function can be used to
-
-  1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or
-  2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in
-     which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside
-     ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to
-     the trainer.
-
-  Parameters
-  ----------
-  init_from_dir: Directory containing checkpoint
-  exclude_var_names: list[str]
-    List of variables in the checkpoint that should be excluded from the map.
-  exclude_name_scopes: list[str]
-    List of name_scopes in the checkpoint model that should be excluded from the map.
-  name_scope_to_remove: str
-    portion of name_scope for checkpoint variables that should not be included in variable names
-    for new model.
-  name_scope_to_prepend: str
-    name_scope to prepend to variable names in checkpoint to give variable names for new model.
-
-  Returns
-  -------
-  dict
-    keys are variable names in the checkpoint and values are variable names in the new model,
-    into which the checkpoint parameters should be loaded.
-  """
-  vars_to_restore = get_checkpoint_variable_names(
-    init_from_dir,
-    exclude_var_names=exclude_var_names,
-    exclude_scopes=exclude_name_scopes,
-  )
-
-  if name_scope_to_prepend is not None:
-    if not name_scope_to_prepend.endswith('/'):
-      name_scope_to_prepend += '/'
-
-  if name_scope_to_remove is not None:
-    if not name_scope_to_remove.endswith('/'):
-      name_scope_to_remove += '/'
-
-  init_map = {}
-
-  for var_name in vars_to_restore:
-    var_name_checkpoint = var_name
-
-    if name_scope_to_remove is not None:
-      var_name = var_name.replace(name_scope_to_remove, '')
-
-    var_name_new_model = var_name
-
-    if name_scope_to_prepend is not None:
-      var_name_new_model = name_scope_to_prepend + var_name_new_model
-
-    init_map[var_name_checkpoint] = var_name_new_model
-
-  return init_map
-
-
-def get_checkpoint_variable_names(model_dir, exclude_var_names=None, exclude_scopes=None):
-  """
-  Gets a list of variable names from the latest checkpoint in model_dir.
-  Removes variables with scope defined by exclude_scopes, and/or with names defined by
-  exclude_var_names.
-
-  Args:
-    model_dir (str): Directory containing checkpoint file for the pre-trained model
-    exclude_var_names (list): Optional variable names to exclude (can include full/partial scope)
-    exclude_scopes (list): Optional scopes to exclude
-
-  Returns:
-    list: variable names
-  """
-  checkpoint_path = tf.train.latest_checkpoint(model_dir)
-  variables_and_shapes = tf.train.list_variables(checkpoint_path)
-
-  def _keep(name):
-    if exclude_scopes and any(name.startswith(exc_scope) for exc_scope in exclude_scopes):
-      return False
-    if exclude_var_names and any(name.endswith(exc_var) for exc_var in exclude_var_names):
-      return False
-    return True
-
-  names = [x[0] for x in variables_and_shapes if _keep(x[0])]
-
-  return names
-
-
-def to_snake_case(name):
-  """
-  Changes name to snake case
-  """
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def copy_phase_inputs(init_dir, dest_dir):
-  """Automatically copies the .json.tf from the init_dir to save_dir
-  so we can load multiple parameters at the same time.
-
-  Args:
-    init_dir:
-      Name of the checkpoint directory.
-    dest_dir:
-      Name of the output directory.
-  """
-  if init_dir is not None:
-    # we are using tf.io.gfile so we can use it with both local and hdfs paths
-    for files in tf.io.gfile.listdir(init_dir):
-      if files.endswith(".json.tf"):
-        src_file = os.path.join(init_dir, files)
-        dest_file = os.path.join(dest_dir, files)
-        if not tf.io.gfile.exists(dest_dir):
-          # creates the folder
-          try:
-            tf.io.gfile.makedirs(dest_dir)
-          # to prevent racing condition
-          except OSError:
-            if not tf.io.gfile.isdir(dest_dir):
-              raise
-        # dest_file may be old if it exists and
-        # dest_file gets copied several times in distributed training
-        tf.io.gfile.copy(src_file, dest_file, overwrite=True)
-
-
-def rehash_sparse_features_nbits(sp_a, nbits, hash_fn=multiplicative_hash):
-  """
-  Rehash the feature ids of the sparse tensor,
-  and limit the output to n bits.
-
-  This is useful for making the distribution of
-  feature_ids more uniform, which may improve performance
-  in some situations.
-
-  This would typically be used on the output of
-  PercentileDiscretizer, since it assigns many
-  bins to low-valued output feature ids.
-
-  Input feature IDs should take values less than 2**32,
-  and nbits should be less than 32
-
-  Args:
-    sp_a:
-      a tf.SparseTensor object
-    nbits:
-      integer number of bits to mask output feature_ids
-    hash_fn:
-      Function that takes integer values and returns hashes of these values.
-      The output does not need to be masked to the desired number of bits,
-      as this masking will be taken care of. Default value = multiplicative_hash.
-
-  Returns:
-    a new tf.SparseTensor
-  """
-
-  feature_ids = sp_a.indices[:, 1]
-  feature_ids = hash_fn(feature_ids)
-
-  sample_ids = sp_a.indices[:, 0]
-  values = sp_a.values
-  dense_shape = sp_a.dense_shape
-
-  indices = tf.stack([sample_ids, feature_ids], axis=1)
-
-  sp_a = tf.SparseTensor(indices, values, dense_shape)
-
-  # note - we need 2**nbits >= batch size
-  # otherwise, sample_ids will be squashed by the mask.
-  return limit_sparse_tensor_size(sp_a, nbits)
-
-
-def convert_to_hparams(opt):
-  """
-  Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams.
-  Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported
-  tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams.
-
-  NOTE: If you are using estimators, please don't call this method and directly pass python dict
-  to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts.
-  """
-
-  # Convert to dict so we can iterate through it cleanly.
-  if isinstance(opt, argparse.Namespace):
-    params_dict = vars(opt)
-  elif isinstance(opt, dict):
-    params_dict = opt
-  elif isinstance(opt, HParams):
-    logging.warning('If you are using Estimator, please pass python dict directly to Estimator.')
-    params_dict = opt.values()
-  else:
-    raise ValueError("Input can not be of type %s. "
-                     "It can be one of { argparse.Namespace, dict, "
-                     "twitter.deepbird.hparam.HParams}."
-                     % type(opt))
-
-  params = HParams()
-  # Hack to convert all parameters from hdfs:/// format to hdfs://default/
-  # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical.
-  for key, val in params_dict.items():
-    val = params_dict[key]
-    # Fix the path if the value is a string
-    if isinstance(val, str):
-      params.add_hparam(key, sanitize_hdfs_path(val))
-    else:
-      params.add_hparam(key, val)
-
-  return params
-
-
-def dynamic_partition(features, partitions, num_partitions=2, name=None):
-  """
-  Partitions each of the tensor in features using the provided mask.
-
-  Args:
-    features:
-      A single tensor or an iterable of tensors (list, tuple, dict)
-    partitions:
-      A bool or integer tensor representing the partitions.
-
-  Returns partitioned outputs as a list. Each element of the list is the same type as features.
-
-  This uses tf.dynamic_partition but adds the following niceties:
-    - features can be a list or dict of different tensor types.
-    - only a partition tensor is used to partition all the feature tensors recursively.
-    - the partition tensor is automatically converted into an integer tensor.
-    - defaults to num_partitions == 2
-  """
-
-  if not isinstance(features, (dict, list, tuple, tf.Tensor)):
-    raise AssertionError("features container must be a dict, list, or tuple, tf.Tensor")
-
-  if isinstance(partitions, tf.Tensor):
-    partitions = tf.cast(partitions, tf.int32)
-
-  if isinstance(features, tf.Tensor):
-    return tf.dynamic_partition(features, partitions, num_partitions, name)
-
-  outputs = []
-  for _ in range(num_partitions):
-    if isinstance(features, (tuple, list)):
-      # Create an empty list of lists first, will be converted to right type afterwards.
-      outputs.append([None for _ in range(len(features))])
-    else:
-      outputs.append(dict())
-
-  iterable = features.items() if isinstance(features, dict) else enumerate(features)
-
-  # Handling partitions of nested classes handled here:
-  # Recursively call dynamic_partition for containers
-  for key, feature in iterable:
-    name_key = None if name is None else name + "_" + str(key)
-    if isinstance(partitions, tf.Tensor):
-      results = tf.dynamic_partition(feature, partitions, num_partitions, name_key)
-    else:
-      results = tf.dynamic_partition(feature, partitions[key], num_partitions[key], name_key)
-      # Append the result to the proper output container
-    for idx, result in enumerate(results):
-      outputs[idx][key] = result
-
-  # if input is tuple, convert list of lists back to list of tuples
-  if isinstance(features, tuple):
-    outputs = [type(features)(output) for output in outputs]
-
-  return outputs
-
-
-def write_file(filename, contents, encode=False):
-  '''
-  Optionally encodes contents and writes contents to a file.
-
-  Arguments:
-    filename:
-      path to file where the contents will be saved.
-      Accepts HDFS and local paths.
-    contents:
-      contents to save to the file.
-      Must be a string when encode is False.
-    encode:
-      False | 'json'. When encode='json', contents is encoded
-      with json.dumps.
-  '''
-  if encode == 'json':
-    contents = json.dumps(contents)
-  elif not is_string(contents):
-    raise ValueError("Expecting string for encode=False")
-
-  graph = tf.Graph()
-  with graph.as_default():
-    write = tf.write_file(filename, contents)
-
-  with tf.Session(graph=graph) as sess:
-    sess.run(write)
-
-
-def read_file(filename, decode=False):
-  '''
-  Reads contents from a file and optionally decodes it.
-
-  Arguments:
-    filename:
-      path to file where the contents will be loaded from.
-      Accepts HDFS and local paths.
-    decode:
-      False | 'json'. When decode='json', contents is decoded
-      with json.loads. When False, contents is returned as is.
-
-  Returns:
-    contents
-  '''
-  graph = tf.Graph()
-  with graph.as_default():
-    read = tf.read_file(filename)
-
-  with tf.Session(graph=graph) as sess:
-    contents = (sess.run(read))
-    # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str
-    if not isinstance(contents, str):
-      contents = contents.decode()
-
-  if decode == 'json':
-    contents = json.loads(contents)
-
-  return contents
-
-def setup_tf_logging_formatter():
-  formatter = _logging.Formatter(
-      '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
-      None)
-  # Setting up absl logging verbosity
-  logging.set_verbosity('info')
-  logging.set_stderrthreshold('info')
-  logging.get_absl_handler().setFormatter(formatter)
-  tf.logging.set_verbosity(tf.logging.INFO)
-  # Set tensorflow logging handler format
-  if len(tf_logging.get_logger().handlers) > 0:
-    tf_logging.get_logger().handlers[0].setFormatter(formatter)
-
-
-def set_tensorflow_log_level(log_level):
-  """
-  Sets tensorflow's default logging level.
-
-  0. all logs are shown.
-  1. filter out INFO logs.
-  2. filter out WARNINGs and INFOs.
-  3. filter out ERRORs, WARNINGs, and INFOs.
-
-  Note that tf.Print output are INFO logs, so setting log_level above 0 would hide
-  output from tf.Print.
-  """
-  assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3
-  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(log_level)
-
-
-def weighted_average(values, weights):
-  """
-  Compute a weighted average using the given values and weights.
-  E.g. this is usually used to compute a weighted loss given sample weights.
-  """
-  return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights)
-
-
-def backup_checkpoint(checkpoint_path_prefix,
-                      backup_path='backup',
-                      empty_backup=True):
-  """
-  Creates a backup copy of a checkpoint in backup_dir.
-  This function is used by the Trainer for early-stopping.
-
-  Arguments:
-    checkpoint_path_prefix:
-      Prefix of the path to the checkpoint files.
-    backup_path:
-      path to a directory where checkpoint files will be backed up.
-    empty_backup:
-      When True (the default), the current contents of the backup directory
-      are removed before the backup is performed.
-
-  Returns:
-    The number of backed up files.
-  """
-  checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix)
-
-  if tf.io.gfile.exists(backup_path) and empty_backup:
-    tf.io.gfile.rmtree(backup_path)
-
-  tf.io.gfile.mkdir(backup_path)
-
-  n_backup = 0
-  # copy all checkpoint files to backup directory (TODO use gfile.glob instead)
-  try:
-    checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*")
-    if len(checkpoint_files) == 0:
-      raise twml.errors.CheckpointNotFoundError("%s not found" % checkpoint_path_prefix)
-    for filename in checkpoint_files:
-      n_backup += 1
-      tf.io.gfile.copy(
-        src=filename,
-        dst=os.path.join(backup_path, os.path.basename(filename))
-      )
-  except tf.errors.OpError as ex:
-    raise twml.errors.CheckpointNotFoundError(
-      f"{str(ex)}\n {checkpoint_path_prefix} not found."
-    )
-
-  # tf.train.latest_checkpoint needs the 'checkpoint' file.
-  with tf.io.gfile.GFile(os.path.join(backup_path, 'checkpoint'), 'w') as f:
-    f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix)
-
-  return n_backup
-
-
-def set_only_checkpoint(source_path, dest_path, remove_source=True):
-  """
-  Removes the checkpoint and model.ckpt* files from dest_path.
-  Moves the latest checkpoint from source_path to dest_path.
-
-  Arguments:
-    source_path:
-      path to directory containing the latest checkpoint.
-      Should contain a valid checkpoint file and model.ckpt files.
-      For early-stopping, this should be the save_dir/best_checkpoint dir.
-    dest_path:
-      path to directory where the latest checkpoint files will be moved.
-      All its checkpoint and model.ckpt* files will be removed.
-      For early-stopping, this should be the save_dir.
-    remove_source:
-      When True (the default), deletes the source directory.
-      Note that even when False, its checkpoint files are moved to
-      dest_path anyway.
-      This deletes the source directory (and any remaining contents).
-  """
-  # make it so that source_path checkpoint is the only checkpoint
-  source_path_prefix = tf.train.latest_checkpoint(source_path)
-  if source_path_prefix is not None:
-    # remove intermediate checkpoints
-    for filename in tf.io.gfile.listdir(dest_path):
-      if filename.startswith("model.ckpt"):
-        tf.io.gfile.Remove(os.path.join(dest_path, filename))
-    # move contents of source_path to dest_path
-    for filename in tf.io.gfile.listdir(source_path):
-      tf.io.gfile.rename(
-        oldname=os.path.join(source_path, filename),
-        newname=os.path.join(dest_path, filename),
-        overwrite=True)  # overwrite "checkpoint" file
-    # delete the source_path dir
-    if remove_source:
-      tf.io.gfile.rmtree(source_path)
-
-
-def list_files_by_datetime(
-  base_path,
-  start_datetime,
-  end_datetime=None,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1,
-  hour_resolution=1,
-  sort=False
-):
-  """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range.
-
-  Args:
-    base_path:
-      The base path. If `None`, returns `None`.
-    start_datetime:
-      A `datetime.datetime` or string representing the start of the range (inclusive).
-      If `None`, it returns `list_files(base_path, extension, sort)`.
-    end_datetime:
-      A `datetime.datetime` or string representing the end of the range (inclusive).
-      If `None`, assumed to be the same as start_datetime.
-    datetime_prefix_format:
-      Format compatible with `datetime.datetime.strftime`
-      (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior).
-    extension:
-      The extension of the files composing the dataset (e.g. 'lzo').
-    parallelism:
-      The number of threads used to process list patterns (this is mostly useful
-      when dealing with filesystems such as HDFS in which listing files is a potentially expensive
-      operation).
-    hour_resolution:
-      The separation between consecutive hours. The default value is 1.
-    sort:
-      bool, whether to return a sorted list of files. Default False.
-
-  Returns:
-    A list with all the matching files.
-
-  Raises:
-    errors.OpError: If there are filesystem / directory listing errors.
-  """
-  if hour_resolution is None:
-    hour_resolution = 1
-
-  if base_path is None:
-    return None
-
-  if start_datetime is None:
-    return list_files(base_path, extension, sort)
-
-  # Do this in case people want to use a single day for training.
-  if end_datetime is None:
-    end_datetime = start_datetime
-
-  assert parallelism > 0
-  assert start_datetime <= end_datetime
-
-  if isinstance(start_datetime, str):
-    start_datetime = datetime.strptime(start_datetime, datetime_prefix_format)
-
-  if isinstance(end_datetime, str):
-    end_datetime = datetime.strptime(end_datetime, datetime_prefix_format)
-
-  assert isinstance(start_datetime, datetime)
-  assert isinstance(end_datetime, datetime)
-
-  base_path = preprocess_path(base_path)
-
-  def _handle_missing_globs(pattern):
-    try:
-      return tf.io.gfile.glob(pattern)
-    except tf.errors.NotFoundError as e:
-      tf.logging.warning(e.message)
-      return []
-
-  # a set is used because there might be some repeated globs depending on dt_prefix_format
-  globs = {
-    os.path.join(base_path, dt.strftime(datetime_prefix_format), '*.%s' % extension)
-    for dt in rrule.rrule(
-      freq=rrule.HOURLY, interval=hour_resolution, dtstart=start_datetime, until=end_datetime)
-  }
-  nested_files = Parallel(n_jobs=parallelism, backend='threading')(
-    delayed(_handle_missing_globs)(p) for p in globs
-  )
-  flattened_files = list(itertools.chain.from_iterable(nested_files))
-
-  if not flattened_files:
-    error_msg = "Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}".format(
-      base_path=base_path, start_datetime=start_datetime, end_datetime=end_datetime
-    )
-    raise OSError(error_msg)
-
-  if sort:
-    flattened_files = sorted(flattened_files)
-
-  return flattened_files
-
-
-def limit_sparse_tensor_size(sparse_tf, input_size_bits, mask_indices=True):
-  """
-  Returns a ``tf.SparseTensor`` which is the input SparseTensor
-  limited to the specified input_size_bits
-
-  Args:
-    sparse_tf:
-      twml.SparseTensor or tf.SparseTensor
-    input_size_bits:
-      The number of bits allocated to the input size.
-      Input size will be power(2,input_size_bits).
-      Note that twml.limit_bits truncates any feature keys that
-      exceed the input size.
-    mask_indices:
-      If mask indices is False; only the shape is changed. Defaults to True.
-  """
-  if isinstance(sparse_tf, twml.SparseTensor):
-    sparse_tf = sparse_tf.to_tf()
-  if not isinstance(sparse_tf, tf.SparseTensor):
-    raise TypeError('Input argument `sparse_tf` should either be of type'
-                    'twml.SparseTensor of tf.SparseTensor. Found type: {}'.
-                    format(type(sparse_tf)))
-  if mask_indices:
-    indices = twml.limit_bits(sparse_tf.indices, input_size_bits)
-  else:
-    indices = sparse_tf.indices
-  dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits])
-  return tf.SparseTensor(indices=indices, values=sparse_tf.values,
-                         dense_shape=dense_shape)
-
-
-def create_module_spec(mlp_fn, mode, params, drop_collections=None):
-  """
-  Creates a standard tags_and_args which should be passed to the create_module_spec
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args).
-
-  Args:
-    module_fn:
-      a function to build a graph for the Module.
-    mode:
-      mode in which the Estimator is run
-    params:
-      parameters passed to the Estimator
-  """
-  import tensorflow_hub as hub # noqa: F402
-  tags_and_args = [(set(), {"params": params, "mode": mode}),  # serving graph
-                   ({"train"}, {"params": params, "mode": mode})  # training graph
-                   ]
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections)
-  return spec
-
-
-def change_name_scope_from_dir(init_scope_name, final_scope_name, save_dir):
-  """
-  Changes the name of the saved scope to the desired name and saves it
-  to the same save_dir.
-
-  Args:
-    init_scope_name:
-      initial scope name
-    final_scope_name:
-      desired (final) scope name
-    save_dir:
-      directory which the scopes are saved
-
-  In the follwing section we:
-    - Read all the variables from the latest checkpoint.
-    - Make a copy of the variables with new name scope.
-    - Store both sets of variables into the latest checkpoint.
-  This essentially doubles up the size of the checkpoint.
-  But when a job is restarted after this part is done, the checkpoint size doubles again.
-  To avoid doing this, we create a copy in backup if a backup isn't found.
-  This allows us always read (from backup) and write same sized checkpoint files.
-  """
-
-  # Create a backup_checkpoints dir
-  backup_dir = os.path.join(save_dir, "change_name_scope_backups")
-  tf.io.gfile.makedirs(backup_dir)
-
-  latest_checkpoint = tf.train.latest_checkpoint(save_dir)
-
-  if latest_checkpoint is None:
-    raise OSError("No checkpoints found in save_dir: %s" % save_dir)
-
-  latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir)
-
-  if (latest_backup_checkpoint is None or
-      (os.path.basename(latest_checkpoint) !=
-       os.path.basename(latest_backup_checkpoint))):
-    backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False)
-
-  variables = tf.train.list_variables(backup_dir)
-  with tf.Graph().as_default(), tf.Session().as_default() as sess:
-    new_variables = []
-    for name, _ in variables:
-      var = tf.train.load_variable(backup_dir, name)
-      # Append both the rename and the original variable
-      new_variables.append(
-        tf.Variable(var, name=name.replace(init_scope_name, final_scope_name)))
-      new_variables.append(tf.Variable(var, name=name))
-    # Save this to the checkpoint in the save_dir
-    saver = tf.train.Saver(new_variables)
-    sess.run(tf.global_variables_initializer())
-    saver.save(sess, latest_checkpoint)  # pylint: disable=no-member
-
-
-def hub_import(input, module, module_name, trainable=False):
-  """
-  Loads exported hub module.
-
-  Args:
-    input:
-      input to hub module
-    module:
-      module path
-    module_name:
-      signature of the exported hub module
-  """
-  import tensorflow_hub as hub # noqa: F402
-  hub_module = hub.Module(module, trainable=trainable)
-  output = hub_module(input, signature=module_name)
-  return output
-
-
-def _extract_hash_space_bits(feature_config):
-  """
-  Extract Sparse Shapes for contrib.FeatureConfig.
-  Arguments:
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  Returns:
-    Dictionary of tensor names and hash space bits.
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    fc_type = type(feature_config)
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig: {fc_type}")
-  sparse_shapes_dict = {}
-  for config in feature_config.sparse_extraction_configs:
-    sparse_shapes_dict[config.output_name] = config.hash_space_bits
-  return sparse_shapes_dict
-
-
-def fix_shape_sparse(features, feature_config):
-  """
-  Modifies the shape of features which are extracted using the hashing trick.
-  Features itself is changed by this function.
-  Arguments:
-    features:
-      Feature dictionary extracted by the feature config
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}")
-  sparse_shape = _extract_hash_space_bits(feature_config)
-  if not isinstance(features, dict):
-    raise TypeError(f"features must be of dictionary type, it is of {type(features)} type")
-  for key in set(features) & set(sparse_shape):
-    features[key] = limit_sparse_tensor_size(features[key], sparse_shape[key], mask_indices=False)
-
-
-def touch_file_in_dir(directory, filename):
-  """
-  Creates a file named filename in directory.
-
-  Arguments:
-    filename: (str)
-    directory: (str)
-  """
-  file_path = os.path.join(directory, filename)
-  with tf.io.gfile.GFile(file_path, "w") as f:
-    f.write("")
-
-
-def file_exist_in_dir(directory: str, filename: str) -> bool:
-  file_path = os.path.join(directory, filename)
-  return tf.io.gfile.exists(file_path)
-
-
-def copy_to_local(remote, local, filename, overwrite=False):
-  """Function to file from remote directory to local directory."""
-  assert "hdfs://" not in local
-  tf.io.gfile.makedirs(local)
-  return tf.io.gfile.copy(
-    os.path.join(remote, filename),
-    os.path.join(local, filename),
-    overwrite=overwrite,
-  )
-
-
-def copy_recursive(src, dst, overwrite=False):
-  """
-  Function to copy a directory recursively.
-
-  Arguments:
-    src: Source directory.
-    dst: Destination directory.
-    overwrite: Specifies if files are to be overwritten if they exist.
-  """
-
-  src = src.rstrip("/")
-  dst = dst.rstrip("/")
-
-  for dirname, subdirs, files in tf.io.gfile.walk(src):
-    dst_dirname = dirname.replace(src, dst)
-    tf.io.gfile.makedirs(dst_dirname)
-
-    for f in files:
-      src_f = os.path.join(dirname, f)
-      dst_f = os.path.join(dst_dirname, f)
-
-      tf.logging.info(f"Copying {src_f} to {dst_f}")
-      tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite)
-
-
-def delete_file_or_dir(path):
-  """
-  Delete the file or directory given by `path`
-  Arguments:
-    path:
-      string indicating path of file or directory to remove
-  """
-  if tf.io.gfile.isdir(path):
-    tf.io.gfile.rmtree(path)
-  else:
-    tf.io.gfile.remove(path)
-
-
-def get_distributed_training_job_path():
-  """
-  Function to get distributed training job path.
-  Note: distributed training has three jobs, one parameter server job,
-  one worker job and one evaluator job. All of these three jobs' name
-  share a common base job name.
-  """
-  job_path = AuroraPath(dc=os.environ.get("TWML_JOB_CLUSTER"),
-    role=os.environ.get("TWML_JOB_ROLE"),
-    env=os.environ.get("TWML_JOB_ENV"),
-    job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"))
-  return job_path
-
-def do_every_n_steps(action, num_steps):
-  """
-  Execute a sequence of TensorFlow operations only once in a while.
-  Specifically, `action` is performed if `global_step` is a
-    multiple of `num_steps`
-
-  Args:
-    action: callable to be performed at regular intervals. This callable
-      must return a TF op with no output tensors.
-    num_steps: period of performing the action, as measured
-      in number of training steps
-
-  Returns:
-    A TensorFlow op with no output tensors, like a tf.print() or tf.no_op().
-    You must use tf.control_dependencies() to execute the op.
-
-  """
-  global_step = tf.train.get_or_create_global_step()
-  condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0)
-  return tf.cond(condition, action, lambda: tf.no_op())
diff --git a/twml/twml_common/__init__.docx b/twml/twml_common/__init__.docx
new file mode 100644
index 000000000..1205dbb1d
Binary files /dev/null and b/twml/twml_common/__init__.docx differ
diff --git a/twml/twml_common/__init__.py b/twml/twml_common/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml_common/initializer.docx b/twml/twml_common/initializer.docx
new file mode 100644
index 000000000..f5ed5ccba
Binary files /dev/null and b/twml/twml_common/initializer.docx differ
diff --git a/twml/twml_common/initializer.py b/twml/twml_common/initializer.py
deleted file mode 100644
index 7a9c734c7..000000000
--- a/twml/twml_common/initializer.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-class PartitionInitializer(tf.keras.initializers.Initializer):
-  """Required to initialize partitioned weight with numpy array for tests"""
-
-  def __init__(self, np_array):
-    self.np_array = np_array
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    offset = partition_info.var_offset
-    ix0, ix1 = offset[0], offset[0] + shape[0]
-    iy0, iy1 = offset[1], offset[1] + shape[1]
-    return self.np_array[ix0:ix1, iy0:iy1]
diff --git a/twml/twml_common/serialize.docx b/twml/twml_common/serialize.docx
new file mode 100644
index 000000000..e060de421
Binary files /dev/null and b/twml/twml_common/serialize.docx differ
diff --git a/twml/twml_common/serialize.py b/twml/twml_common/serialize.py
deleted file mode 100644
index 36c53881e..000000000
--- a/twml/twml_common/serialize.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from thrift.protocol import TBinaryProtocol
-from thrift.transport import TTransport
-
-
-def serialize(obj):
-  tbuf = TTransport.TMemoryBuffer()
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  obj.write(iproto)
-  return tbuf.getvalue()
-
-
-def deserialize(record, bytes):
-  tbuf = TTransport.TMemoryBuffer(bytes)
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  record.read(iproto)
-  return record
diff --git a/twml/twml_common/sparse_inputs.docx b/twml/twml_common/sparse_inputs.docx
new file mode 100644
index 000000000..85195590c
Binary files /dev/null and b/twml/twml_common/sparse_inputs.docx differ
diff --git a/twml/twml_common/sparse_inputs.py b/twml/twml_common/sparse_inputs.py
deleted file mode 100644
index b8f7939e5..000000000
--- a/twml/twml_common/sparse_inputs.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-
-def create_sparse_tensor(batch_size, input_size, num_values, dtype=tf.float32):
-  random_indices = np.sort(np.random.randint(batch_size * input_size, size=num_values))
-  test_indices_i = random_indices // input_size
-  test_indices_j = random_indices % input_size
-  test_indices = np.stack([test_indices_i, test_indices_j], axis=1)
-  test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype)
-
-  return tf.SparseTensor(indices=tf.constant(test_indices),
-                         values=tf.constant(test_values),
-                         dense_shape=(batch_size, input_size))
-
-
-def create_reference_input(sparse_input, use_binary_values):
-  if use_binary_values:
-    sp_a = tf.SparseTensor(indices=sparse_input.indices,
-                           values=tf.ones_like(sparse_input.values),
-                           dense_shape=sparse_input.dense_shape)
-  else:
-    sp_a = sparse_input
-  return sp_a
diff --git a/unified_user_actions/.gitignore b/unified_user_actions/.gitignore
deleted file mode 100644
index e98c1bb78..000000000
--- a/unified_user_actions/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-.DS_Store
-CONFIG.ini
-PROJECT
-docs
diff --git a/unified_user_actions/.gitignore.docx b/unified_user_actions/.gitignore.docx
new file mode 100644
index 000000000..0cc9db827
Binary files /dev/null and b/unified_user_actions/.gitignore.docx differ
diff --git a/unified_user_actions/BUILD.bazel b/unified_user_actions/BUILD.bazel
deleted file mode 100644
index 1624a57d4..000000000
--- a/unified_user_actions/BUILD.bazel
+++ /dev/null
@@ -1 +0,0 @@
-# This prevents SQ query from grabbing //:all since it traverses up once to find a BUILD
diff --git a/unified_user_actions/BUILD.docx b/unified_user_actions/BUILD.docx
new file mode 100644
index 000000000..b64ed63b8
Binary files /dev/null and b/unified_user_actions/BUILD.docx differ
diff --git a/unified_user_actions/README.docx b/unified_user_actions/README.docx
new file mode 100644
index 000000000..4fb7a9210
Binary files /dev/null and b/unified_user_actions/README.docx differ
diff --git a/unified_user_actions/README.md b/unified_user_actions/README.md
deleted file mode 100644
index 4211e7ade..000000000
--- a/unified_user_actions/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Unified User Actions (UUA)
-
-**Unified User Actions** (UUA) is a centralized, real-time stream of user actions on Twitter, consumed by various product, ML, and marketing teams. UUA reads client-side and server-side event streams that contain the user's actions and generates a unified real-time user actions Kafka stream. The Kafka stream is replicated to HDFS, GCP Pubsub, GCP GCS, GCP BigQuery.  The user actions include public actions such as favorites, retweets, replies and implicit actions like bookmark, impression, video view.
-
-## Components 
-
-- adapter: transform the raw inputs to UUA Thrift output
-- client: Kafka client related utils
-- kafka: more specific Kafka utils like customized serde
-- service: deployment, modules and services
\ No newline at end of file
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/AbstractAdapter.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/AbstractAdapter.docx
new file mode 100644
index 000000000..2a0564655
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/AbstractAdapter.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/AbstractAdapter.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/AbstractAdapter.scala
deleted file mode 100644
index 385a3d23d..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/AbstractAdapter.scala
+++ /dev/null
@@ -1,19 +0,0 @@
-package com.twitter.unified_user_actions.adapter
-
-import com.twitter.finagle.stats.NullStatsReceiver
-import com.twitter.finagle.stats.StatsReceiver
-
-trait AbstractAdapter[INPUT, OUTK, OUTV] extends Serializable {
-
-  /**
-   * The basic input -> seq[output] adapter which concrete adapters should extend from
-   * @param input a single INPUT
-   * @return A list of (OUTK, OUTV) tuple. The OUTK is the output key mainly for publishing to Kafka (or Pubsub).
-   *         If other processing, e.g. offline batch processing, doesn't require the output key then it can drop it
-   *         like source.adaptOneToKeyedMany.map(_._2)
-   */
-  def adaptOneToKeyedMany(
-    input: INPUT,
-    statsReceiver: StatsReceiver = NullStatsReceiver
-  ): Seq[(OUTK, OUTV)]
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/BUILD b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/BUILD
deleted file mode 100644
index a6ef069c4..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/BUILD
+++ /dev/null
@@ -1,11 +0,0 @@
-scala_library(
-    name = "base",
-    sources = [
-        "AbstractAdapter.scala",
-    ],
-    compiler_option_sets = ["fatal_warnings"],
-    tags = ["bazel-compatible"],
-    dependencies = [
-        "util/util-stats/src/main/scala/com/twitter/finagle/stats",
-    ],
-)
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/BUILD.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/BUILD.docx
new file mode 100644
index 000000000..c4c75f2ab
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/BUILD.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagement.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagement.docx
new file mode 100644
index 000000000..ad26ab833
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagement.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagement.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagement.scala
deleted file mode 100644
index 41db74b4b..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagement.scala
+++ /dev/null
@@ -1,125 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.ads.spendserver.thriftscala.SpendServerEvent
-import com.twitter.unified_user_actions.thriftscala._
-
-object AdsCallbackEngagement {
-  object PromotedTweetFav extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetFav)
-
-  object PromotedTweetUnfav extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetUnfav)
-
-  object PromotedTweetReply extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetReply)
-
-  object PromotedTweetRetweet
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetRetweet)
-
-  object PromotedTweetBlockAuthor
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetBlockAuthor)
-
-  object PromotedTweetUnblockAuthor
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetUnblockAuthor)
-
-  object PromotedTweetComposeTweet
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetComposeTweet)
-
-  object PromotedTweetClick extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetClick)
-
-  object PromotedTweetReport extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetReport)
-
-  object PromotedProfileFollow
-      extends ProfileAdsCallbackEngagement(ActionType.ServerPromotedProfileFollow)
-
-  object PromotedProfileUnfollow
-      extends ProfileAdsCallbackEngagement(ActionType.ServerPromotedProfileUnfollow)
-
-  object PromotedTweetMuteAuthor
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetMuteAuthor)
-
-  object PromotedTweetClickProfile
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetClickProfile)
-
-  object PromotedTweetClickHashtag
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetClickHashtag)
-
-  object PromotedTweetOpenLink
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetOpenLink) {
-    override def getItem(input: SpendServerEvent): Option[Item] = {
-      input.engagementEvent.flatMap { e =>
-        e.impressionData.flatMap { i =>
-          getPromotedTweetInfo(
-            i.promotedTweetId,
-            i.advertiserId,
-            tweetActionInfoOpt = Some(
-              TweetActionInfo.ServerPromotedTweetOpenLink(
-                ServerPromotedTweetOpenLink(url = e.url))))
-        }
-      }
-    }
-  }
-
-  object PromotedTweetCarouselSwipeNext
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetCarouselSwipeNext)
-
-  object PromotedTweetCarouselSwipePrevious
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetCarouselSwipePrevious)
-
-  object PromotedTweetLingerImpressionShort
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetLingerImpressionShort)
-
-  object PromotedTweetLingerImpressionMedium
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetLingerImpressionMedium)
-
-  object PromotedTweetLingerImpressionLong
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetLingerImpressionLong)
-
-  object PromotedTweetClickSpotlight
-      extends BaseTrendAdsCallbackEngagement(ActionType.ServerPromotedTweetClickSpotlight)
-
-  object PromotedTweetViewSpotlight
-      extends BaseTrendAdsCallbackEngagement(ActionType.ServerPromotedTweetViewSpotlight)
-
-  object PromotedTrendView
-      extends BaseTrendAdsCallbackEngagement(ActionType.ServerPromotedTrendView)
-
-  object PromotedTrendClick
-      extends BaseTrendAdsCallbackEngagement(ActionType.ServerPromotedTrendClick)
-
-  object PromotedTweetVideoPlayback25
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerPromotedTweetVideoPlayback25)
-
-  object PromotedTweetVideoPlayback50
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerPromotedTweetVideoPlayback50)
-
-  object PromotedTweetVideoPlayback75
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerPromotedTweetVideoPlayback75)
-
-  object PromotedTweetVideoAdPlayback25
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerPromotedTweetVideoAdPlayback25)
-
-  object PromotedTweetVideoAdPlayback50
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerPromotedTweetVideoAdPlayback50)
-
-  object PromotedTweetVideoAdPlayback75
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerPromotedTweetVideoAdPlayback75)
-
-  object TweetVideoAdPlayback25
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerTweetVideoAdPlayback25)
-
-  object TweetVideoAdPlayback50
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerTweetVideoAdPlayback50)
-
-  object TweetVideoAdPlayback75
-      extends BaseVideoAdsCallbackEngagement(ActionType.ServerTweetVideoAdPlayback75)
-
-  object PromotedTweetDismissWithoutReason
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetDismissWithoutReason)
-
-  object PromotedTweetDismissUninteresting
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetDismissUninteresting)
-
-  object PromotedTweetDismissRepetitive
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetDismissRepetitive)
-
-  object PromotedTweetDismissSpam
-      extends BaseAdsCallbackEngagement(ActionType.ServerPromotedTweetDismissSpam)
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagementsAdapter.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagementsAdapter.docx
new file mode 100644
index 000000000..de08c6788
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagementsAdapter.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagementsAdapter.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagementsAdapter.scala
deleted file mode 100644
index f59ee9e48..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/AdsCallbackEngagementsAdapter.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.finagle.stats.NullStatsReceiver
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.finatra.kafka.serde.UnKeyed
-import com.twitter.unified_user_actions.adapter.AbstractAdapter
-import com.twitter.ads.spendserver.thriftscala.SpendServerEvent
-import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
-
-class AdsCallbackEngagementsAdapter
-    extends AbstractAdapter[SpendServerEvent, UnKeyed, UnifiedUserAction] {
-
-  import AdsCallbackEngagementsAdapter._
-
-  override def adaptOneToKeyedMany(
-    input: SpendServerEvent,
-    statsReceiver: StatsReceiver = NullStatsReceiver
-  ): Seq[(UnKeyed, UnifiedUserAction)] =
-    adaptEvent(input).map { e => (UnKeyed, e) }
-}
-
-object AdsCallbackEngagementsAdapter {
-  def adaptEvent(input: SpendServerEvent): Seq[UnifiedUserAction] = {
-    val baseEngagements: Seq[BaseAdsCallbackEngagement] =
-      EngagementTypeMappings.getEngagementMappings(Option(input).flatMap(_.engagementEvent))
-    baseEngagements.flatMap(_.getUUA(input))
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BUILD b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BUILD
deleted file mode 100644
index e945f872a..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-scala_library(
-    sources = [
-        "*.scala",
-    ],
-    compiler_option_sets = ["fatal_warnings"],
-    tags = [
-        "bazel-compatible",
-        "bazel-only",
-    ],
-    dependencies = [
-        "kafka/finagle-kafka/finatra-kafka/src/main/scala",
-        "src/thrift/com/twitter/ads/billing/spendserver:spendserver_thrift-scala",
-        "src/thrift/com/twitter/ads/eventstream:eventstream-scala",
-        "unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter:base",
-        "unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/common",
-        "unified_user_actions/thrift/src/main/thrift/com/twitter/unified_user_actions:unified_user_actions-scala",
-    ],
-)
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BUILD.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BUILD.docx
new file mode 100644
index 000000000..fdb1333a1
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BUILD.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseAdsCallbackEngagement.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseAdsCallbackEngagement.docx
new file mode 100644
index 000000000..5dbca4139
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseAdsCallbackEngagement.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseAdsCallbackEngagement.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseAdsCallbackEngagement.scala
deleted file mode 100644
index 2cefd7af3..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseAdsCallbackEngagement.scala
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.ads.spendserver.thriftscala.SpendServerEvent
-import com.twitter.unified_user_actions.adapter.common.AdapterUtils
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.AuthorInfo
-import com.twitter.unified_user_actions.thriftscala.EventMetadata
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.SourceLineage
-import com.twitter.unified_user_actions.thriftscala.TweetInfo
-import com.twitter.unified_user_actions.thriftscala.TweetActionInfo
-import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
-import com.twitter.unified_user_actions.thriftscala.UserIdentifier
-
-abstract class BaseAdsCallbackEngagement(actionType: ActionType) {
-
-  protected def getItem(input: SpendServerEvent): Option[Item] = {
-    input.engagementEvent.flatMap { e =>
-      e.impressionData.flatMap { i =>
-        getPromotedTweetInfo(i.promotedTweetId, i.advertiserId)
-      }
-    }
-  }
-
-  protected def getPromotedTweetInfo(
-    promotedTweetIdOpt: Option[Long],
-    advertiserId: Long,
-    tweetActionInfoOpt: Option[TweetActionInfo] = None
-  ): Option[Item] = {
-    promotedTweetIdOpt.map { promotedTweetId =>
-      Item.TweetInfo(
-        TweetInfo(
-          actionTweetId = promotedTweetId,
-          actionTweetAuthorInfo = Some(AuthorInfo(authorId = Some(advertiserId))),
-          tweetActionInfo = tweetActionInfoOpt)
-      )
-    }
-  }
-
-  def getUUA(input: SpendServerEvent): Option[UnifiedUserAction] = {
-    val userIdentifier: UserIdentifier =
-      UserIdentifier(
-        userId = input.engagementEvent.flatMap(e => e.clientInfo.flatMap(_.userId64)),
-        guestIdMarketing = input.engagementEvent.flatMap(e => e.clientInfo.flatMap(_.guestId)),
-      )
-
-    getItem(input).map { item =>
-      UnifiedUserAction(
-        userIdentifier = userIdentifier,
-        item = item,
-        actionType = actionType,
-        eventMetadata = getEventMetadata(input),
-      )
-    }
-  }
-
-  protected def getEventMetadata(input: SpendServerEvent): EventMetadata =
-    EventMetadata(
-      sourceTimestampMs = input.engagementEvent
-        .map { e => e.engagementEpochTimeMilliSec }.getOrElse(AdapterUtils.currentTimestampMs),
-      receivedTimestampMs = AdapterUtils.currentTimestampMs,
-      sourceLineage = SourceLineage.ServerAdsCallbackEngagements,
-      language = input.engagementEvent.flatMap { e => e.clientInfo.flatMap(_.languageCode) },
-      countryCode = input.engagementEvent.flatMap { e => e.clientInfo.flatMap(_.countryCode) },
-      clientAppId =
-        input.engagementEvent.flatMap { e => e.clientInfo.flatMap(_.clientId) }.map { _.toLong },
-    )
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseTrendAdsCallbackEngagement.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseTrendAdsCallbackEngagement.docx
new file mode 100644
index 000000000..a1e93e44d
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseTrendAdsCallbackEngagement.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseTrendAdsCallbackEngagement.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseTrendAdsCallbackEngagement.scala
deleted file mode 100644
index 494e2ba10..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseTrendAdsCallbackEngagement.scala
+++ /dev/null
@@ -1,18 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.ads.spendserver.thriftscala.SpendServerEvent
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BaseTrendAdsCallbackEngagement(actionType: ActionType)
-    extends BaseAdsCallbackEngagement(actionType = actionType) {
-
-  override protected def getItem(input: SpendServerEvent): Option[Item] = {
-    input.engagementEvent.flatMap { e =>
-      e.impressionData.flatMap { i =>
-        i.promotedTrendId.map { promotedTrendId =>
-          Item.TrendInfo(TrendInfo(actionTrendId = promotedTrendId))
-        }
-      }
-    }
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseVideoAdsCallbackEngagement.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseVideoAdsCallbackEngagement.docx
new file mode 100644
index 000000000..8a2360037
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseVideoAdsCallbackEngagement.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseVideoAdsCallbackEngagement.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseVideoAdsCallbackEngagement.scala
deleted file mode 100644
index 8fead0888..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/BaseVideoAdsCallbackEngagement.scala
+++ /dev/null
@@ -1,54 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.ads.spendserver.thriftscala.SpendServerEvent
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.AuthorInfo
-import com.twitter.unified_user_actions.thriftscala.TweetVideoWatch
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.TweetActionInfo
-import com.twitter.unified_user_actions.thriftscala.TweetInfo
-
-abstract class BaseVideoAdsCallbackEngagement(actionType: ActionType)
-    extends BaseAdsCallbackEngagement(actionType = actionType) {
-
-  override def getItem(input: SpendServerEvent): Option[Item] = {
-    input.engagementEvent.flatMap { e =>
-      e.impressionData.flatMap { i =>
-        getTweetInfo(i.promotedTweetId, i.organicTweetId, i.advertiserId, input)
-      }
-    }
-  }
-
-  private def getTweetInfo(
-    promotedTweetId: Option[Long],
-    organicTweetId: Option[Long],
-    advertiserId: Long,
-    input: SpendServerEvent
-  ): Option[Item] = {
-    val actionedTweetIdOpt: Option[Long] =
-      if (promotedTweetId.isEmpty) organicTweetId else promotedTweetId
-    actionedTweetIdOpt.map { actionTweetId =>
-      Item.TweetInfo(
-        TweetInfo(
-          actionTweetId = actionTweetId,
-          actionTweetAuthorInfo = Some(AuthorInfo(authorId = Some(advertiserId))),
-          tweetActionInfo = Some(
-            TweetActionInfo.TweetVideoWatch(
-              TweetVideoWatch(
-                isMonetizable = Some(true),
-                videoOwnerId = input.engagementEvent
-                  .flatMap(e => e.cardEngagement).flatMap(_.amplifyDetails).flatMap(_.videoOwnerId),
-                videoUuid = input.engagementEvent
-                  .flatMap(_.cardEngagement).flatMap(_.amplifyDetails).flatMap(_.videoUuid),
-                prerollOwnerId = input.engagementEvent
-                  .flatMap(e => e.cardEngagement).flatMap(_.amplifyDetails).flatMap(
-                    _.prerollOwnerId),
-                prerollUuid = input.engagementEvent
-                  .flatMap(_.cardEngagement).flatMap(_.amplifyDetails).flatMap(_.prerollUuid)
-              ))
-          )
-        ),
-      )
-    }
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/EngagementTypeMappings.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/EngagementTypeMappings.docx
new file mode 100644
index 000000000..d31e0cfcb
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/EngagementTypeMappings.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/EngagementTypeMappings.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/EngagementTypeMappings.scala
deleted file mode 100644
index 9700a1ef1..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/EngagementTypeMappings.scala
+++ /dev/null
@@ -1,69 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.ads.eventstream.thriftscala.EngagementEvent
-import com.twitter.adserver.thriftscala.EngagementType
-import com.twitter.unified_user_actions.adapter.ads_callback_engagements.AdsCallbackEngagement._
-
-object EngagementTypeMappings {
-
-  /**
-   * Ads could be Tweets or non-Tweets. Since UUA explicitly sets the item type, it is
-   * possible that one Ads Callback engagement type maps to multiple UUA action types.
-   */
-  def getEngagementMappings(
-    engagementEvent: Option[EngagementEvent]
-  ): Seq[BaseAdsCallbackEngagement] = {
-    val promotedTweetId: Option[Long] =
-      engagementEvent.flatMap(_.impressionData).flatMap(_.promotedTweetId)
-    engagementEvent
-      .map(event =>
-        event.engagementType match {
-          case EngagementType.Fav => Seq(PromotedTweetFav)
-          case EngagementType.Unfav => Seq(PromotedTweetUnfav)
-          case EngagementType.Reply => Seq(PromotedTweetReply)
-          case EngagementType.Retweet => Seq(PromotedTweetRetweet)
-          case EngagementType.Block => Seq(PromotedTweetBlockAuthor)
-          case EngagementType.Unblock => Seq(PromotedTweetUnblockAuthor)
-          case EngagementType.Send => Seq(PromotedTweetComposeTweet)
-          case EngagementType.Detail => Seq(PromotedTweetClick)
-          case EngagementType.Report => Seq(PromotedTweetReport)
-          case EngagementType.Follow => Seq(PromotedProfileFollow)
-          case EngagementType.Unfollow => Seq(PromotedProfileUnfollow)
-          case EngagementType.Mute => Seq(PromotedTweetMuteAuthor)
-          case EngagementType.ProfilePic => Seq(PromotedTweetClickProfile)
-          case EngagementType.ScreenName => Seq(PromotedTweetClickProfile)
-          case EngagementType.UserName => Seq(PromotedTweetClickProfile)
-          case EngagementType.Hashtag => Seq(PromotedTweetClickHashtag)
-          case EngagementType.Url => Seq(PromotedTweetOpenLink)
-          case EngagementType.CarouselSwipeNext => Seq(PromotedTweetCarouselSwipeNext)
-          case EngagementType.CarouselSwipePrevious => Seq(PromotedTweetCarouselSwipePrevious)
-          case EngagementType.DwellShort => Seq(PromotedTweetLingerImpressionShort)
-          case EngagementType.DwellMedium => Seq(PromotedTweetLingerImpressionMedium)
-          case EngagementType.DwellLong => Seq(PromotedTweetLingerImpressionLong)
-          case EngagementType.SpotlightClick => Seq(PromotedTweetClickSpotlight)
-          case EngagementType.SpotlightView => Seq(PromotedTweetViewSpotlight)
-          case EngagementType.TrendView => Seq(PromotedTrendView)
-          case EngagementType.TrendClick => Seq(PromotedTrendClick)
-          case EngagementType.VideoContentPlayback25 => Seq(PromotedTweetVideoPlayback25)
-          case EngagementType.VideoContentPlayback50 => Seq(PromotedTweetVideoPlayback50)
-          case EngagementType.VideoContentPlayback75 => Seq(PromotedTweetVideoPlayback75)
-          case EngagementType.VideoAdPlayback25 if promotedTweetId.isDefined =>
-            Seq(PromotedTweetVideoAdPlayback25)
-          case EngagementType.VideoAdPlayback25 if promotedTweetId.isEmpty =>
-            Seq(TweetVideoAdPlayback25)
-          case EngagementType.VideoAdPlayback50 if promotedTweetId.isDefined =>
-            Seq(PromotedTweetVideoAdPlayback50)
-          case EngagementType.VideoAdPlayback50 if promotedTweetId.isEmpty =>
-            Seq(TweetVideoAdPlayback50)
-          case EngagementType.VideoAdPlayback75 if promotedTweetId.isDefined =>
-            Seq(PromotedTweetVideoAdPlayback75)
-          case EngagementType.VideoAdPlayback75 if promotedTweetId.isEmpty =>
-            Seq(TweetVideoAdPlayback75)
-          case EngagementType.DismissRepetitive => Seq(PromotedTweetDismissRepetitive)
-          case EngagementType.DismissSpam => Seq(PromotedTweetDismissSpam)
-          case EngagementType.DismissUninteresting => Seq(PromotedTweetDismissUninteresting)
-          case EngagementType.DismissWithoutReason => Seq(PromotedTweetDismissWithoutReason)
-          case _ => Nil
-        }).toSeq.flatten
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/ProfileAdsCallbackEngagement.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/ProfileAdsCallbackEngagement.docx
new file mode 100644
index 000000000..85863e9ab
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/ProfileAdsCallbackEngagement.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/ProfileAdsCallbackEngagement.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/ProfileAdsCallbackEngagement.scala
deleted file mode 100644
index 86633d3db..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/ads_callback_engagements/ProfileAdsCallbackEngagement.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-package com.twitter.unified_user_actions.adapter.ads_callback_engagements
-
-import com.twitter.ads.spendserver.thriftscala.SpendServerEvent
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.ProfileInfo
-
-abstract class ProfileAdsCallbackEngagement(actionType: ActionType)
-    extends BaseAdsCallbackEngagement(actionType) {
-
-  override protected def getItem(input: SpendServerEvent): Option[Item] = {
-    input.engagementEvent.flatMap { e =>
-      e.impressionData.flatMap { i =>
-        getProfileInfo(i.advertiserId)
-      }
-    }
-  }
-
-  protected def getProfileInfo(advertiserId: Long): Option[Item] = {
-    Some(
-      Item.ProfileInfo(
-        ProfileInfo(
-          actionProfileId = advertiserId
-        )))
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BUILD b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BUILD
deleted file mode 100644
index e8f741e78..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-scala_library(
-    sources = [
-        "*.scala",
-    ],
-    tags = ["bazel-compatible"],
-    dependencies = [
-        "common-internal/analytics/client-analytics-data-layer/src/main/scala",
-        "kafka/finagle-kafka/finatra-kafka/src/main/scala",
-        "src/scala/com/twitter/loggedout/analytics/common",
-        "src/thrift/com/twitter/clientapp/gen:clientapp-scala",
-        "twadoop_config/configuration/log_categories/group/scribelib:client_event-scala",
-        "unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter:base",
-        "unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/common",
-        "unified_user_actions/thrift/src/main/thrift/com/twitter/unified_user_actions:unified_user_actions-scala",
-    ],
-)
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BUILD.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BUILD.docx
new file mode 100644
index 000000000..eb9494c3a
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BUILD.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCTAClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCTAClientEvent.docx
new file mode 100644
index 000000000..e435bcea1
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCTAClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCTAClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCTAClientEvent.scala
deleted file mode 100644
index d1a47db26..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCTAClientEvent.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.logbase.thriftscala.LogBase
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
-import com.twitter.unified_user_actions.thriftscala._
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-
-abstract class BaseCTAClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def toUnifiedUserAction(logEvent: LogEvent): Seq[UnifiedUserAction] = {
-    val logBase: Option[LogBase] = logEvent.logBase
-    val userIdentifier: UserIdentifier = UserIdentifier(
-      userId = logBase.flatMap(_.userId),
-      guestIdMarketing = logBase.flatMap(_.guestIdMarketing))
-    val uuaItem: Item = Item.CtaInfo(CTAInfo())
-    val eventTimestamp = logBase.flatMap(getSourceTimestamp).getOrElse(0L)
-    val ceItem = LogEventItem.unsafeEmpty
-
-    val productSurface: Option[ProductSurface] = ProductSurfaceUtils
-      .getProductSurface(logEvent.eventNamespace)
-
-    val eventMetaData: EventMetadata = ClientEventCommonUtils
-      .getEventMetadata(
-        eventTimestamp = eventTimestamp,
-        logEvent = logEvent,
-        ceItem = ceItem,
-        productSurface = productSurface
-      )
-
-    Seq(
-      UnifiedUserAction(
-        userIdentifier = userIdentifier,
-        item = uuaItem,
-        actionType = actionType,
-        eventMetadata = eventMetaData,
-        productSurface = productSurface,
-        productSurfaceInfo =
-          ProductSurfaceUtils.getProductSurfaceInfo(productSurface, ceItem, logEvent)
-      ))
-  }
-
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCardClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCardClientEvent.docx
new file mode 100644
index 000000000..05849481d
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCardClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCardClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCardClientEvent.scala
deleted file mode 100644
index 63235304e..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseCardClientEvent.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.CardInfo
-import com.twitter.unified_user_actions.thriftscala.Item
-
-abstract class BaseCardClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.ignoreItemType(itemTypeOpt)
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = Some(
-    Item.CardInfo(
-      CardInfo(
-        id = ceItem.id,
-        itemType = ceItem.itemType,
-        actionTweetAuthorInfo = ClientEventCommonUtils.getAuthorInfo(ceItem),
-      ))
-  )
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseClientEvent.docx
new file mode 100644
index 000000000..d7e2b25a0
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseClientEvent.scala
deleted file mode 100644
index a2df60aab..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseClientEvent.scala
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.logbase.thriftscala.ClientEventReceiver
-import com.twitter.logbase.thriftscala.LogBase
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BaseClientEvent(actionType: ActionType) {
-  def toUnifiedUserAction(logEvent: LogEvent): Seq[UnifiedUserAction] = {
-    val logBase: Option[LogBase] = logEvent.logBase
-
-    for {
-      ed <- logEvent.eventDetails.toSeq
-      items <- ed.items.toSeq
-      ceItem <- items
-      eventTimestamp <- logBase.flatMap(getSourceTimestamp)
-      uuaItem <- getUuaItem(ceItem, logEvent)
-      if isItemTypeValid(ceItem.itemType)
-    } yield {
-      val userIdentifier: UserIdentifier = UserIdentifier(
-        userId = logBase.flatMap(_.userId),
-        guestIdMarketing = logBase.flatMap(_.guestIdMarketing))
-
-      val productSurface: Option[ProductSurface] = ProductSurfaceUtils
-        .getProductSurface(logEvent.eventNamespace)
-
-      val eventMetaData: EventMetadata = ClientEventCommonUtils
-        .getEventMetadata(
-          eventTimestamp = eventTimestamp,
-          logEvent = logEvent,
-          ceItem = ceItem,
-          productSurface = productSurface
-        )
-
-      UnifiedUserAction(
-        userIdentifier = userIdentifier,
-        item = uuaItem,
-        actionType = actionType,
-        eventMetadata = eventMetaData,
-        productSurface = productSurface,
-        productSurfaceInfo =
-          ProductSurfaceUtils.getProductSurfaceInfo(productSurface, ceItem, logEvent)
-      )
-    }
-  }
-
-  def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = for (actionTweetId <- ceItem.id)
-    yield Item.TweetInfo(
-      ClientEventCommonUtils
-        .getBasicTweetInfo(actionTweetId, ceItem, logEvent.eventNamespace))
-
-  // default implementation filters items of type tweet
-  // override in the subclass implementation to filter items of other types
-  def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.isItemTypeTweet(itemTypeOpt)
-
-  def getSourceTimestamp(logBase: LogBase): Option[Long] =
-    logBase.clientEventReceiver match {
-      case Some(ClientEventReceiver.CesHttp) | Some(ClientEventReceiver.CesThrift) =>
-        logBase.driftAdjustedEventCreatedAtMs
-      case _ => Some(logBase.driftAdjustedEventCreatedAtMs.getOrElse(logBase.timestamp))
-    }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseFeedbackSubmitClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseFeedbackSubmitClientEvent.docx
new file mode 100644
index 000000000..0245f47a2
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseFeedbackSubmitClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseFeedbackSubmitClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseFeedbackSubmitClientEvent.scala
deleted file mode 100644
index 83388bd0d..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseFeedbackSubmitClientEvent.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BaseFeedbackSubmitClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = {
-    logEvent.eventNamespace.flatMap(_.page).flatMap {
-      case "search" =>
-        val searchInfoUtil = new SearchInfoUtils(ceItem)
-        searchInfoUtil.getQueryOptFromItem(logEvent).flatMap { query =>
-          val isRelevant: Boolean = logEvent.eventNamespace
-            .flatMap(_.element)
-            .contains("is_relevant")
-          logEvent.eventNamespace.flatMap(_.component).flatMap {
-            case "relevance_prompt_module" =>
-              for (actionTweetId <- ceItem.id)
-                yield Item.FeedbackPromptInfo(
-                  FeedbackPromptInfo(
-                    feedbackPromptActionInfo = FeedbackPromptActionInfo.TweetRelevantToSearch(
-                      TweetRelevantToSearch(
-                        searchQuery = query,
-                        tweetId = actionTweetId,
-                        isRelevant = Some(isRelevant)))))
-            case "did_you_find_it_module" =>
-              Some(
-                Item.FeedbackPromptInfo(FeedbackPromptInfo(feedbackPromptActionInfo =
-                  FeedbackPromptActionInfo.DidYouFindItSearch(
-                    DidYouFindItSearch(searchQuery = query, isRelevant = Some(isRelevant))))))
-          }
-        }
-      case _ => None
-    }
-
-  }
-
-  override def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.isItemTypeForSearchResultsPageFeedbackSubmit(itemTypeOpt)
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseNotificationTabClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseNotificationTabClientEvent.docx
new file mode 100644
index 000000000..9b3811cec
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseNotificationTabClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseNotificationTabClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseNotificationTabClientEvent.scala
deleted file mode 100644
index 37737f017..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseNotificationTabClientEvent.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BaseNotificationTabClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  // itemType is `None` for Notification Tab events
-  override def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.ignoreItemType(itemTypeOpt)
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = for {
-    notificationTabDetails <- ceItem.notificationTabDetails
-    clientEventMetadata <- notificationTabDetails.clientEventMetadata
-    notificationId <- NotificationClientEventUtils.getNotificationIdForNotificationTab(ceItem)
-  } yield {
-    clientEventMetadata.tweetIds match {
-      // if `tweetIds` contain more than one Tweet id, create `MultiTweetNotification`
-      case Some(tweetIds) if tweetIds.size > 1 =>
-        Item.NotificationInfo(
-          NotificationInfo(
-            actionNotificationId = notificationId,
-            content = NotificationContent.MultiTweetNotification(
-              MultiTweetNotification(tweetIds = tweetIds))
-          ))
-      // if `tweetIds` contain exactly one Tweet id, create `TweetNotification`
-      case Some(tweetIds) if tweetIds.size == 1 =>
-        Item.NotificationInfo(
-          NotificationInfo(
-            actionNotificationId = notificationId,
-            content =
-              NotificationContent.TweetNotification(TweetNotification(tweetId = tweetIds.head))))
-      // if `tweetIds` are missing, create `UnknownNotification`
-      case _ =>
-        Item.NotificationInfo(
-          NotificationInfo(
-            actionNotificationId = notificationId,
-            content = NotificationContent.UnknownNotification(UnknownNotification())
-          ))
-    }
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseProfileClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseProfileClientEvent.docx
new file mode 100644
index 000000000..091927cb5
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseProfileClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseProfileClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseProfileClientEvent.scala
deleted file mode 100644
index 35e122dcd..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseProfileClientEvent.scala
+++ /dev/null
@@ -1,25 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.adapter.client_event.ClientEventCommonUtils.getProfileIdFromUserItem
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.ProfileInfo
-
-abstract class BaseProfileClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-  override def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.isItemTypeProfile(itemTypeOpt)
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] =
-    getProfileIdFromUserItem(ceItem).map { id =>
-      Item.ProfileInfo(
-        ProfileInfo(actionProfileId = id)
-      )
-    }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BasePushNotificationClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BasePushNotificationClientEvent.docx
new file mode 100644
index 000000000..3811928f4
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BasePushNotificationClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BasePushNotificationClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BasePushNotificationClientEvent.scala
deleted file mode 100644
index be3af9dde..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BasePushNotificationClientEvent.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BasePushNotificationClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = for {
-    itemId <- ceItem.id
-    notificationId <- NotificationClientEventUtils.getNotificationIdForPushNotification(logEvent)
-  } yield {
-    Item.NotificationInfo(
-      NotificationInfo(
-        actionNotificationId = notificationId,
-        content = NotificationContent.TweetNotification(TweetNotification(tweetId = itemId))))
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseSearchTypeaheadEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseSearchTypeaheadEvent.docx
new file mode 100644
index 000000000..28a76b663
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseSearchTypeaheadEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseSearchTypeaheadEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseSearchTypeaheadEvent.scala
deleted file mode 100644
index b00745d7f..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseSearchTypeaheadEvent.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.logbase.thriftscala.LogBase
-import com.twitter.unified_user_actions.adapter.client_event.ClientEventCommonUtils.getProfileIdFromUserItem
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.EventMetadata
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.ProductSurface
-import com.twitter.unified_user_actions.thriftscala.TopicQueryResult
-import com.twitter.unified_user_actions.thriftscala.TypeaheadActionInfo
-import com.twitter.unified_user_actions.thriftscala.TypeaheadInfo
-import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
-import com.twitter.unified_user_actions.thriftscala.UserIdentifier
-import com.twitter.unified_user_actions.thriftscala.UserResult
-
-abstract class BaseSearchTypeaheadEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def toUnifiedUserAction(logEvent: LogEvent): Seq[UnifiedUserAction] = {
-    val logBase: Option[LogBase] = logEvent.logBase
-
-    for {
-      ed <- logEvent.eventDetails.toSeq
-      targets <- ed.targets.toSeq
-      ceTarget <- targets
-      eventTimestamp <- logBase.flatMap(getSourceTimestamp)
-      uuaItem <- getUuaItem(ceTarget, logEvent)
-      if isItemTypeValid(ceTarget.itemType)
-    } yield {
-      val userIdentifier: UserIdentifier = UserIdentifier(
-        userId = logBase.flatMap(_.userId),
-        guestIdMarketing = logBase.flatMap(_.guestIdMarketing))
-
-      val productSurface: Option[ProductSurface] = ProductSurfaceUtils
-        .getProductSurface(logEvent.eventNamespace)
-
-      val eventMetaData: EventMetadata = ClientEventCommonUtils
-        .getEventMetadata(
-          eventTimestamp = eventTimestamp,
-          logEvent = logEvent,
-          ceItem = ceTarget,
-          productSurface = productSurface
-        )
-
-      UnifiedUserAction(
-        userIdentifier = userIdentifier,
-        item = uuaItem,
-        actionType = actionType,
-        eventMetadata = eventMetaData,
-        productSurface = productSurface,
-        productSurfaceInfo =
-          ProductSurfaceUtils.getProductSurfaceInfo(productSurface, ceTarget, logEvent)
-      )
-    }
-  }
-  override def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.isItemTypeTypeaheadResult(itemTypeOpt)
-
-  override def getUuaItem(
-    ceTarget: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] =
-    logEvent.searchDetails.flatMap(_.query).flatMap { query =>
-      ceTarget.itemType match {
-        case Some(ItemType.User) =>
-          getProfileIdFromUserItem(ceTarget).map { profileId =>
-            Item.TypeaheadInfo(
-              TypeaheadInfo(
-                actionQuery = query,
-                typeaheadActionInfo =
-                  TypeaheadActionInfo.UserResult(UserResult(profileId = profileId))))
-          }
-        case Some(ItemType.Search) =>
-          ceTarget.name.map { name =>
-            Item.TypeaheadInfo(
-              TypeaheadInfo(
-                actionQuery = query,
-                typeaheadActionInfo = TypeaheadActionInfo.TopicQueryResult(
-                  TopicQueryResult(suggestedTopicQuery = name))))
-          }
-        case _ => None
-      }
-    }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseTopicClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseTopicClientEvent.docx
new file mode 100644
index 000000000..877579d8b
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseTopicClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseTopicClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseTopicClientEvent.scala
deleted file mode 100644
index b74a56ace..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseTopicClientEvent.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.Item
-import com.twitter.unified_user_actions.thriftscala.TopicInfo
-
-abstract class BaseTopicClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-  override def isItemTypeValid(itemTypeOpt: Option[ItemType]): Boolean =
-    ItemTypeFilterPredicates.isItemTypeTopic(itemTypeOpt)
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] =
-    for (actionTopicId <- ClientEventCommonUtils.getTopicId(
-        ceItem = ceItem,
-        ceNamespaceOpt = logEvent.eventNamespace))
-      yield Item.TopicInfo(TopicInfo(actionTopicId = actionTopicId))
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseUASClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseUASClientEvent.docx
new file mode 100644
index 000000000..4e4c400da
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseUASClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseUASClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseUASClientEvent.scala
deleted file mode 100644
index de16de786..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseUASClientEvent.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.logbase.thriftscala.LogBase
-import com.twitter.unified_user_actions.thriftscala.ActionType
-import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BaseUASClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def toUnifiedUserAction(logEvent: LogEvent): Seq[UnifiedUserAction] = {
-    val logBase: Option[LogBase] = logEvent.logBase
-    val ceItem = LogEventItem.unsafeEmpty
-
-    val uuaOpt: Option[UnifiedUserAction] = for {
-      eventTimestamp <- logBase.flatMap(getSourceTimestamp)
-      uuaItem <- getUuaItem(ceItem, logEvent)
-    } yield {
-      val userIdentifier: UserIdentifier = UserIdentifier(
-        userId = logBase.flatMap(_.userId),
-        guestIdMarketing = logBase.flatMap(_.guestIdMarketing))
-
-      val productSurface: Option[ProductSurface] = ProductSurfaceUtils
-        .getProductSurface(logEvent.eventNamespace)
-
-      val eventMetaData: EventMetadata = ClientEventCommonUtils
-        .getEventMetadata(
-          eventTimestamp = eventTimestamp,
-          logEvent = logEvent,
-          ceItem = ceItem,
-          productSurface = productSurface
-        )
-
-      UnifiedUserAction(
-        userIdentifier = userIdentifier,
-        item = uuaItem,
-        actionType = actionType,
-        eventMetadata = eventMetaData,
-        productSurface = productSurface,
-        productSurfaceInfo =
-          ProductSurfaceUtils.getProductSurfaceInfo(productSurface, ceItem, logEvent)
-      )
-    }
-
-    uuaOpt match {
-      case Some(uua) => Seq(uua)
-      case _ => Nil
-    }
-  }
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = for {
-    performanceDetails <- logEvent.performanceDetails
-    duration <- performanceDetails.durationMs
-  } yield {
-    Item.UasInfo(UASInfo(timeSpentMs = duration))
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseVideoClientEvent.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseVideoClientEvent.docx
new file mode 100644
index 000000000..8029e8612
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseVideoClientEvent.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseVideoClientEvent.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseVideoClientEvent.scala
deleted file mode 100644
index 7d6cdbb2e..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/BaseVideoClientEvent.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.thriftscala._
-
-abstract class BaseVideoClientEvent(actionType: ActionType)
-    extends BaseClientEvent(actionType = actionType) {
-
-  override def getUuaItem(
-    ceItem: LogEventItem,
-    logEvent: LogEvent
-  ): Option[Item] = for {
-    actionTweetId <- ceItem.id
-    clientMediaEvent <- ceItem.clientMediaEvent
-    sessionState <- clientMediaEvent.sessionState
-    mediaIdentifier <- sessionState.contentVideoIdentifier
-    mediaId <- VideoClientEventUtils.videoIdFromMediaIdentifier(mediaIdentifier)
-    mediaDetails <- ceItem.mediaDetailsV2
-    mediaItems <- mediaDetails.mediaItems
-    videoMetadata <- VideoClientEventUtils.getVideoMetadata(
-      mediaId,
-      mediaItems,
-      ceItem.cardDetails.flatMap(_.amplifyDetails))
-  } yield {
-    Item.TweetInfo(
-      ClientEventCommonUtils
-        .getBasicTweetInfo(
-          actionTweetId = actionTweetId,
-          ceItem = ceItem,
-          ceNamespaceOpt = logEvent.eventNamespace)
-        .copy(tweetActionInfo = Some(videoMetadata)))
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventAdapter.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventAdapter.docx
new file mode 100644
index 000000000..9e1c34cc6
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventAdapter.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventAdapter.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventAdapter.scala
deleted file mode 100644
index 3bfde0c36..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventAdapter.scala
+++ /dev/null
@@ -1,272 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.finagle.stats.NullStatsReceiver
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.clientapp.thriftscala.EventNamespace
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.finatra.kafka.serde.UnKeyed
-import com.twitter.unified_user_actions.adapter.AbstractAdapter
-import com.twitter.unified_user_actions.adapter.client_event.ClientEventImpression._
-import com.twitter.unified_user_actions.adapter.client_event.ClientEventEngagement._
-import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
-import scala.util.matching.Regex
-
-class ClientEventAdapter extends AbstractAdapter[LogEvent, UnKeyed, UnifiedUserAction] {
-  import ClientEventAdapter._
-
-  override def adaptOneToKeyedMany(
-    input: LogEvent,
-    statsReceiver: StatsReceiver = NullStatsReceiver
-  ): Seq[(UnKeyed, UnifiedUserAction)] =
-    adaptEvent(input).map { e => (UnKeyed, e) }
-}
-
-object ClientEventAdapter {
-  // Refer to go/cme-scribing and go/interaction-event-spec for details
-  def isVideoEvent(element: String): Boolean = Seq[String](
-    "gif_player",
-    "periscope_player",
-    "platform_amplify_card",
-    "video_player",
-    "vine_player").contains(element)
-
-  /**
-   * Tweet clicks on the Notification Tab on iOS are a special case because the `element` is different
-   * from Tweet clicks everywhere else on the platform.
-   *
-   * For Notification Tab on iOS, `element` could be one of `user_mentioned_you`,
-   * `user_mentioned_you_in_a_quote_tweet`, `user_replied_to_your_tweet`, or `user_quoted_your_tweet`.
-   *
-   * In other places, `element` = `tweet`.
-   */
-  def isTweetClickEvent(element: String): Boolean =
-    Seq[String](
-      "tweet",
-      "user_mentioned_you",
-      "user_mentioned_you_in_a_quote_tweet",
-      "user_replied_to_your_tweet",
-      "user_quoted_your_tweet"
-    ).contains(element)
-
-  final val validUASIosClientIds = Seq[Long](
-    129032L, // Twitter for iPhone
-    191841L // Twitter for iPad
-  )
-  // Twitter for Android
-  final val validUASAndroidClientIds = Seq[Long](258901L)
-
-  def adaptEvent(inputLogEvent: LogEvent): Seq[UnifiedUserAction] =
-    Option(inputLogEvent).toSeq
-      .filterNot { logEvent: LogEvent =>
-        shouldIgnoreClientEvent(logEvent.eventNamespace)
-      }
-      .flatMap { logEvent: LogEvent =>
-        val actionTypesPerEvent: Seq[BaseClientEvent] = logEvent.eventNamespace.toSeq.flatMap {
-          name =>
-            (name.page, name.section, name.component, name.element, name.action) match {
-              case (_, _, _, _, Some("favorite")) => Seq(TweetFav)
-              case (_, _, _, _, Some("unfavorite")) => Seq(TweetUnfav)
-              case (_, _, Some("stream"), Some("linger"), Some("results")) =>
-                Seq(TweetLingerImpression)
-              case (_, _, Some("stream"), None, Some("results")) =>
-                Seq(TweetRenderImpression)
-              case (_, _, _, _, Some("send_reply")) => Seq(TweetReply)
-              // Different clients may have different actions of the same "send quote"
-              // but it turns out that both send_quote and retweet_with_comment should correspond to
-              // "send quote"
-              case (_, _, _, _, Some("send_quote_tweet")) |
-                  (_, _, _, _, Some("retweet_with_comment")) =>
-                Seq(TweetQuote)
-              case (_, _, _, _, Some("retweet")) => Seq(TweetRetweet)
-              case (_, _, _, _, Some("unretweet")) => Seq(TweetUnretweet)
-              case (_, _, _, _, Some("reply")) => Seq(TweetClickReply)
-              case (_, _, _, _, Some("quote")) => Seq(TweetClickQuote)
-              case (_, _, _, Some(element), Some("playback_start")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlaybackStart)
-              case (_, _, _, Some(element), Some("playback_complete")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlaybackComplete)
-              case (_, _, _, Some(element), Some("playback_25")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlayback25)
-              case (_, _, _, Some(element), Some("playback_50")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlayback50)
-              case (_, _, _, Some(element), Some("playback_75")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlayback75)
-              case (_, _, _, Some(element), Some("playback_95")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlayback95)
-              case (_, _, _, Some(element), Some("play_from_tap")) if isVideoEvent(element) =>
-                Seq(TweetVideoPlayFromTap)
-              case (_, _, _, Some(element), Some("video_quality_view")) if isVideoEvent(element) =>
-                Seq(TweetVideoQualityView)
-              case (_, _, _, Some(element), Some("video_view")) if isVideoEvent(element) =>
-                Seq(TweetVideoView)
-              case (_, _, _, Some(element), Some("video_mrc_view")) if isVideoEvent(element) =>
-                Seq(TweetVideoMrcView)
-              case (_, _, _, Some(element), Some("view_threshold")) if isVideoEvent(element) =>
-                Seq(TweetVideoViewThreshold)
-              case (_, _, _, Some(element), Some("cta_url_click")) if isVideoEvent(element) =>
-                Seq(TweetVideoCtaUrlClick)
-              case (_, _, _, Some(element), Some("cta_watch_click")) if isVideoEvent(element) =>
-                Seq(TweetVideoCtaWatchClick)
-              case (_, _, _, Some("platform_photo_card"), Some("click")) => Seq(TweetPhotoExpand)
-              case (_, _, _, Some("platform_card"), Some("click")) => Seq(CardClick)
-              case (_, _, _, _, Some("open_app")) => Seq(CardOpenApp)
-              case (_, _, _, _, Some("install_app")) => Seq(CardAppInstallAttempt)
-              case (_, _, _, Some("platform_card"), Some("vote")) |
-                  (_, _, _, Some("platform_forward_card"), Some("vote")) =>
-                Seq(PollCardVote)
-              case (_, _, _, Some("mention"), Some("click")) |
-                  (_, _, _, _, Some("mention_click")) =>
-                Seq(TweetClickMentionScreenName)
-              case (_, _, _, Some(element), Some("click")) if isTweetClickEvent(element) =>
-                Seq(TweetClick)
-              case // Follow from the Topic page (or so-called landing page)
-                  (_, _, _, Some("topic"), Some("follow")) |
-                  // Actually not sure how this is generated ... but saw quite some events in BQ
-                  (_, _, _, Some("social_proof"), Some("follow")) |
-                  // Click on Tweet's caret menu of "Follow (the topic)", it needs to be:
-                  // 1) user follows the Topic already, 2) and clicked on the "Unfollow Topic" first.
-                  (_, _, _, Some("feedback_follow_topic"), Some("click")) =>
-                Seq(TopicFollow)
-              case (_, _, _, Some("topic"), Some("unfollow")) |
-                  (_, _, _, Some("social_proof"), Some("unfollow")) |
-                  (_, _, _, Some("feedback_unfollow_topic"), Some("click")) =>
-                Seq(TopicUnfollow)
-              case (_, _, _, Some("topic"), Some("not_interested")) |
-                  (_, _, _, Some("feedback_not_interested_in_topic"), Some("click")) =>
-                Seq(TopicNotInterestedIn)
-              case (_, _, _, Some("topic"), Some("un_not_interested")) |
-                  (_, _, _, Some("feedback_not_interested_in_topic"), Some("undo")) =>
-                Seq(TopicUndoNotInterestedIn)
-              case (_, _, _, Some("feedback_givefeedback"), Some("click")) =>
-                Seq(TweetNotHelpful)
-              case (_, _, _, Some("feedback_givefeedback"), Some("undo")) =>
-                Seq(TweetUndoNotHelpful)
-              case (_, _, _, Some("report_tweet"), Some("click")) |
-                  (_, _, _, Some("report_tweet"), Some("done")) =>
-                Seq(TweetReport)
-              case (_, _, _, Some("feedback_dontlike"), Some("click")) =>
-                Seq(TweetNotInterestedIn)
-              case (_, _, _, Some("feedback_dontlike"), Some("undo")) =>
-                Seq(TweetUndoNotInterestedIn)
-              case (_, _, _, Some("feedback_notabouttopic"), Some("click")) =>
-                Seq(TweetNotAboutTopic)
-              case (_, _, _, Some("feedback_notabouttopic"), Some("undo")) =>
-                Seq(TweetUndoNotAboutTopic)
-              case (_, _, _, Some("feedback_notrecent"), Some("click")) =>
-                Seq(TweetNotRecent)
-              case (_, _, _, Some("feedback_notrecent"), Some("undo")) =>
-                Seq(TweetUndoNotRecent)
-              case (_, _, _, Some("feedback_seefewer"), Some("click")) =>
-                Seq(TweetSeeFewer)
-              case (_, _, _, Some("feedback_seefewer"), Some("undo")) =>
-                Seq(TweetUndoSeeFewer)
-              // Only when action = "submit" we get all fields in ReportDetails, such as reportType
-              // See https://confluence.twitter.biz/pages/viewpage.action?spaceKey=HEALTH&title=Understanding+ReportDetails
-              case (Some(page), _, _, Some("ticket"), Some("submit"))
-                  if page.startsWith("report_") =>
-                Seq(TweetReportServer)
-              case (Some("profile"), _, _, _, Some("block")) =>
-                Seq(ProfileBlock)
-              case (Some("profile"), _, _, _, Some("unblock")) =>
-                Seq(ProfileUnblock)
-              case (Some("profile"), _, _, _, Some("mute_user")) =>
-                Seq(ProfileMute)
-              case (Some("profile"), _, _, _, Some("report")) =>
-                Seq(ProfileReport)
-              case (Some("profile"), _, _, _, Some("show")) =>
-                Seq(ProfileShow)
-              case (_, _, _, Some("follow"), Some("click")) => Seq(TweetFollowAuthor)
-              case (_, _, _, _, Some("follow")) => Seq(TweetFollowAuthor, ProfileFollow)
-              case (_, _, _, Some("unfollow"), Some("click")) => Seq(TweetUnfollowAuthor)
-              case (_, _, _, _, Some("unfollow")) => Seq(TweetUnfollowAuthor)
-              case (_, _, _, Some("block"), Some("click")) => Seq(TweetBlockAuthor)
-              case (_, _, _, Some("unblock"), Some("click")) => Seq(TweetUnblockAuthor)
-              case (_, _, _, Some("mute"), Some("click")) => Seq(TweetMuteAuthor)
-              case (_, _, _, Some(element), Some("click")) if isTweetClickEvent(element) =>
-                Seq(TweetClick)
-              case (_, _, _, _, Some("profile_click")) => Seq(TweetClickProfile, ProfileClick)
-              case (_, _, _, _, Some("share_menu_click")) => Seq(TweetClickShare)
-              case (_, _, _, _, Some("copy_link")) => Seq(TweetShareViaCopyLink)
-              case (_, _, _, _, Some("share_via_dm")) => Seq(TweetClickSendViaDirectMessage)
-              case (_, _, _, _, Some("bookmark")) => Seq(TweetShareViaBookmark, TweetBookmark)
-              case (_, _, _, _, Some("unbookmark")) => Seq(TweetUnbookmark)
-              case (_, _, _, _, Some("hashtag_click")) |
-                  // This scribe is triggered on mobile platforms (android/iphone) when user click on hashtag in a tweet.
-                  (_, _, _, Some("hashtag"), Some("search")) =>
-                Seq(TweetClickHashtag)
-              case (_, _, _, _, Some("open_link")) => Seq(TweetOpenLink)
-              case (_, _, _, _, Some("take_screenshot")) => Seq(TweetTakeScreenshot)
-              case (_, _, _, Some("feedback_notrelevant"), Some("click")) =>
-                Seq(TweetNotRelevant)
-              case (_, _, _, Some("feedback_notrelevant"), Some("undo")) =>
-                Seq(TweetUndoNotRelevant)
-              case (_, _, _, _, Some("follow_attempt")) => Seq(ProfileFollowAttempt)
-              case (_, _, _, _, Some("favorite_attempt")) => Seq(TweetFavoriteAttempt)
-              case (_, _, _, _, Some("retweet_attempt")) => Seq(TweetRetweetAttempt)
-              case (_, _, _, _, Some("reply_attempt")) => Seq(TweetReplyAttempt)
-              case (_, _, _, _, Some("login")) => Seq(CTALoginClick)
-              case (Some("login"), _, _, _, Some("show")) => Seq(CTALoginStart)
-              case (Some("login"), _, _, _, Some("success")) => Seq(CTALoginSuccess)
-              case (_, _, _, _, Some("signup")) => Seq(CTASignupClick)
-              case (Some("signup"), _, _, _, Some("success")) => Seq(CTASignupSuccess)
-              case // Android app running in the background
-                  (Some("notification"), Some("status_bar"), None, _, Some("background_open")) |
-                  // Android app running in the foreground
-                  (Some("notification"), Some("status_bar"), None, _, Some("open")) |
-                  // iOS app running in the background
-                  (Some("notification"), Some("notification_center"), None, _, Some("open")) |
-                  // iOS app running in the foreground
-                  (None, Some("toasts"), Some("social"), Some("favorite"), Some("open")) |
-                  // m5
-                  (Some("app"), Some("push"), _, _, Some("open")) =>
-                Seq(NotificationOpen)
-              case (Some("ntab"), Some("all"), Some("urt"), _, Some("navigate")) =>
-                Seq(NotificationClick)
-              case (Some("ntab"), Some("all"), Some("urt"), _, Some("see_less_often")) =>
-                Seq(NotificationSeeLessOften)
-              case (Some("notification"), Some("status_bar"), None, _, Some("background_dismiss")) |
-                  (Some("notification"), Some("status_bar"), None, _, Some("dismiss")) | (
-                    Some("notification"),
-                    Some("notification_center"),
-                    None,
-                    _,
-                    Some("dismiss")
-                  ) =>
-                Seq(NotificationDismiss)
-              case (_, _, _, Some("typeahead"), Some("click")) => Seq(TypeaheadClick)
-              case (Some("search"), _, Some(component), _, Some("click"))
-                  if component == "relevance_prompt_module" || component == "did_you_find_it_module" =>
-                Seq(FeedbackPromptSubmit)
-              case (Some("app"), Some("enter_background"), _, _, Some("become_inactive"))
-                  if logEvent.logBase
-                    .flatMap(_.clientAppId)
-                    .exists(validUASIosClientIds.contains(_)) =>
-                Seq(AppExit)
-              case (Some("app"), _, _, _, Some("become_inactive"))
-                  if logEvent.logBase
-                    .flatMap(_.clientAppId)
-                    .exists(validUASAndroidClientIds.contains(_)) =>
-                Seq(AppExit)
-              case (_, _, Some("gallery"), Some("photo"), Some("impression")) =>
-                Seq(TweetGalleryImpression)
-              case (_, _, _, _, _)
-                  if TweetDetailsImpression.isTweetDetailsImpression(logEvent.eventNamespace) =>
-                Seq(TweetDetailsImpression)
-              case _ => Nil
-            }
-        }
-        actionTypesPerEvent.map(_.toUnifiedUserAction(logEvent))
-      }.flatten
-
-  def shouldIgnoreClientEvent(eventNamespace: Option[EventNamespace]): Boolean =
-    eventNamespace.exists { name =>
-      (name.page, name.section, name.component, name.element, name.action) match {
-        case (Some("ddg"), _, _, _, Some("experiment")) => true
-        case (Some("qig_ranker"), _, _, _, _) => true
-        case (Some("timelinemixer"), _, _, _, _) => true
-        case (Some("timelineservice"), _, _, _, _) => true
-        case (Some("tweetconvosvc"), _, _, _, _) => true
-        case _ => false
-      }
-    }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventCommonUtils.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventCommonUtils.docx
new file mode 100644
index 000000000..dfc6a29cb
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventCommonUtils.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventCommonUtils.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventCommonUtils.scala
deleted file mode 100644
index f81060ad9..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventCommonUtils.scala
+++ /dev/null
@@ -1,169 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.EventNamespace
-import com.twitter.clientapp.thriftscala.Item
-import com.twitter.clientapp.thriftscala.ItemType.User
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.adapter.common.AdapterUtils
-import com.twitter.unified_user_actions.thriftscala.AuthorInfo
-import com.twitter.unified_user_actions.thriftscala.ClientEventNamespace
-import com.twitter.unified_user_actions.thriftscala.EventMetadata
-import com.twitter.unified_user_actions.thriftscala.ProductSurface
-import com.twitter.unified_user_actions.thriftscala.SourceLineage
-import com.twitter.unified_user_actions.thriftscala.TweetAuthorFollowClickSource
-import com.twitter.unified_user_actions.thriftscala.TweetAuthorUnfollowClickSource
-import com.twitter.unified_user_actions.thriftscala.TweetInfo
-
-/**
- * Comprises helper methods that:
- * 1. need not be overridden by subclasses of `BaseClientEvent`
- * 2. need not be invoked by instances of subclasses of `BaseClientEvent`
- * 3. need to be accessible to subclasses of `BaseClientEvent` and other utils
- */
-object ClientEventCommonUtils {
-
-  def getBasicTweetInfo(
-    actionTweetId: Long,
-    ceItem: LogEventItem,
-    ceNamespaceOpt: Option[EventNamespace]
-  ): TweetInfo = TweetInfo(
-    actionTweetId = actionTweetId,
-    actionTweetTopicSocialProofId = getTopicId(ceItem, ceNamespaceOpt),
-    retweetingTweetId = ceItem.tweetDetails.flatMap(_.retweetingTweetId),
-    quotedTweetId = ceItem.tweetDetails.flatMap(_.quotedTweetId),
-    inReplyToTweetId = ceItem.tweetDetails.flatMap(_.inReplyToTweetId),
-    quotingTweetId = ceItem.tweetDetails.flatMap(_.quotingTweetId),
-    // only set AuthorInfo when authorId is present
-    actionTweetAuthorInfo = getAuthorInfo(ceItem),
-    retweetingAuthorId = ceItem.tweetDetails.flatMap(_.retweetAuthorId),
-    quotedAuthorId = ceItem.tweetDetails.flatMap(_.quotedAuthorId),
-    inReplyToAuthorId = ceItem.tweetDetails.flatMap(_.inReplyToAuthorId),
-    tweetPosition = ceItem.position,
-    promotedId = ceItem.promotedId
-  )
-
-  def getTopicId(
-    ceItem: LogEventItem,
-    ceNamespaceOpt: Option[EventNamespace] = None,
-  ): Option[Long] =
-    ceNamespaceOpt.flatMap {
-      TopicIdUtils.getTopicId(item = ceItem, _)
-    }
-
-  def getAuthorInfo(
-    ceItem: LogEventItem,
-  ): Option[AuthorInfo] =
-    ceItem.tweetDetails.flatMap(_.authorId).map { authorId =>
-      AuthorInfo(
-        authorId = Some(authorId),
-        isFollowedByActingUser = ceItem.isViewerFollowsTweetAuthor,
-        isFollowingActingUser = ceItem.isTweetAuthorFollowsViewer,
-      )
-    }
-
-  def getEventMetadata(
-    eventTimestamp: Long,
-    logEvent: LogEvent,
-    ceItem: LogEventItem,
-    productSurface: Option[ProductSurface] = None
-  ): EventMetadata = EventMetadata(
-    sourceTimestampMs = eventTimestamp,
-    receivedTimestampMs = AdapterUtils.currentTimestampMs,
-    sourceLineage = SourceLineage.ClientEvents,
-    // Client UI language or from Gizmoduck which is what user set in Twitter App.
-    // Please see more at https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/finatra-internal/international/src/main/scala/com/twitter/finatra/international/LanguageIdentifier.scala
-    // The format should be ISO 639-1.
-    language = logEvent.logBase.flatMap(_.language).map(AdapterUtils.normalizeLanguageCode),
-    // Country code could be IP address (geoduck) or User registration country (gizmoduck) and the former takes precedence.
-    // We don’t know exactly which one is applied, unfortunately,
-    // see https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/finatra-internal/international/src/main/scala/com/twitter/finatra/international/CountryIdentifier.scala
-    // The format should be ISO_3166-1_alpha-2.
-    countryCode = logEvent.logBase.flatMap(_.country).map(AdapterUtils.normalizeCountryCode),
-    clientAppId = logEvent.logBase.flatMap(_.clientAppId),
-    clientVersion = logEvent.clientVersion,
-    clientEventNamespace = logEvent.eventNamespace.map(en => toClientEventNamespace(en)),
-    traceId = getTraceId(productSurface, ceItem),
-    requestJoinId = getRequestJoinId(productSurface, ceItem),
-    clientEventTriggeredOn = logEvent.eventDetails.flatMap(_.triggeredOn)
-  )
-
-  def toClientEventNamespace(eventNamespace: EventNamespace): ClientEventNamespace =
-    ClientEventNamespace(
-      page = eventNamespace.page,
-      section = eventNamespace.section,
-      component = eventNamespace.component,
-      element = eventNamespace.element,
-      action = eventNamespace.action
-    )
-
-  /**
-   * Get the profileId from Item.id, which itemType = 'USER'.
-   *
-   * The profileId can be also be found in the event_details.profile_id.
-   * However, the item.id is more reliable than event_details.profile_id,
-   * in particular, 45% of the client events with USER items have
-   * Null for event_details.profile_id while 0.13% item.id is Null.
-   * As such, we only use item.id to populate the profile_id.
-   */
-  def getProfileIdFromUserItem(item: Item): Option[Long] =
-    if (item.itemType.contains(User))
-      item.id
-    else None
-
-  /**
-   * TraceId is going to be deprecated and replaced by requestJoinId.
-   *
-   * Get the traceId from LogEventItem based on productSurface.
-   *
-   * The traceId is hydrated in controller data from backend. Different product surfaces
-   * populate different controller data. Thus, the product surface is checked first to decide
-   * which controller data should be read to ge the requestJoinId.
-   */
-  def getTraceId(productSurface: Option[ProductSurface], ceItem: LogEventItem): Option[Long] =
-    productSurface match {
-      case Some(ProductSurface.HomeTimeline) => HomeInfoUtils.getTraceId(ceItem)
-      case Some(ProductSurface.SearchResultsPage) => { new SearchInfoUtils(ceItem) }.getTraceId
-      case _ => None
-    }
-
-  /**
-   * Get the requestJoinId from LogEventItem based on productSurface.
-   *
-   * The requestJoinId is hydrated in controller data from backend. Different product surfaces
-   * populate different controller data. Thus, the product surface is checked first to decide
-   * which controller data should be read to get the requestJoinId.
-   *
-   * Support Home / Home_latest / SearchResults for now, to add other surfaces based on requirement.
-   */
-  def getRequestJoinId(productSurface: Option[ProductSurface], ceItem: LogEventItem): Option[Long] =
-    productSurface match {
-      case Some(ProductSurface.HomeTimeline) => HomeInfoUtils.getRequestJoinId(ceItem)
-      case Some(ProductSurface.SearchResultsPage) => {
-          new SearchInfoUtils(ceItem)
-        }.getRequestJoinId
-      case _ => None
-    }
-
-  def getTweetAuthorFollowSource(
-    eventNamespace: Option[EventNamespace]
-  ): TweetAuthorFollowClickSource = {
-    eventNamespace
-      .map(ns => (ns.element, ns.action)).map {
-        case (Some("follow"), Some("click")) => TweetAuthorFollowClickSource.CaretMenu
-        case (_, Some("follow")) => TweetAuthorFollowClickSource.ProfileImage
-        case _ => TweetAuthorFollowClickSource.Unknown
-      }.getOrElse(TweetAuthorFollowClickSource.Unknown)
-  }
-
-  def getTweetAuthorUnfollowSource(
-    eventNamespace: Option[EventNamespace]
-  ): TweetAuthorUnfollowClickSource = {
-    eventNamespace
-      .map(ns => (ns.element, ns.action)).map {
-        case (Some("unfollow"), Some("click")) => TweetAuthorUnfollowClickSource.CaretMenu
-        case (_, Some("unfollow")) => TweetAuthorUnfollowClickSource.ProfileImage
-        case _ => TweetAuthorUnfollowClickSource.Unknown
-      }.getOrElse(TweetAuthorUnfollowClickSource.Unknown)
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventEngagement.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventEngagement.docx
new file mode 100644
index 000000000..55df5fe12
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventEngagement.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventEngagement.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventEngagement.scala
deleted file mode 100644
index 0a2e59e0e..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventEngagement.scala
+++ /dev/null
@@ -1,687 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.ItemType
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.unified_user_actions.thriftscala._
-
-object ClientEventEngagement {
-  object TweetFav extends BaseClientEvent(ActionType.ClientTweetFav)
-
-  /**
-   * This is fired when a user unlikes a liked(favorited) tweet
-   */
-  object TweetUnfav extends BaseClientEvent(ActionType.ClientTweetUnfav)
-
-  /**
-   * This is "Send Reply" event to indicate publishing of a reply Tweet as opposed to clicking
-   * on the reply button to initiate a reply Tweet (captured in ClientTweetClickReply).
-   * The difference between this and the ServerTweetReply are:
-   * 1) ServerTweetReply already has the new Tweet Id, 2) A sent reply may be lost during transfer
-   * over the wire and thus may not end up with a follow-up ServerTweetReply.
-   */
-  object TweetReply extends BaseClientEvent(ActionType.ClientTweetReply)
-
-  /**
-   * This is the "send quote" event to indicate publishing of a quote tweet as opposed to clicking
-   * on the quote button to initiate a quote tweet (captured in ClientTweetClickQuote).
-   * The difference between this and the ServerTweetQuote are:
-   * 1) ServerTweetQuote already has the new Tweet Id, 2) A sent quote may be lost during transfer
-   * over the wire and thus may not ended up with a follow-up ServerTweetQuote.
-   */
-  object TweetQuote extends BaseClientEvent(ActionType.ClientTweetQuote)
-
-  /**
-   * This is the "retweet" event to indicate publishing of a retweet.
-   */
-  object TweetRetweet extends BaseClientEvent(ActionType.ClientTweetRetweet)
-
-  /**
-   * "action = reply" indicates that a user expressed the intention to reply to a Tweet by clicking
-   * the reply button. No new tweet is created in this event.
-   */
-  object TweetClickReply extends BaseClientEvent(ActionType.ClientTweetClickReply)
-
-  /**
-   * Please note that the "action == quote" is NOT the create quote Tweet event like what
-   * we can get from TweetyPie.
-   * It is just click on the "quote tweet" (after clicking on the retweet button there are 2 options,
-   * one is "retweet" and the other is "quote tweet")
-   *
-   * Also checked the CE (BQ Table), the `item.tweet_details.quoting_tweet_id` is always NULL but
-   * `item.tweet_details.retweeting_tweet_id`, `item.tweet_details.in_reply_to_tweet_id`, `item.tweet_details.quoted_tweet_id`
-   * could be NON-NULL and UUA would just include these NON-NULL fields as is. This is also checked in the unit test.
-   */
-  object TweetClickQuote extends BaseClientEvent(ActionType.ClientTweetClickQuote)
-
-  /**
-   * Refer to go/cme-scribing and go/interaction-event-spec for details.
-   * Fired on the first tick of a track regardless of where in the video it is playing.
-   * For looping playback, this is only fired once and does not reset at loop boundaries.
-   */
-  object TweetVideoPlaybackStart
-      extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlaybackStart)
-
-  /**
-   * Refer to go/cme-scribing and go/interaction-event-spec for details.
-   * Fired when playback reaches 100% of total track duration.
-   * Not valid for live videos.
-   * For looping playback, this is only fired once and does not reset at loop boundaries.
-   */
-  object TweetVideoPlaybackComplete
-      extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlaybackComplete)
-
-  /**
-   * Refer to go/cme-scribing and go/interaction-event-spec for details.
-   * This is fired when playback reaches 25% of total track duration. Not valid for live videos.
-   * For looping playback, this is only fired once and does not reset at loop boundaries.
-   */
-  object TweetVideoPlayback25 extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlayback25)
-  object TweetVideoPlayback50 extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlayback50)
-  object TweetVideoPlayback75 extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlayback75)
-  object TweetVideoPlayback95 extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlayback95)
-
-  /**
-   * Refer to go/cme-scribing and go/interaction-event-spec for details.
-   * This if fired when the video has been played in non-preview
-   * (i.e. not autoplaying in the timeline) mode, and was not started via auto-advance.
-   * For looping playback, this is only fired once and does not reset at loop boundaries.
-   */
-  object TweetVideoPlayFromTap extends BaseVideoClientEvent(ActionType.ClientTweetVideoPlayFromTap)
-
-  /**
-   * Refer to go/cme-scribing and go/interaction-event-spec for details.
-   * This is fired when 50% of the video has been on-screen and playing for 10 consecutive seconds
-   * or 95% of the video duration, whichever comes first.
-   * For looping playback, this is only fired once and does not reset at loop boundaries.
-   */
-  object TweetVideoQualityView extends BaseVideoClientEvent(ActionType.ClientTweetVideoQualityView)
-
-  object TweetVideoView extends BaseVideoClientEvent(ActionType.ClientTweetVideoView)
-  object TweetVideoMrcView extends BaseVideoClientEvent(ActionType.ClientTweetVideoMrcView)
-  object TweetVideoViewThreshold
-      extends BaseVideoClientEvent(ActionType.ClientTweetVideoViewThreshold)
-  object TweetVideoCtaUrlClick extends BaseVideoClientEvent(ActionType.ClientTweetVideoCtaUrlClick)
-  object TweetVideoCtaWatchClick
-      extends BaseVideoClientEvent(ActionType.ClientTweetVideoCtaWatchClick)
-
-  /**
-   * This is fired when a user clicks on "Undo retweet" after re-tweeting a tweet
-   *
-   */
-  object TweetUnretweet extends BaseClientEvent(ActionType.ClientTweetUnretweet)
-
-  /**
-   * This is fired when a user clicks on a photo attached to a tweet and the photo expands to fit
-   * the screen.
-   */
-  object TweetPhotoExpand extends BaseClientEvent(ActionType.ClientTweetPhotoExpand)
-
-  /**
-   * This is fired when a user clicks on a card, a card could be a photo or video for example
-   */
-  object CardClick extends BaseCardClientEvent(ActionType.ClientCardClick)
-  object CardOpenApp extends BaseCardClientEvent(ActionType.ClientCardOpenApp)
-  object CardAppInstallAttempt extends BaseCardClientEvent(ActionType.ClientCardAppInstallAttempt)
-  object PollCardVote extends BaseCardClientEvent(ActionType.ClientPollCardVote)
-
-  /**
-   * This is fired when a user clicks on a profile mention inside a tweet.
-   */
-  object TweetClickMentionScreenName
-      extends BaseClientEvent(ActionType.ClientTweetClickMentionScreenName) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] =
-      (
-        ceItem.id,
-        logEvent.eventDetails.flatMap(
-          _.targets.flatMap(_.find(_.itemType.contains(ItemType.User))))) match {
-        case (Some(tweetId), Some(target)) =>
-          (target.id, target.name) match {
-            case (Some(profileId), Some(profileHandle)) =>
-              Some(
-                Item.TweetInfo(
-                  ClientEventCommonUtils
-                    .getBasicTweetInfo(tweetId, ceItem, logEvent.eventNamespace)
-                    .copy(tweetActionInfo = Some(
-                      TweetActionInfo.ClientTweetClickMentionScreenName(
-                        ClientTweetClickMentionScreenName(
-                          actionProfileId = profileId,
-                          handle = profileHandle
-                        ))))))
-            case _ => None
-          }
-        case _ => None
-      }
-  }
-
-  /**
-   * These are fired when user follows/unfollows a Topic. Please see the comment in the
-   * ClientEventAdapter namespace matching to see the subtle details.
-   */
-  object TopicFollow extends BaseTopicClientEvent(ActionType.ClientTopicFollow)
-  object TopicUnfollow extends BaseTopicClientEvent(ActionType.ClientTopicUnfollow)
-
-  /**
-   * This is fired when the user clicks the "x" icon next to the topic on their timeline,
-   * and clicks "Not interested in {TOPIC}" in the pop-up prompt
-   * Alternatively, they can also click "See more" button to visit the topic page, and click "Not interested" there.
-   */
-  object TopicNotInterestedIn extends BaseTopicClientEvent(ActionType.ClientTopicNotInterestedIn)
-
-  /**
-   * This is fired when the user clicks the "Undo" button after clicking "x" or "Not interested" on a Topic
-   * which is captured in ClientTopicNotInterestedIn
-   */
-  object TopicUndoNotInterestedIn
-      extends BaseTopicClientEvent(ActionType.ClientTopicUndoNotInterestedIn)
-
-  /**
-   * This is fired when a user clicks on  "This Tweet's not helpful" flow in the caret menu
-   * of a Tweet result on the Search Results Page
-   */
-  object TweetNotHelpful extends BaseClientEvent(ActionType.ClientTweetNotHelpful)
-
-  /**
-   * This is fired when a user clicks Undo after clicking on
-   * "This Tweet's not helpful" flow in the caret menu of a Tweet result on the Search Results Page
-   */
-  object TweetUndoNotHelpful extends BaseClientEvent(ActionType.ClientTweetUndoNotHelpful)
-
-  object TweetReport extends BaseClientEvent(ActionType.ClientTweetReport) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] = {
-      for {
-        actionTweetId <- ceItem.id
-      } yield {
-        Item.TweetInfo(
-          ClientEventCommonUtils
-            .getBasicTweetInfo(
-              actionTweetId = actionTweetId,
-              ceItem = ceItem,
-              ceNamespaceOpt = logEvent.eventNamespace)
-            .copy(tweetActionInfo = Some(
-              TweetActionInfo.ClientTweetReport(
-                ClientTweetReport(
-                  isReportTweetDone =
-                    logEvent.eventNamespace.flatMap(_.action).exists(_.contains("done")),
-                  reportFlowId = logEvent.reportDetails.flatMap(_.reportFlowId)
-                )
-              ))))
-      }
-    }
-  }
-
-  /**
-   * Not Interested In (Do Not like) event
-   */
-  object TweetNotInterestedIn extends BaseClientEvent(ActionType.ClientTweetNotInterestedIn)
-  object TweetUndoNotInterestedIn extends BaseClientEvent(ActionType.ClientTweetUndoNotInterestedIn)
-
-  /**
-   * This is fired when a user FIRST clicks the "Not interested in this Tweet" button in the caret menu of a Tweet
-   * then clicks "This Tweet is not about {TOPIC}" in the subsequent prompt
-   * Note: this button is hidden unless a user clicks "Not interested in this Tweet" first.
-   */
-  object TweetNotAboutTopic extends BaseClientEvent(ActionType.ClientTweetNotAboutTopic)
-
-  /**
-   * This is fired when a user clicks "Undo" immediately after clicking "This Tweet is not about {TOPIC}",
-   * which is captured in TweetNotAboutTopic
-   */
-  object TweetUndoNotAboutTopic extends BaseClientEvent(ActionType.ClientTweetUndoNotAboutTopic)
-
-  /**
-   * This is fired when a user FIRST clicks the "Not interested in this Tweet" button in the caret menu of a Tweet
-   * then clicks  "This Tweet isn't recent" in the subsequent prompt
-   * Note: this button is hidden unless a user clicks "Not interested in this Tweet" first.
-   */
-  object TweetNotRecent extends BaseClientEvent(ActionType.ClientTweetNotRecent)
-
-  /**
-   * This is fired when a user clicks "Undo" immediately after clicking "his Tweet isn't recent",
-   * which is captured in TweetNotRecent
-   */
-  object TweetUndoNotRecent extends BaseClientEvent(ActionType.ClientTweetUndoNotRecent)
-
-  /**
-   * This is fired when a user clicks "Not interested in this Tweet" button in the caret menu of a Tweet
-   * then clicks  "Show fewer tweets from" in the subsequent prompt
-   * Note: this button is hidden unless a user clicks "Not interested in this Tweet" first.
-   */
-  object TweetSeeFewer extends BaseClientEvent(ActionType.ClientTweetSeeFewer)
-
-  /**
-   * This is fired when a user clicks "Undo" immediately after clicking "Show fewer tweets from",
-   * which is captured in TweetSeeFewer
-   */
-  object TweetUndoSeeFewer extends BaseClientEvent(ActionType.ClientTweetUndoSeeFewer)
-
-  /**
-   * This is fired when a user click "Submit" at the end of a "Report Tweet" flow
-   * ClientTweetReport = 1041 is scribed by HealthClient team, on the client side
-   * This is scribed by spamacaw, on the server side
-   * They can be joined on reportFlowId
-   * See https://confluence.twitter.biz/pages/viewpage.action?spaceKey=HEALTH&title=Understanding+ReportDetails
-   */
-  object TweetReportServer extends BaseClientEvent(ActionType.ServerTweetReport) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] =
-      for {
-        actionTweetId <- ceItem.id
-      } yield Item.TweetInfo(
-        ClientEventCommonUtils
-          .getBasicTweetInfo(
-            actionTweetId = actionTweetId,
-            ceItem = ceItem,
-            ceNamespaceOpt = logEvent.eventNamespace)
-          .copy(tweetActionInfo = Some(
-            TweetActionInfo.ServerTweetReport(
-              ServerTweetReport(
-                reportFlowId = logEvent.reportDetails.flatMap(_.reportFlowId),
-                reportType = logEvent.reportDetails.flatMap(_.reportType)
-              )
-            ))))
-  }
-
-  /**
-   * This is fired when a user clicks Block in a Profile page
-   * A Profile can also be blocked when a user clicks Block in the menu of a Tweet, which
-   * is captured in ClientTweetBlockAuthor
-   */
-  object ProfileBlock extends BaseProfileClientEvent(ActionType.ClientProfileBlock)
-
-  /**
-   * This is fired when a user clicks unblock in a pop-up prompt right after blocking a profile
-   * in the profile page or clicks unblock in a drop-down menu in the profile page.
-   */
-  object ProfileUnblock extends BaseProfileClientEvent(ActionType.ClientProfileUnblock)
-
-  /**
-   * This is fired when a user clicks Mute in a Profile page
-   * A Profile can also be muted when a user clicks Mute in the menu of a Tweet, which
-   * is captured in ClientTweetMuteAuthor
-   */
-  object ProfileMute extends BaseProfileClientEvent(ActionType.ClientProfileMute)
-
-  /*
-   * This is fired when a user clicks "Report User" action from user profile page
-   * */
-  object ProfileReport extends BaseProfileClientEvent(ActionType.ClientProfileReport)
-
-  // This is fired when a user profile is open in a Profile page
-  object ProfileShow extends BaseProfileClientEvent(ActionType.ClientProfileShow)
-
-  object ProfileClick extends BaseProfileClientEvent(ActionType.ClientProfileClick) {
-
-    /**
-     * ClientTweetClickProfile would emit 2 events, 1 with item type Tweet and 1 with item type User
-     * Both events will go to both actions (the actual classes). For ClientTweetClickProfile,
-     * item type of Tweet will filter out the event with item type User. But for ClientProfileClick,
-     * because we need to include item type of User, then we will also include the event of TweetClickProfile
-     * if we don't do anything here. This override ensures we don't include tweet author clicks events in ProfileClick
-     */
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] =
-      if (logEvent.eventDetails
-          .flatMap(_.items).exists(items => items.exists(_.itemType.contains(ItemType.Tweet)))) {
-        None
-      } else {
-        super.getUuaItem(ceItem, logEvent)
-      }
-  }
-
-  /**
-   * This is fired when a user follows a profile from the
-   * profile page / people module and people tab on the Search Results Page / sidebar on the Home page
-   * A Profile can also be followed when a user clicks follow in the
-   * caret menu of a Tweet / follow button on hovering on profile avatar,
-   * which is captured in ClientTweetFollowAuthor
-   */
-  object ProfileFollow extends BaseProfileClientEvent(ActionType.ClientProfileFollow) {
-
-    /**
-     * ClientTweetFollowAuthor would emit 2 events, 1 with item type Tweet and 1 with item type User
-     *  Both events will go to both actions (the actual classes). For ClientTweetFollowAuthor,
-     *  item type of Tweet will filter out the event with item type User. But for ClientProfileFollow,
-     *  because we need to include item type of User, then we will also include the event of TweetFollowAuthor
-     *  if we don't do anything here. This override ensures we don't include tweet author follow events in ProfileFollow
-     */
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] =
-      if (logEvent.eventDetails
-          .flatMap(_.items).exists(items => items.exists(_.itemType.contains(ItemType.Tweet)))) {
-        None
-      } else {
-        super.getUuaItem(ceItem, logEvent)
-      }
-  }
-
-  /**
-   * This is fired when a user clicks Follow in the caret menu of a Tweet or hovers on the avatar of the tweet author
-   * and clicks on the Follow button. A profile can also be followed by clicking the Follow button on the Profile
-   * page and confirm, which is captured in ClientProfileFollow.
-   * The event emits two items, one of user type and another of tweet type, since the default implementation of
-   * BaseClientEvent only looks for Tweet type, the other item is dropped which is the expected behaviour
-   */
-  object TweetFollowAuthor extends BaseClientEvent(ActionType.ClientTweetFollowAuthor) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] = {
-      for {
-        actionTweetId <- ceItem.id
-      } yield {
-        Item.TweetInfo(
-          ClientEventCommonUtils
-            .getBasicTweetInfo(
-              actionTweetId = actionTweetId,
-              ceItem = ceItem,
-              ceNamespaceOpt = logEvent.eventNamespace)
-            .copy(tweetActionInfo = Some(
-              TweetActionInfo.ClientTweetFollowAuthor(
-                ClientTweetFollowAuthor(
-                  ClientEventCommonUtils.getTweetAuthorFollowSource(logEvent.eventNamespace))
-              ))))
-      }
-    }
-  }
-
-  /**
-   * This is fired when a user clicks Unfollow in the caret menu of a Tweet or hovers on the avatar of the tweet author
-   * and clicks on the Unfollow button. A profile can also be unfollowed by clicking the Unfollow button on the Profile
-   * page and confirm, which will be captured in ClientProfileUnfollow.
-   * The event emits two items, one of user type and another of tweet type, since the default implementation of
-   * BaseClientEvent only looks for Tweet type, the other item is dropped which is the expected behaviour
-   */
-  object TweetUnfollowAuthor extends BaseClientEvent(ActionType.ClientTweetUnfollowAuthor) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] = {
-      for {
-        actionTweetId <- ceItem.id
-      } yield {
-        Item.TweetInfo(
-          ClientEventCommonUtils
-            .getBasicTweetInfo(
-              actionTweetId = actionTweetId,
-              ceItem = ceItem,
-              ceNamespaceOpt = logEvent.eventNamespace)
-            .copy(tweetActionInfo = Some(
-              TweetActionInfo.ClientTweetUnfollowAuthor(
-                ClientTweetUnfollowAuthor(
-                  ClientEventCommonUtils.getTweetAuthorUnfollowSource(logEvent.eventNamespace))
-              ))))
-      }
-    }
-  }
-
-  /**
-   * This is fired when a user clicks Block in the caret menu of a Tweet to block the profile
-   * that authors this Tweet. A profile can also be blocked in the Profile page, which is captured
-   * in ClientProfileBlock
-   */
-  object TweetBlockAuthor extends BaseClientEvent(ActionType.ClientTweetBlockAuthor)
-
-  /**
-   * This is fired when a user clicks unblock in a pop-up prompt right after blocking an author
-   * in the drop-down menu of a tweet
-   */
-  object TweetUnblockAuthor extends BaseClientEvent(ActionType.ClientTweetUnblockAuthor)
-
-  /**
-   * This is fired when a user clicks Mute in the caret menu of a Tweet to mute the profile
-   * that authors this Tweet. A profile can also be muted in the Profile page, which is captured
-   * in ClientProfileMute
-   */
-  object TweetMuteAuthor extends BaseClientEvent(ActionType.ClientTweetMuteAuthor)
-
-  /**
-   * This is fired when a user clicks on a Tweet to open the Tweet details page. Note that for
-   * Tweets in the Notification Tab product surface, a click can be registered differently
-   * depending on whether the Tweet is a rendered Tweet (a click results in ClientTweetClick)
-   * or a wrapper Notification (a click results in ClientNotificationClick).
-   */
-  object TweetClick extends BaseClientEvent(ActionType.ClientTweetClick)
-
-  /**
-   * This is fired when a user clicks to view the profile page of another user from a Tweet
-   */
-  object TweetClickProfile extends BaseClientEvent(ActionType.ClientTweetClickProfile)
-
-  /**
-   * This is fired when a user clicks on the "share" icon on a Tweet to open the share menu.
-   * The user may or may not proceed and finish sharing the Tweet.
-   */
-  object TweetClickShare extends BaseClientEvent(ActionType.ClientTweetClickShare)
-
-  /**
-   * This is fired when a user clicks "Copy link to Tweet" in a menu appeared after hitting
-   * the "share" icon on a Tweet OR when a user selects share_via -> copy_link after long-click
-   * a link inside a tweet on a mobile device
-   */
-  object TweetShareViaCopyLink extends BaseClientEvent(ActionType.ClientTweetShareViaCopyLink)
-
-  /**
-   * This is fired when a user clicks "Send via Direct Message" after
-   * clicking on the "share" icon on a Tweet to open the share menu.
-   * The user may or may not proceed and finish Sending the DM.
-   */
-  object TweetClickSendViaDirectMessage
-      extends BaseClientEvent(ActionType.ClientTweetClickSendViaDirectMessage)
-
-  /**
-   * This is fired when a user clicks "Bookmark" after
-   * clicking on the "share" icon on a Tweet to open the share menu.
-   */
-  object TweetShareViaBookmark extends BaseClientEvent(ActionType.ClientTweetShareViaBookmark)
-
-  /**
-   * This is fired when a user clicks "Remove Tweet from Bookmarks" after
-   * clicking on the "share" icon on a Tweet to open the share menu.
-   */
-  object TweetUnbookmark extends BaseClientEvent(ActionType.ClientTweetUnbookmark)
-
-  /**
-   * This event is fired when the user clicks on a hashtag in a Tweet.
-   */
-  object TweetClickHashtag extends BaseClientEvent(ActionType.ClientTweetClickHashtag) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] = for {
-      actionTweetId <- ceItem.id
-    } yield Item.TweetInfo(
-      ClientEventCommonUtils
-        .getBasicTweetInfo(
-          actionTweetId = actionTweetId,
-          ceItem = ceItem,
-          ceNamespaceOpt = logEvent.eventNamespace)
-        .copy(tweetActionInfo = logEvent.eventDetails
-          .map(
-            _.targets.flatMap(_.headOption.flatMap(_.name))
-          ) // fetch the first item in the details and then the name will have the hashtag value with the '#' sign
-          .map { hashtagOpt =>
-            TweetActionInfo.ClientTweetClickHashtag(
-              ClientTweetClickHashtag(hashtag = hashtagOpt)
-            )
-          }))
-  }
-
-  /**
-   * This is fired when a user clicks "Bookmark" after clicking on the "share" icon on a Tweet to
-   * open the share menu, or when a user clicks on the 'bookmark' icon on a Tweet (bookmark icon
-   * is available to ios only as of March 2023).
-   * TweetBookmark and TweetShareByBookmark log the same events but serve for individual use cases.
-   */
-  object TweetBookmark extends BaseClientEvent(ActionType.ClientTweetBookmark)
-
-  /**
-   * This is fired when a user clicks on a link in a tweet.
-   * The link could be displayed as a URL or embedded
-   * in a component such as an image or a card in a tweet.
-   */
-  object TweetOpenLink extends BaseClientEvent(ActionType.ClientTweetOpenLink) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] =
-      for {
-        actionTweetId <- ceItem.id
-      } yield Item.TweetInfo(
-        ClientEventCommonUtils
-          .getBasicTweetInfo(
-            actionTweetId = actionTweetId,
-            ceItem = ceItem,
-            ceNamespaceOpt = logEvent.eventNamespace)
-          .copy(tweetActionInfo = Some(
-            TweetActionInfo.ClientTweetOpenLink(
-              ClientTweetOpenLink(url = logEvent.eventDetails.flatMap(_.url))
-            ))))
-  }
-
-  /**
-   * This is fired when a user takes a screenshot.
-   * This is available for only mobile clients.
-   */
-  object TweetTakeScreenshot extends BaseClientEvent(ActionType.ClientTweetTakeScreenshot) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] =
-      for {
-        actionTweetId <- ceItem.id
-      } yield Item.TweetInfo(
-        ClientEventCommonUtils
-          .getBasicTweetInfo(
-            actionTweetId = actionTweetId,
-            ceItem = ceItem,
-            ceNamespaceOpt = logEvent.eventNamespace)
-          .copy(tweetActionInfo = Some(
-            TweetActionInfo.ClientTweetTakeScreenshot(
-              ClientTweetTakeScreenshot(percentVisibleHeight100k = ceItem.percentVisibleHeight100k)
-            ))))
-  }
-
-  /**
-   * This is fired when a user clicks the "This Tweet isn't relevant" button in a prompt displayed
-   * after clicking "This Tweet's not helpful" in search result page or "Not Interested in this Tweet"
-   * in the home timeline page.
-   * Note: this button is hidden unless a user clicks "This Tweet isn't relevant" or
-   * "This Tweet's not helpful" first
-   */
-  object TweetNotRelevant extends BaseClientEvent(ActionType.ClientTweetNotRelevant)
-
-  /**
-   * This is fired when a user clicks "Undo" immediately after clicking "this Tweet isn't relevant",
-   * which is captured in TweetNotRelevant
-   */
-  object TweetUndoNotRelevant extends BaseClientEvent(ActionType.ClientTweetUndoNotRelevant)
-
-  /**
-   * This is fired when a user is logged out and follows a profile from the
-   * profile page / people module from web.
-   * One can only try to follow from web, iOS and Android do not support logged out browsing
-   */
-  object ProfileFollowAttempt extends BaseProfileClientEvent(ActionType.ClientProfileFollowAttempt)
-
-  /**
-   * This is fired when a user is logged out and favourite a tweet from web.
-   * One can only try to favourite from web, iOS and Android do not support logged out browsing
-   */
-  object TweetFavoriteAttempt extends BaseClientEvent(ActionType.ClientTweetFavoriteAttempt)
-
-  /**
-   * This is fired when a user is logged out and Retweet a tweet from web.
-   * One can only try to favourite from web, iOS and Android do not support logged out browsing
-   */
-  object TweetRetweetAttempt extends BaseClientEvent(ActionType.ClientTweetRetweetAttempt)
-
-  /**
-   * This is fired when a user is logged out and reply on tweet from web.
-   * One can only try to favourite from web, iOS and Android do not support logged out browsing
-   */
-  object TweetReplyAttempt extends BaseClientEvent(ActionType.ClientTweetReplyAttempt)
-
-  /**
-   * This is fired when a user is logged out and clicks on login button.
-   * Currently seem to be generated only on [m5, LiteNativeWrapper] as of Jan 2023.
-   */
-  object CTALoginClick extends BaseCTAClientEvent(ActionType.ClientCTALoginClick)
-
-  /**
-   * This is fired when a user is logged out and login window is shown.
-   */
-  object CTALoginStart extends BaseCTAClientEvent(ActionType.ClientCTALoginStart)
-
-  /**
-   * This is fired when a user is logged out and login is successful.
-   */
-  object CTALoginSuccess extends BaseCTAClientEvent(ActionType.ClientCTALoginSuccess)
-
-  /**
-   * This is fired when a user is logged out and clicks on signup button.
-   */
-  object CTASignupClick extends BaseCTAClientEvent(ActionType.ClientCTASignupClick)
-
-  /**
-   * This is fired when a user is logged out and signup is successful.
-   */
-  object CTASignupSuccess extends BaseCTAClientEvent(ActionType.ClientCTASignupSuccess)
-
-  /**
-   * This is fired when a user opens a Push Notification.
-   * Refer to https://confluence.twitter.biz/pages/viewpage.action?pageId=161811800
-   * for Push Notification scribe details
-   */
-  object NotificationOpen extends BasePushNotificationClientEvent(ActionType.ClientNotificationOpen)
-
-  /**
-   * This is fired when a user clicks on a notification in the Notification Tab.
-   * Refer to go/ntab-urt-scribe for Notification Tab scribe details.
-   */
-  object NotificationClick
-      extends BaseNotificationTabClientEvent(ActionType.ClientNotificationClick)
-
-  /**
-   * This is fired when a user taps the "See Less Often" caret menu item of a notification in
-   * the Notification Tab.
-   * Refer to go/ntab-urt-scribe for Notification Tab scribe details.
-   */
-  object NotificationSeeLessOften
-      extends BaseNotificationTabClientEvent(ActionType.ClientNotificationSeeLessOften)
-
-  /**
-   * This is fired when a user closes or swipes away a Push Notification.
-   * Refer to https://confluence.twitter.biz/pages/viewpage.action?pageId=161811800
-   * for Push Notification scribe details
-   */
-  object NotificationDismiss
-      extends BasePushNotificationClientEvent(ActionType.ClientNotificationDismiss)
-
-  /**
-   *  This is fired when a user clicks on a typeahead suggestion(queries, events, topics, users)
-   *  in a drop-down menu of a search box or a tweet compose box.
-   */
-  object TypeaheadClick extends BaseSearchTypeaheadEvent(ActionType.ClientTypeaheadClick)
-
-  /**
-   * This is a generic event fired when the user submits feedback on a prompt.
-   * Some examples include Did You Find It Prompt and Tweet Relevance on Search Results Page.
-   */
-  object FeedbackPromptSubmit
-      extends BaseFeedbackSubmitClientEvent(ActionType.ClientFeedbackPromptSubmit)
-
-  object AppExit extends BaseUASClientEvent(ActionType.ClientAppExit)
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventImpression.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventImpression.docx
new file mode 100644
index 000000000..909a489ce
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventImpression.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventImpression.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventImpression.scala
deleted file mode 100644
index e0315015f..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/ClientEventImpression.scala
+++ /dev/null
@@ -1,207 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.EventNamespace
-import com.twitter.clientapp.thriftscala.LogEvent
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.logbase.thriftscala.LogBase
-import com.twitter.unified_user_actions.thriftscala._
-import com.twitter.unified_user_actions.thriftscala.Item.TweetInfo
-
-object ClientEventImpression {
-  object TweetLingerImpression extends BaseClientEvent(ActionType.ClientTweetLingerImpression) {
-    override def getUuaItem(
-      ceItem: LogEventItem,
-      logEvent: LogEvent
-    ): Option[Item] = {
-      for {
-        actionTweetId <- ceItem.id
-        impressionDetails <- ceItem.impressionDetails
-        lingerStartTimestampMs <- impressionDetails.visibilityStart
-        lingerEndTimestampMs <- impressionDetails.visibilityEnd
-      } yield {
-        Item.TweetInfo(
-          ClientEventCommonUtils
-            .getBasicTweetInfo(actionTweetId, ceItem, logEvent.eventNamespace)
-            .copy(tweetActionInfo = Some(
-              TweetActionInfo.ClientTweetLingerImpression(
-                ClientTweetLingerImpression(
-                  lingerStartTimestampMs = lingerStartTimestampMs,
-                  lingerEndTimestampMs = lingerEndTimestampMs
-                )
-              ))))
-      }
-    }
-  }
-
-  /**
-   * To make parity with iesource's definition, render impression for quoted Tweets would emit
-   * 2 events: 1 for the quoting Tweet and 1 for the original Tweet!!!
-   */
-  object TweetRenderImpression extends BaseClientEvent(ActionType.ClientTweetRenderImpression) {
-    override def toUnifiedUserAction(logEvent: LogEvent): Seq[UnifiedUserAction] = {
-
-      val logBase: Option[LogBase] = logEvent.logBase
-
-      val raw = for {
-        ed <- logEvent.eventDetails.toSeq
-        items <- ed.items.toSeq
-        ceItem <- items
-        eventTimestamp <- logBase.flatMap(getSourceTimestamp)
-        uuaItem <- getUuaItem(ceItem, logEvent)
-        if isItemTypeValid(ceItem.itemType)
-      } yield {
-        val userIdentifier: UserIdentifier = UserIdentifier(
-          userId = logBase.flatMap(_.userId),
-          guestIdMarketing = logBase.flatMap(_.guestIdMarketing))
-
-        val productSurface: Option[ProductSurface] = ProductSurfaceUtils
-          .getProductSurface(logEvent.eventNamespace)
-
-        val eventMetaData: EventMetadata = ClientEventCommonUtils
-          .getEventMetadata(
-            eventTimestamp = eventTimestamp,
-            logEvent = logEvent,
-            ceItem = ceItem,
-            productSurface = productSurface
-          )
-
-        UnifiedUserAction(
-          userIdentifier = userIdentifier,
-          item = uuaItem,
-          actionType = ActionType.ClientTweetRenderImpression,
-          eventMetadata = eventMetaData,
-          productSurface = productSurface,
-          productSurfaceInfo =
-            ProductSurfaceUtils.getProductSurfaceInfo(productSurface, ceItem, logEvent)
-        )
-      }
-
-      raw.flatMap { e =>
-        e.item match {
-          case TweetInfo(t) =>
-            // If it is an impression toward quoted Tweet we emit 2 impressions, 1 for quoting Tweet
-            // and 1 for the original Tweet.
-            if (t.quotedTweetId.isDefined) {
-              val originalItem = t.copy(
-                actionTweetId = t.quotedTweetId.get,
-                actionTweetAuthorInfo = t.quotedAuthorId.map(id => AuthorInfo(authorId = Some(id))),
-                quotingTweetId = Some(t.actionTweetId),
-                quotedTweetId = None,
-                inReplyToTweetId = None,
-                replyingTweetId = None,
-                retweetingTweetId = None,
-                retweetedTweetId = None,
-                quotedAuthorId = None,
-                retweetingAuthorId = None,
-                inReplyToAuthorId = None
-              )
-              val original = e.copy(item = TweetInfo(originalItem))
-              Seq(original, e)
-            } else Seq(e)
-          case _ => Nil
-        }
-      }
-    }
-  }
-
-  object TweetGalleryImpression extends BaseClientEvent(ActionType.ClientTweetGalleryImpression)
-
-  object TweetDetailsImpression extends BaseClientEvent(ActionType.ClientTweetDetailsImpression) {
-
-    case class EventNamespaceInternal(
-      client: String,
-      page: String,
-      section: String,
-      component: String,
-      element: String,
-      action: String)
-
-    def isTweetDetailsImpression(eventNamespaceOpt: Option[EventNamespace]): Boolean =
-      eventNamespaceOpt.exists { eventNamespace =>
-        val eventNamespaceInternal = EventNamespaceInternal(
-          client = eventNamespace.client.getOrElse(""),
-          page = eventNamespace.page.getOrElse(""),
-          section = eventNamespace.section.getOrElse(""),
-          component = eventNamespace.component.getOrElse(""),
-          element = eventNamespace.element.getOrElse(""),
-          action = eventNamespace.action.getOrElse(""),
-        )
-
-        isIphoneAppOrMacAppOrIpadAppClientTweetDetailsImpression(
-          eventNamespaceInternal) || isAndroidAppClientTweetDetailsImpression(
-          eventNamespaceInternal) || isWebClientTweetDetailImpression(
-          eventNamespaceInternal) || isTweetDeckAppClientTweetDetailsImpression(
-          eventNamespaceInternal) || isOtherAppClientTweetDetailsImpression(eventNamespaceInternal)
-      }
-
-    private def isWebClientTweetDetailImpression(
-      eventNamespace: EventNamespaceInternal
-    ): Boolean = {
-      val eventNameSpaceStr =
-        eventNamespace.client + ":" + eventNamespace.page + ":" + eventNamespace.section + ":" + eventNamespace.component + ":" + eventNamespace.element + ":" + eventNamespace.action
-      eventNameSpaceStr.equalsIgnoreCase("m5:tweet::::show") || eventNameSpaceStr.equalsIgnoreCase(
-        "m5:tweet:landing:::show") || eventNameSpaceStr
-        .equalsIgnoreCase("m2:tweet::::impression") || eventNameSpaceStr.equalsIgnoreCase(
-        "m2:tweet::tweet::impression") || eventNameSpaceStr
-        .equalsIgnoreCase("LiteNativeWrapper:tweet::::show") || eventNameSpaceStr.equalsIgnoreCase(
-        "LiteNativeWrapper:tweet:landing:::show")
-    }
-
-    private def isOtherAppClientTweetDetailsImpression(
-      eventNamespace: EventNamespaceInternal
-    ): Boolean = {
-      val excludedClients = Set(
-        "web",
-        "m5",
-        "m2",
-        "LiteNativeWrapper",
-        "iphone",
-        "ipad",
-        "mac",
-        "android",
-        "android_tablet",
-        "deck")
-      (!excludedClients.contains(eventNamespace.client)) && eventNamespace.page
-        .equalsIgnoreCase("tweet") && eventNamespace.section
-        .equalsIgnoreCase("") && eventNamespace.component
-        .equalsIgnoreCase("tweet") && eventNamespace.element
-        .equalsIgnoreCase("") && eventNamespace.action.equalsIgnoreCase("impression")
-    }
-
-    private def isTweetDeckAppClientTweetDetailsImpression(
-      eventNamespace: EventNamespaceInternal
-    ): Boolean =
-      eventNamespace.client
-        .equalsIgnoreCase("deck") && eventNamespace.page
-        .equalsIgnoreCase("tweet") && eventNamespace.section
-        .equalsIgnoreCase("") && eventNamespace.component
-        .equalsIgnoreCase("tweet") && eventNamespace.element
-        .equalsIgnoreCase("") && eventNamespace.action.equalsIgnoreCase("impression")
-
-    private def isAndroidAppClientTweetDetailsImpression(
-      eventNamespace: EventNamespaceInternal
-    ): Boolean =
-      (eventNamespace.client
-        .equalsIgnoreCase("android") || eventNamespace.client
-        .equalsIgnoreCase("android_tablet")) && eventNamespace.page
-        .equalsIgnoreCase("tweet") && eventNamespace.section.equalsIgnoreCase(
-        "") && (eventNamespace.component
-        .equalsIgnoreCase("tweet") || eventNamespace.component
-        .matches("^suggest.*_tweet.*$") || eventNamespace.component
-        .equalsIgnoreCase("")) && eventNamespace.element
-        .equalsIgnoreCase("") && eventNamespace.action.equalsIgnoreCase("impression")
-
-    private def isIphoneAppOrMacAppOrIpadAppClientTweetDetailsImpression(
-      eventNamespace: EventNamespaceInternal
-    ): Boolean =
-      (eventNamespace.client
-        .equalsIgnoreCase("iphone") || eventNamespace.client
-        .equalsIgnoreCase("ipad") || eventNamespace.client
-        .equalsIgnoreCase("mac")) && eventNamespace.page.equalsIgnoreCase(
-        "tweet") && eventNamespace.section
-        .equalsIgnoreCase("") && (eventNamespace.component
-        .equalsIgnoreCase("tweet") || eventNamespace.component
-        .matches("^suggest.*_tweet.*$")) && eventNamespace.element
-        .equalsIgnoreCase("") && eventNamespace.action.equalsIgnoreCase("impression")
-  }
-}
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/HomeInfoUtils.docx b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/HomeInfoUtils.docx
new file mode 100644
index 000000000..5d4d4e0cc
Binary files /dev/null and b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/HomeInfoUtils.docx differ
diff --git a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/HomeInfoUtils.scala b/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/HomeInfoUtils.scala
deleted file mode 100644
index 276908f02..000000000
--- a/unified_user_actions/adapter/src/main/scala/com/twitter/unified_user_actions/adapter/client_event/HomeInfoUtils.scala
+++ /dev/null
@@ -1,32 +0,0 @@
-package com.twitter.unified_user_actions.adapter.client_event
-
-import com.twitter.clientapp.thriftscala.{Item => LogEventItem}
-import com.twitter.suggests.controller_data.home_tweets.thriftscala.HomeTweetsControllerData
-import com.twitter.suggests.controller_data.home_tweets.thriftscala.HomeTweetsControllerDataAliases.V1Alias
-import com.twitter.suggests.controller_data.thriftscala.ControllerData
-import com.twitter.suggests.controller_data.v2.thriftscala.{ControllerData => ControllerDataV2}
-
-object HomeInfoUtils {
-
-  def getHomeTweetControllerDataV1(ceItem: LogEventItem): Option[V1Alias] = {
-    ceItem.suggestionDetails
-      .flatMap(_.decodedControllerData)
-      .flatMap(_ match {
-        case ControllerData.V2(
-              ControllerDataV2.HomeTweets(
-                HomeTweetsControllerData.V1(homeTweetsControllerDataV1)
-              )) =>
-          Some(homeTweetsControllerDataV1)
-        case _ => None
-      })
-  }
-
-  def getTraceId(ceItem: LogEventItem): Option[Long] =
-    getHomeTweetControllerDataV1(ceItem).flatMap(_.traceId)
-
-  def getSuggestType(ceItem: LogEventItem): Option[String] =
-    ceItem.suggestionDetails.flatMap(_.suggestionType)
-
-  def getRequestJoinId(ceItem: LogEventItem): Option[Long] =
-    getHomeTweetControllerDataV1(ceItem).flatMap(_.requestJoinId)
-}