diff --git a/twml/BUILD b/twml/BUILD
deleted file mode 100644
index c339f6fae..000000000
--- a/twml/BUILD
+++ /dev/null
@@ -1,186 +0,0 @@
-twml_sources = [
-    "twml/**/*.py",
-]
-
-twml_deps = [
-    "3rdparty/python/cherrypy:default",
-    "3rdparty/python/pyyaml:default",
-    "3rdparty/python/absl-py:default",
-    "3rdparty/python/joblib:default",
-    "3rdparty/python/kazoo:default",
-    "3rdparty/python/python-dateutil:default",
-    "3rdparty/python/pytz:default",
-    "cortex/ml-metastore/src/main/python/com/twitter/mlmetastore/modelrepo/client",
-    "src/python/twitter/common/app",
-    "src/python/twitter/common/app/modules:vars",
-    "src/python/twitter/common/metrics",
-    "src/python/twitter/deepbird/compat/v1/optimizers",
-    "src/python/twitter/deepbird/compat/v1/rnn",
-    "src/python/twitter/deepbird/hparam",
-    "src/python/twitter/deepbird/io",
-    "src/python/twitter/deepbird/io/legacy",
-    "src/python/twitter/deepbird/logging",
-    "src/python/twitter/deepbird/sparse",
-    "src/python/twitter/deepbird/stats_server",
-    "src/python/twitter/deepbird/util:simple-data-record-handler",
-    "src/python/twitter/deepbird/util/hashing",
-    "src/python/twitter/ml/api/dal",
-    "src/python/twitter/ml/common:metrics",
-    "src/python/twitter/ml/common/kubernetes",
-    "src/python/twitter/ml/common:resources",
-    "src/python/twitter/ml/twml/kubernetes",
-    "src/python/twitter/ml/twml:status",
-    "src/thrift/com/twitter/dal:dal_no_constants-python",
-    "src/thrift/com/twitter/statebird:compiled-v2-python",
-]
-
-python3_library(
-    name = "twml-test-common-deps",
-    tags = ["no-mypy"],
-    dependencies = [
-        "src/python/twitter/deepbird/util:inference",
-        "src/python/twitter/deepbird/util/data",
-        "src/thrift/com/twitter/ml/api:data-python",
-        "twml/tests/data:resources",
-    ],
-)
-
-python3_library(
-    name = "twml_packer_deps_no_tf",
-    tags = [
-        "bazel-compatible",
-        "no-mypy",
-    ],
-    dependencies = [
-        "3rdparty/python/numpy:default",
-        "3rdparty/python/pandas:default",
-        "3rdparty/python/pyyaml:default",
-        "3rdparty/python/requests:default",
-        "3rdparty/python/scikit-learn:default",
-        "3rdparty/python/scipy:default",
-        "3rdparty/python/tensorflow-hub:default",
-        "3rdparty/python/thriftpy2:default",
-    ],
-)
-
-python3_library(
-    name = "twml_packer_deps_no_tf_py3",
-    tags = [
-        "known-to-fail-jira:CX-20246",
-        "no-mypy",
-    ],
-    dependencies = [
-        ":twml_packer_deps_no_tf",
-        "3rdparty/python/tensorflow-model-analysis",
-    ],
-)
-
-alias(
-    name = "twml-test-shared",
-    target = ":twml_common",
-)
-
-python3_library(
-    name = "twml_common",
-    sources = ["twml_common/**/*.py"],
-    tags = [
-        "bazel-compatible",
-        "no-mypy",
-    ],
-)
-
-# Alias twml-dev to twml to avoid breaking user targets.
-alias(
-    name = "twml-dev",
-    target = "twml",
-)
-
-python3_library(
-    name = "twml-test-dev-deps",
-    tags = [
-        "bazel-compatible",
-        "no-mypy",
-    ],
-    dependencies = [
-        ":twml",
-        ":twml-test-common-deps",
-        ":twml-test-shared",
-        "3rdparty/python/freezegun:default",
-        "src/python/twitter/deepbird/keras/layers",
-        "src/thrift/com/twitter/ml/api:data-python",
-        "src/thrift/com/twitter/ml/prediction_service:prediction_service-python",
-    ],
-)
-
-python3_library(
-    name = "twml-dev-python",
-    sources = twml_sources,
-    tags = [
-        "bazel-compatible",
-        "no-mypy",
-    ],
-    dependencies = twml_deps + [
-        ":twml_packer_deps_no_tf",
-        "3rdparty/python/tensorflow",
-        "3rdparty/python/twml:libtwml-universal",
-        "twml/libtwml:libtwml-python",
-    ],
-)
-
-# Build a smaller .pex file that models can depend on.
-# Tensorflow and other dependencies are downloaded from Packer on Aurora.
-# Note: This gets the C++ ops through 3rdparty artifacts.
-python3_library(
-    name = "twml-nodeps",
-    sources = twml_sources,
-    tags = [
-        "bazel-compatible",
-        "no-mypy",
-    ],
-    dependencies = twml_deps + [
-        "3rdparty/python/twml:libtwml-universal",
-    ],
-)
-
-python3_library(
-    name = "twml",
-    tags = [
-        "bazel-compatible",
-        "no-mypy",
-    ],
-    dependencies = [
-        ":twml-nodeps",
-        ":twml_packer_deps_no_tf",
-        "3rdparty/python/tensorflow",
-    ],
-)
-
-python37_binary(
-    name = "tensorboard",
-    source = "twml/tensorboard/__main__.py",
-    dependencies = [
-        "3rdparty/python/_closures/twml:tensorboard",
-        "3rdparty/python/tensorflow",
-    ],
-)
-
-python37_binary(
-    name = "saved_model_cli",
-    source = "twml/saved_model_cli/__main__.py",
-    dependencies = [
-        "3rdparty/python/_closures/twml:saved_model_cli",
-        "3rdparty/python/tensorflow",
-    ],
-)
-
-# This target is added so twml can be used regardless of the Tensorflow version:
-# This target does not pull in TensorFlow 1.x or the related libtwml compiled using TF 1.x.
-python3_library(
-    name = "twml-py-source-only",
-    sources = twml_sources,
-    tags = [
-        "known-to-fail-jira:CX-23416",
-        "no-mypy",
-    ],
-    dependencies = twml_deps,
-)
diff --git a/twml/README.md b/twml/README.md
deleted file mode 100644
index b2b315b45..000000000
--- a/twml/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# TWML
-
----
-Note: `twml` is no longer under development. Much of the code here is out of date and unused.
-It is included here for completeness, because `twml` is still used to train the light ranker models
-(see `src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.md`)
----
-
-TWML is one of Twitter's machine learning frameworks, which uses Tensorflow under the hood. While it is mostly
-deprecated,
-it is still currently used to train the Earlybird light ranking models (
-see `src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py`).
-The most relevant part of this is the `DataRecordTrainer` class, which is where the core training logic resides.  
diff --git a/twml/libtwml/BUILD b/twml/libtwml/BUILD
deleted file mode 100644
index c80b64b3b..000000000
--- a/twml/libtwml/BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-python3_library(
-    name = "libtwml-python",
-    sources = ["libtwml/**/*.py"],
-    tags = [
-        "no-mypy",
-        "bazel-compatible",
-    ],
-)
diff --git a/twml/libtwml/include/twml.h b/twml/libtwml/include/twml.h
deleted file mode 100644
index 9d88cdc7b..000000000
--- a/twml/libtwml/include/twml.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <twml/defines.h>
-#include <twml/Error.h>
-#include <twml/functions.h>
-#include <twml/Hashmap.h>
-#include <twml/optim.h>
-#include <twml/hashing_discretizer_impl.h>
-#include <twml/discretizer_impl.h>
-#include <twml/Tensor.h>
-#include <twml/HashedDataRecord.h>
-#include <twml/BatchPredictionRequest.h>
-#include <twml/BatchPredictionResponse.h>
-#include <twml/BlockFormatReader.h>
-#include <twml/BlockFormatWriter.h>
-#include <twml/ThriftReader.h>
-#include <twml/ThriftWriter.h>
-#include <twml/HashedDataRecordReader.h>
-#include <twml/DataRecordReader.h>
-#include <twml/DataRecordWriter.h>
-#include <twml/TensorRecordWriter.h>
-#include <twml/DataRecord.h>
-#include <twml/io/IOError.h>
diff --git a/twml/libtwml/include/twml/BatchPredictionRequest.h b/twml/libtwml/include/twml/BatchPredictionRequest.h
deleted file mode 100644
index 6070ec045..000000000
--- a/twml/libtwml/include/twml/BatchPredictionRequest.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-
-#include <twml/DataRecord.h>
-#include <twml/HashedDataRecord.h>
-#include <twml/Tensor.h>
-
-namespace twml {
-
-template<class RecordType>
-class GenericBatchPredictionRequest {
- static_assert(std::is_same<RecordType, HashedDataRecord>::value ||
-               std::is_same<RecordType, DataRecord>::value,
-               "RecordType has to be HashedDatarecord or DataRecord");
- public:
-  typedef typename RecordType::Reader Reader;
-  GenericBatchPredictionRequest(int numOfLabels=0, int numOfWeights=0):
-      m_common_features(), m_requests(),
-      num_labels(numOfLabels), num_weights(numOfWeights)
-  {}
-
-  void decode(Reader &reader);
-
-  std::vector<RecordType>& requests() {
-    return m_requests;
-  }
-
-  RecordType& common() {
-    return m_common_features;
-  }
-
- private:
-  RecordType m_common_features;
-  std::vector<RecordType> m_requests;
-  int num_labels;
-  int num_weights;
-};
-
-using HashedBatchPredictionRequest = GenericBatchPredictionRequest<HashedDataRecord>;
-using BatchPredictionRequest = GenericBatchPredictionRequest<DataRecord>;
-
-}
-
-#endif
diff --git a/twml/libtwml/include/twml/BatchPredictionResponse.h b/twml/libtwml/include/twml/BatchPredictionResponse.h
deleted file mode 100644
index b7e709464..000000000
--- a/twml/libtwml/include/twml/BatchPredictionResponse.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-
-#include <twml/Tensor.h>
-#include <twml/RawTensor.h>
-#include <twml/ThriftWriter.h>
-
-namespace twml {
-
-    // Encodes a batch of model predictions as a list of Thrift DataRecord
-    // objects inside a Thrift BatchPredictionResponse object. Prediction
-    // values are continousFeatures inside each DataRecord.
-    //
-    // The BatchPredictionResponseWriter TensorFlow operator uses this class
-    // to determine the size of the output tensor to allocate. The operator
-    // then allocates memory for the output tensor and uses this class to
-    // write binary Thrift to the output tensor.
-    //
-    class BatchPredictionResponse {
-    private:
-      uint64_t batch_size_;
-      const Tensor &keys_;
-      const Tensor &values_;  // prediction values (batch_size * num_keys)
-      const Tensor &dense_keys_;
-      const std::vector<RawTensor> &dense_values_;
-
-      inline uint64_t getBatchSize() { return batch_size_; }
-      inline bool hasContinuous() { return keys_.getNumDims() > 0; }
-      inline bool hasDenseTensors() { return dense_keys_.getNumDims() > 0; }
-
-      inline uint64_t getPredictionSize() {
-        return values_.getNumDims() > 1 ? values_.getDim(1) : 1;
-      };
-
-      void encode(twml::ThriftWriter &thrift_writer);
-
-      template <typename T>
-      void serializePredictions(twml::ThriftWriter &thrift_writer);
-
-    public:
-      // keys:         'continuousFeatures' prediction keys
-      // values:       'continuousFeatures' prediction values (batch_size * num_keys)
-      // dense_keys:   'tensors' prediction keys
-      // dense_values: 'tensors' prediction values (batch_size * num_keys)
-      BatchPredictionResponse(
-        const Tensor &keys, const Tensor &values,
-        const Tensor &dense_keys, const std::vector<RawTensor> &dense_values);
-
-      // Calculate the size of the Thrift encoded output (but do not encode).
-      // The BatchPredictionResponseWriter TensorFlow operator uses this value
-      // to allocate the output tensor.
-      uint64_t encodedSize();
-
-      // Write the BatchPredictionResponse as binary Thrift. The
-      // BatchPredictionResponseWriter operator uses this method to populate
-      // the output tensor.
-      void write(Tensor &result);
-    };
-}
diff --git a/twml/libtwml/include/twml/BlockFormatReader.h b/twml/libtwml/include/twml/BlockFormatReader.h
deleted file mode 100644
index 4c68458ba..000000000
--- a/twml/libtwml/include/twml/BlockFormatReader.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include <string>
-#include <cstdlib>
-#include <unistd.h>
-#include <stdexcept>
-#include <inttypes.h>
-#include <stdint.h>
-
-namespace twml {
-class BlockFormatReader {
- private:
-  int record_size_;
-  long block_pos_;
-  long block_end_;
-  char classname_[1024];
-
-  int read_one_record_size();
-  int read_int();
-  int consume_marker(int scan);
-  int unpack_varint_i32();
-  int unpack_tag_and_wiretype(uint32_t *tag, uint32_t *wiretype);
-  int unpack_string(char *out, uint64_t max_out_len);
-
- public:
-  BlockFormatReader();
-  bool next();
-  uint64_t current_size() const { return record_size_; }
-
-  virtual uint64_t read_bytes(void *dest, int size, int count) = 0;
-};
-}
diff --git a/twml/libtwml/include/twml/BlockFormatWriter.h b/twml/libtwml/include/twml/BlockFormatWriter.h
deleted file mode 100644
index b9c496f40..000000000
--- a/twml/libtwml/include/twml/BlockFormatWriter.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-#include <cstdlib>
-#include <cstdio>
-#include <unistd.h>
-#include <cinttypes>
-#include <cstdint>
-
-#ifndef PATH_MAX
-#define PATH_MAX (8096)
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-  struct block_format_writer__;
-  typedef block_format_writer__ * block_format_writer;
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#ifdef __cplusplus
-namespace twml {
-    class BlockFormatWriter {
-    private:
-        const char *file_name_;
-        FILE *outputfile_;
-        char temp_file_name_[PATH_MAX];
-        int record_index_;
-        int records_per_block_;
-
-        int pack_tag_and_wiretype(FILE *file, uint32_t tag, uint32_t wiretype);
-        int pack_varint_i32(FILE *file, int value);
-        int pack_string(FILE *file, const char *in, size_t in_len);
-        int write_int(FILE *file, int value);
-
-    public:
-        BlockFormatWriter(const char *file_name, int record_per_block);
-        ~BlockFormatWriter();
-        int write(const char *class_name, const char *record, int record_len) ;
-        int flush();
-        block_format_writer getHandle();
-      };
-
-      BlockFormatWriter *getBlockFormatWriter(block_format_writer w);
-} //twml namespace
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-twml_err block_format_writer_create(block_format_writer *w, const char *file_name, int records_per_block);
-twml_err block_format_write(block_format_writer w, const char *class_name, const char *record, int record_len);
-twml_err block_format_flush(block_format_writer w);
-twml_err block_format_writer_delete(const block_format_writer w);
-#ifdef __cplusplus
-}
-#endif
diff --git a/twml/libtwml/include/twml/DataRecord.h b/twml/libtwml/include/twml/DataRecord.h
deleted file mode 100644
index f39f1158b..000000000
--- a/twml/libtwml/include/twml/DataRecord.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/common.h>
-#include <twml/defines.h>
-#include <twml/TensorRecord.h>
-
-#include <cstdint>
-#include <cmath>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace twml {
-
-class DataRecordReader;
-
-class TWMLAPI DataRecord : public TensorRecord {
-public:
-  typedef std::vector<std::pair<std::string, double>> SparseContinuousValueType;
-  typedef std::vector<std::string> SparseBinaryValueType;
-  typedef Set<int64_t> BinaryFeatures;
-  typedef Map<int64_t, double> ContinuousFeatures;
-  typedef Map<int64_t, int64_t> DiscreteFeatures;
-  typedef Map<int64_t, std::string> StringFeatures;
-  typedef Map<int64_t, SparseBinaryValueType> SparseBinaryFeatures;
-  typedef Map<int64_t, SparseContinuousValueType> SparseContinuousFeatures;
-  typedef Map<int64_t, std::vector<uint8_t>> BlobFeatures;
-
-private:
-  BinaryFeatures m_binary;
-  ContinuousFeatures m_continuous;
-  DiscreteFeatures m_discrete;
-  StringFeatures m_string;
-  SparseBinaryFeatures m_sparsebinary;
-  SparseContinuousFeatures m_sparsecontinuous;
-  BlobFeatures m_blob;
-
-
-  std::vector<float> m_labels;
-  std::vector<float> m_weights;
-
-  void addLabel(int64_t id, double label = 1);
-  void addWeight(int64_t id, double value);
-
-public:
-  typedef DataRecordReader Reader;
-
-  DataRecord(int num_labels=0, int num_weights=0):
-      m_binary(),
-      m_continuous(),
-      m_discrete(),
-      m_string(),
-      m_sparsebinary(),
-      m_sparsecontinuous(),
-      m_blob(),
-      m_labels(num_labels, std::nanf("")),
-      m_weights(num_weights) {
-#ifdef USE_DENSE_HASH
-        m_binary.set_empty_key(0);
-        m_continuous.set_empty_key(0);
-        m_discrete.set_empty_key(0);
-        m_string.set_empty_key(0);
-        m_sparsebinary.set_empty_key(0);
-        m_sparsecontinuous.set_empty_key(0);
-#endif
-        m_binary.max_load_factor(0.5);
-        m_continuous.max_load_factor(0.5);
-        m_discrete.max_load_factor(0.5);
-        m_string.max_load_factor(0.5);
-        m_sparsebinary.max_load_factor(0.5);
-        m_sparsecontinuous.max_load_factor(0.5);
-      }
-
-  const BinaryFeatures &getBinary() const { return m_binary; }
-  const ContinuousFeatures &getContinuous() const { return m_continuous; }
-  const DiscreteFeatures &getDiscrete() const { return m_discrete; }
-  const StringFeatures &getString() const { return m_string; }
-  const SparseBinaryFeatures &getSparseBinary() const { return m_sparsebinary; }
-  const SparseContinuousFeatures &getSparseContinuous() const { return m_sparsecontinuous; }
-  const BlobFeatures &getBlob() const { return m_blob; }
-
-  const std::vector<float> &labels() const { return m_labels; }
-  const std::vector<float> &weights() const { return m_weights; }
-
-  // used by DataRecordWriter
-  template <typename T>
-  void addContinuous(std::vector<int64_t> feature_ids, std::vector<T> values) {
-    for (size_t i = 0; i < feature_ids.size(); ++i){
-      m_continuous[feature_ids[i]] = values[i];
-    }
-  }
-
-  template <typename T>
-  void addContinuous(const int64_t *keys, uint64_t num_keys, T *values) {
-    for (size_t i = 0; i < num_keys; ++i){
-       m_continuous[keys[i]] = values[i];
-     }
-  }
-
-  void decode(DataRecordReader &reader);
-  void clear();
-  friend class DataRecordReader;
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/DataRecordReader.h b/twml/libtwml/include/twml/DataRecordReader.h
deleted file mode 100644
index 0ef8e64ff..000000000
--- a/twml/libtwml/include/twml/DataRecordReader.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/common.h>
-#include <twml/defines.h>
-#include <twml/DataRecord.h>
-#include <twml/TensorRecordReader.h>
-
-#include <cstdint>
-
-#include <vector>
-#include <string>
-#include <unordered_map>
-
-namespace twml {
-
-class TWMLAPI DataRecordReader : public TensorRecordReader {
-
-private:
-  typedef Map<int64_t, int64_t> KeyMap_t;
-  KeyMap_t *m_keep_map;
-  KeyMap_t *m_labels_map;
-  KeyMap_t *m_weights_map;
-
-public:
-  bool keepKey              (const int64_t &key, int64_t &code);
-  bool isLabel              (const int64_t &key, int64_t &code);
-  bool isWeight             (const int64_t &key, int64_t &code);
-  void readBinary           (const int feature_type , DataRecord *record);
-  void readContinuous       (const int feature_type , DataRecord *record);
-  void readDiscrete         (const int feature_type , DataRecord *record);
-  void readString           (const int feature_type , DataRecord *record);
-  void readSparseBinary     (const int feature_type , DataRecord *record);
-  void readSparseContinuous (const int feature_type , DataRecord *record);
-  void readBlob             (const int feature_type , DataRecord *record);
-
-  DataRecordReader() :
-      TensorRecordReader(nullptr),
-      m_keep_map(nullptr),
-      m_labels_map(nullptr),
-      m_weights_map(nullptr)
-      {}
-
-  // Using a template instead of int64_t because tensorflow implements int64 based on compiler.
-  void setKeepMap(KeyMap_t *keep_map) {
-    m_keep_map = keep_map;
-  }
-
-  void setLabelsMap(KeyMap_t *labels_map) {
-    m_labels_map = labels_map;
-  }
-
-  void setWeightsMap(KeyMap_t *weights_map) {
-    m_weights_map = weights_map;
-  }
-
-  void setDecodeMode(int64_t mode) {}
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/DataRecordWriter.h b/twml/libtwml/include/twml/DataRecordWriter.h
deleted file mode 100644
index 6b330d323..000000000
--- a/twml/libtwml/include/twml/DataRecordWriter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <twml/DataRecord.h>
-#include <twml/TensorRecordWriter.h>
-
-namespace twml {
-
-// Encodes DataRecords as binary Thrift. BatchPredictionResponse
-// uses this class to encode prediction responses through our
-// TensorFlow response writer operator.
-class TWMLAPI DataRecordWriter {
-  private:
-    uint32_t m_records_written;
-    twml::ThriftWriter &m_thrift_writer;
-    twml::TensorRecordWriter m_tensor_writer;
-
-    void writeBinary(twml::DataRecord &record);
-    void writeContinuous(twml::DataRecord &record);
-    void writeDiscrete(twml::DataRecord &record);
-    void writeString(twml::DataRecord &record);
-    void writeSparseBinaryFeatures(twml::DataRecord &record);
-    void writeSparseContinuousFeatures(twml::DataRecord &record);
-    void writeBlobFeatures(twml::DataRecord &record);
-    void writeDenseTensors(twml::DataRecord &record);
-
-  public:
-    DataRecordWriter(twml::ThriftWriter &thrift_writer):
-      m_records_written(0),
-      m_thrift_writer(thrift_writer),
-      m_tensor_writer(twml::TensorRecordWriter(thrift_writer)) { }
-
-    uint32_t getRecordsWritten();
-    uint64_t write(twml::DataRecord &record);
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/Error.h b/twml/libtwml/include/twml/Error.h
deleted file mode 100644
index 89307d214..000000000
--- a/twml/libtwml/include/twml/Error.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-
-#ifdef __cplusplus
-#include <stddef.h>
-#include <stdexcept>
-#include <stdint.h>
-#include <string>
-
-namespace twml {
-
-class Error : public std::runtime_error {
- private:
-  twml_err m_err;
- public:
-  Error(twml_err  err, const std::string &msg) :
-      std::runtime_error(msg), m_err(err)
-  {
-  }
-
-  twml_err err() const
-  {
-    return m_err;
-  }
-};
-
-class ThriftInvalidField: public twml::Error {
- public:
-  ThriftInvalidField(int16_t field_id, const std::string& func) :
-      Error(TWML_ERR_THRIFT,
-            "Found invalid field (" + std::to_string(field_id)
-            + ") while reading thrift [" + func + "]")
-  {
-  }
-};
-
-class ThriftInvalidType: public twml::Error {
- public:
-  ThriftInvalidType(uint8_t type_id, const std::string& func, const std::string type) :
-      Error(TWML_ERR_THRIFT,
-            "Found invalid type (" + std::to_string(type_id) +
-            ") while reading thrift [" + func + "::" + type + "]")
-  {
-  }
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/HashedDataRecord.h b/twml/libtwml/include/twml/HashedDataRecord.h
deleted file mode 100644
index de63c4dc7..000000000
--- a/twml/libtwml/include/twml/HashedDataRecord.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <twml/TensorRecord.h>
-
-#include <cstdint>
-#include <cmath>
-#include <vector>
-
-namespace twml {
-
-class HashedDataRecordReader;
-
-class TWMLAPI HashedDataRecord : public TensorRecord {
- public:
-  typedef HashedDataRecordReader Reader;
-
-  HashedDataRecord(int num_labels=0, int num_weights=0):
-      m_keys(),
-      m_transformed_keys(),
-      m_values(),
-      m_codes(),
-      m_types(),
-      m_labels(num_labels, std::nanf("")),
-      m_weights(num_weights) {}
-
-  void decode(HashedDataRecordReader &reader);
-
-  const std::vector<int64_t> &keys() const { return m_keys; }
-  const std::vector<int64_t> &transformed_keys() const { return m_transformed_keys; }
-  const std::vector<double> &values() const { return m_values; }
-  const std::vector<int64_t> &codes() const { return m_codes; }
-  const std::vector<uint8_t> &types() const { return m_types; }
-
-  const std::vector<float> &labels() const { return m_labels; }
-  const std::vector<float> &weights() const { return m_weights; }
-
-  void clear();
-
-  uint64_t totalSize() const { return m_keys.size(); }
-
-  void extendSize(int delta_size) {
-    int count = m_keys.size() + delta_size;
-    m_keys.reserve(count);
-    m_transformed_keys.reserve(count);
-    m_values.reserve(count);
-    m_codes.reserve(count);
-    m_types.reserve(count);
-  }
-
- private:
-  std::vector<int64_t> m_keys;
-  std::vector<int64_t> m_transformed_keys;
-  std::vector<double> m_values;
-  std::vector<int64_t> m_codes;
-  std::vector<uint8_t> m_types;
-
-  std::vector<float> m_labels;
-  std::vector<float> m_weights;
-
-  void addKey(int64_t key, int64_t transformed_key, int64_t code, uint8_t type, double value=1);
-  void addLabel(int64_t id, double value = 1);
-  void addWeight(int64_t id, double value);
-
-  friend class HashedDataRecordReader;
-};
-
-}
-#endif
\ No newline at end of file
diff --git a/twml/libtwml/include/twml/HashedDataRecordReader.h b/twml/libtwml/include/twml/HashedDataRecordReader.h
deleted file mode 100644
index 5470eb5c8..000000000
--- a/twml/libtwml/include/twml/HashedDataRecordReader.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/common.h>
-#include <twml/defines.h>
-#include <twml/HashedDataRecord.h>
-#include <twml/TensorRecordReader.h>
-
-#include <cstdint>
-
-#include <vector>
-#include <string>
-#include <unordered_map>
-
-namespace twml {
-
-enum class DecodeMode: int64_t
-{
-  hash_valname = 0,
-  hash_fname_and_valname = 1,
-};
-
-class TWMLAPI HashedDataRecordReader : public TensorRecordReader {
-private:
-  typedef Map<int64_t, int64_t> KeyMap_t;
-  KeyMap_t *m_keep_map;
-  KeyMap_t *m_labels_map;
-  KeyMap_t *m_weights_map;
-  DecodeMode m_decode_mode;
-
-public:
-  bool keepId               (const int64_t &key, int64_t &code);
-  bool isLabel              (const int64_t &key, int64_t &code);
-  bool isWeight             (const int64_t &key, int64_t &code);
-  void readBinary           (const int feature_type , HashedDataRecord *record);
-  void readContinuous       (const int feature_type , HashedDataRecord *record);
-  void readDiscrete         (const int feature_type , HashedDataRecord *record);
-  void readString           (const int feature_type , HashedDataRecord *record);
-  void readSparseBinary     (const int feature_type , HashedDataRecord *record);
-  void readSparseContinuous (const int feature_type , HashedDataRecord *record);
-  void readBlob             (const int feature_type , HashedDataRecord *record);
-
-  HashedDataRecordReader() :
-      TensorRecordReader(nullptr),
-      m_keep_map(nullptr),
-      m_labels_map(nullptr),
-      m_weights_map(nullptr),
-      m_decode_mode(DecodeMode::hash_valname)
-      {}
-
-  // Using a template instead of int64_t because tensorflow implements int64 based on compiler.
-  void setKeepMap(KeyMap_t *keep_map) {
-    m_keep_map = keep_map;
-  }
-
-  void setLabelsMap(KeyMap_t *labels_map) {
-    m_labels_map = labels_map;
-  }
-
-  void setWeightsMap(KeyMap_t *weights_map) {
-    m_weights_map = weights_map;
-  }
-
-  void setDecodeMode(int64_t mode) {
-    m_decode_mode = static_cast<DecodeMode>(mode);
-  }
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/Hashmap.h b/twml/libtwml/include/twml/Hashmap.h
deleted file mode 100644
index 59314236b..000000000
--- a/twml/libtwml/include/twml/Hashmap.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-#include <twml/Tensor.h>
-#include <twml/Type.h>
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-    typedef void * twml_hashmap;
-    typedef int64_t tw_hash_key_t;
-    typedef int64_t tw_hash_val_t;
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef __cplusplus
-namespace twml {
-
-    typedef tw_hash_key_t HashKey_t;
-    typedef tw_hash_val_t HashVal_t;
-
-    class HashMap {
-    private:
-        twml_hashmap m_hashmap;
-
-    public:
-        HashMap();
-        ~HashMap();
-
-        // Disable copy constructor and assignment
-        // TODO: Fix this after retain and release are added to twml_hashmap
-        HashMap(const HashMap &other) = delete;
-        HashMap& operator=(const HashMap &other) = delete;
-
-        void clear();
-        uint64_t size() const;
-        int8_t insert(const HashKey_t key);
-        int8_t insert(const HashKey_t key, const HashVal_t val);
-        void remove(const HashKey_t key);
-        int8_t get(HashVal_t &val, const HashKey_t key) const;
-
-        void insert(Tensor &mask, const Tensor keys);
-        void insert(Tensor &mask, const Tensor keys, const Tensor vals);
-        void remove(const Tensor keys);
-        void get(Tensor &mask, Tensor &vals, const Tensor keys) const;
-
-        void getInplace(Tensor &mask, Tensor &keys_vals) const;
-        void toTensors(Tensor &keys, Tensor &vals) const;
-    };
-}
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-    TWMLAPI twml_err twml_hashmap_create(twml_hashmap *hashmap);
-
-    TWMLAPI twml_err twml_hashmap_clear(const twml_hashmap hashmap);
-
-    TWMLAPI twml_err twml_hashmap_get_size(uint64_t *size, const twml_hashmap hashmap);
-
-    TWMLAPI twml_err twml_hashmap_delete(const twml_hashmap hashmap);
-
-    // insert, get, remove single key / value
-    TWMLAPI twml_err twml_hashmap_insert_key(int8_t *mask,
-                                             const twml_hashmap hashmap,
-                                             const tw_hash_key_t key);
-
-    TWMLAPI twml_err twml_hashmap_insert_key_and_value(int8_t *mask, twml_hashmap hashmap,
-                                                       const tw_hash_key_t key,
-                                                       const tw_hash_val_t val);
-
-    TWMLAPI twml_err twml_hashmap_remove_key(const twml_hashmap hashmap,
-                                             const tw_hash_key_t key);
-
-    TWMLAPI twml_err twml_hashmap_get_value(int8_t *mask, tw_hash_val_t *val,
-                                            const twml_hashmap hashmap,
-                                            const tw_hash_key_t key);
-
-    TWMLAPI twml_err twml_hashmap_insert_keys(twml_tensor masks,
-                                              const twml_hashmap hashmap,
-                                              const twml_tensor keys);
-
-    // insert, get, remove tensors of keys / values
-    TWMLAPI twml_err twml_hashmap_insert_keys_and_values(twml_tensor masks,
-                                                         twml_hashmap hashmap,
-                                                         const twml_tensor keys,
-                                                         const twml_tensor vals);
-
-    TWMLAPI twml_err twml_hashmap_remove_keys(const twml_hashmap hashmap,
-                                              const twml_tensor keys);
-
-    TWMLAPI twml_err twml_hashmap_get_values(twml_tensor masks,
-                                             twml_tensor vals,
-                                             const twml_hashmap hashmap,
-                                             const twml_tensor keys);
-
-    TWMLAPI twml_err twml_hashmap_get_values_inplace(twml_tensor masks,
-                                                     twml_tensor keys_vals,
-                                                     const twml_hashmap hashmap);
-
-    TWMLAPI twml_err twml_hashmap_to_tensors(twml_tensor keys,
-                                             twml_tensor vals,
-                                             const twml_hashmap hashmap);
-#ifdef __cplusplus
-}
-#endif
diff --git a/twml/libtwml/include/twml/RawTensor.h b/twml/libtwml/include/twml/RawTensor.h
deleted file mode 100644
index 571966743..000000000
--- a/twml/libtwml/include/twml/RawTensor.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-#include <twml/Tensor.h>
-#include <type_traits>
-
-#ifdef __cplusplus
-namespace twml {
-
-// This class contains the raw pointers to tensors coming from thrift object.
-class TWMLAPI RawTensor : public Tensor
-{
-private:
-  bool m_is_big_endian;
-  uint64_t m_raw_length;
-public:
-
-  RawTensor() {}
-
-  RawTensor(void *data, const std::vector<uint64_t> &dims,
-            const std::vector<uint64_t> &strides, twml_type type, bool is_big_endian, uint64_t length)
-      :  Tensor(data, dims, strides, type), m_is_big_endian(is_big_endian), m_raw_length(length) {}
-
-  bool is_big_endian() const {
-    return m_is_big_endian;
-  }
-
-  uint64_t getRawLength() const {
-    return m_raw_length;
-  }
-
-  // Extracts a slice from a tensor at idx0 along dimension 0
-  // Used in BatchPredictionResponse to write each slice in separate records
-  RawTensor getSlice(uint64_t idx0) const {
-    void *slice = nullptr;
-    uint64_t raw_length = 0;
-
-    if (getType() == TWML_TYPE_STRING) {
-      raw_length = getStride(0);
-      std::string *data = const_cast<std::string *>(static_cast<const std::string*>(getData<void>()));
-      slice = static_cast<void *>(data + raw_length * idx0);
-    } else {
-      raw_length = getStride(0) * getSizeOf(getType());
-      char *data = const_cast<char *>(static_cast<const char*>(getData<void>()));
-      slice = static_cast<void *>(data + raw_length * idx0);
-    }
-
-    std::vector<uint64_t> dims, strides;
-    for (int i = 1; i < getNumDims(); i++) {
-      dims.push_back(getDim(i));
-      strides.push_back(getStride(i));
-    }
-
-    return RawTensor(slice, dims, strides, getType(), m_is_big_endian, raw_length);
-  }
-};
-
-// Wrapper class around RawTensor to hold sparse tensors.
-class TWMLAPI RawSparseTensor
-{
-private:
-  RawTensor m_indices;
-  RawTensor m_values;
-  std::vector<uint64_t> m_dense_shape;
-
-public:
-
-  RawSparseTensor() {
-  }
-
-  RawSparseTensor(const RawTensor &indices_, const RawTensor &values_,
-                  const std::vector<uint64_t> &dense_shape_) :
-      m_indices(indices_), m_values(values_), m_dense_shape(dense_shape_)
-  {
-    if (m_indices.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "Indices of Sparse Tensor must be of type int64");
-    }
-  }
-
-  const RawTensor &indices() const {
-    return m_indices;
-  }
-
-  const RawTensor &values() const {
-    return m_values;
-  }
-
-  const std::vector<uint64_t>& denseShape() const {
-    return m_dense_shape;
-  }
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/Tensor.h b/twml/libtwml/include/twml/Tensor.h
deleted file mode 100644
index 774474403..000000000
--- a/twml/libtwml/include/twml/Tensor.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-
-#include <cstddef>
-#include <vector>
-#include <string>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-  struct twml_tensor__;
-  typedef twml_tensor__ * twml_tensor;
-
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef __cplusplus
-namespace twml {
-
-class TWMLAPI Tensor
-{
-private:
-  twml_type m_type;
-  void *m_data;
-  std::vector<uint64_t> m_dims;
-  std::vector<uint64_t> m_strides;
-
-public:
-  Tensor() {}
-  Tensor(void *data, int ndims, const uint64_t *dims, const uint64_t *strides, twml_type type);
-  Tensor(void *data, const std::vector<uint64_t> &dims, const std::vector<uint64_t> &strides, twml_type type);
-
-  const std::vector<uint64_t>& getDims() const {
-    return m_dims;
-  }
-
-  int getNumDims() const;
-  uint64_t getDim(int dim) const;
-  uint64_t getStride(int dim) const;
-  uint64_t getNumElements() const;
-  twml_type getType() const;
-
-  twml_tensor getHandle();
-  const twml_tensor getHandle() const;
-
-  template<typename T> T *getData();
-  template<typename T> const T *getData() const;
-};
-
-TWMLAPI std::string getTypeName(twml_type type);
-TWMLAPI const Tensor *getConstTensor(const twml_tensor t);
-TWMLAPI Tensor *getTensor(twml_tensor t);
-TWMLAPI uint64_t getSizeOf(twml_type type);
-
-}
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-    TWMLAPI twml_err twml_tensor_create(twml_tensor *tensor, void *data,
-                                        int ndims, uint64_t *dims,
-                                        uint64_t *strides, twml_type type);
-
-    TWMLAPI twml_err twml_tensor_delete(const twml_tensor tensor);
-
-    TWMLAPI twml_err twml_tensor_get_type(twml_type *type, const twml_tensor tensor);
-
-    TWMLAPI twml_err twml_tensor_get_data(void **data, const twml_tensor tensor);
-
-    TWMLAPI twml_err twml_tensor_get_dim(uint64_t *dim, const twml_tensor tensor, int id);
-
-    TWMLAPI twml_err twml_tensor_get_num_dims(int *ndims, const twml_tensor tensor);
-
-    TWMLAPI twml_err twml_tensor_get_num_elements(uint64_t *nelements, const twml_tensor tensor);
-
-    TWMLAPI twml_err twml_tensor_get_stride(uint64_t *stride, const twml_tensor tensor, int id);
-#ifdef __cplusplus
-}
-#endif
diff --git a/twml/libtwml/include/twml/TensorRecord.h b/twml/libtwml/include/twml/TensorRecord.h
deleted file mode 100644
index d128cfdce..000000000
--- a/twml/libtwml/include/twml/TensorRecord.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <twml/RawTensor.h>
-
-#include <cstdint>
-#include <unordered_map>
-
-namespace twml {
-
-class TensorRecordReader;
-
-// A class containing the data from TensorRecord.
-// - This serves as the base class from which DataRecord and HashedDataRecord are inherited.
-class TWMLAPI TensorRecord {
-public:
-  typedef std::unordered_map<int64_t, const RawTensor> RawTensors;
-  typedef std::unordered_map<int64_t, const RawSparseTensor> RawSparseTensors;
-
-private:
-  RawTensors m_tensors;
-  RawSparseTensors m_sparse_tensors;
-
-public:
-
-  const RawTensors &getRawTensors() {
-    return m_tensors;
-  }
-
-  const RawTensor& getRawTensor(int64_t id) const {
-    return m_tensors.at(id);
-  }
-
-  const RawSparseTensor& getRawSparseTensor(int64_t id) const {
-    return m_sparse_tensors.at(id);
-  }
-
-  void addRawTensor(int64_t id, const RawTensor &tensor) {
-    m_tensors.emplace(id, tensor);
-  }
-
-  friend class TensorRecordReader;
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/TensorRecordReader.h b/twml/libtwml/include/twml/TensorRecordReader.h
deleted file mode 100644
index 3a62bd885..000000000
--- a/twml/libtwml/include/twml/TensorRecordReader.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <twml/TensorRecord.h>
-#include <twml/ThriftReader.h>
-
-#include <cstdint>
-
-#include <vector>
-#include <string>
-#include <unordered_map>
-
-namespace twml {
-
-// Class that parses the thrift objects as defined in tensor.thrift
-class TWMLAPI TensorRecordReader : public ThriftReader {
-
-  std::vector<uint64_t> readShape();
-  template<typename T> RawTensor readTypedTensor();
-  RawTensor readRawTypedTensor();
-  RawTensor readStringTensor();
-  RawTensor readGeneralTensor();
-  RawSparseTensor readCOOSparseTensor();
-
-public:
-  void readTensor(const int feature_type, TensorRecord *record);
-  void readSparseTensor(const int feature_type, TensorRecord *record);
-
-  TensorRecordReader(const uint8_t *buffer) : ThriftReader(buffer) {}
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/TensorRecordWriter.h b/twml/libtwml/include/twml/TensorRecordWriter.h
deleted file mode 100644
index d8b7c3dbf..000000000
--- a/twml/libtwml/include/twml/TensorRecordWriter.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <twml/TensorRecord.h>
-
-namespace twml {
-
-// Encodes tensors as DataRecord/TensorRecord-compatible Thrift.
-// DataRecordWriter relies on this class to encode the tensor fields.
-class TWMLAPI TensorRecordWriter {
-
-private:
-  uint32_t m_records_written;
-  twml::ThriftWriter &m_thrift_writer;
-
-  void writeTensor(const RawTensor &tensor);
-  void writeRawTensor(const RawTensor &tensor);
-
-public:
-  TensorRecordWriter(twml::ThriftWriter &thrift_writer):
-      m_records_written(0),
-      m_thrift_writer(thrift_writer) { }
-
-  uint32_t getRecordsWritten();
-
-  // Caller (usually DataRecordWriter) must precede with struct header field
-  // like thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_GENERAL_TENSOR)
-  //
-  // All tensors written as RawTensors except for StringTensors
-  uint64_t write(twml::TensorRecord &record);
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/ThriftReader.h b/twml/libtwml/include/twml/ThriftReader.h
deleted file mode 100644
index 25c83ea29..000000000
--- a/twml/libtwml/include/twml/ThriftReader.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <cstdint>
-#include <cstddef>
-#include <cstring>
-
-namespace twml {
-
-class ThriftReader {
- protected:
-  const uint8_t *m_buffer;
-
- public:
-
-  ThriftReader(const uint8_t *buffer): m_buffer(buffer) {}
-
-  const uint8_t *getBuffer() { return m_buffer; }
-
-  void setBuffer(const uint8_t *buffer) { m_buffer = buffer; }
-
-  template<typename T> T readDirect() {
-    T val;
-    memcpy(&val, m_buffer, sizeof(T));
-    m_buffer += sizeof(T);
-    return val;
-  }
-
-  template<typename T> void skip() {
-    m_buffer += sizeof(T);
-  }
-
-  void skipLength(size_t length) {
-    m_buffer += length;
-  }
-
-  uint8_t readByte();
-  int16_t readInt16();
-  int32_t readInt32();
-  int64_t readInt64();
-  double readDouble();
-
-  template<typename T> inline
-  int32_t getRawBuffer(const uint8_t **begin) {
-    int32_t length = readInt32();
-    *begin = m_buffer;
-    skipLength(length * sizeof(T));
-    return length;
-  }
-
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/ThriftWriter.h b/twml/libtwml/include/twml/ThriftWriter.h
deleted file mode 100644
index 1216415b0..000000000
--- a/twml/libtwml/include/twml/ThriftWriter.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-
-#include <twml/defines.h>
-#include <cstdint>
-#include <cstddef>
-#include <cstring>
-
-namespace twml {
-
-// A low-level binary Thrift writer that can also compute output size
-// in dry run mode without copying memory. See also https://git.io/vNPiv
-//
-// WARNING: Users of this class are responsible for generating valid Thrift
-// by following the Thrift binary protocol (https://git.io/vNPiv).
-class TWMLAPI ThriftWriter {
-  protected:
-    bool m_dry_run;
-    uint8_t *m_buffer;
-    size_t m_buffer_size;
-    size_t m_bytes_written;
-
-    template <typename T> inline uint64_t write(T val);
-
-  public:
-    // buffer:       Memory to write the binary Thrift to.
-    // buffer_size:  Length of the buffer.
-    // dry_run:      If true, just count bytes 'written' but do not copy memory.
-    //               If false, write binary Thrift to the buffer normally.
-    //               Useful to determine output size for TensorFlow allocations.
-    ThriftWriter(uint8_t *buffer, size_t buffer_size, bool dry_run = false) :
-        m_dry_run(dry_run),
-        m_buffer(buffer),
-        m_buffer_size(buffer_size),
-        m_bytes_written(0) {}
-
-    // total bytes written to the buffer since object creation
-    uint64_t getBytesWritten();
-
-    // encode headers and values into the buffer
-    uint64_t writeStructFieldHeader(int8_t field_type, int16_t field_id);
-    uint64_t writeStructStop();
-    uint64_t writeListHeader(int8_t element_type, int32_t num_elems);
-    uint64_t writeMapHeader(int8_t key_type, int8_t val_type, int32_t num_elems);
-    uint64_t writeDouble(double val);
-    uint64_t writeInt8(int8_t val);
-    uint64_t writeInt16(int16_t val);
-    uint64_t writeInt32(int32_t val);
-    uint64_t writeInt64(int64_t val);
-    uint64_t writeBinary(const uint8_t *bytes, int32_t num_bytes);
-    // clients expect UTF-8-encoded strings per the Thrift protocol
-    // (often this is just used to send bytes, not real strings though)
-    uint64_t writeString(std::string str);
-    uint64_t writeBool(bool val);
-};
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/Type.h b/twml/libtwml/include/twml/Type.h
deleted file mode 100644
index 8b460c812..000000000
--- a/twml/libtwml/include/twml/Type.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-namespace twml {
-
-    template<typename T> struct Type;
-
-    template<> struct Type<float>
-    {
-        enum {
-            type = TWML_TYPE_FLOAT,
-        };
-    };
-
-    template<> struct Type<std::string>
-    {
-        enum {
-            type = TWML_TYPE_STRING,
-        };
-    };
-
-    template<> struct Type<double>
-    {
-        enum {
-            type = TWML_TYPE_DOUBLE,
-        };
-    };
-
-    template<> struct Type<int64_t>
-    {
-        enum {
-            type = TWML_TYPE_INT64,
-        };
-    };
-
-    template<> struct Type<int32_t>
-    {
-        enum {
-            type = TWML_TYPE_INT32,
-        };
-    };
-
-    template<> struct Type<int8_t>
-    {
-        enum {
-            type = TWML_TYPE_INT8,
-        };
-    };
-
-    template<> struct Type<uint8_t>
-    {
-        enum {
-            type = TWML_TYPE_UINT8,
-        };
-    };
-
-
-    template<> struct Type<bool>
-    {
-        enum {
-            type = TWML_TYPE_BOOL,
-        };
-    };
-
-}
-#endif
diff --git a/twml/libtwml/include/twml/common.h b/twml/libtwml/include/twml/common.h
deleted file mode 100644
index c3a2e9aee..000000000
--- a/twml/libtwml/include/twml/common.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef TWML_LIBTWML_INCLUDE_TWML_COMMON_H_
-#define TWML_LIBTWML_INCLUDE_TWML_COMMON_H_
-
-#define USE_ABSEIL_HASH 1
-
-#if defined(USE_ABSEIL_HASH)
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#elif defined(USE_DENSE_HASH)
-#include <sparsehash/dense_hash_map>
-#include <sparsehash/dense_hash_set>
-#else
-#include <unordered_map>
-#include <unordered_set>
-#endif  // USE_ABSEIL_HASH
-
-
-namespace twml {
-#if defined(USE_ABSEIL_HASH)
-  template<typename KeyType, typename ValueType>
-    using Map = absl::flat_hash_map<KeyType, ValueType>;
-
-  template<typename KeyType>
-    using Set = absl::flat_hash_set<KeyType>;
-#elif defined(USE_DENSE_HASH)
-// Do not use this unless an proper empty key can be found.
-  template<typename KeyType, typename ValueType>
-    using Map = google::dense_hash_map<KeyType, ValueType>;
-
-  template<typename KeyType>
-    using Set = google::dense_hash_set<KeyType>;
-#else
-  template<typename KeyType, typename ValueType>
-    using Map = std::unordered_map<KeyType, ValueType>;
-
-  template<typename KeyType>
-    using Set = std::unordered_set<KeyType>;
-#endif  // USE_DENSE_HASH
-
-}  // namespace twml
-
-#endif  // TWML_LIBTWML_INCLUDE_TWML_COMMON_H_
\ No newline at end of file
diff --git a/twml/libtwml/include/twml/defines.h b/twml/libtwml/include/twml/defines.h
deleted file mode 100644
index e7f7d138d..000000000
--- a/twml/libtwml/include/twml/defines.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-#include <stdbool.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-  typedef enum {
-    TWML_TYPE_FLOAT32 = 1,
-    TWML_TYPE_FLOAT64 = 2,
-    TWML_TYPE_INT32  = 3,
-    TWML_TYPE_INT64  = 4,
-    TWML_TYPE_INT8   = 5,
-    TWML_TYPE_UINT8  = 6,
-    TWML_TYPE_BOOL   = 7,
-    TWML_TYPE_STRING = 8,
-    TWML_TYPE_FLOAT  = TWML_TYPE_FLOAT32,
-    TWML_TYPE_DOUBLE = TWML_TYPE_FLOAT64,
-    TWML_TYPE_UNKNOWN = -1,
-  } twml_type;
-
-  typedef enum {
-    TWML_ERR_NONE = 1000,
-    TWML_ERR_SIZE = 1001,
-    TWML_ERR_TYPE = 1002,
-    TWML_ERR_THRIFT = 1100,
-    TWML_ERR_IO = 1200,
-    TWML_ERR_UNKNOWN = 1999,
-  } twml_err;
-#ifdef __cplusplus
-}
-#endif
-
-#define TWMLAPI __attribute__((visibility("default")))
-
-#ifndef TWML_INDEX_BASE
-#define TWML_INDEX_BASE 0
-#endif
diff --git a/twml/libtwml/include/twml/discretizer_impl.h b/twml/libtwml/include/twml/discretizer_impl.h
deleted file mode 100644
index 587bde458..000000000
--- a/twml/libtwml/include/twml/discretizer_impl.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-#include <twml/common.h>
-#include <twml/defines.h>
-#include <twml/Tensor.h>
-
-#ifdef __cplusplus
-namespace twml {
-    TWMLAPI void discretizerInfer(
-        Tensor &output_keys,
-        Tensor &output_vals,
-        const Tensor &input_ids,
-        const Tensor &input_vals,
-        const Tensor &bin_ids,
-        const Tensor &bin_vals,
-        const Tensor &feature_offsets,
-        int output_bits,
-        const Map<int64_t, int64_t> &ID_to_index,
-        int start_compute,
-        int end_compute,
-        int output_start);
-}  // namespace twml
-#endif
diff --git a/twml/libtwml/include/twml/functions.h b/twml/libtwml/include/twml/functions.h
deleted file mode 100644
index c23680cac..000000000
--- a/twml/libtwml/include/twml/functions.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-#include <twml/Tensor.h>
-
-#ifdef __cplusplus
-namespace twml {
-
-    // Adding these as an easy way to test the wrappers
-    TWMLAPI void add1(Tensor &output, const Tensor input);
-    TWMLAPI void copy(Tensor &output, const Tensor input);
-    TWMLAPI int64_t featureId(const std::string &feature);
-}
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    // Adding these as an easy way to test the wrappers
-    TWMLAPI twml_err twml_add1(twml_tensor output, const twml_tensor input);
-    TWMLAPI twml_err twml_copy(twml_tensor output, const twml_tensor input);
-    TWMLAPI twml_err twml_get_feature_id(int64_t *result, const uint64_t len, const char *str);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/twml/libtwml/include/twml/hashing_discretizer_impl.h b/twml/libtwml/include/twml/hashing_discretizer_impl.h
deleted file mode 100644
index a04efb7e0..000000000
--- a/twml/libtwml/include/twml/hashing_discretizer_impl.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-#include <twml/common.h>
-#include <twml/defines.h>
-#include <twml/Tensor.h>
-#include <unordered_map>
-
-#ifdef __cplusplus
-namespace twml {
-    TWMLAPI void hashDiscretizerInfer(
-        Tensor &output_keys,
-        Tensor &output_vals,
-        const Tensor &input_ids,
-        const Tensor &input_vals,
-        int n_bin,
-        const Tensor &bin_vals,
-        int output_bits,
-        const Map<int64_t, int64_t> &ID_to_index,
-        int start_compute,
-        int end_compute,
-        int64_t options);
-}  // namespace twml
-#endif
diff --git a/twml/libtwml/include/twml/io/IOError.h b/twml/libtwml/include/twml/io/IOError.h
deleted file mode 100644
index 867ab44df..000000000
--- a/twml/libtwml/include/twml/io/IOError.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include <twml/Error.h>
-
-namespace twml {
-namespace io {
-
-class IOError : public twml::Error {
-  public:
-    enum Status {
-      OUT_OF_RANGE = 1,
-      WRONG_MAGIC = 2,
-      WRONG_HEADER = 3,
-      ERROR_HEADER_CHECKSUM = 4,
-      INVALID_METHOD = 5,
-      USING_RESERVED = 6,
-      ERROR_HEADER_EXTRA_FIELD_CHECKSUM = 7,
-      CANT_FIT_OUTPUT = 8,
-      SPLIT_FILE = 9,
-      BLOCK_SIZE_TOO_LARGE = 10,
-      SOURCE_LARGER_THAN_DESTINATION = 11,
-      DESTINATION_LARGER_THAN_CAPACITY = 12,
-      HEADER_FLAG_MISMATCH = 13,
-      NOT_ENOUGH_INPUT = 14,
-      ERROR_SOURCE_BLOCK_CHECKSUM = 15,
-      COMPRESSED_DATA_VIOLATION = 16,
-      ERROR_DESTINATION_BLOCK_CHECKSUM = 17,
-      EMPTY_RECORD = 18,
-      MALFORMED_MEMORY_RECORD = 19,
-      UNSUPPORTED_OUTPUT_TYPE = 20,
-      OTHER_ERROR
-    };
-
-    IOError(Status status);
-
-    Status status() const {
-      return m_status;
-    }
-
-  private:
-    Status m_status;
-};
-
-}
-}
diff --git a/twml/libtwml/include/twml/optim.h b/twml/libtwml/include/twml/optim.h
deleted file mode 100644
index d0a2df4ef..000000000
--- a/twml/libtwml/include/twml/optim.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#pragma once
-#include <twml/defines.h>
-#include <twml/Tensor.h>
-
-#ifdef __cplusplus
-namespace twml {
-    TWMLAPI void linearInterpolation(
-        Tensor output,
-        const Tensor input,
-        const Tensor xs,
-        const Tensor ys);
-
-    TWMLAPI void nearestInterpolation(
-        Tensor output,
-        const Tensor input,
-        const Tensor xs,
-        const Tensor ys);
-
-    TWMLAPI void mdlInfer(
-        Tensor &output_keys,
-        Tensor &output_vals,
-        const Tensor &input_keys,
-        const Tensor &input_vals,
-        const Tensor &bin_ids,
-        const Tensor &bin_vals,
-        const Tensor &feature_offsets,
-        bool return_bin_indices = false);
-}
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-    TWMLAPI twml_err twml_optim_nearest_interpolation(
-        twml_tensor output,
-        const twml_tensor input,
-        const twml_tensor xs,
-        const twml_tensor ys);
-
-    TWMLAPI twml_err twml_optim_mdl_infer(
-        twml_tensor output_keys,
-        twml_tensor output_vals,
-        const twml_tensor input_keys,
-        const twml_tensor input_vals,
-        const twml_tensor bin_ids,
-        const twml_tensor bin_vals,
-        const twml_tensor feature_offsets,
-        const bool return_bin_indices = false);
-#ifdef __cplusplus
-}
-#endif
diff --git a/twml/libtwml/include/twml/utilities.h b/twml/libtwml/include/twml/utilities.h
deleted file mode 100644
index a30b44aff..000000000
--- a/twml/libtwml/include/twml/utilities.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-#ifdef __cplusplus
-namespace twml {
-
-inline int64_t mixDiscreteIdAndValue(int64_t key, int64_t value) {
-  key ^= ((17LL + value) * 2654435761LL);
-  return key;
-}
-
-inline int64_t mixStringIdAndValue(int64_t key, int32_t str_len, const uint8_t *str) {
-  int32_t hash = 0;
-  for (int32_t i = 0; i < str_len; i++) {
-    hash = (31 * hash) + (int32_t)str[i];
-  }
-  return key ^ hash;
-}
-}
-#endif
\ No newline at end of file
diff --git a/twml/libtwml/setup.cfg b/twml/libtwml/setup.cfg
deleted file mode 100644
index d5253c179..000000000
--- a/twml/libtwml/setup.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-[bdist_wheel]
-universal=1
-
-[build]
-build-lib=build_dir
-build-temp=build_dir
-
-[bdist]
-bdist-base=build_dir
diff --git a/twml/libtwml/setup.py b/twml/libtwml/setup.py
deleted file mode 100644
index 2dcfa105d..000000000
--- a/twml/libtwml/setup.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-libtwml setup.py module
-"""
-from setuptools import setup, find_packages
-
-setup(
-  name='libtwml',
-  version='2.0',
-  description="Tensorflow C++ ops for twml",
-  packages=find_packages(),
-  data_files=[('', ['libtwml_tf.so'])],
-)
diff --git a/twml/libtwml/src/lib/BatchPredictionRequest.cpp b/twml/libtwml/src/lib/BatchPredictionRequest.cpp
deleted file mode 100644
index cca8d6545..000000000
--- a/twml/libtwml/src/lib/BatchPredictionRequest.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "internal/thrift.h"
-#include "internal/error.h"
-
-#include <twml/DataRecordReader.h>
-#include <twml/HashedDataRecordReader.h>
-#include <twml/BatchPredictionRequest.h>
-#include <twml/Error.h>
-
-#include <algorithm>
-#include <cstring>
-#include <cstdint>
-
-namespace twml {
-
-template<typename RecordType>
-void GenericBatchPredictionRequest<RecordType>::decode(Reader &reader) {
-  uint8_t feature_type = reader.readByte();
-  while (feature_type != TTYPE_STOP) {
-    int16_t field_id = reader.readInt16();
-
-    switch (field_id) {
-      case 1: {
-        CHECK_THRIFT_TYPE(feature_type, TTYPE_LIST, "list");
-        CHECK_THRIFT_TYPE(reader.readByte(), TTYPE_STRUCT, "list_element");
-
-        int32_t length = reader.readInt32();
-        m_requests.resize(length, RecordType(this->num_labels, this->num_weights));
-        for (auto &request : m_requests) {
-          request.decode(reader);
-        }
-
-        break;
-      }
-      case 2: {
-        CHECK_THRIFT_TYPE(feature_type, TTYPE_STRUCT, "commonFeatures");
-        m_common_features.decode(reader);
-        break;
-      }
-      default: throw ThriftInvalidField(field_id, __func__);
-    }
-
-    feature_type = reader.readByte();
-  }
-  return;
-}
-
-
-// Instantiate decoders.
-template void GenericBatchPredictionRequest<HashedDataRecord>::decode(HashedDataRecordReader &reader);
-template void GenericBatchPredictionRequest<DataRecord>::decode(DataRecordReader &reader);
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/BatchPredictionResponse.cpp b/twml/libtwml/src/lib/BatchPredictionResponse.cpp
deleted file mode 100644
index 2a17d3605..000000000
--- a/twml/libtwml/src/lib/BatchPredictionResponse.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "internal/endianutils.h"
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <twml/Tensor.h>
-#include <twml/BatchPredictionResponse.h>
-#include <twml/DataRecord.h>
-#include <twml/ThriftWriter.h>
-#include <twml/DataRecordWriter.h>
-
-#include <inttypes.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <string.h>
-
-#include <algorithm>
-
-// When the number of predictions is very high, as some cases that Ads wants, the generic thrift
-// encoder becomes super expensive because we have to deal with lua tables.
-// This function is a special operation to efficiently write a batch prediction responses based on
-// tensors.
-namespace twml {
-
-BatchPredictionResponse::BatchPredictionResponse(
-  const Tensor &keys, const Tensor &values,
-  const Tensor &dense_keys, const std::vector<RawTensor> &dense_values
-) : keys_(keys), values_(values), dense_keys_(dense_keys), dense_values_(dense_values) {
-  // determine batch size
-  if (values_.getNumDims() > 0) {
-    batch_size_ = values_.getDim(0);
-  } else if (dense_keys_.getNumElements() < 1) {
-    throw twml::Error(TWML_ERR_TYPE, "Continuous values and dense tensors are both empty");
-  } else if (dense_keys_.getNumElements() != dense_values_.size()) {
-    throw twml::Error(TWML_ERR_TYPE, "Number of tensors not equal to number of keys");
-  } else {
-    // dim 0 for each tensor indexes batch elements
-    std::vector<uint64_t> batch_sizes;
-    batch_sizes.reserve(dense_values_.size());
-
-    for (int i = 0; i < dense_values_.size(); i++)
-      batch_sizes.push_back(dense_values_.at(i).getDim(0));
-
-    if (std::adjacent_find(
-          batch_sizes.begin(),
-          batch_sizes.end(),
-          std::not_equal_to<uint64_t>()) != batch_sizes.end())
-      throw twml::Error(TWML_ERR_TYPE, "Batch size (dim 0) for all tensors must be the same");
-
-    batch_size_ = dense_values.at(0).getDim(0);
-  }
-}
-
-void BatchPredictionResponse::encode(twml::ThriftWriter &thrift_writer) {
-  if (hasContinuous()) {
-    switch (values_.getType()) {
-      case TWML_TYPE_FLOAT:
-        serializePredictions<float>(thrift_writer);
-        break;
-      case TWML_TYPE_DOUBLE:
-        serializePredictions<double>(thrift_writer);
-        break;
-      default:
-        throw twml::Error(TWML_ERR_TYPE, "Predictions must be float or double.");
-    }
-  } else {
-    // dense tensor predictions
-    serializePredictions<double>(thrift_writer);
-  }
-}
-
-template <typename T>
-void BatchPredictionResponse::serializePredictions(twml::ThriftWriter &thrift_writer) {
-  twml::DataRecordWriter record_writer = twml::DataRecordWriter(thrift_writer);
-
-  // start BatchPredictionResponse
-  thrift_writer.writeStructFieldHeader(TTYPE_LIST, BPR_PREDICTIONS);
-  thrift_writer.writeListHeader(TTYPE_STRUCT, getBatchSize());
-
-  for (int i = 0; i < getBatchSize(); i++) {
-    twml::DataRecord record = twml::DataRecord();
-
-    if (hasContinuous()) {
-      const T *values = values_.getData<T>();
-      const int64_t *local_keys = keys_.getData<int64_t>();
-      const T *local_values = values + (i * getPredictionSize());
-      record.addContinuous(local_keys, getPredictionSize(), local_values);
-    }
-
-    if (hasDenseTensors()) {
-      const int64_t *local_dense_keys = dense_keys_.getData<int64_t>();
-
-      for (int j = 0; j < dense_keys_.getNumElements(); j++) {
-        const RawTensor &dense_value = dense_values_.at(j).getSlice(i);
-        record.addRawTensor(local_dense_keys[j], dense_value);
-      }
-    }
-
-    record_writer.write(record);
-  }
-
-  // end BatchPredictionResponse
-  thrift_writer.writeStructStop();
-}
-
-// calculate expected binary Thrift size (no memory is copied)
-uint64_t BatchPredictionResponse::encodedSize() {
-  bool dry_mode = true;
-  twml::ThriftWriter dry_writer = twml::ThriftWriter(nullptr, 0, dry_mode);
-  encode(dry_writer);
-  return dry_writer.getBytesWritten();
-}
-
-void BatchPredictionResponse::write(Tensor &result) {
-  size_t result_size = result.getNumElements();
-  uint8_t *result_data = result.getData<uint8_t>();
-
-  if (result_size != this->encodedSize()) {
-    throw twml::Error(TWML_ERR_SIZE, "Sizes do not match");
-  }
-
-  twml::ThriftWriter writer = twml::ThriftWriter(result_data, result_size);
-  encode(writer);
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/BlockFormatReader.cpp b/twml/libtwml/src/lib/BlockFormatReader.cpp
deleted file mode 100644
index 98f49ac4f..000000000
--- a/twml/libtwml/src/lib/BlockFormatReader.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-#include <twml/BlockFormatReader.h>
-#include <cstring>
-#include <stdexcept>
-
-#define OFFSET_CHUNK                (32768)
-#define RECORDS_PER_BLOCK           (100)
-
-#define WIRE_TYPE_VARINT            (0)
-#define WIRE_TYPE_64BIT             (1)
-#define WIRE_TYPE_LENGTH_PREFIXED   (2)
-
-/*
-   This was all extracted from the ancient elephant bird scrolls
-   https://github.com/twitter/elephant-bird/blob/master/core/src/main/java/com/twitter/elephantbird/mapreduce/io/BinaryBlockReader.java
-*/
-
-#define MARKER_SIZE (16)
-static uint8_t _marker[MARKER_SIZE] = {
-  0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29,
-  0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff
-};
-
-
-namespace twml {
-BlockFormatReader::BlockFormatReader():
-    record_size_(0), block_pos_(0), block_end_(0) {
-  memset(classname_, 0, sizeof(classname_));
-}
-
-
-bool BlockFormatReader::next() {
-  record_size_ = read_one_record_size();
-  if (record_size_ < 0) {
-    record_size_ = 0;
-    return false;
-  }
-  return true;
-}
-
-int BlockFormatReader::read_int() {
-  uint8_t buff[4];
-  if (read_bytes(buff, 1, 4) != 4)
-    return -1;
-  return static_cast<int>(buff[0])
-      | (static_cast<int>(buff[1] << 8))
-      | (static_cast<int>(buff[2] << 16))
-      | (static_cast<int>(buff[3] << 24));
-}
-
-int BlockFormatReader::consume_marker(int scan) {
-  uint8_t buff[MARKER_SIZE];
-  if (read_bytes(buff, 1, MARKER_SIZE) != MARKER_SIZE)
-    return 0;
-
-  while (memcmp(buff, _marker, MARKER_SIZE) != 0) {
-    if (!scan) return 0;
-    memmove(buff, buff + 1, MARKER_SIZE - 1);
-    if (read_bytes(buff + MARKER_SIZE - 1, 1, 1) != 1)
-      return 0;
-  }
-  return 1;
-}
-
-int BlockFormatReader::unpack_varint_i32() {
-  int value = 0;
-  for (int i = 0; i < 10; i++) {
-    uint8_t x;
-    if (read_bytes(&x, 1, 1) != 1)
-      return -1;
-    block_pos_++;
-    value |= (static_cast<int>(x & 0x7F)) << (i * 7);
-    if ((x & 0x80) == 0) break;
-  }
-  return value;
-}
-
-
-int BlockFormatReader::unpack_tag_and_wiretype(uint32_t *tag, uint32_t *wiretype) {
-  uint8_t x;
-  if (read_bytes(&x, 1, 1) != 1)
-    return -1;
-
-  block_pos_++;
-  *tag = (x & 0x7f) >> 3;
-  *wiretype = x & 7;
-  if ((x & 0x80) == 0)
-    return 0;
-
-  return -1;
-}
-
-int BlockFormatReader::unpack_string(char *out, uint64_t max_out_len) {
-  int len = unpack_varint_i32();
-  if (len < 0) return -1;
-  uint64_t slen = len;
-  if (slen + 1 > max_out_len) return -1;
-  uint64_t n = read_bytes(out, 1, slen);
-  if (n != slen) return -1;
-  block_pos_ += n;
-  out[n] = 0;
-  return 0;
-}
-
-int BlockFormatReader::read_one_record_size() {
-  for (int i = 0; i < 2; i++) {
-    if (block_end_ == 0) {
-      while (consume_marker(1)) {
-        int block_size = read_int();
-        if (block_size > 0) {
-          block_pos_ = 0;
-          block_end_ = block_size;
-          uint32_t tag, wiretype;
-          if (unpack_tag_and_wiretype(&tag, &wiretype))
-            throw std::invalid_argument("unsupported tag and wiretype");
-          if (tag != 1 && wiretype != WIRE_TYPE_VARINT)
-            throw std::invalid_argument("unexpected tag and wiretype");
-          int version = unpack_varint_i32();
-          if (version != 1)
-            throw std::invalid_argument("unsupported version");
-          if (unpack_tag_and_wiretype(&tag, &wiretype))
-            throw std::invalid_argument("unsupported tag and wiretype");
-          if (tag != 2 && wiretype != WIRE_TYPE_LENGTH_PREFIXED)
-            throw std::invalid_argument("unexpected tag and wiretype");
-          if (unpack_string(classname_, sizeof(classname_)-1))
-            throw std::invalid_argument("unsupported class name");
-          break;
-        }
-      }
-    }
-    if (block_pos_ < block_end_) {
-      uint32_t tag, wiretype;
-      if (unpack_tag_and_wiretype(&tag, &wiretype))
-        throw std::invalid_argument("unsupported tag and wiretype");
-      if (tag != 3 && wiretype != WIRE_TYPE_LENGTH_PREFIXED)
-        throw std::invalid_argument("unexpected tag and wiretype");
-      int record_size = unpack_varint_i32();
-      block_pos_ += record_size;
-      return record_size;
-    } else {
-      block_end_ = 0;
-    }
-  }
-  return -1;
-}
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/BlockFormatWriter.cpp b/twml/libtwml/src/lib/BlockFormatWriter.cpp
deleted file mode 100644
index d66e17351..000000000
--- a/twml/libtwml/src/lib/BlockFormatWriter.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "internal/error.h"
-#include <cstring>
-#include <iostream>
-#include <twml/BlockFormatWriter.h>
-
-#define WIRE_TYPE_LENGTH_PREFIXED   (2)
-#define WIRE_TYPE_VARINT            (0)
-
-#ifndef PATH_MAX
-#define PATH_MAX (8096)
-#endif
-
-#define MARKER_SIZE (16)
-static uint8_t _marker[MARKER_SIZE] = {
-        0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29,
-        0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff
-};
-namespace twml {
-
-    BlockFormatWriter::BlockFormatWriter(const char *file_name, int record_per_block) :
-      file_name_(file_name), record_index_(0), records_per_block_(record_per_block) {
-      snprintf(temp_file_name_, PATH_MAX, "%s.block", file_name);
-      outputfile_ = fopen(file_name_, "a");
-    }
-
-    BlockFormatWriter::~BlockFormatWriter() {
-      fclose(outputfile_);
-    }
-    // TODO: use fstream
-    int BlockFormatWriter::pack_tag_and_wiretype(FILE *buffer, uint32_t tag, uint32_t wiretype) {
-      uint8_t x = ((tag & 0x0f) << 3) | (wiretype & 0x7);
-      size_t n = fwrite(&x, 1, 1, buffer);
-      if (n != 1) {
-        return -1;
-      }
-      return 0;
-    }
-
-    int BlockFormatWriter::pack_varint_i32(FILE *buffer, int value) {
-      for (int i = 0; i < 10; i++) {
-        uint8_t x = value & 0x7F;
-        value = value >> 7;
-        if (value != 0) x |= 0x80;
-        size_t n = fwrite(&x, 1, 1, buffer);
-        if (n != 1) {
-          return -1;
-        }
-        if (value == 0) break;
-      }
-      return 0;
-    }
-
-    int BlockFormatWriter::pack_string(FILE *buffer, const char *in, size_t in_len) {
-      if (pack_varint_i32(buffer, in_len)) return -1;
-      size_t n = fwrite(in, 1, in_len, buffer);
-      if (n != in_len) return -1;
-      return 0;
-    }
-
-    int BlockFormatWriter::write_int(FILE *buffer, int value) {
-      uint8_t buff[4];
-      buff[0] = value & 0xff;
-      buff[1] = (value >> 8) & 0xff;
-      buff[2] = (value >> 16) & 0xff;
-      buff[3] = (value >> 24) & 0xff;
-      size_t n = fwrite(buff, 1, 4, buffer);
-      if (n != 4) {
-        return -1;
-      }
-      return 0;
-    }
-
-    int BlockFormatWriter::write(const char *class_name, const char *record, int record_len) {
-      if (record) {
-        record_index_++;
-        // The buffer holds max records_per_block_ of records (block).
-        FILE *buffer = fopen(temp_file_name_, "a");
-        if (!buffer) return -1;
-        if (ftell(buffer) == 0) {
-          if (pack_tag_and_wiretype(buffer, 1, WIRE_TYPE_VARINT))
-            throw std::invalid_argument("Error writting tag and wiretype");
-          if (pack_varint_i32(buffer, 1))
-            throw std::invalid_argument("Error writting varint_i32");
-          if (pack_tag_and_wiretype(buffer, 2, WIRE_TYPE_LENGTH_PREFIXED))
-            throw std::invalid_argument("Error writting tag and wiretype");
-          if (pack_string(buffer, class_name, strlen(class_name)))
-            throw std::invalid_argument("Error writting class name");
-        }
-        if (pack_tag_and_wiretype(buffer, 3, WIRE_TYPE_LENGTH_PREFIXED))
-          throw std::invalid_argument("Error writtig tag and wiretype");
-        if (pack_string(buffer, record, record_len))
-          throw std::invalid_argument("Error writting record");
-        fclose(buffer);
-      }
-
-      if ((record_index_ % records_per_block_) == 0) {
-        flush();
-      }
-      return 0;
-    }
-
-    int BlockFormatWriter::flush() {
-      // Flush the records in the buffer to outputfile
-      FILE *buffer = fopen(temp_file_name_, "r");
-      if (buffer) {
-        fseek(buffer, 0, SEEK_END);
-        int64_t block_size = ftell(buffer);
-        fseek(buffer, 0, SEEK_SET);
-
-        if (fwrite(_marker, sizeof(_marker), 1, outputfile_) != 1) return 1;
-        if (write_int(outputfile_, block_size)) return 1;
-        uint8_t buff[4096];
-        while (1) {
-          size_t n = fread(buff, 1, sizeof(buff), buffer);
-          if (n) {
-            size_t x = fwrite(buff, 1, n, outputfile_);
-            if (x != n) return 1;
-          }
-          if (n != sizeof(buff)) break;
-        }
-        fclose(buffer);
-        // Remove the buffer
-        if (remove(temp_file_name_)) return 1;
-      }
-      return 0;
-    }
-
-    block_format_writer BlockFormatWriter::getHandle() {
-        return reinterpret_cast<block_format_writer>(this);
-      }
-
-    BlockFormatWriter *getBlockFormatWriter(block_format_writer w) {
-       return reinterpret_cast<BlockFormatWriter *>(w);
-    }
-
-}  // namespace twml
-
-twml_err block_format_writer_create(block_format_writer *w, const char *file_name, int records_per_block) {
-  HANDLE_EXCEPTIONS(
-    twml::BlockFormatWriter *writer =  new twml::BlockFormatWriter(file_name, records_per_block);
-    *w = reinterpret_cast<block_format_writer>(writer););
-  return TWML_ERR_NONE;
-}
-
-twml_err block_format_write(block_format_writer w, const char *class_name, const char *record, int record_len) {
-  HANDLE_EXCEPTIONS(
-    twml::BlockFormatWriter *writer = twml::getBlockFormatWriter(w);
-    writer->write(class_name, record, record_len););
-  return TWML_ERR_NONE;
-}
-
-twml_err block_format_flush(block_format_writer w) {
-  HANDLE_EXCEPTIONS(
-    twml::BlockFormatWriter *writer = twml::getBlockFormatWriter(w);
-    writer->flush(););
-  return TWML_ERR_NONE;
-}
-
-twml_err block_format_writer_delete(const block_format_writer w) {
-  HANDLE_EXCEPTIONS(
-    delete twml::getBlockFormatWriter(w););
-  return TWML_ERR_NONE;
-}
diff --git a/twml/libtwml/src/lib/CMakeLists.txt b/twml/libtwml/src/lib/CMakeLists.txt
deleted file mode 100644
index 6bf2a6e7c..000000000
--- a/twml/libtwml/src/lib/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
-cmake_policy(VERSION 2.8)
-
-
-set(TWML_VERSION "2.0.0")
-string(REPLACE "." ";" TWML_VERSION_LIST ${TWML_VERSION})
-list(GET TWML_VERSION_LIST 0 TWML_SOVERSION)
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_inc.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_INC)
-
-file(GLOB_RECURSE sources *.cpp)
-
-set (CMAKE_CXX_FLAGS "-Wall -std=c++11 ${CMAKE_CXX_FLAGS} -fPIC")
-
-add_library(twml STATIC ${sources})
-
-target_include_directories(
-  twml
-  PUBLIC
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${TF_INC}  # Absail dependency from tensorflow
-  )
-
-set_target_properties(twml PROPERTIES
-  VERSION "${TWML_VERSION}"
-  SOVERSION "${TWML_SOVERSION}"
-  )
diff --git a/twml/libtwml/src/lib/CPPLINT.cfg b/twml/libtwml/src/lib/CPPLINT.cfg
deleted file mode 100644
index dfe873a9d..000000000
--- a/twml/libtwml/src/lib/CPPLINT.cfg
+++ /dev/null
@@ -1 +0,0 @@
-exclude_files=murmur_hash3.cpp
\ No newline at end of file
diff --git a/twml/libtwml/src/lib/DataRecord.cpp b/twml/libtwml/src/lib/DataRecord.cpp
deleted file mode 100644
index 766422063..000000000
--- a/twml/libtwml/src/lib/DataRecord.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "internal/thrift.h"
-#include "internal/error.h"
-
-#include <twml/utilities.h>
-#include <twml/DataRecord.h>
-#include <twml/DataRecordReader.h>
-#include <twml/Error.h>
-
-#include <cstring>
-#include <cstdint>
-
-namespace twml {
-
-void DataRecord::decode(DataRecordReader &reader) {
-  uint8_t feature_type = reader.readByte();
-  while (feature_type != TTYPE_STOP) {
-    int16_t field_id = reader.readInt16();
-    switch (field_id) {
-      case DR_BINARY:
-        reader.readBinary(feature_type, this);
-        break;
-      case DR_CONTINUOUS:
-        reader.readContinuous(feature_type, this);
-        break;
-      case DR_DISCRETE:
-        reader.readDiscrete(feature_type, this);
-        break;
-      case DR_STRING:
-        reader.readString(feature_type, this);
-        break;
-      case DR_SPARSE_BINARY:
-        reader.readSparseBinary(feature_type, this);
-        break;
-      case DR_SPARSE_CONTINUOUS:
-        reader.readSparseContinuous(feature_type, this);
-        break;
-      case DR_BLOB:
-        reader.readBlob(feature_type, this);
-        break;
-      case DR_GENERAL_TENSOR:
-        reader.readTensor(feature_type, dynamic_cast<TensorRecord *>(this));
-        break;
-      case DR_SPARSE_TENSOR:
-        reader.readSparseTensor(feature_type, dynamic_cast<TensorRecord *>(this));
-        break;
-      default:
-        throw ThriftInvalidField(field_id, "DataRecord::decode");
-    }
-    feature_type = reader.readByte();
-  }
-}
-
-void DataRecord::addLabel(int64_t id, double label) {
-  m_labels[id] = label;
-}
-
-void DataRecord::addWeight(int64_t id, double val) {
-  m_weights[id] = val;
-}
-
-void DataRecord::clear() {
-  std::fill(m_labels.begin(), m_labels.end(), std::nanf(""));
-  std::fill(m_weights.begin(), m_weights.end(), 0.0);
-  m_binary.clear();
-  m_continuous.clear();
-  m_discrete.clear();
-  m_string.clear();
-  m_sparsebinary.clear();
-  m_sparsecontinuous.clear();
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/DataRecordReader.cpp b/twml/libtwml/src/lib/DataRecordReader.cpp
deleted file mode 100644
index f151e07a7..000000000
--- a/twml/libtwml/src/lib/DataRecordReader.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-#include "internal/thrift.h"
-#include "internal/error.h"
-#include <string>
-#include <cmath>
-
-#include <twml/DataRecordReader.h>
-
-namespace twml {
-
-inline std::string bufferToString(int32_t str_len, const uint8_t *str) {
-  return std::string(str, str + str_len);
-}
-
-
-bool DataRecordReader::keepKey(const int64_t &key, int64_t &code) {
-  auto it = m_keep_map->find(key);
-  if (it == m_keep_map->end()) return false;
-  code = it->second;
-  return true;
-}
-
-bool DataRecordReader::isLabel(const int64_t &key, int64_t &code) {
-  if (m_labels_map == nullptr) return false;
-  auto it = m_labels_map->find(key);
-  if (it == m_labels_map->end()) return false;
-  code = it->second;
-  return true;
-}
-
-bool DataRecordReader::isWeight(const int64_t &key, int64_t &code) {
-  if (m_weights_map == nullptr) return false;
-  auto it = m_weights_map->find(key);
-  if (it == m_weights_map->end()) return false;
-  code = it->second;
-  return true;
-}
-
-
-void DataRecordReader::readBinary(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_SET, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  int32_t length = readInt32();
-  int64_t id, code;
-#ifdef USE_DENSE_HASH
-  record->m_binary.resize(2 * length);
-#else
-  record->m_binary.reserve(2 * length);
-#endif
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    record->m_binary.insert(id);
-    if (isLabel(id, code)) {
-      record->addLabel(code);
-    }
-  }
-}
-
-void DataRecordReader::readContinuous(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_DOUBLE, "value_type");
-
-  int32_t length = readInt32();
-  int64_t id, code;
-#ifdef USE_DENSE_HASH
-  record->m_continuous.resize(2 * length);
-#else
-  record->m_continuous.reserve(2 * length);
-#endif
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    double val = readDouble();
-    if (!std::isnan(val)) {
-      record->m_continuous[id] = val;
-    }
-    if (isLabel(id, code)) {
-      record->addLabel(code, val);
-    } else if (isWeight(id, code)) {
-      record->addWeight(code, val);
-    }
-  }
-}
-
-void DataRecordReader::readDiscrete(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "value_type");
-
-  int32_t length = readInt32();
-  int64_t id;
-#ifdef USE_DENSE_HASH
-  record->m_discrete.resize(2 * length);
-#else
-  record->m_discrete.reserve(2 * length);
-#endif
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    record->m_discrete[id] = readInt64();
-  }
-}
-
-void DataRecordReader::readString(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "value_type");
-  int32_t length = readInt32();
-  int64_t id;
-
-#ifdef USE_DENSE_HASH
-  record->m_string.resize(2 * length);
-#else
-  record->m_string.reserve(2 * length);
-#endif
-
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    const uint8_t *begin = nullptr;
-    int32_t str_len = getRawBuffer<uint8_t>(&begin);
-    record->m_string[id] = bufferToString(str_len, begin);
-  }
-}
-
-void DataRecordReader::readSparseBinary(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_SET, "value_type");
-
-  int32_t length = readInt32();
-  int64_t id, code;
-
-#ifdef USE_DENSE_HASH
-  record->m_sparsebinary.resize(2 * length);
-#else
-  record->m_sparsebinary.reserve(2 * length);
-#endif
-
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "set:key_type");
-    int32_t set_length = readInt32();
-    if (keepKey(id, code)) {
-      record->m_sparsebinary[id].reserve(set_length);
-      for (int32_t j = 0; j < set_length; j++) {
-        const uint8_t *begin = nullptr;
-        int32_t str_len = getRawBuffer<uint8_t>(&begin);
-        record->m_sparsebinary[id].push_back(bufferToString(str_len, begin));
-      }
-    } else {
-      for (int32_t j = 0; j < set_length; j++) {
-        int32_t str_len = readInt32();
-        skipLength(str_len);
-      }
-    }
-  }
-}
-
-void DataRecordReader::readSparseContinuous(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_MAP, "value_type");
-
-  int32_t length = readInt32();
-  int64_t id, code;
-
-#ifdef USE_DENSE_HASH
-  record->m_sparsecontinuous.resize(2 * length);
-#else
-  record->m_sparsecontinuous.reserve(2 * length);
-#endif
-
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "map::key_type");
-    CHECK_THRIFT_TYPE(readByte(), TTYPE_DOUBLE, "map::value_type");
-    int32_t map_length = readInt32();
-    if (keepKey(id, code)) {
-      record->m_sparsecontinuous[id].reserve(map_length);
-      for (int32_t j = 0; j < map_length; j++) {
-        const uint8_t *begin = nullptr;
-        int32_t str_len = getRawBuffer<uint8_t>(&begin);
-        double val = readDouble();
-        if (!std::isnan(val)) {
-          record->m_sparsecontinuous[id].push_back({bufferToString(str_len, begin), val});
-        }
-      }
-    } else {
-      for (int32_t j = 0; j < map_length; j++) {
-        int32_t str_len = readInt32();
-        skipLength(str_len);
-        skip<double>();
-      }
-    }
-  }
-}
-
-void DataRecordReader::readBlob(
-  const int feature_type,
-  DataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "value_type");
-
-  int32_t length = readInt32();
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepKey(id, code)) {
-      const uint8_t *begin = nullptr;
-      int32_t blob_len = getRawBuffer<uint8_t>(&begin);
-      record->m_blob[id] = std::vector<uint8_t>(begin, begin + blob_len);
-    } else {
-      int32_t str_len = readInt32();
-      skipLength(str_len);
-    }
-  }
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/DataRecordWriter.cpp b/twml/libtwml/src/lib/DataRecordWriter.cpp
deleted file mode 100644
index e12a50d48..000000000
--- a/twml/libtwml/src/lib/DataRecordWriter.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <map>
-#include <twml/ThriftWriter.h>
-#include <twml/DataRecordWriter.h>
-#include <twml/io/IOError.h>
-#include <unordered_set>
-
-using namespace twml::io;
-
-namespace twml {
-
-void DataRecordWriter::writeBinary(twml::DataRecord &record) {
-  const DataRecord::BinaryFeatures bin_features = record.getBinary();
-
-  if (bin_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_SET, DR_BINARY);
-    m_thrift_writer.writeListHeader(TTYPE_I64, bin_features.size());
-
-    for (const auto &it : bin_features) {
-      m_thrift_writer.writeInt64(it);
-    }
-  }
-}
-
-void DataRecordWriter::writeContinuous(twml::DataRecord &record) {
-  const DataRecord::ContinuousFeatures cont_features = record.getContinuous();
-
-  if (cont_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_CONTINUOUS);
-    m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_DOUBLE, cont_features.size());
-
-    for (const auto &it : cont_features) {
-      m_thrift_writer.writeInt64(it.first);
-      m_thrift_writer.writeDouble(it.second);
-    }
-  }
-}
-
-void DataRecordWriter::writeDiscrete(twml::DataRecord &record) {
-  const DataRecord::DiscreteFeatures disc_features = record.getDiscrete();
-
-  if (disc_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_DISCRETE);
-    m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_I64, disc_features.size());
-
-     for (const auto &it : disc_features) {
-      m_thrift_writer.writeInt64(it.first);
-      m_thrift_writer.writeInt64(it.second);
-    }
-  }
-}
-
-void DataRecordWriter::writeString(twml::DataRecord &record) {
-  const DataRecord::StringFeatures str_features = record.getString();
-
-  if (str_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_STRING);
-    m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_STRING, str_features.size());
-
-
-    for (const auto &it : str_features) {
-      m_thrift_writer.writeInt64(it.first);
-      m_thrift_writer.writeString(it.second);
-    }
-  }
-}
-
-// convert from internal representation list<(i64, string)>
-// to Thrift representation map<i64, set<string>>
-void DataRecordWriter::writeSparseBinaryFeatures(twml::DataRecord &record) {
-  const DataRecord::SparseBinaryFeatures sp_bin_features = record.getSparseBinary();
-
-  // write map<i64, set<string>> as Thrift
-  if (sp_bin_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_SPARSE_BINARY);
-    m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_SET, sp_bin_features.size());
-
-    for (auto key_vals : sp_bin_features) {
-      m_thrift_writer.writeInt64(key_vals.first);
-      m_thrift_writer.writeListHeader(TTYPE_STRING, key_vals.second.size());
-
-      for (auto name : key_vals.second)
-        m_thrift_writer.writeString(name);
-    }
-  }
-}
-
-// convert from internal representation list<(i64, string, double)>
-// to Thrift representation map<i64, map<string, double>>
-void DataRecordWriter::writeSparseContinuousFeatures(twml::DataRecord &record) {
-  const DataRecord::SparseContinuousFeatures sp_cont_features = record.getSparseContinuous();
-
-  // write map<i64, map<string, double>> as Thrift
-  if (sp_cont_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_SPARSE_CONTINUOUS);
-    m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_MAP, sp_cont_features.size());
-
-    for (auto key_vals : sp_cont_features) {
-      m_thrift_writer.writeInt64(key_vals.first);
-
-      if (key_vals.second.size() == 0)
-        throw IOError(IOError::MALFORMED_MEMORY_RECORD);
-
-      m_thrift_writer.writeMapHeader(TTYPE_STRING, TTYPE_DOUBLE, key_vals.second.size());
-
-      for (auto map_str_double : key_vals.second) {
-        m_thrift_writer.writeString(map_str_double.first);
-        m_thrift_writer.writeDouble(map_str_double.second);
-      }
-    }
-  }
-}
-
-void DataRecordWriter::writeBlobFeatures(twml::DataRecord &record) {
-  const DataRecord::BlobFeatures blob_features = record.getBlob();
-
-  if (blob_features.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_BLOB);
-    m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_STRING, blob_features.size());
-
-    for (const auto &it : blob_features) {
-      m_thrift_writer.writeInt64(it.first);
-      std::vector<uint8_t> value = it.second;
-      m_thrift_writer.writeBinary(value.data(), value.size());
-    }
-  }
-}
-
-void DataRecordWriter::writeDenseTensors(twml::DataRecord &record) {
-  TensorRecord::RawTensors raw_tensors = record.getRawTensors();
-  if (raw_tensors.size() > 0) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_GENERAL_TENSOR);
-    m_tensor_writer.write(record);
-  }
-}
-
-TWMLAPI uint32_t DataRecordWriter::getRecordsWritten() {
-  return m_records_written;
-}
-
-TWMLAPI uint64_t DataRecordWriter::write(twml::DataRecord &record) {
-  uint64_t bytes_written_before = m_thrift_writer.getBytesWritten();
-
-  writeBinary(record);
-  writeContinuous(record);
-  writeDiscrete(record);
-  writeString(record);
-  writeSparseBinaryFeatures(record);
-  writeSparseContinuousFeatures(record);
-  writeBlobFeatures(record);
-  writeDenseTensors(record);
-  // TODO add sparse tensor field
-
-  m_thrift_writer.writeStructStop();
-  m_records_written++;
-
-  return m_thrift_writer.getBytesWritten() - bytes_written_before;
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/HashedDataRecord.cpp b/twml/libtwml/src/lib/HashedDataRecord.cpp
deleted file mode 100644
index 6bbecee70..000000000
--- a/twml/libtwml/src/lib/HashedDataRecord.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "internal/thrift.h"
-#include "internal/error.h"
-
-#include <twml/HashedDataRecord.h>
-#include <twml/HashedDataRecordReader.h>
-#include <twml/Error.h>
-
-#include <algorithm>
-#include <cstring>
-#include <cstdint>
-
-namespace twml {
-
-void HashedDataRecord::decode(HashedDataRecordReader &reader) {
-  uint8_t feature_type = reader.readByte();
-  while (feature_type != TTYPE_STOP) {
-    int16_t field_id = reader.readInt16();
-    switch (field_id) {
-      case DR_BINARY:
-        reader.readBinary(feature_type, this);
-        break;
-      case DR_CONTINUOUS:
-        reader.readContinuous(feature_type, this);
-        break;
-      case DR_DISCRETE:
-        reader.readDiscrete(feature_type, this);
-        break;
-      case DR_STRING:
-        reader.readString(feature_type, this);
-        break;
-      case DR_SPARSE_BINARY:
-        reader.readSparseBinary(feature_type, this);
-        break;
-      case DR_SPARSE_CONTINUOUS:
-        reader.readSparseContinuous(feature_type, this);
-        break;
-      case DR_BLOB:
-        reader.readBlob(feature_type, this);
-        break;
-      case DR_GENERAL_TENSOR:
-        reader.readTensor(feature_type, dynamic_cast<TensorRecord *>(this));
-        break;
-      case DR_SPARSE_TENSOR:
-        reader.readSparseTensor(feature_type, dynamic_cast<TensorRecord *>(this));
-        break;
-      default:
-        throw ThriftInvalidField(field_id, "HashedDataRecord::readThrift");
-    }
-    feature_type = reader.readByte();
-  }
-}
-
-void HashedDataRecord::addKey(int64_t key, int64_t transformed_key,
-                              int64_t code, uint8_t type, double value) {
-  m_keys.push_back(key);
-  m_transformed_keys.push_back(transformed_key);
-  m_values.push_back(value);
-  m_codes.push_back(code);
-  m_types.push_back(type);
-}
-
-void HashedDataRecord::addLabel(int64_t id, double label) {
-  m_labels[id] = label;
-}
-
-void HashedDataRecord::addWeight(int64_t id, double val) {
-  m_weights[id] = val;
-}
-
-void HashedDataRecord::clear() {
-  std::fill(m_labels.begin(), m_labels.end(), std::nanf(""));
-  std::fill(m_weights.begin(), m_weights.end(), 0.0);
-  m_keys.clear();
-  m_transformed_keys.clear();
-  m_values.clear();
-  m_codes.clear();
-  m_types.clear();
-}
-
-}  // namespace twml
\ No newline at end of file
diff --git a/twml/libtwml/src/lib/HashedDataRecordReader.cpp b/twml/libtwml/src/lib/HashedDataRecordReader.cpp
deleted file mode 100644
index 93c86001b..000000000
--- a/twml/libtwml/src/lib/HashedDataRecordReader.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-#include "internal/thrift.h"
-#include "internal/error.h"
-
-#include <twml/HashedDataRecordReader.h>
-#include <twml/utilities.h>
-#include <twml/functions.h>
-#include <cmath>
-
-namespace twml {
-
-bool HashedDataRecordReader::keepId(const int64_t &key, int64_t &code) {
-  auto it = m_keep_map->find(key);
-  if (it == m_keep_map->end()) return false;
-  code = it->second;
-  return true;
-}
-
-bool HashedDataRecordReader::isLabel(const int64_t &key, int64_t &code) {
-  if (m_labels_map == nullptr) return false;
-  auto it = m_labels_map->find(key);
-  if (it == m_labels_map->end()) return false;
-  code = it->second;
-  return true;
-}
-
-bool HashedDataRecordReader::isWeight(const int64_t &key, int64_t &code) {
-  if (m_weights_map == nullptr) return false;
-  auto it = m_weights_map->find(key);
-  if (it == m_weights_map->end()) return false;
-  code = it->second;
-  return true;
-}
-
-void HashedDataRecordReader::readBinary(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_SET, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-
-  int32_t length = readInt32();
-  record->extendSize(length);
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepId(id, code)) {
-      record->addKey(id, id, code, DR_BINARY);
-    } else if (isLabel(id, code)) {
-      record->addLabel(code);
-    }
-  }
-}
-
-void HashedDataRecordReader::readContinuous(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_DOUBLE, "value_type");
-
-  int32_t length = readInt32();
-  record->extendSize(length);
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepId(id, code)) {
-      double value = readDouble();
-      if (!std::isnan(value)) {
-        record->addKey(id, id, code, DR_CONTINUOUS, value);
-      }
-    } else if (isLabel(id, code)) {
-      record->addLabel(code, readDouble());
-    }  else if (isWeight(id, code)) {
-      record->addWeight(code, readDouble());
-    } else {
-      skip<double>();
-    }
-  }
-}
-
-void HashedDataRecordReader::readDiscrete(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "value_type");
-
-  int32_t length = readInt32();
-  record->extendSize(length);
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepId(id, code)) {
-      int64_t transformed_key = mixDiscreteIdAndValue(id, readInt64());
-      record->addKey(id, transformed_key, code, DR_DISCRETE);
-    } else {
-      skip<int64_t>();
-    }
-  }
-}
-
-void HashedDataRecordReader::readString(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "value_type");
-
-  int32_t length = readInt32();
-  record->extendSize(length);
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepId(id, code)) {
-      const uint8_t *begin = nullptr;
-      int32_t str_len = getRawBuffer<uint8_t>(&begin);
-      int64_t transformed_key = mixStringIdAndValue(id, str_len, begin);
-      record->addKey(id, transformed_key, code, DR_STRING);
-    } else {
-      int32_t str_len = readInt32();
-      skipLength(str_len);
-    }
-  }
-}
-
-void HashedDataRecordReader::readSparseBinary(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_SET, "value_type");
-
-  int32_t length = readInt32();
-  record->extendSize(length);
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepId(id, code)) {
-      CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "set:key_type");
-      int32_t set_length = readInt32();
-      for (int32_t j = 0; j < set_length; j++) {
-        const uint8_t *begin = nullptr;
-        int32_t str_len = getRawBuffer<uint8_t>(&begin);
-        int64_t transformed_key = mixStringIdAndValue(id, str_len, begin);
-        record->addKey(id, transformed_key, code, DR_SPARSE_BINARY);
-      }
-    } else {
-      CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "set:key_type");
-      int32_t set_length = readInt32();
-      for (int32_t j = 0; j < set_length; j++) {
-        int32_t str_len = readInt32();
-        skipLength(str_len);
-      }
-    }
-  }
-}
-
-void HashedDataRecordReader::readSparseContinuous(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_MAP, "value_type");
-
-  int32_t length = readInt32();
-  record->extendSize(length);
-  int64_t id, code;
-  for (int32_t i = 0; i < length; i++) {
-    id = readInt64();
-    if (keepId(id, code)) {
-      CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "map::key_type");
-      CHECK_THRIFT_TYPE(readByte(), TTYPE_DOUBLE, "map::value_type");
-      int32_t map_length = readInt32();
-      for (int32_t j = 0; j < map_length; j++) {
-        const uint8_t *begin = nullptr;
-        int32_t str_len = getRawBuffer<uint8_t>(&begin);
-        int64_t transformed_key = 0;
-        switch(m_decode_mode) {
-          case DecodeMode::hash_fname_and_valname:
-            transformed_key = mixStringIdAndValue(id, str_len, begin);
-            break;
-          default:  // m_decode_mode == DecodeMode::hash_valname == 0 is default
-            twml_get_feature_id(&transformed_key, str_len, reinterpret_cast<const char *>(begin));
-        }
-        double value = readDouble();
-        if (!std::isnan(value)) {
-          record->addKey(id, transformed_key, code, DR_SPARSE_CONTINUOUS, value);
-        }
-      }
-    } else {
-      CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "map::key_type");
-      CHECK_THRIFT_TYPE(readByte(), TTYPE_DOUBLE, "map::value_type");
-      int32_t map_length = readInt32();
-      for (int32_t j = 0; j < map_length; j++) {
-        int32_t str_len = readInt32();
-        skipLength(str_len);
-        skip<double>();
-      }
-    }
-  }
-}
-
-void HashedDataRecordReader::readBlob(
-  const int feature_type,
-  HashedDataRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "value_type");
-
-  int32_t length = readInt32();
-  int64_t id;
-  for (int32_t i = 0; i < length; i++) {
-    // Skips the BlobFeatures if they are defined or not in the FeatureConfig
-    id = readInt64();
-    int32_t str_len = readInt32();
-    skipLength(str_len);
-  }
-}
-}  // namespace twml
\ No newline at end of file
diff --git a/twml/libtwml/src/lib/Hashmap.cpp b/twml/libtwml/src/lib/Hashmap.cpp
deleted file mode 100644
index 4086e8a16..000000000
--- a/twml/libtwml/src/lib/Hashmap.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-#include "internal/khash.h"
-#include "internal/error.h"
-#include <twml/defines.h>
-#include <twml/Hashmap.h>
-#include <cstdint>
-
-namespace twml {
-  HashMap::HashMap() :
-    m_hashmap(nullptr) {
-    TWML_CHECK(twml_hashmap_create(&m_hashmap), "Failed to create HashMap");
-  }
-
-  HashMap::~HashMap() {
-    // Do not throw exceptions from the destructor
-    twml_hashmap_delete(m_hashmap);
-  }
-
-  void HashMap::clear() {
-    TWML_CHECK(twml_hashmap_clear(m_hashmap), "Failed to clear HashMap");
-  }
-
-  uint64_t HashMap::size() const {
-    uint64_t size;
-    TWML_CHECK(twml_hashmap_get_size(&size, m_hashmap), "Failed to get HashMap size");
-    return size;
-  }
-
-  int8_t HashMap::insert(const HashKey_t key) {
-    int8_t result;
-    TWML_CHECK(twml_hashmap_insert_key(&result, m_hashmap, key),
-           "Failed to insert key");
-    return result;
-  }
-
-  int8_t HashMap::insert(const HashKey_t key, const HashKey_t val) {
-    int8_t result;
-    TWML_CHECK(twml_hashmap_insert_key_and_value(&result, m_hashmap, key, val),
-           "Failed to insert key");
-    return result;
-  }
-
-  int8_t HashMap::get(HashVal_t &val, const HashKey_t key) const {
-    int8_t result;
-    TWML_CHECK(twml_hashmap_get_value(&result, &val, m_hashmap, key),
-           "Failed to insert key,value pair");
-    return result;
-  }
-
-  void HashMap::insert(Tensor &mask, const Tensor keys) {
-    TWML_CHECK(twml_hashmap_insert_keys(mask.getHandle(), m_hashmap, keys.getHandle()),
-           "Failed to insert keys tensor");
-  }
-
-  void HashMap::insert(Tensor &mask, const Tensor keys, const Tensor vals) {
-    TWML_CHECK(twml_hashmap_insert_keys_and_values(mask.getHandle(), m_hashmap,
-                             keys.getHandle(), vals.getHandle()),
-           "Failed to insert keys,values tensor pair");
-  }
-
-  void HashMap::remove(const Tensor keys) {
-    TWML_CHECK(twml_hashmap_remove_keys(m_hashmap, keys.getHandle()),
-           "Failed to remove keys tensor");
-  }
-
-  void HashMap::get(Tensor &mask, Tensor &vals, const Tensor keys) const {
-    TWML_CHECK(twml_hashmap_get_values(mask.getHandle(), vals.getHandle(),
-                      m_hashmap, keys.getHandle()),
-           "Failed to get values tensor");
-  }
-
-  void HashMap::getInplace(Tensor &mask, Tensor &keys_vals) const {
-    TWML_CHECK(twml_hashmap_get_values_inplace(mask.getHandle(),
-                           keys_vals.getHandle(),
-                           m_hashmap),
-           "Failed to get values tensor");
-  }
-
-  void HashMap::toTensors(Tensor &keys, Tensor &vals) const {
-    TWML_CHECK(twml_hashmap_to_tensors(keys.getHandle(),
-                       vals.getHandle(),
-                       m_hashmap),
-           "Failed to get keys,values tensors from HashMap");
-  }
-}  // namespace twml
-
-using twml::HashKey_t;
-using twml::HashVal_t;
-
-KHASH_MAP_INIT_INT64(HashKey_t, HashVal_t);
-typedef khash_t(HashKey_t)* hash_map_t;
-
-
-twml_err twml_hashmap_create(twml_hashmap *hashmap) {
-  hash_map_t *h = reinterpret_cast<hash_map_t *>(hashmap);
-  *h = kh_init(HashKey_t);
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_clear(const twml_hashmap hashmap) {
-  hash_map_t h = (hash_map_t)hashmap;
-  kh_clear(HashKey_t, h);
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_get_size(uint64_t *size, const twml_hashmap hashmap) {
-  hash_map_t h = (hash_map_t)hashmap;
-  *size = kh_size(h);
-  return TWML_ERR_NONE;
-}
-
-
-twml_err twml_hashmap_delete(const twml_hashmap hashmap) {
-  hash_map_t h = (hash_map_t)hashmap;
-  kh_destroy(HashKey_t, h);
-  return TWML_ERR_NONE;
-}
-
-// insert, remove, get single key / value
-twml_err twml_hashmap_insert_key(int8_t *mask,
-                 const twml_hashmap hashmap,
-                 const HashKey_t key) {
-  hash_map_t h = (hash_map_t)hashmap;
-  int ret = 0;
-  khiter_t k = kh_put(HashKey_t, h, key, &ret);
-  *mask = ret >= 0;
-  if (*mask) {
-    HashVal_t v = kh_size(h);
-    kh_value(h, k) = v;
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_insert_key_and_value(int8_t *mask, twml_hashmap hashmap,
-                       const HashKey_t key, const HashVal_t val) {
-  hash_map_t h = (hash_map_t)hashmap;
-  int ret = 0;
-  khiter_t k = kh_put(HashKey_t, h, key, &ret);
-  *mask = ret >= 0;
-  if (*mask) {
-    kh_value(h, k) = val;
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_remove_key(const twml_hashmap hashmap,
-                 const HashKey_t key) {
-  hash_map_t h = (hash_map_t)hashmap;
-  khiter_t k = kh_get(HashKey_t, h, key);
-  if (k != kh_end(h)) {
-    kh_del(HashKey_t, h, k);
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_get_value(int8_t *mask, HashVal_t *val,
-                const twml_hashmap hashmap, const HashKey_t key) {
-  hash_map_t h = (hash_map_t)hashmap;
-  khiter_t k = kh_get(HashKey_t, h, key);
-  if (k == kh_end(h)) {
-    *mask = false;
-  } else {
-    *val = kh_value(h, k);
-    *mask = true;
-  }
-  return TWML_ERR_NONE;
-}
-
-// insert, get, remove tensors of keys / values
-twml_err twml_hashmap_insert_keys(twml_tensor masks,
-                  const twml_hashmap hashmap,
-                  const twml_tensor keys) {
-  auto masks_tensor = twml::getTensor(masks);
-  auto keys_tensor = twml::getConstTensor(keys);
-
-  if (masks_tensor->getType() != TWML_TYPE_INT8) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getNumElements() != masks_tensor->getNumElements()) {
-    return TWML_ERR_SIZE;
-  }
-
-  int8_t *mptr = masks_tensor->getData<int8_t>();
-  const HashKey_t *kptr = keys_tensor->getData<HashKey_t>();
-
-  uint64_t num_elements = keys_tensor->getNumElements();
-
-  hash_map_t h = (hash_map_t)hashmap;
-  for (uint64_t i = 0; i < num_elements; i++) {
-    int ret = 0;
-    khiter_t k = kh_put(HashKey_t, h, kptr[i], &ret);
-    mptr[i] = ret >= 0;
-    if (mptr[i]) {
-      HashVal_t v = kh_size(h);
-      kh_value(h, k) = v;
-    }
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_insert_keys_and_values(twml_tensor masks,
-                       twml_hashmap hashmap,
-                       const twml_tensor keys,
-                       const twml_tensor vals) {
-  auto masks_tensor = twml::getTensor(masks);
-  auto keys_tensor = twml::getConstTensor(keys);
-  auto vals_tensor = twml::getConstTensor(vals);
-
-  if (masks_tensor->getType() != TWML_TYPE_INT8) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (vals_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getNumElements() != vals_tensor->getNumElements() ||
-    keys_tensor->getNumElements() != masks_tensor->getNumElements()) {
-    return TWML_ERR_SIZE;
-  }
-
-  int8_t *mptr = masks_tensor->getData<int8_t>();
-  const HashKey_t *kptr = keys_tensor->getData<HashKey_t>();
-  const HashVal_t *vptr = twml::getConstTensor(vals)->getData<HashVal_t>();
-
-  uint64_t num_elements = keys_tensor->getNumElements();
-
-  hash_map_t h = (hash_map_t)hashmap;
-  for (uint64_t i = 0; i < num_elements; i++) {
-    int ret = 0;
-    khiter_t k = kh_put(HashKey_t, h, kptr[i], &ret);
-    mptr[i] = ret >= 0;
-    if (mptr[i]) {
-      kh_value(h, k) = vptr[i];
-    }
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_remove_keys(const twml_hashmap hashmap,
-                  const twml_tensor keys) {
-  auto keys_tensor = twml::getConstTensor(keys);
-
-  if (keys_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  const HashKey_t *kptr = keys_tensor->getData<HashKey_t>();
-  uint64_t num_elements = keys_tensor->getNumElements();
-
-  hash_map_t h = (hash_map_t)hashmap;
-  for (uint64_t i = 0; i < num_elements; i++) {
-    khiter_t k = kh_get(HashKey_t, h, kptr[i]);
-    if (k != kh_end(h)) {
-      kh_del(HashKey_t, h, kptr[i]);
-    }
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_get_values(twml_tensor masks, twml_tensor vals,
-                 const twml_hashmap hashmap, const twml_tensor keys) {
-  auto masks_tensor = twml::getTensor(masks);
-  auto vals_tensor = twml::getTensor(vals);
-  auto keys_tensor = twml::getConstTensor(keys);
-
-  if (masks_tensor->getType() != TWML_TYPE_INT8) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (vals_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getNumElements() != vals_tensor->getNumElements() ||
-    keys_tensor->getNumElements() != masks_tensor->getNumElements()) {
-    return TWML_ERR_SIZE;
-  }
-
-  int8_t *mptr = masks_tensor->getData<int8_t>();
-  HashVal_t *vptr = vals_tensor->getData<HashVal_t>();
-  const HashKey_t *kptr = keys_tensor->getData<HashKey_t>();
-
-  uint64_t num_elements = keys_tensor->getNumElements();
-
-  hash_map_t h = (hash_map_t)hashmap;
-  for (uint64_t i = 0; i < num_elements; i++) {
-    khiter_t k = kh_get(HashKey_t, h, kptr[i]);
-    if (k == kh_end(h)) {
-      mptr[i] = false;
-    } else {
-      mptr[i] = true;
-      vptr[i] = kh_value(h, k);
-    }
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_get_values_inplace(twml_tensor masks, twml_tensor keys_vals,
-                     const twml_hashmap hashmap) {
-  auto masks_tensor = twml::getTensor(masks);
-  auto keys_tensor = twml::getTensor(keys_vals);
-
-  if (masks_tensor->getType() != TWML_TYPE_INT8) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (keys_tensor->getNumElements() != masks_tensor->getNumElements()) {
-    return TWML_ERR_SIZE;
-  }
-
-  int8_t *mptr = masks_tensor->getData<int8_t>();
-  HashKey_t *kptr = keys_tensor->getData<HashKey_t>();
-
-  uint64_t num_elements = keys_tensor->getNumElements();
-
-  hash_map_t h = (hash_map_t)hashmap;
-  for (uint64_t i = 0; i < num_elements; i++) {
-    khiter_t k = kh_get(HashKey_t, h, kptr[i]);
-    if (k == kh_end(h)) {
-      mptr[i] = false;
-    } else {
-      mptr[i] = true;
-      kptr[i] = kh_value(h, k);
-    }
-  }
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_hashmap_to_tensors(twml_tensor keys, twml_tensor vals,
-                 const twml_hashmap hashmap) {
-  hash_map_t h = (hash_map_t)hashmap;
-  const uint64_t size = kh_size(h);
-
-  auto keys_tensor = twml::getTensor(keys);
-  auto vals_tensor = twml::getTensor(vals);
-
-  if (keys_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (vals_tensor->getType() != TWML_TYPE_INT64) {
-    return TWML_ERR_TYPE;
-  }
-
-  if (size != keys_tensor->getNumElements() ||
-    size != vals_tensor->getNumElements()) {
-    return TWML_ERR_SIZE;
-  }
-
-  HashKey_t *kptr = keys_tensor->getData<HashKey_t>();
-  HashVal_t *vptr = vals_tensor->getData<HashVal_t>();
-
-  HashKey_t key, i = 0;
-  HashKey_t val;
-
-  kh_foreach(h, key, val, {
-      kptr[i] = key;
-      vptr[i] = val;
-      i++;
-    });
-
-  return TWML_ERR_NONE;
-}
diff --git a/twml/libtwml/src/lib/Tensor.cpp b/twml/libtwml/src/lib/Tensor.cpp
deleted file mode 100644
index d610d9316..000000000
--- a/twml/libtwml/src/lib/Tensor.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-#include "internal/error.h"
-#include <twml/Tensor.h>
-#include <twml/Type.h>
-#include <type_traits>
-#include <algorithm>
-#include <numeric>
-
-namespace twml {
-
-using std::vector;
-
-Tensor::Tensor(void *data, int ndims, const uint64_t *dims, const uint64_t *strides, twml_type type) :
-    m_type(type), m_data(data),
-    m_dims(dims, dims + ndims),
-    m_strides(strides, strides + ndims) {
-}
-
-Tensor::Tensor(void *data,
-               const vector<uint64_t> &dims,
-               const vector<uint64_t> &strides,
-               twml_type type) :
-    m_type(type), m_data(data),
-    m_dims(dims.begin(), dims.end()),
-    m_strides(strides.begin(), strides.end()) {
-  if (dims.size() != strides.size()) {
-    throw twml::Error(TWML_ERR_SIZE, "The number size of dims and strides don't match");
-  }
-}
-
-int Tensor::getNumDims() const {
-  return static_cast<int>(m_dims.size());
-}
-
-uint64_t Tensor::getDim(int id) const {
-  if (id >= this->getNumDims()) {
-    throw twml::Error(TWML_ERR_SIZE, "Requested dimension exceeds tensor dimension");
-  }
-  return m_dims[id];
-}
-
-uint64_t Tensor::getStride(int id) const {
-  if (id >= this->getNumDims()) {
-    throw twml::Error(TWML_ERR_SIZE, "Requested dimension exceeds tensor dimension");
-  }
-  return m_strides[id];
-}
-
-uint64_t Tensor::getNumElements() const {
-  return std::accumulate(m_dims.begin(), m_dims.end(), 1, std::multiplies<int>());
-}
-
-twml_type Tensor::getType() const {
-  return m_type;
-}
-
-twml_tensor Tensor::getHandle() {
-  return reinterpret_cast<twml_tensor>(this);
-}
-
-const twml_tensor Tensor::getHandle() const {
-  return reinterpret_cast<const twml_tensor>(const_cast<Tensor *>(this));
-}
-
-const Tensor *getConstTensor(const twml_tensor t) {
-  return reinterpret_cast<const Tensor *>(t);
-}
-
-Tensor *getTensor(twml_tensor t) {
-  return reinterpret_cast<Tensor *>(t);
-}
-
-#define INSTANTIATE(T)                                  \
-  template<> TWMLAPI T *Tensor::getData() {             \
-    if ((twml_type)Type<T>::type != m_type) {           \
-      throw twml::Error(TWML_ERR_TYPE,                  \
-                        "Requested invalid type");      \
-    }                                                   \
-    return reinterpret_cast<T *>(m_data);               \
-  }                                                     \
-  template<> TWMLAPI const T *Tensor::getData() const { \
-    if ((twml_type)Type<T>::type != m_type) {           \
-      throw twml::Error(TWML_ERR_TYPE,                  \
-                        "Requested invalid type");      \
-    }                                                   \
-    return (const T *)m_data;                           \
-  }                                                     \
-
-INSTANTIATE(int32_t)
-INSTANTIATE(int64_t)
-INSTANTIATE(int8_t)
-INSTANTIATE(uint8_t)
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(bool)
-INSTANTIATE(std::string)
-
-// This is used for the C api. No checks needed for void.
-template<> TWMLAPI void *Tensor::getData() {
-  return m_data;
-}
-template<> TWMLAPI const void *Tensor::getData() const {
-  return (const void *)m_data;
-}
-
-std::string getTypeName(twml_type type) {
-  switch (type) {
-    case TWML_TYPE_FLOAT32 : return "float32";
-    case TWML_TYPE_FLOAT64 : return "float64";
-    case TWML_TYPE_INT32   : return "int32";
-    case TWML_TYPE_INT64   : return "int64";
-    case TWML_TYPE_INT8    : return "int8";
-    case TWML_TYPE_UINT8   : return "uint8";
-    case TWML_TYPE_BOOL    : return "bool";
-    case TWML_TYPE_STRING  : return "string";
-    case TWML_TYPE_UNKNOWN : return "Unknown type";
-  }
-  throw twml::Error(TWML_ERR_TYPE, "Uknown type");
-}
-
-uint64_t getSizeOf(twml_type dtype) {
-  switch (dtype) {
-    case TWML_TYPE_FLOAT  : return 4;
-    case TWML_TYPE_DOUBLE : return 8;
-    case TWML_TYPE_INT64  : return 8;
-    case TWML_TYPE_INT32  : return 4;
-    case TWML_TYPE_UINT8  : return 1;
-    case TWML_TYPE_BOOL   : return 1;
-    case TWML_TYPE_INT8   : return 1;
-    case TWML_TYPE_STRING :
-      throw twml::Error(TWML_ERR_THRIFT, "getSizeOf not supported for strings");
-    case TWML_TYPE_UNKNOWN:
-      throw twml::Error(TWML_ERR_THRIFT, "Can't get size of unknown types");
-  }
-  throw twml::Error(TWML_ERR_THRIFT, "Invalid twml_type");
-}
-
-}  // namespace twml
-
-twml_err twml_tensor_create(twml_tensor *t, void *data, int ndims, uint64_t *dims,
-              uint64_t *strides, twml_type type) {
-  HANDLE_EXCEPTIONS(
-    twml::Tensor *res =  new twml::Tensor(data, ndims, dims, strides, type);
-    *t = reinterpret_cast<twml_tensor>(res););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_delete(const twml_tensor t) {
-  HANDLE_EXCEPTIONS(
-    delete twml::getConstTensor(t););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_get_type(twml_type *type, const twml_tensor t) {
-  HANDLE_EXCEPTIONS(
-    *type = twml::getConstTensor(t)->getType(););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_get_data(void **data, const twml_tensor t) {
-  HANDLE_EXCEPTIONS(
-    *data = twml::getTensor(t)->getData<void>(););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_get_dim(uint64_t *dim, const twml_tensor t, int id) {
-  HANDLE_EXCEPTIONS(
-    const twml::Tensor *tensor = twml::getConstTensor(t);
-    *dim = tensor->getDim(id););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_get_stride(uint64_t *stride, const twml_tensor t, int id) {
-  HANDLE_EXCEPTIONS(
-    const twml::Tensor *tensor = twml::getConstTensor(t);
-    *stride = tensor->getStride(id););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_get_num_dims(int *ndim, const twml_tensor t) {
-  HANDLE_EXCEPTIONS(
-    const twml::Tensor *tensor = twml::getConstTensor(t);
-    *ndim = tensor->getNumDims(););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_tensor_get_num_elements(uint64_t *nelements, const twml_tensor t) {
-  HANDLE_EXCEPTIONS(
-    const twml::Tensor *tensor = twml::getConstTensor(t);
-    *nelements = tensor->getNumElements(););
-  return TWML_ERR_NONE;
-}
diff --git a/twml/libtwml/src/lib/TensorRecordReader.cpp b/twml/libtwml/src/lib/TensorRecordReader.cpp
deleted file mode 100644
index 3ffb1b98a..000000000
--- a/twml/libtwml/src/lib/TensorRecordReader.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-#include "internal/thrift.h"
-#include "internal/error.h"
-#include <string>
-
-#include <twml/TensorRecordReader.h>
-#include <twml/RawTensor.h>
-
-namespace twml {
-
-template<typename T> struct TensorTraits;
-
-#define INSTANTIATE(TYPE, THRIFT_TYPE, TWML_TYPE)   \
-  template<> struct TensorTraits<TYPE> {            \
-    static const TTYPES ThriftType = THRIFT_TYPE;   \
-    static const twml_type TwmlType = TWML_TYPE;    \
-  };                                                \
-
-INSTANTIATE(int64_t, TTYPE_I64, TWML_TYPE_INT64)
-INSTANTIATE(int32_t, TTYPE_I32, TWML_TYPE_INT32)
-INSTANTIATE(double, TTYPE_DOUBLE, TWML_TYPE_DOUBLE)
-INSTANTIATE(bool, TTYPE_BOOL, TWML_TYPE_BOOL)
-
-static
-std::vector<uint64_t> calcStrides(const std::vector<uint64_t> &shape) {
-  int ndims = static_cast<int>(shape.size());
-  std::vector<uint64_t> strides(ndims);
-  uint64_t stride = 1;
-  for (int i = ndims-1; i >= 0; i--) {
-    strides[i] = stride;
-    stride *= shape[i];
-  }
-  return strides;
-}
-
-static twml_type getTwmlType(int dtype) {
-  // Convert tensor.thrift enum to twml enum
-  switch (dtype) {
-    case DATA_TYPE_FLOAT:
-      return TWML_TYPE_FLOAT;
-    case DATA_TYPE_DOUBLE:
-      return TWML_TYPE_DOUBLE;
-    case DATA_TYPE_INT64:
-      return TWML_TYPE_INT64;
-    case DATA_TYPE_INT32:
-      return TWML_TYPE_INT32;
-    case DATA_TYPE_UINT8:
-      return TWML_TYPE_UINT8;
-    case DATA_TYPE_STRING:
-      return TWML_TYPE_STRING;
-    case DATA_TYPE_BOOL:
-      return TWML_TYPE_BOOL;
-  }
-  return TWML_TYPE_UNKNOWN;
-}
-
-std::vector<uint64_t> TensorRecordReader::readShape() {
-  int32_t length = readInt32();
-
-  std::vector<uint64_t> shape;
-  shape.reserve(length);
-  for (int32_t i = 0; i < length; i++) {
-    shape.push_back(static_cast<uint64_t>(readInt64()));
-  }
-
-  return shape;
-}
-
-template<typename T>
-RawTensor TensorRecordReader::readTypedTensor() {
-  std::vector<uint64_t> shape;
-  int32_t length = 0;
-  const uint8_t *data = nullptr;
-  uint64_t raw_length = 0;
-  uint8_t field_type = TTYPE_STOP;
-
-  while ((field_type = readByte()) != TTYPE_STOP) {
-    int16_t field_id = readInt16();
-    switch (field_id) {
-      case 1:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_LIST, "data");
-        CHECK_THRIFT_TYPE(readByte(), TensorTraits<T>::ThriftType, "data_type");
-        length = getRawBuffer<T>(&data);
-        raw_length = length * sizeof(T);
-        break;
-      case 2:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_LIST, "shape");
-        CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "shape_type");
-        shape = readShape();
-        break;
-      default:
-        throw ThriftInvalidField(field_id, "TensorRecordReader::readTypedTensor");
-    }
-  }
-
-  // data is required
-  if (data == nullptr) {
-    throw twml::Error(TWML_ERR_THRIFT, "data field not found for TypedTensor");
-  }
-
-  // shape is optional
-  if (shape.size() == 0) {
-    shape.push_back((uint64_t)length);
-  }
-
-  // TODO: Try avoiding stride calculation
-  std::vector<uint64_t> strides = calcStrides(shape);
-  // FIXME: Try to use const void * in Tensors.
-  return RawTensor(const_cast<void *>(static_cast<const void *>(data)),
-                   shape, strides, (twml_type)TensorTraits<T>::TwmlType, true, raw_length);
-}
-
-RawTensor TensorRecordReader::readRawTypedTensor() {
-  std::vector<uint64_t> shape;
-  const uint8_t *data = nullptr;
-  twml_type type = TWML_TYPE_UNKNOWN;
-  uint64_t raw_length = 0;
-  uint8_t field_type = TTYPE_STOP;
-
-  while ((field_type = readByte()) != TTYPE_STOP) {
-    int16_t field_id = readInt16();
-    switch (field_id) {
-      case 1:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_I32, "DataType");
-        type = getTwmlType(readInt32());
-        break;
-      case 2:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_STRING, "content");
-        raw_length = getRawBuffer<uint8_t>(&data);
-        break;
-      case 3:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_LIST, "shape");
-        CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "shape_type");
-        shape = readShape();
-        break;
-      default:
-        throw ThriftInvalidField(field_id, "TensorRecordReader::readRawTypedTensor");
-    }
-  }
-
-  // data type is required
-  if (type == TWML_TYPE_UNKNOWN) {
-    throw twml::Error(TWML_ERR_THRIFT, "DataType is a required field for RawTypedTensor");
-  }
-
-  // data is required
-  if (data == nullptr) {
-    throw twml::Error(TWML_ERR_THRIFT, "content is a required field for RawTypedTensor");
-  }
-
-  // shape is optional in the thrift file, but it is really required for string types.
-  if (shape.size() == 0) {
-    if (type == TWML_TYPE_STRING) {
-      throw twml::Error(TWML_ERR_THRIFT, "shape required for string types in RawTypedTensor");
-    }
-    shape.push_back((uint64_t)(raw_length / getSizeOf(type)));
-  }
-
-  // TODO: Try avoiding stride calculation
-  std::vector<uint64_t> strides = calcStrides(shape);
-  // FIXME: Try to use const void * data inside Tensors.
-  return RawTensor(const_cast<void *>(static_cast<const void *>(data)),
-                   shape, strides, type, false, raw_length);
-}
-
-RawTensor TensorRecordReader::readStringTensor() {
-  std::vector<uint64_t> shape;
-  int32_t length = 0;
-  const uint8_t *data = nullptr;
-  uint64_t raw_length = 0;
-  uint8_t field_type = TTYPE_STOP;
-  const uint8_t *dummy = nullptr;
-
-  while ((field_type = readByte()) != TTYPE_STOP) {
-    int16_t field_id = readInt16();
-    switch (field_id) {
-      case 1:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_LIST, "data");
-        CHECK_THRIFT_TYPE(readByte(), TTYPE_STRING, "data_type");
-        length = readInt32();
-        // Store the current location of the byte stream.
-        // Use this at to "deocde strings" at a later point.
-        data = getBuffer();
-        for (int32_t i = 0; i < length; i++) {
-          // Skip reading the strings
-          getRawBuffer<uint8_t>(&dummy);
-        }
-        raw_length = length;
-        break;
-      case 2:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_LIST, "shape");
-        CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "shape_type");
-        shape = readShape();
-        break;
-      default:
-        throw ThriftInvalidField(field_id, "TensorRecordReader::readTypedTensor");
-    }
-  }
-
-  // data is required
-  if (data == nullptr) {
-    throw twml::Error(TWML_ERR_THRIFT, "data field not found for TypedTensor");
-  }
-
-  // shape is optional
-  if (shape.size() == 0) {
-    shape.push_back((uint64_t)length);
-  }
-
-  // TODO: Try avoiding stride calculation
-  std::vector<uint64_t> strides = calcStrides(shape);
-  // FIXME: Try to use const void * in Tensors.
-  return RawTensor(const_cast<void *>(static_cast<const void *>(data)),
-                   shape, strides, TWML_TYPE_UINT8, false, raw_length);
-}
-
-RawTensor TensorRecordReader::readGeneralTensor() {
-  // No loop is required because GeneralTensor is union. It is going to contain one field only.
-  // All the fields are structs
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRUCT, "type");
-  int16_t field_id = readInt16();
-  RawTensor output;
-
-  switch (field_id) {
-    case GT_RAW:
-      output = readRawTypedTensor();
-      break;
-    case GT_STRING:
-      output = readStringTensor();
-      break;
-    case GT_INT32:
-      output = readTypedTensor<int32_t>();
-      break;
-    case GT_INT64:
-      output = readTypedTensor<int64_t>();
-      break;
-    case GT_FLOAT:
-    case GT_DOUBLE:
-      // Store both FloatTensor and DoubleTensor as double tensor as both are list of doubles.
-      output = readTypedTensor<double>();
-      break;
-    case GT_BOOL:
-      output = readTypedTensor<bool>();
-      break;
-    default:
-      throw ThriftInvalidField(field_id, "TensorRecordReader::readGeneralTensor()");
-  }
-
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STOP, "stop");
-  return output;
-}
-
-RawSparseTensor TensorRecordReader::readCOOSparseTensor() {
-  std::vector<uint64_t> shape;
-  uint8_t field_type = TTYPE_STOP;
-  RawTensor indices, values;
-
-  while ((field_type = readByte()) != TTYPE_STOP) {
-    int16_t field_id = readInt16();
-    switch (field_id) {
-      case 1:
-        CHECK_THRIFT_TYPE(field_type, TTYPE_LIST, "shape");
-        CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "shape_type");
-        shape = readShape();
-        break;
-      case 2:
-        indices = readTypedTensor<int64_t>();
-        break;
-      case 3:
-        values = readGeneralTensor();
-        break;
-      default:
-        throw twml::Error(TWML_ERR_THRIFT, "Invalid field when deocidng COOSparseTensor");
-    }
-  }
-
-  return RawSparseTensor(indices, values, shape);
-}
-
-void TensorRecordReader::readTensor(const int feature_type, TensorRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRUCT, "value_type");
-
-  int32_t length = readInt32();
-  for (int32_t i = 0; i < length; i++) {
-    int64_t id = readInt64();
-    record->m_tensors.emplace(id, readGeneralTensor());
-  }
-}
-
-void TensorRecordReader::readSparseTensor(const int feature_type, TensorRecord *record) {
-  CHECK_THRIFT_TYPE(feature_type, TTYPE_MAP, "type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_I64, "key_type");
-  CHECK_THRIFT_TYPE(readByte(), TTYPE_STRUCT, "value_type");
-
-  int32_t length = readInt32();
-  for (int32_t i = 0; i < length; i++) {
-    int64_t id = readInt64();
-
-    // No loop is required because SparseTensor is union. It is going to contain one field only.
-    // All the fields are structs
-    CHECK_THRIFT_TYPE(readByte(), TTYPE_STRUCT, "field");
-    int16_t field_id = readInt16();
-    RawSparseTensor output;
-
-    // Only COOSparsetensor is supported.
-    switch (field_id) {
-      case SP_COO:
-        output = readCOOSparseTensor();
-        break;
-      default:
-        throw ThriftInvalidField(field_id, "TensorRecordReader::readSparseTensor()");
-    }
-
-    // Read the last byte of the struct.
-    CHECK_THRIFT_TYPE(readByte(), TTYPE_STOP, "stop");
-
-    // Add to the map.
-    record->m_sparse_tensors.emplace(id, output);
-  }
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/TensorRecordWriter.cpp b/twml/libtwml/src/lib/TensorRecordWriter.cpp
deleted file mode 100644
index b1fe98e64..000000000
--- a/twml/libtwml/src/lib/TensorRecordWriter.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <map>
-#include <twml/ThriftWriter.h>
-#include <twml/TensorRecordWriter.h>
-#include <twml/io/IOError.h>
-
-using namespace twml::io;
-
-namespace twml {
-
-static int32_t getRawThriftType(twml_type dtype) {
-  // convert twml enum to tensor.thrift enum
-  switch (dtype) {
-    case TWML_TYPE_FLOAT:
-      return DATA_TYPE_FLOAT;
-    case TWML_TYPE_DOUBLE:
-      return DATA_TYPE_DOUBLE;
-    case TWML_TYPE_INT64:
-      return DATA_TYPE_INT64;
-    case TWML_TYPE_INT32:
-      return DATA_TYPE_INT32;
-    case TWML_TYPE_UINT8:
-      return DATA_TYPE_UINT8;
-    case TWML_TYPE_STRING:
-      return DATA_TYPE_STRING;
-    case TWML_TYPE_BOOL:
-      return DATA_TYPE_BOOL;
-    default:
-      throw IOError(IOError::UNSUPPORTED_OUTPUT_TYPE);
-  }
-}
-
-void TensorRecordWriter::writeTensor(const RawTensor &tensor) {
-  if (tensor.getType() == TWML_TYPE_INT32) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_INT32);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_I32, tensor.getNumElements());
-
-    const int32_t *data = tensor.getData<int32_t>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeInt32(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_INT64) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_INT64);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumElements());
-
-    const int64_t *data = tensor.getData<int64_t>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeInt64(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_FLOAT) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_FLOAT);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_DOUBLE, tensor.getNumElements());
-
-    const float *data = tensor.getData<float>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeDouble(static_cast<double>(data[i]));
-
-  } else if (tensor.getType() == TWML_TYPE_DOUBLE) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_DOUBLE);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_DOUBLE, tensor.getNumElements());
-
-    const double *data = tensor.getData<double>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeDouble(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_STRING) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_STRING);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_STRING, tensor.getNumElements());
-
-    const std::string *data = tensor.getData<std::string>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeString(data[i]);
-
-  } else if (tensor.getType() == TWML_TYPE_BOOL) {
-    m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_BOOL);
-    m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 1);
-    m_thrift_writer.writeListHeader(TTYPE_BOOL, tensor.getNumElements());
-
-    const bool *data = tensor.getData<bool>();
-
-    for (uint64_t i = 0; i < tensor.getNumElements(); i++)
-      m_thrift_writer.writeBool(data[i]);
-
-  } else {
-    throw IOError(IOError::UNSUPPORTED_OUTPUT_TYPE);
-  }
-
-  // write tensor shape field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 2);
-  m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumDims());
-
-  for (uint64_t i = 0; i < tensor.getNumDims(); i++)
-    m_thrift_writer.writeInt64(tensor.getDim(i));
-
-  m_thrift_writer.writeStructStop();
-  m_thrift_writer.writeStructStop();
-}
-
-void TensorRecordWriter::writeRawTensor(const RawTensor &tensor) {
-  m_thrift_writer.writeStructFieldHeader(TTYPE_STRUCT, GT_RAW);
-
-  // dataType field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_I32, 1);
-  m_thrift_writer.writeInt32(getRawThriftType(tensor.getType()));
-
-  // content field
-  uint64_t type_size = getSizeOf(tensor.getType());
-  m_thrift_writer.writeStructFieldHeader(TTYPE_STRING, 2);
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor.getData<void>());
-  m_thrift_writer.writeBinary(data, tensor.getNumElements() * type_size);
-
-  // shape field
-  m_thrift_writer.writeStructFieldHeader(TTYPE_LIST, 3);
-  m_thrift_writer.writeListHeader(TTYPE_I64, tensor.getNumDims());
-
-  for (uint64_t i = 0; i < tensor.getNumDims(); i++)
-    m_thrift_writer.writeInt64(tensor.getDim(i));
-
-  m_thrift_writer.writeStructStop();
-  m_thrift_writer.writeStructStop();
-}
-
-TWMLAPI uint32_t TensorRecordWriter::getRecordsWritten() {
-  return m_records_written;
-}
-
-// Caller (usually DataRecordWriter) must precede with struct header field
-// like thrift_writer.writeStructFieldHeader(TTYPE_MAP, DR_GENERAL_TENSOR)
-TWMLAPI uint64_t TensorRecordWriter::write(twml::TensorRecord &record) {
-  uint64_t bytes_written_before = m_thrift_writer.getBytesWritten();
-
-  m_thrift_writer.writeMapHeader(TTYPE_I64, TTYPE_STRUCT, record.getRawTensors().size());
-
-  for (auto id_tensor_pairs : record.getRawTensors()) {
-    m_thrift_writer.writeInt64(id_tensor_pairs.first);
-
-    // all tensors written as RawTensor Thrift except for StringTensors
-    // this avoids the overhead of converting little endian to big endian
-    if (id_tensor_pairs.second.getType() == TWML_TYPE_STRING)
-      writeTensor(id_tensor_pairs.second);
-    else
-      writeRawTensor(id_tensor_pairs.second);
-  }
-
-  m_records_written++;
-
-  return m_thrift_writer.getBytesWritten() - bytes_written_before;
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/ThriftReader.cpp b/twml/libtwml/src/lib/ThriftReader.cpp
deleted file mode 100644
index bceb74c13..000000000
--- a/twml/libtwml/src/lib/ThriftReader.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "internal/endianutils.h"
-
-#include <twml/ThriftReader.h>
-#include <twml/Error.h>
-
-#include <cstring>
-
-namespace twml {
-
-uint8_t ThriftReader::readByte() {
-  return readDirect<uint8_t>();
-}
-
-int16_t ThriftReader::readInt16() {
-  return betoh16(readDirect<int16_t>());
-}
-
-int32_t ThriftReader::readInt32() {
-  return betoh32(readDirect<int32_t>());
-}
-
-int64_t ThriftReader::readInt64() {
-  return betoh64(readDirect<int64_t>());
-}
-
-double ThriftReader::readDouble() {
-  double val;
-  int64_t *val_proxy = reinterpret_cast<int64_t*>(&val);
-  *val_proxy = readInt64();
-  return val;
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/ThriftWriter.cpp b/twml/libtwml/src/lib/ThriftWriter.cpp
deleted file mode 100644
index 4f298a154..000000000
--- a/twml/libtwml/src/lib/ThriftWriter.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "internal/endianutils.h"
-#include "internal/error.h"
-#include "internal/thrift.h"
-
-#include <twml/ThriftWriter.h>
-#include <twml/Error.h>
-#include <twml/io/IOError.h>
-
-#include <cstring>
-
-using namespace twml::io;
-
-namespace twml {
-
-template <typename T> inline
-uint64_t ThriftWriter::write(T val) {
-  if (!m_dry_run) {
-    if (m_bytes_written + sizeof(T) > m_buffer_size)
-      throw IOError(IOError::DESTINATION_LARGER_THAN_CAPACITY);
-    memcpy(m_buffer, &val, sizeof(T));
-    m_buffer += sizeof(T);
-  }
-  m_bytes_written += sizeof(T);
-  return sizeof(T);
-}
-
-TWMLAPI uint64_t ThriftWriter::getBytesWritten() {
-  return m_bytes_written;
-}
-
-TWMLAPI uint64_t ThriftWriter::writeStructFieldHeader(int8_t field_type, int16_t field_id) {
-  return writeInt8(field_type) + writeInt16(field_id);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeStructStop() {
-  return writeInt8(static_cast<int8_t>(TTYPE_STOP));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeListHeader(int8_t element_type, int32_t num_elems) {
-  return writeInt8(element_type) + writeInt32(num_elems);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeMapHeader(int8_t key_type, int8_t val_type, int32_t num_elems) {
-  return writeInt8(key_type) + writeInt8(val_type) + writeInt32(num_elems);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeDouble(double val) {
-  int64_t bin_value;
-  memcpy(&bin_value, &val, sizeof(int64_t));
-  return writeInt64(bin_value);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt8(int8_t val) {
-  return write(val);
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt16(int16_t val) {
-  return write(betoh16(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt32(int32_t val) {
-  return write(betoh32(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeInt64(int64_t val) {
-  return write(betoh64(val));
-}
-
-TWMLAPI uint64_t ThriftWriter::writeBinary(const uint8_t *bytes, int32_t num_bytes) {
-  writeInt32(num_bytes);
-
-  if (!m_dry_run) {
-    if (m_bytes_written + num_bytes > m_buffer_size)
-      throw IOError(IOError::DESTINATION_LARGER_THAN_CAPACITY);
-    memcpy(m_buffer, bytes, num_bytes);
-    m_buffer += num_bytes;
-  }
-  m_bytes_written += num_bytes;
-
-  return 4 + num_bytes;
-}
-
-TWMLAPI uint64_t ThriftWriter::writeString(std::string str) {
-  return writeBinary(reinterpret_cast<const uint8_t *>(str.data()), str.length());
-}
-
-TWMLAPI uint64_t ThriftWriter::writeBool(bool val) {
-  return write(val);
-}
-
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/discretizer_impl.cpp b/twml/libtwml/src/lib/discretizer_impl.cpp
deleted file mode 100644
index 3f161341e..000000000
--- a/twml/libtwml/src/lib/discretizer_impl.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-#include "internal/interpolate.h"
-#include "internal/error.h"
-#include <twml/discretizer_impl.h>
-#include <twml/optim.h>
-
-namespace twml {
-  // it is assumed that start_compute and end_compute are valid
-  template<typename T>
-  void discretizerInfer(Tensor &output_keys,
-          Tensor &output_vals,
-          const Tensor &input_ids,
-          const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          int output_bits,
-          const Map<int64_t, int64_t> &ID_to_index,
-          int64_t start_compute,
-          int64_t end_compute,
-          int64_t output_start) {
-    auto out_keysData = output_keys.getData<int64_t>();
-    auto out_valsData = output_vals.getData<T>();
-    uint64_t out_keysStride = output_keys.getStride(0);
-    uint64_t out_valsStride = output_vals.getStride(0);
-
-    auto in_idsData = input_ids.getData<int64_t>();
-    auto in_valsData = input_vals.getData<T>();
-    uint64_t in_idsStride = input_ids.getStride(0);
-    uint64_t in_valsStride = input_vals.getStride(0);
-
-    auto xsData = bin_vals.getData<T>();
-    auto ysData = bin_ids.getData<int64_t>();
-    uint64_t xsStride = bin_vals.getStride(0);
-    uint64_t ysStride = bin_ids.getStride(0);
-
-    auto offsetData = feature_offsets.getData<int64_t>();
-
-    uint64_t total_bins = bin_ids.getNumElements();
-    uint64_t fsize = feature_offsets.getNumElements();
-
-    uint64_t output_size = (1 << output_bits);
-
-    for (uint64_t i = start_compute; i < end_compute; i++) {
-      int64_t feature_ID = in_idsData[i * in_idsStride];
-      T val = in_valsData[i * in_valsStride];
-
-      auto iter = ID_to_index.find(feature_ID);
-      if (iter == ID_to_index.end()) {
-        // feature not calibrated
-        // modulo add operation for new key from feature ID
-        int64_t ikey = feature_ID % (output_size - total_bins) + total_bins;
-        out_keysData[(i + output_start - start_compute) * out_keysStride] = ikey;
-        out_valsData[(i + output_start - start_compute) * out_valsStride] = val;
-        continue;
-      }
-
-      int64_t ikey = iter->second;
-
-      // Perform interpolation
-      uint64_t offset = offsetData[ikey];
-      uint64_t next_offset = (ikey == (int64_t)(fsize - 1)) ? total_bins : offsetData[ikey + 1];
-      uint64_t mainSize = next_offset - offset;
-
-      const T *lxsData = xsData + offset;
-      const int64_t *lysData = ysData + offset;
-      int64_t okey;
-      okey = interpolation<T, int64_t>(lxsData, xsStride,
-                                       lysData, ysStride,
-                                       val, mainSize,
-                                       NEAREST, 0);
-      out_keysData[(i + output_start - start_compute) * out_keysStride] = okey;
-      out_valsData[(i + output_start - start_compute) * out_valsStride] = 1;
-    }
-  }
-
-  void discretizerInfer(Tensor &output_keys,
-          Tensor &output_vals,
-          const Tensor &input_ids,
-          const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          int output_bits,
-          const Map<int64_t, int64_t> &ID_to_index,
-          int start_compute,
-          int end_compute,
-          int output_start) {
-    if (input_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (bin_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (feature_offsets.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    if (bin_ids.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_ids must be 1 Dimensional");
-    }
-
-    if (bin_vals.getNumElements() != bin_ids.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "Dimensions of bin_vals and bin_ids do not match");
-    }
-
-    if (feature_offsets.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "feature_offsets must be contiguous");
-    }
-
-    uint64_t size = input_ids.getDim(0);
-    if (end_compute == -1) {
-      end_compute = size;
-    }
-
-    if (start_compute < 0 || start_compute >= size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "start_compute out of range");
-    }
-
-    if (end_compute < -1 || end_compute > size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "end_compute out of range");
-    }
-
-    if (start_compute > end_compute && end_compute != -1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "must have start_compute <= end_compute, or end_compute==-1");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::discretizerInfer<float>(output_keys, output_vals,
-                  input_ids, input_vals,
-                  bin_ids, bin_vals, feature_offsets, output_bits, ID_to_index,
-                  start_compute, end_compute, output_start);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::discretizerInfer<double>(output_keys, output_vals,
-                   input_ids, input_vals,
-                   bin_ids, bin_vals, feature_offsets, output_bits, ID_to_index,
-                   start_compute, end_compute, output_start);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for discretizerInfer");
-    }
-  }
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/functions.cpp b/twml/libtwml/src/lib/functions.cpp
deleted file mode 100644
index b7af3c0ac..000000000
--- a/twml/libtwml/src/lib/functions.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "internal/error.h"
-#include "internal/murmur_hash3.h"
-#include "internal/utf_converter.h"
-#include <twml/functions.h>
-#include <cstring>
-#include <algorithm>
-
-namespace twml {
-
-  template<typename T>
-  void add1(Tensor &output, const Tensor input) {
-    T *odata = output.getData<T>();
-    const T *idata = input.getData<T>();
-    const uint64_t num_elements = input.getNumElements();
-
-    for (uint64_t i = 0; i < num_elements; i++) {
-      odata[i] = idata[i] + 1;
-    }
-  }
-
-  template<typename T>
-  void copy(Tensor &output, const Tensor input) {
-    T *odata = output.getData<T>();
-    const T *idata = input.getData<T>();
-    const uint64_t num_elements = input.getNumElements();
-
-    for (uint64_t i = 0; i < num_elements; i++) {
-      odata[i] = idata[i];
-    }
-  }
-
-  void add1(Tensor &output, const Tensor input) {
-    auto type =  input.getType();
-    if (output.getType() != type) {
-      throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
-    }
-
-    if (output.getNumElements() != input.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
-    }
-
-    // TODO: Implement an easier dispatch function
-    switch (type) {
-    case TWML_TYPE_FLOAT:
-      twml::add1<float>(output, input);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::add1<double>(output, input);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "add1 only supports float and double tensors");
-    }
-  }
-
-  void copy(Tensor &output, const Tensor input) {
-    auto type =  input.getType();
-    if (output.getType() != type) {
-      throw twml::Error(TWML_ERR_TYPE, "Output type does not match input type");
-    }
-
-    if (output.getNumElements() != input.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE, "Output size does not match input size");
-    }
-
-    // TODO: Implement an easier dispatch function
-    switch (type) {
-    case TWML_TYPE_FLOAT:
-      twml::copy<float>(output, input);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::copy<double>(output, input);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "copy only supports float and double tensors");
-    }
-  }
-
-  int64_t featureId(const std::string &feature) {
-    const char *str = feature.c_str();
-    uint64_t len = feature.size();
-    int64_t id = 0;
-    TWML_CHECK(twml_get_feature_id(&id, len, str), "Error getting featureId");
-    return id;
-  }
-}  // namespace twml
-
-twml_err twml_add1(twml_tensor output, const twml_tensor input) {
-  HANDLE_EXCEPTIONS(
-    auto out = twml::getTensor(output);
-    auto in = twml::getConstTensor(input);
-    twml::add1(*out, *in););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_copy(twml_tensor output, const twml_tensor input) {
-  HANDLE_EXCEPTIONS(
-    auto out = twml::getTensor(output);
-    auto in = twml::getConstTensor(input);
-    twml::copy(*out, *in););
-  return TWML_ERR_NONE;
-}
-
-inline twml_err twml_get_feature_id_internal(int64_t *result,
-                                             uint64_t out_size, uint16_t *out,
-                                             uint64_t out2_size, uint16_t *out2,
-                                             const uint64_t len, const char *str) {
-  uint64_t k = 0;
-  for (uint64_t i = 0; i < len; i++) {
-    if (str[i] == '#') {
-      k = i;
-      break;
-    }
-  }
-
-  uint8_t hash[16];
-  if (k != 0) {
-    ssize_t n = utf8_to_utf16((const uint8_t *) str, k, out, out_size);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-
-    MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, out2);
-    n = utf8_to_utf16((const uint8_t *) (str + k + 1), len - k - 1, &out2[4], out2_size - 8);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-
-    MurmurHash3_x64_128(out2, (n * sizeof(uint16_t)) + 8, 0, hash);
-  } else {
-    ssize_t n = utf8_to_utf16((const uint8_t *)str, len, out, out_size);
-    if (n < 0) throw std::invalid_argument("error while converting from utf8 to utf16");
-    MurmurHash3_x64_128(out, n * sizeof(uint16_t), 0, hash);
-  }
-  int64_t id;
-  memcpy(&id, hash, sizeof(int64_t));
-  *result = id;
-
-  return TWML_ERR_NONE;
-}
-
-static const int UTF16_STR_MAX_SIZE = 1024;
-
-twml_err twml_get_feature_id(int64_t *result, const uint64_t len, const char *str) {
-  try {
-    uint16_t out[UTF16_STR_MAX_SIZE];
-    uint16_t out2[UTF16_STR_MAX_SIZE];
-    return twml_get_feature_id_internal(result,
-                                        UTF16_STR_MAX_SIZE, out,
-                                        UTF16_STR_MAX_SIZE, out2,
-                                        len, str);
-  } catch(const std::invalid_argument &ex) {
-    // If the space on the stack is not enough, try using the heap.
-    // len + 1 is needed because a null terminating character is added at the end.
-    std::vector<uint16_t> out(len + 1);
-    std::vector<uint16_t> out2(len + 1);
-    return twml_get_feature_id_internal(result,
-                                        len + 1, out.data(),
-                                        len + 1, out2.data(),
-                                        len, str);
-
-  }
-}
diff --git a/twml/libtwml/src/lib/hashing_discretizer_impl.cpp b/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
deleted file mode 100644
index 166242ffb..000000000
--- a/twml/libtwml/src/lib/hashing_discretizer_impl.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "internal/linear_search.h"
-#include "internal/error.h"
-#include <twml/hashing_discretizer_impl.h>
-#include <twml/optim.h>
-#include <algorithm>
-
-namespace twml {
-  template<typename Tx>
-  static int64_t lower_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
-    auto index_temp = std::lower_bound(data, data + buf_size, val);
-    return static_cast<int64_t>(index_temp - data);
-  }
-
-  template<typename Tx>
-  static int64_t upper_bound_search(const Tx *data, const Tx val, const int64_t buf_size) {
-    auto index_temp = std::upper_bound(data, data + buf_size, val);
-    return static_cast<int64_t>(index_temp - data);
-  }
-
-  template<typename Tx>
-  using search_method = int64_t (*)(const Tx *, const Tx, const int64_t);
-
-  typedef uint64_t (*hash_signature)(uint64_t, int64_t, uint64_t);
-
-  // uint64_t integer_multiplicative_hashing()
-  //
-  // A function to hash discretized feature_ids into one of 2**output_bits buckets.
-  // This function hashes the feature_ids to achieve a uniform distribution of
-  //   IDs, so the hashed IDs are with high probability far apart
-  // Then, bucket_indices can simply be added, resulting in unique new IDs with high probability
-  // We integer hash again to again spread out the new IDs
-  // Finally we take the upper
-  // Required args:
-  //   feature_id:
-  //     The feature id of the feature to be hashed.
-  //   bucket_index:
-  //     The bucket index of the discretized feature value
-  //   output_bits:
-  //     The number of bits of output space for the features to be hashed into.
-  //
-  // Note - feature_ids may have arbitrary distribution within int32s
-  // Note - 64 bit feature_ids can be processed with this, but the upper
-  //          32 bits have no effect on the output
-  // e.g. all feature ids 0 through 255 exist in movie-lens.
-  // this hashing constant is good for 32 LSBs. will use N=32. (can use N<32 also)
-  // this hashing constant is co-prime with 2**32, therefore we have that
-  //   a != b, a and b in [0,2**32)
-  //    implies
-  //   f(a) != f(b) where f(x) = (hashing_constant * x) % (2**32)
-  // note that we are mostly ignoring the upper 32 bits, using modulo 2**32 arithmetic
-  uint64_t integer_multiplicative_hashing(uint64_t feature_id,
-                                          int64_t bucket_index,
-                                          uint64_t output_bits) {
-    // possibly use 14695981039346656037 for 64 bit unsigned??
-    //  = 20921 * 465383 * 1509404459
-    // alternatively, 14695981039346656039 is prime
-    // We would also need to use N = 64
-    const uint64_t hashing_constant = 2654435761;
-    const uint64_t N = 32;
-    // hash once to prevent problems from anomalous input id distributions
-    feature_id *= hashing_constant;
-    feature_id += bucket_index;
-    // this hash enables the following right shift operation
-    //  without losing the bucket information (lower bits)
-    feature_id *= hashing_constant;
-    // output size is a power of 2
-    feature_id >>= N - output_bits;
-    uint64_t mask = (1 << output_bits) - 1;
-    return mask & feature_id;
-  }
-
-  uint64_t integer64_multiplicative_hashing(uint64_t feature_id,
-                                            int64_t bucket_index,
-                                            uint64_t output_bits) {
-    const uint64_t hashing_constant = 14695981039346656039UL;
-    const uint64_t N = 64;
-    // hash once to prevent problems from anomalous input id distributions
-    feature_id *= hashing_constant;
-    feature_id += bucket_index;
-    // this hash enables the following right shift operation
-    //  without losing the bucket information (lower bits)
-    feature_id *= hashing_constant;
-    // output size is a power of 2
-    feature_id >>= N - output_bits;
-    uint64_t mask = (1 << output_bits) - 1;
-    return mask & feature_id;
-  }
-
-  int64_t option_bits(int64_t options, int64_t high, int64_t low) {
-    options >>= low;
-    options &= (1 << (high - low + 1)) - 1;
-    return options;
-  }
-
-  // it is assumed that start_compute and end_compute are valid
-  template<typename T>
-  void hashDiscretizerInfer(Tensor &output_keys,
-                            Tensor &output_vals,
-                            const Tensor &input_ids,
-                            const Tensor &input_vals,
-                            const Tensor &bin_vals,
-                            int output_bits,
-                            const Map<int64_t, int64_t> &ID_to_index,
-                            int64_t start_compute,
-                            int64_t end_compute,
-                            int64_t n_bin,
-                            int64_t options) {
-    auto output_keys_data = output_keys.getData<int64_t>();
-    auto output_vals_data = output_vals.getData<T>();
-
-    auto input_ids_data = input_ids.getData<int64_t>();
-    auto input_vals_data = input_vals.getData<T>();
-
-    auto bin_vals_data = bin_vals.getData<T>();
-
-    // The function pointer implementation removes the option_bits
-    // function call (might be inlined) and corresponding branch from
-    // the hot loop, but it prevents inlining these functions, so
-    // there will be function call overhead. Uncertain which would
-    // be faster, testing needed. Also, code optimizers do weird things...
-    hash_signature hash_fn = integer_multiplicative_hashing;
-    switch (option_bits(options, 4, 2)) {
-      case 0:
-      hash_fn = integer_multiplicative_hashing;
-      break;
-      case 1:
-      hash_fn = integer64_multiplicative_hashing;
-      break;
-      default:
-      hash_fn = integer_multiplicative_hashing;
-    }
-
-    search_method<T> search_fn = lower_bound_search;
-    switch (option_bits(options, 1, 0)) {
-      case 0:
-      search_fn = lower_bound_search<T>;
-      break;
-      case 1:
-      search_fn = linear_search<T>;
-      break;
-      case 2:
-      search_fn = upper_bound_search<T>;
-      break;
-      default:
-      search_fn = lower_bound_search<T>;
-    }
-
-    for (uint64_t i = start_compute; i < end_compute; i++) {
-      int64_t id = input_ids_data[i];
-      T val = input_vals_data[i];
-
-      auto iter = ID_to_index.find(id);
-      if (iter != ID_to_index.end()) {
-        int64_t feature_idx = iter->second;
-        const T *bin_vals_start = bin_vals_data + feature_idx * n_bin;
-        int64_t out_bin_idx = search_fn(bin_vals_start, val, n_bin);
-        output_keys_data[i] = hash_fn(id, out_bin_idx, output_bits);
-        output_vals_data[i] = 1;
-      } else {
-        // feature not calibrated
-        output_keys_data[i] = id & ((1 << output_bits) - 1);
-        output_vals_data[i] = val;
-      }
-    }
-  }
-
-  void hashDiscretizerInfer(Tensor &output_keys,
-                            Tensor &output_vals,
-                            const Tensor &input_ids,
-                            const Tensor &input_vals,
-                            int n_bin,
-                            const Tensor &bin_vals,
-                            int output_bits,
-                            const Map<int64_t, int64_t> &ID_to_index,
-                            int start_compute,
-                            int end_compute,
-                            int64_t options) {
-    if (input_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_ids must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    uint64_t size = input_ids.getDim(0);
-    if (end_compute == -1) {
-      end_compute = size;
-    }
-
-    if (start_compute < 0 || start_compute >= size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "start_compute out of range");
-    }
-
-    if (end_compute < -1 || end_compute > size) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "end_compute out of range");
-    }
-
-    if (start_compute > end_compute && end_compute != -1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "must have start_compute <= end_compute, or end_compute==-1");
-    }
-
-    if (output_keys.getStride(0) != 1 || output_vals.getStride(0) != 1 ||
-        input_ids.getStride(0) != 1 || input_vals.getStride(0) != 1 ||
-        bin_vals.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "All Strides must be 1.");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::hashDiscretizerInfer<float>(output_keys, output_vals,
-                  input_ids, input_vals,
-                  bin_vals, output_bits, ID_to_index,
-                  start_compute, end_compute, n_bin, options);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::hashDiscretizerInfer<double>(output_keys, output_vals,
-                   input_ids, input_vals,
-                   bin_vals, output_bits, ID_to_index,
-                   start_compute, end_compute, n_bin, options);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for hashDiscretizerInfer");
-    }
-  }
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/internal/endianutils.h b/twml/libtwml/src/lib/internal/endianutils.h
deleted file mode 100644
index 3b27797d7..000000000
--- a/twml/libtwml/src/lib/internal/endianutils.h
+++ /dev/null
@@ -1,137 +0,0 @@
-//
-//  endian_fix.h
-//  ImageCore
-//
-//  For OSes that use glibc < 2.9 (like RHEL5)
-//
-#pragma once
-
-#ifdef __APPLE__
-#include <libkern/OSByteOrder.h>
-#define htobe16(x) OSSwapHostToBigInt16(x)
-#define htole16(x) OSSwapHostToLittleInt16(x)
-#define betoh16(x) OSSwapBigToHostInt16(x)
-#define letoh16(x) OSSwapLittleToHostInt16(x)
-#define htobe32(x) OSSwapHostToBigInt32(x)
-#define htole32(x) OSSwapHostToLittleInt32(x)
-#define betoh32(x) OSSwapBigToHostInt32(x)
-#define letoh32(x) OSSwapLittleToHostInt32(x)
-#define htobe64(x) OSSwapHostToBigInt64(x)
-#define htole64(x) OSSwapHostToLittleInt64(x)
-#define betoh64(x) OSSwapBigToHostInt64(x)
-#define letoh64(x) OSSwapLittleToHostInt64(x)
-#else
-#include <endian.h>
-#ifdef __USE_BSD
-/* Conversion interfaces.  */
-#include <byteswap.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#ifndef htobe16
-#define htobe16(x) __bswap_16(x)
-#endif
-#ifndef htole16
-#define htole16(x) (x)
-#endif
-#ifndef betoh16
-#define betoh16(x) __bswap_16(x)
-#endif
-#ifndef letoh16
-#define letoh16(x) (x)
-#endif
-
-#ifndef htobe32
-#define htobe32(x) __bswap_32(x)
-#endif
-#ifndef htole32
-#define htole32(x) (x)
-#endif
-#ifndef betoh32
-#define betoh32(x) __bswap_32(x)
-#endif
-#ifndef letoh32
-#define letoh32(x) (x)
-#endif
-
-#ifndef htobe64
-#define htobe64(x) __bswap_64(x)
-#endif
-#ifndef htole64
-#define htole64(x) (x)
-#endif
-#ifndef betoh64
-#define betoh64(x) __bswap_64(x)
-#endif
-#ifndef letoh64
-#define letoh64(x) (x)
-#endif
-
-#else /* __BYTE_ORDER == __LITTLE_ENDIAN */
-#ifndef htobe16
-#define htobe16(x) (x)
-#endif
-#ifndef htole16
-#define htole16(x) __bswap_16(x)
-#endif
-#ifndef be16toh
-#define be16toh(x) (x)
-#endif
-#ifndef le16toh
-#define le16toh(x) __bswap_16(x)
-#endif
-
-#ifndef htobe32
-#define htobe32(x) (x)
-#endif
-#ifndef htole32
-#define htole32(x) __bswap_32(x)
-#endif
-#ifndef betoh32
-#define betoh32(x) (x)
-#endif
-#ifndef letoh32
-#define letoh32(x) __bswap_32(x)
-#endif
-
-#ifndef htobe64
-#define htobe64(x) (x)
-#endif
-#ifndef htole64
-#define htole64(x) __bswap_64(x)
-#endif
-#ifndef betoh64
-#define betoh64(x) (x)
-#endif
-#ifndef letoh64
-#define letoh64(x) __bswap_64(x)
-#endif
-
-#endif /* __BYTE_ORDER == __LITTLE_ENDIAN */
-
-#else  /* __USE_BSD */
-#ifndef betoh16
-#define betoh16 be16toh
-#endif
-
-#ifndef betoh32
-#define betoh32 be32toh
-#endif
-
-#ifndef betoh64
-#define betoh64 be64toh
-#endif
-
-#ifndef letoh16
-#define letoh16 le16toh
-#endif
-
-#ifndef letoh32
-#define letoh32 le32toh
-#endif
-
-#ifndef letoh64
-#define letoh64 le64toh
-#endif
-
-#endif /* __USE_BSD */
-#endif /* __APPLE__ */
diff --git a/twml/libtwml/src/lib/internal/error.h b/twml/libtwml/src/lib/internal/error.h
deleted file mode 100644
index 3d1bc5441..000000000
--- a/twml/libtwml/src/lib/internal/error.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-#include <twml/Error.h>
-#include <iostream>
-
-#define HANDLE_EXCEPTIONS(fn) do {              \
-        try {                                   \
-            fn                                  \
-        } catch(const twml::Error &e) {         \
-            std::cerr << e.what() << std::endl; \
-            return e.err();                     \
-        } catch(...) {                          \
-            std::cerr << "Unknown error\n";     \
-            return TWML_ERR_UNKNOWN;            \
-        }                                       \
-    } while(0)
-
-#define TWML_CHECK(fn, msg) do {                \
-        twml_err err = fn;                      \
-        if (err == TWML_ERR_NONE) break;        \
-        throw twml::Error(err, msg);            \
-    } while(0)
-
-
-#define CHECK_THRIFT_TYPE(real_type, expected_type, type) do {      \
-    int real_type_val = real_type;                                  \
-    if (real_type_val != expected_type) {                           \
-      throw twml::ThriftInvalidType(real_type_val, __func__, type); \
-    }                                                               \
-  } while(0)
diff --git a/twml/libtwml/src/lib/internal/interpolate.h b/twml/libtwml/src/lib/internal/interpolate.h
deleted file mode 100644
index 3e1daf53e..000000000
--- a/twml/libtwml/src/lib/internal/interpolate.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-#include <twml/optim.h>
-namespace twml {
-
-  enum InterpolationMode {LINEAR, NEAREST};
-
-  template<typename Tx, typename Ty>
-  static Tx interpolation(const Tx *xsData, const int64_t xsStride,
-                 const Ty *ysData, const int64_t ysStride,
-                 const Tx val, const int64_t mainSize,
-                 const InterpolationMode mode,
-                 const int64_t lowest,
-                 const bool return_local_index = false) {
-    int64_t left = 0;
-    int64_t right = mainSize-1;
-
-    if (val <= xsData[0]) {
-      right = 0;
-    } else if (val >= xsData[right*xsStride]) {
-      left = right;
-    } else {
-      while (left < right) {
-        int64_t middle = (left+right)/2;
-
-        if (middle < mainSize - 1 &&
-          val >= xsData[middle*xsStride] &&
-          val <= xsData[(middle+1)*xsStride]) {
-          left = middle;
-          right = middle + 1;
-          break;
-        } else if (val > xsData[middle*xsStride]) {
-          left = middle;
-        } else {
-          right = middle;
-        }
-      }
-      if (lowest) {
-        while (left > 0 &&
-             val >= xsData[(left - 1) * xsStride] &&
-             val == xsData[left * xsStride]) {
-          left--;
-          right--;
-        }
-      }
-    }
-
-    Ty out = 0;
-    if (return_local_index) {
-        out = left;
-    } else if (mode == NEAREST) {
-      out = ysData[left*ysStride];
-    } else {
-      int64_t leftys = left*ysStride;
-      int64_t rightys = right*ysStride;
-      int64_t leftxs = left*xsStride;
-      int64_t rightxs = right*xsStride;
-      if (right != left+1 ||
-        xsData[leftxs] == xsData[rightxs]) {
-        out = ysData[leftys];
-      } else {
-        Tx xLeft = xsData[leftxs];
-        Tx xRight = xsData[rightxs];
-        Tx yLeft = ysData[leftys];
-        Tx ratio = (val - xLeft) / (xRight - xLeft);
-        out = ratio*(ysData[rightys] - yLeft) + yLeft;
-      }
-    }
-    return out;
-  }
-
-}  // namespace twml
-#endif
diff --git a/twml/libtwml/src/lib/internal/khash.h b/twml/libtwml/src/lib/internal/khash.h
deleted file mode 100644
index c9075cbbc..000000000
--- a/twml/libtwml/src/lib/internal/khash.h
+++ /dev/null
@@ -1,627 +0,0 @@
-/* The MIT License
-
-   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/*
-  An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
-   int ret, is_missing;
-   khiter_t k;
-   khash_t(32) *h = kh_init(32);
-   k = kh_put(32, h, 5, &ret);
-   kh_value(h, k) = 10;
-   k = kh_get(32, h, 10);
-   is_missing = (k == kh_end(h));
-   k = kh_get(32, h, 5);
-   kh_del(32, h, k);
-   for (k = kh_begin(h); k != kh_end(h); ++k)
-      if (kh_exist(h, k)) kh_value(h, k) = 1;
-   kh_destroy(32, h);
-   return 0;
-}
-*/
-
-/*
-  2013-05-02 (0.2.8):
-
-   * Use quadratic probing. When the capacity is power of 2, stepping function
-     i*(i+1)/2 guarantees to traverse each bucket. It is better than double
-     hashing on cache performance and is more robust than linear probing.
-
-     In theory, double hashing should be more robust than quadratic probing.
-     However, my implementation is probably not for large hash tables, because
-     the second hash function is closely tied to the first hash function,
-     which reduce the effectiveness of double hashing.
-
-   Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
-
-  2011-12-29 (0.2.7):
-
-    * Minor code clean up; no actual effect.
-
-  2011-09-16 (0.2.6):
-
-   * The capacity is a power of 2. This seems to dramatically improve the
-     speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
-
-      - http://code.google.com/p/ulib/
-      - http://nothings.org/computer/judy/
-
-   * Allow to optionally use linear probing which usually has better
-     performance for random input. Double hashing is still the default as it
-     is more robust to certain non-random input.
-
-   * Added Wang's integer hash function (not used by default). This hash
-     function is more robust to certain non-random input.
-
-  2011-02-14 (0.2.5):
-
-    * Allow to declare global functions.
-
-  2009-09-26 (0.2.4):
-
-    * Improve portability
-
-  2008-09-19 (0.2.3):
-
-   * Corrected the example
-   * Improved interfaces
-
-  2008-09-11 (0.2.2):
-
-   * Improved speed a little in kh_put()
-
-  2008-09-10 (0.2.1):
-
-   * Added kh_clear()
-   * Fixed a compiling error
-
-  2008-09-02 (0.2.0):
-
-   * Changed to token concatenation which increases flexibility.
-
-  2008-08-31 (0.1.2):
-
-   * Fixed a bug in kh_get(), which has not been tested previously.
-
-  2008-08-31 (0.1.1):
-
-   * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
-  @header
-
-  Generic hash table library.
- */
-
-#define AC_VERSION_KHASH_H "0.2.8"
-
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-/* compiler specific configuration */
-
-#if UINT_MAX == 0xffffffffu
-typedef unsigned int khint32_t;
-#elif ULONG_MAX == 0xffffffffu
-typedef unsigned long khint32_t;
-#endif
-
-#if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khint64_t;
-#else
-typedef uint64_t khint64_t;
-#endif
-
-#ifndef kh_inline
-#ifdef _MSC_VER
-#define kh_inline __inline
-#else
-#define kh_inline inline
-#endif
-#endif /* kh_inline */
-
-#ifndef klib_unused
-#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
-#define klib_unused __attribute__ ((__unused__))
-#else
-#define klib_unused
-#endif
-#endif /* klib_unused */
-
-typedef khint32_t khint_t;
-typedef khint_t khiter_t;
-
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
-
-#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#ifndef kcalloc
-#define kcalloc(N,Z) calloc(N,Z)
-#endif
-#ifndef kmalloc
-#define kmalloc(Z) malloc(Z)
-#endif
-#ifndef krealloc
-#define krealloc(P,Z) realloc(P,Z)
-#endif
-#ifndef kfree
-#define kfree(P) free(P)
-#endif
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define __KHASH_TYPE(name, khkey_t, khval_t) \
-   typedef struct kh_##name##_s { \
-      khint_t n_buckets, size, n_occupied, upper_bound; \
-      khint32_t *flags; \
-      khkey_t *keys; \
-      khval_t *vals; \
-   } kh_##name##_t;
-
-#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \
-   extern kh_##name##_t *kh_init_##name(void);                    \
-   extern void kh_destroy_##name(kh_##name##_t *h);               \
-   extern void kh_clear_##name(kh_##name##_t *h);                 \
-   extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \
-   extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
-   extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
-   extern void kh_del_##name(kh_##name##_t *h, khint_t x);
-
-#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   SCOPE kh_##name##_t *kh_init_##name(void) {                    \
-      return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \
-   }                                                  \
-   SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \
-   {                                                  \
-      if (h) {                                        \
-         kfree((void *)h->keys); kfree(h->flags);              \
-         kfree((void *)h->vals);                            \
-         kfree(h);                                       \
-      }                                               \
-   }                                                  \
-   SCOPE void kh_clear_##name(kh_##name##_t *h)                \
-   {                                                  \
-      if (h && h->flags) {                               \
-         memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
-         h->size = h->n_occupied = 0;                       \
-      }                                               \
-   }                                                  \
-   SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \
-   {                                                  \
-      if (h->n_buckets) {                                   \
-         khint_t k, i, last, mask, step = 0; \
-         mask = h->n_buckets - 1;                           \
-         k = __hash_func(key); i = k & mask;                   \
-         last = i; \
-         while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-            i = (i + (++step)) & mask; \
-            if (i == last) return h->n_buckets;                \
-         }                                            \
-         return __ac_iseither(h->flags, i)? h->n_buckets : i;     \
-      } else return 0;                                   \
-   }                                                  \
-   SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
-   { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
-      khint32_t *new_flags = 0;                             \
-      khint_t j = 1;                                     \
-      {                                               \
-         kroundup32(new_n_buckets);                            \
-         if (new_n_buckets < 4) new_n_buckets = 4;             \
-         if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
-         else { /* hash table size to be changed (shrink or expand); rehash */ \
-            new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
-            if (!new_flags) return -1;                      \
-            memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
-            if (h->n_buckets < new_n_buckets) { /* expand */      \
-               khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-               if (!new_keys) { kfree(new_flags); return -1; }    \
-               h->keys = new_keys;                          \
-               if (kh_is_map) {                          \
-                  khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-                  if (!new_vals) { kfree(new_flags); return -1; } \
-                  h->vals = new_vals;                       \
-               }                                      \
-            } /* otherwise shrink */                        \
-         }                                            \
-      }                                               \
-      if (j) { /* rehashing is needed */                       \
-         for (j = 0; j != h->n_buckets; ++j) {                 \
-            if (__ac_iseither(h->flags, j) == 0) {             \
-               khkey_t key = h->keys[j];                    \
-               khval_t val;                              \
-               khint_t new_mask;                         \
-               new_mask = new_n_buckets - 1;                   \
-               if (kh_is_map) val = h->vals[j];             \
-               __ac_set_isdel_true(h->flags, j);               \
-               while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
-                  khint_t k, i, step = 0; \
-                  k = __hash_func(key);                     \
-                  i = k & new_mask;                      \
-                  while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
-                  __ac_set_isempty_false(new_flags, i);        \
-                  if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
-                     { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
-                     if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-                     __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
-                  } else { /* write the element and jump out of the loop */ \
-                     h->keys[i] = key;                   \
-                     if (kh_is_map) h->vals[i] = val;       \
-                     break;                              \
-                  }                                   \
-               }                                      \
-            }                                         \
-         }                                            \
-         if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
-            h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-            if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-         }                                            \
-         kfree(h->flags); /* free the working space */            \
-         h->flags = new_flags;                              \
-         h->n_buckets = new_n_buckets;                      \
-         h->n_occupied = h->size;                           \
-         h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
-      }                                               \
-      return 0;                                          \
-   }                                                  \
-   SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
-   {                                                  \
-      khint_t x;                                         \
-      if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
-         if (h->n_buckets > (h->size<<1)) {                    \
-            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
-               *ret = -1; return h->n_buckets;                 \
-            }                                         \
-         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
-            *ret = -1; return h->n_buckets;                    \
-         }                                            \
-      } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
-      {                                               \
-         khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
-         x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
-         if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
-         else {                                          \
-            last = i; \
-            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-               if (__ac_isdel(h->flags, i)) site = i;          \
-               i = (i + (++step)) & mask; \
-               if (i == last) { x = site; break; }             \
-            }                                         \
-            if (x == h->n_buckets) {                        \
-               if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
-               else x = i;                               \
-            }                                         \
-         }                                            \
-      }                                               \
-      if (__ac_isempty(h->flags, x)) { /* not present at all */      \
-         h->keys[x] = key;                               \
-         __ac_set_isboth_false(h->flags, x);                   \
-         ++h->size; ++h->n_occupied;                           \
-         *ret = 1;                                       \
-      } else if (__ac_isdel(h->flags, x)) { /* deleted */            \
-         h->keys[x] = key;                               \
-         __ac_set_isboth_false(h->flags, x);                   \
-         ++h->size;                                      \
-         *ret = 2;                                       \
-      } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
-      return x;                                          \
-   }                                                  \
-   SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \
-   {                                                  \
-      if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \
-         __ac_set_isdel_true(h->flags, x);                     \
-         --h->size;                                      \
-      }                                               \
-   }
-
-#define KHASH_DECLARE(name, khkey_t, khval_t)                     \
-   __KHASH_TYPE(name, khkey_t, khval_t)                        \
-   __KHASH_PROTOTYPES(name, khkey_t, khval_t)
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   __KHASH_TYPE(name, khkey_t, khval_t)                        \
-   __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-   KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
-  @abstract     Integer hash function
-  @param  key   The integer [khint32_t]
-  @return       The hash value [khint_t]
- */
-#define kh_int_hash_func(key) (khint32_t)(key)
-/*! @function
-  @abstract     Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
-  @abstract     64-bit integer hash function
-  @param  key   The integer [khint64_t]
-  @return       The hash value [khint_t]
- */
-#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
-/*! @function
-  @abstract     64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-/*! @function
-  @abstract     const char* hash function
-  @param  s     Pointer to a null terminated string
-  @return       The hash value
- */
-static kh_inline khint_t __ac_X31_hash_string(const char *s)
-{
-   khint_t h = (khint_t)*s;
-   if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
-   return h;
-}
-/*! @function
-  @abstract     Another interface to const char* hash function
-  @param  key   Pointer to a null terminated string [const char*]
-  @return       The hash value [khint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
-  @abstract     Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-static kh_inline khint_t __ac_Wang_hash(khint_t key)
-{
-    key += ~(key << 15);
-    key ^=  (key >> 10);
-    key +=  (key << 3);
-    key ^=  (key >> 6);
-    key += ~(key << 11);
-    key ^=  (key >> 16);
-    return key;
-}
-#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other convenient macros... */
-
-/*!
-  @abstract Type of the hash table.
-  @param  name  Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
-  @abstract     Initiate a hash table.
-  @param  name  Name of the hash table [symbol]
-  @return       Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name()
-
-/*! @function
-  @abstract     Destroy a hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
-  @abstract     Reset a hash table without deallocating memory.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
-  @abstract     Resize a hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  s     New size [khint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
-  @abstract     Insert a key to the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Key [type of keys]
-  @param  r     Extra return code: -1 if the operation failed;
-                0 if the key is present in the hash table;
-                1 if the bucket is empty (never used); 2 if the element in
-            the bucket has been deleted [int*]
-  @return       Iterator to the inserted element [khint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
-  @abstract     Retrieve a key from the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Key [type of keys]
-  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
-  @abstract     Remove a key from the hash table.
-  @param  name  Name of the hash table [symbol]
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  k     Iterator to the element to be deleted [khint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-/*! @function
-  @abstract     Test whether a bucket contains data.
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
-  @abstract     Get key given an iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
-  @abstract     Get value given an iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  x     Iterator to the bucket [khint_t]
-  @return       Value [type of values]
-  @discussion   For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
-  @abstract     Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
-  @abstract     Get the start iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       The start iterator [khint_t]
- */
-#define kh_begin(h) (khint_t)(0)
-
-/*! @function
-  @abstract     Get the end iterator
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       The end iterator [khint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
-  @abstract     Get the number of elements in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       Number of elements in the hash table [khint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
-  @abstract     Get the number of buckets in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @return       Number of buckets in the hash table [khint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/*! @function
-  @abstract     Iterate over the entries in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  kvar  Variable to which key will be assigned
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \
-   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
-      if (!kh_exist(h,__i)) continue;                 \
-      (kvar) = kh_key(h,__i);                      \
-      (vvar) = kh_val(h,__i);                      \
-      code;                                  \
-   } }
-
-/*! @function
-  @abstract     Iterate over the values in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach_value(h, vvar, code) { khint_t __i;      \
-   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
-      if (!kh_exist(h,__i)) continue;                 \
-      (vvar) = kh_val(h,__i);                      \
-      code;                                  \
-   } }
-
-/* More conenient interfaces */
-
-/*! @function
-  @abstract     Instantiate a hash set containing integer keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name)                            \
-   KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing integer keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t)                      \
-   KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing 64-bit integer keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT64(name)                             \
-   KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing 64-bit integer keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_INT64(name, khval_t)                       \
-   KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-typedef const char *kh_cstr_t;
-/*! @function
-  @abstract     Instantiate a hash map containing const char* keys
-  @param  name  Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name)                            \
-   KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
-  @abstract     Instantiate a hash map containing const char* keys
-  @param  name  Name of the hash table [symbol]
-  @param  khval_t  Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t)                      \
-   KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-#endif /* __AC_KHASH_H */
diff --git a/twml/libtwml/src/lib/internal/linear_search.h b/twml/libtwml/src/lib/internal/linear_search.h
deleted file mode 100644
index a3d294853..000000000
--- a/twml/libtwml/src/lib/internal/linear_search.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-#include <twml/optim.h>
-namespace twml {
-
-  template<typename Tx>
-  static int64_t linear_search(const Tx *xsData, const Tx val, const int64_t mainSize) {
-    int64_t left = 0;
-    int64_t right = mainSize-1;
-    while(left <= right && val > xsData[left])
-      left++;
-    return left;
-  }
-
-}  // namespace twml
-#endif
diff --git a/twml/libtwml/src/lib/internal/murmur_hash3.h b/twml/libtwml/src/lib/internal/murmur_hash3.h
deleted file mode 100644
index 3bdfbe486..000000000
--- a/twml/libtwml/src/lib/internal/murmur_hash3.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#ifndef _MURMURHASH3_H_
-#define _MURMURHASH3_H_
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else  // defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
-
-void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
-
-//-----------------------------------------------------------------------------
-
-#endif // _MURMURHASH3_H_
diff --git a/twml/libtwml/src/lib/internal/thrift.h b/twml/libtwml/src/lib/internal/thrift.h
deleted file mode 100644
index 4e4786219..000000000
--- a/twml/libtwml/src/lib/internal/thrift.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// For details of how to encode and decode thrift, check
-// https://github.com/apache/thrift/blob/master/doc/specs/thrift-binary-protocol.md
-
-// Definitions of the thrift binary format
-typedef enum {
-  TTYPE_STOP   = 0,
-  TTYPE_VOID   = 1,
-  TTYPE_BOOL   = 2,
-  TTYPE_BYTE   = 3,
-  TTYPE_DOUBLE = 4,
-  TTYPE_I16    = 6,
-  TTYPE_I32    = 8,
-  TTYPE_I64    = 10,
-  TTYPE_STRING = 11,
-  TTYPE_STRUCT = 12,
-  TTYPE_MAP    = 13,
-  TTYPE_SET    = 14,
-  TTYPE_LIST   = 15,
-  TTYPE_ENUM   = 16,
-} TTYPES;
-
-// Fields of a batch prediction response
-typedef enum {
-  BPR_DUMMY ,
-  BPR_PREDICTIONS,
-} BPR_FIELDS;
-
-// Fields of a datarecord
-typedef enum {
-  DR_CROSS             , // fake field for crosses
-  DR_BINARY            ,
-  DR_CONTINUOUS        ,
-  DR_DISCRETE          ,
-  DR_STRING            ,
-  DR_SPARSE_BINARY     ,
-  DR_SPARSE_CONTINUOUS ,
-  DR_BLOB              ,
-  DR_GENERAL_TENSOR    ,
-  DR_SPARSE_TENSOR     ,
-} DR_FIELDS;
-
-// Fields for General tensor
-typedef enum {
-  GT_DUMMY  , // dummy field
-  GT_RAW    ,
-  GT_STRING ,
-  GT_INT32  ,
-  GT_INT64  ,
-  GT_FLOAT  ,
-  GT_DOUBLE ,
-  GT_BOOL   ,
-} GT_FIELDS;
-
-typedef enum {
-  SP_DUMMY  , // dummy field
-  SP_COO    ,
-} SP_FIELDS;
-
-// Enum values from tensor.thrift
-typedef enum {
-  DATA_TYPE_FLOAT  ,
-  DATA_TYPE_DOUBLE ,
-  DATA_TYPE_INT32  ,
-  DATA_TYPE_INT64  ,
-  DATA_TYPE_UINT8  ,
-  DATA_TYPE_STRING ,
-  DATA_TYPE_BYTE   ,
-  DATA_TYPE_BOOL   ,
-} DATA_TYPES;
diff --git a/twml/libtwml/src/lib/internal/utf_converter.h b/twml/libtwml/src/lib/internal/utf_converter.h
deleted file mode 100644
index b0b38fb11..000000000
--- a/twml/libtwml/src/lib/internal/utf_converter.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _UTF_CONVERTER_H_
-#define _UTF_CONVERTER_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/types.h>
-
-ssize_t utf8_to_utf16(const uint8_t *in, uint64_t in_len, uint16_t *out, uint64_t max_out);
-
-#endif
diff --git a/twml/libtwml/src/lib/io/IOError.cpp b/twml/libtwml/src/lib/io/IOError.cpp
deleted file mode 100644
index e0a661c13..000000000
--- a/twml/libtwml/src/lib/io/IOError.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <twml/io/IOError.h>
-
-
-namespace twml {
-namespace io {
-
-namespace {
-  std::string messageFromStatus(IOError::Status status) {
-    switch (status) {
-      case IOError::OUT_OF_RANGE:
-        return "failed to read enough input";
-      case IOError::WRONG_MAGIC:
-        return "wrong magic in stream";
-      case IOError::WRONG_HEADER:
-        return "wrong header in stream";
-      case IOError::ERROR_HEADER_CHECKSUM:
-        return "header checksum doesn't match";
-      case IOError::INVALID_METHOD:
-        return "using invalid method";
-      case IOError::USING_RESERVED:
-        return "using reserved flag";
-      case IOError::ERROR_HEADER_EXTRA_FIELD_CHECKSUM:
-        return "extra header field checksum doesn't match";
-      case IOError::CANT_FIT_OUTPUT:
-        return "can't fit output in the given space";
-      case IOError::SPLIT_FILE:
-        return "split files aren't supported";
-      case IOError::BLOCK_SIZE_TOO_LARGE:
-        return "block size is too large";
-      case IOError::SOURCE_LARGER_THAN_DESTINATION:
-        return "source is larger than destination";
-      case IOError::DESTINATION_LARGER_THAN_CAPACITY:
-        return "destination buffer is too small to fit uncompressed result";
-      case IOError::HEADER_FLAG_MISMATCH:
-        return "failed to match flags for compressed and decompressed data";
-      case IOError::NOT_ENOUGH_INPUT:
-        return "not enough input to proceed with decompression";
-      case IOError::ERROR_SOURCE_BLOCK_CHECKSUM:
-        return "source block checksum doesn't match";
-      case IOError::COMPRESSED_DATA_VIOLATION:
-        return "error occurred while decompressing the data";
-      case IOError::ERROR_DESTINATION_BLOCK_CHECKSUM:
-        return "destination block checksum doesn't match";
-      case IOError::EMPTY_RECORD:
-        return "can't write an empty record";
-      case IOError::MALFORMED_MEMORY_RECORD:
-        return "can't write malformed record";
-      case IOError::UNSUPPORTED_OUTPUT_TYPE:
-        return "output data type is not supported";
-      case IOError::OTHER_ERROR:
-      default:
-        return "unknown error occurred";
-    }
-  }
-}  // namespace
-
-IOError::IOError(Status status): twml::Error(TWML_ERR_IO, "Found error while processing stream: " +
-    messageFromStatus(status)), m_status(status) {}
-
-}  // namespace io
-}  // namespace twml
diff --git a/twml/libtwml/src/lib/murmur_hash3.cpp b/twml/libtwml/src/lib/murmur_hash3.cpp
deleted file mode 100644
index 89c9c1fc1..000000000
--- a/twml/libtwml/src/lib/murmur_hash3.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "internal/murmur_hash3.h"
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE  __forceinline
-
-#include <stdlib.h>
-
-#define ROTL32(x,y)  _rotl(x,y)
-#define ROTL64(x,y)  _rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else  // defined(_MSC_VER)
-
-#define  FORCE_INLINE inline __attribute__((always_inline))
-
-FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r )
-{
-  return (x << r) | (x >> (32 - r));
-}
-
-FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r )
-{
-  return (x << r) | (x >> (64 - r));
-}
-
-#define  ROTL32(x,y)  rotl32(x,y)
-#define ROTL64(x,y)  rotl64(x,y)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
-{
-  return p[i];
-}
-
-FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
-{
-  return p[i];
-}
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32 ( uint32_t h )
-{
-  h ^= h >> 16;
-  h *= 0x85ebca6b;
-  h ^= h >> 13;
-  h *= 0xc2b2ae35;
-  h ^= h >> 16;
-
-  return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64 ( uint64_t k )
-{
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-
-  return k;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_32 ( const void * key, int len,
-                          uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 4;
-
-  uint32_t h1 = seed;
-
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  //----------
-  // body
-
-  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i);
-
-    k1 *= c1;
-    k1 = ROTL32(k1,15);
-    k1 *= c2;
-
-    h1 ^= k1;
-    h1 = ROTL32(h1,13);
-    h1 = h1*5+0xe6546b64;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
-
-  uint32_t k1 = 0;
-
-  switch(len & 3)
-  {
-  case 3: k1 ^= tail[2] << 16;
-  case 2: k1 ^= tail[1] << 8;
-  case 1: k1 ^= tail[0];
-          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-
-  h1 = fmix32(h1);
-
-  *(uint32_t*)out = h1;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_128 ( const void * key, const int len,
-                           uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint32_t h1 = seed;
-  uint32_t h2 = seed;
-  uint32_t h3 = seed;
-  uint32_t h4 = seed;
-
-  const uint32_t c1 = 0x239b961b;
-  const uint32_t c2 = 0xab0e9789;
-  const uint32_t c3 = 0x38b34ae5;
-  const uint32_t c4 = 0xa1e38b93;
-
-  //----------
-  // body
-
-  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i*4+0);
-    uint32_t k2 = getblock32(blocks,i*4+1);
-    uint32_t k3 = getblock32(blocks,i*4+2);
-    uint32_t k4 = getblock32(blocks,i*4+3);
-
-    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
-
-    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
-    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
-
-    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
-    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
-
-    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
-    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
-  uint32_t k1 = 0;
-  uint32_t k2 = 0;
-  uint32_t k3 = 0;
-  uint32_t k4 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k4 ^= tail[14] << 16;
-  case 14: k4 ^= tail[13] << 8;
-  case 13: k4 ^= tail[12] << 0;
-           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
-  case 12: k3 ^= tail[11] << 24;
-  case 11: k3 ^= tail[10] << 16;
-  case 10: k3 ^= tail[ 9] << 8;
-  case  9: k3 ^= tail[ 8] << 0;
-           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
-  case  8: k2 ^= tail[ 7] << 24;
-  case  7: k2 ^= tail[ 6] << 16;
-  case  6: k2 ^= tail[ 5] << 8;
-  case  5: k2 ^= tail[ 4] << 0;
-           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
-  case  4: k1 ^= tail[ 3] << 24;
-  case  3: k1 ^= tail[ 2] << 16;
-  case  2: k1 ^= tail[ 1] << 8;
-  case  1: k1 ^= tail[ 0] << 0;
-           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  h1 = fmix32(h1);
-  h2 = fmix32(h2);
-  h3 = fmix32(h3);
-  h4 = fmix32(h4);
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  ((uint32_t*)out)[0] = h1;
-  ((uint32_t*)out)[1] = h2;
-  ((uint32_t*)out)[2] = h3;
-  ((uint32_t*)out)[3] = h4;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x64_128 ( const void * key, const int len,
-                           const uint32_t seed, void * out )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint64_t h1 = seed;
-  uint64_t h2 = seed;
-
-  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
-  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
-  //----------
-  // body
-
-  const uint64_t * blocks = (const uint64_t *)(data);
-
-  for(int i = 0; i < nblocks; i++)
-  {
-    uint64_t k1 = getblock64(blocks,i*2+0);
-    uint64_t k2 = getblock64(blocks,i*2+1);
-
-    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
-
-    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
-
-  uint64_t k1 = 0;
-  uint64_t k2 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
-  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
-  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
-  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
-  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
-  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
-  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
-           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
-  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
-  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
-  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
-  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
-  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
-  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
-  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
-           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len;
-
-  h1 += h2;
-  h2 += h1;
-
-  h1 = fmix64(h1);
-  h2 = fmix64(h2);
-
-  h1 += h2;
-  h2 += h1;
-
-  ((uint64_t*)out)[0] = h1;
-  ((uint64_t*)out)[1] = h2;
-}
-
-//-----------------------------------------------------------------------------
-
diff --git a/twml/libtwml/src/lib/optim.cpp b/twml/libtwml/src/lib/optim.cpp
deleted file mode 100644
index 7db36c26d..000000000
--- a/twml/libtwml/src/lib/optim.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-#include "internal/interpolate.h"
-#include "internal/error.h"
-#include <twml/optim.h>
-
-namespace twml {
-  template<typename T>
-  void mdlInfer(Tensor &output_keys, Tensor &output_vals,
-          const Tensor &input_keys, const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          bool return_bin_indices) {
-    auto okeysData = output_keys.getData<int64_t>();
-    auto ovalsData = output_vals.getData<T>();
-    uint64_t okeysStride   = output_keys.getStride(0);
-    uint64_t ovaluesStride = output_vals.getStride(0);
-
-    auto ikeysData = input_keys.getData<int64_t>();
-    auto ivalsData = input_vals.getData<T>();
-    uint64_t ikeysStride   = input_keys.getStride(0);
-    uint64_t ivaluesStride = input_vals.getStride(0);
-
-    auto xsData = bin_vals.getData<T>();
-    auto ysData = bin_ids.getData<int64_t>();
-    uint64_t xsStride = bin_vals.getStride(0);
-    uint64_t ysStride = bin_ids.getStride(0);
-
-    auto offsetData = feature_offsets.getData<int64_t>();
-
-    uint64_t size = input_keys.getDim(0);
-    uint64_t total_bins = bin_ids.getNumElements();
-    uint64_t fsize = feature_offsets.getNumElements();
-
-    for (uint64_t i = 0; i < size; i++) {
-      int64_t ikey = ikeysData[i * ikeysStride] - TWML_INDEX_BASE;
-      T val = ivalsData[i * ivaluesStride];
-      if (ikey == -1) {
-        ovalsData[i * ovaluesStride] = val;
-        continue;
-      }
-
-      // Perform interpolation
-      uint64_t offset = offsetData[ikey];
-      uint64_t next_offset = (ikey == (int64_t)(fsize - 1)) ? total_bins : offsetData[ikey + 1];
-      uint64_t mainSize = next_offset - offset;
-
-      const T *lxsData = xsData + offset;
-      const int64_t *lysData = ysData + offset;
-      int64_t okey = interpolation<T, int64_t>(lxsData, xsStride,
-                                 lysData, ysStride,
-                                 val, mainSize, NEAREST, 0,
-                                 return_bin_indices);
-      okeysData[i * okeysStride] = okey + TWML_INDEX_BASE;
-      ovalsData[i * ovaluesStride] = 1;
-    }
-  }
-
-  void mdlInfer(Tensor &output_keys, Tensor &output_vals,
-          const Tensor &input_keys, const Tensor &input_vals,
-          const Tensor &bin_ids,
-          const Tensor &bin_vals,
-          const Tensor &feature_offsets,
-          bool return_bin_indices) {
-    if (input_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "input_keys must be a Long Tensor");
-    }
-
-    if (output_keys.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "output_keys must be a Long Tensor");
-    }
-
-    if (bin_ids.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (feature_offsets.getType() != TWML_TYPE_INT64) {
-      throw twml::Error(TWML_ERR_TYPE, "bin_ids must be a Long Tensor");
-    }
-
-    if (input_vals.getType() != bin_vals.getType()) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "Data type of input_vals does not match type of bin_vals");
-    }
-
-    if (bin_vals.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_vals must be 1 Dimensional");
-    }
-
-    if (bin_ids.getNumDims() != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "bin_ids must be 1 Dimensional");
-    }
-
-    if (bin_vals.getNumElements() != bin_ids.getNumElements()) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "Dimensions of bin_vals and bin_ids do not match");
-    }
-
-    if (feature_offsets.getStride(0) != 1) {
-      throw twml::Error(TWML_ERR_SIZE,
-                "feature_offsets must be contiguous");
-    }
-
-    switch (input_vals.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::mdlInfer<float>(output_keys, output_vals,
-                  input_keys, input_vals,
-                  bin_ids, bin_vals, feature_offsets,
-                  return_bin_indices);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::mdlInfer<double>(output_keys, output_vals,
-                   input_keys, input_vals,
-                   bin_ids, bin_vals, feature_offsets,
-                   return_bin_indices);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for mdlInfer");
-    }
-  }
-
-  const int DEFAULT_INTERPOLATION_LOWEST = 0;
-  /**
-   * @param output tensor to hold linear or nearest interpolation output.
-   *    This function does not allocate space.
-   *    The output tensor must have space allcoated.
-   * @param input input tensor; size must match output.
-   *    input is assumed to have size [batch_size, number_of_labels].
-   * @param xs the bins.
-   * @param ys the values for the bins.
-   * @param mode: linear or nearest InterpolationMode.
-   *    linear is used for isotonic calibration.
-   *    nearest is used for MDL calibration and MDL inference.
-   *
-   * @return Returns nothing. Output is stored into the output tensor.
-   *
-   * This is used by IsotonicCalibration inference.
-   */
-  template <typename T>
-  void interpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys,
-    const InterpolationMode mode) {
-    // Sanity check: input and output should have two dims.
-    if (input.getNumDims() != 2 || output.getNumDims() != 2) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "input and output should have 2 dimensions.");
-    }
-
-    // Sanity check: input and output size should match.
-    for (int i = 0; i < input.getNumDims(); i++) {
-      if (input.getDim(i) != output.getDim(i))  {
-        throw twml::Error(TWML_ERR_TYPE,
-                  "input and output mismatch in size.");
-      }
-    }
-
-    // Sanity check: number of labels in input should match
-    // number of labels in xs / ys.
-    if (input.getDim(1) != xs.getDim(0)
-      || input.getDim(1) != ys.getDim(0)) {
-      throw twml::Error(TWML_ERR_TYPE,
-                "input, xs, ys should have the same number of labels.");
-    }
-
-    const uint64_t inputStride0 = input.getStride(0);
-    const uint64_t inputStride1 = input.getStride(1);
-    const uint64_t outputStride0 = output.getStride(0);
-    const uint64_t outputStride1 = output.getStride(1);
-    const uint64_t xsStride0 = xs.getStride(0);
-    const uint64_t xsStride1 = xs.getStride(1);
-    const uint64_t ysStride0 = ys.getStride(0);
-    const uint64_t ysStride1 = ys.getStride(1);
-    const uint64_t mainSize = xs.getDim(1);
-
-    // for each value in the input matrix, compute output value by
-    // calling interpolation.
-    auto inputData = input.getData<T>();
-    auto outputData = output.getData<T>();
-    auto xsData = xs.getData<T>();
-    auto ysData = ys.getData<T>();
-
-    for (uint64_t i = 0; i < input.getDim(0); i++) {
-      for (uint64_t j = 0; j < input.getDim(1); j++) {
-        const T val = inputData[i * inputStride0 + j * inputStride1];
-        const T *lxsData = xsData + j * xsStride0;
-        const T *lysData = ysData + j * ysStride0;
-        const T res = interpolation(
-          lxsData, xsStride1,
-          lysData, ysStride1,
-          val,
-          mainSize,
-          mode,
-          DEFAULT_INTERPOLATION_LOWEST);
-        outputData[i * outputStride0 + j * outputStride1] = res;
-      }
-    }
-  }
-
-  void linearInterpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys) {
-    switch (input.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::interpolation<float>(output, input, xs, ys, LINEAR);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::interpolation<double>(output, input, xs, ys, LINEAR);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for linearInterpolation.");
-    }
-  }
-
-  void nearestInterpolation(
-    Tensor output,
-    const Tensor input,
-    const Tensor xs,
-    const Tensor ys) {
-    switch (input.getType()) {
-    case TWML_TYPE_FLOAT:
-      twml::interpolation<float>(output, input, xs, ys, NEAREST);
-      break;
-    case TWML_TYPE_DOUBLE:
-      twml::interpolation<double>(output, input, xs, ys, NEAREST);
-      break;
-    default:
-      throw twml::Error(TWML_ERR_TYPE,
-        "Unsupported datatype for nearestInterpolation.");
-    }
-  }
-}  // namespace twml
-
-twml_err twml_optim_mdl_infer(twml_tensor output_keys,
-                twml_tensor output_vals,
-                const twml_tensor input_keys,
-                const twml_tensor input_vals,
-                const twml_tensor bin_ids,
-                const twml_tensor bin_vals,
-                const twml_tensor feature_offsets,
-                bool return_bin_indices) {
-  HANDLE_EXCEPTIONS(
-    using namespace twml;
-    mdlInfer(*getTensor(output_keys),
-         *getTensor(output_vals),
-         *getConstTensor(input_keys),
-         *getConstTensor(input_vals),
-         *getConstTensor(bin_ids),
-         *getConstTensor(bin_vals),
-         *getConstTensor(feature_offsets),
-          return_bin_indices););
-  return TWML_ERR_NONE;
-}
-
-twml_err twml_optim_nearest_interpolation(
-                twml_tensor output,
-                const twml_tensor input,
-                const twml_tensor xs,
-                const twml_tensor ys) {
-  HANDLE_EXCEPTIONS(
-    using namespace twml;
-    nearestInterpolation(*getTensor(output),
-      *getConstTensor(input),
-      *getConstTensor(xs),
-      *getConstTensor(ys)););
-  return TWML_ERR_NONE;
-}
diff --git a/twml/libtwml/src/lib/utf_converter.cpp b/twml/libtwml/src/lib/utf_converter.cpp
deleted file mode 100644
index 5c943f3e3..000000000
--- a/twml/libtwml/src/lib/utf_converter.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "internal/utf_converter.h"
-
-ssize_t utf8_to_utf16(const uint8_t *in, uint64_t in_len, uint16_t *out, uint64_t max_out) {
-  uint64_t num_out = 0;
-  uint64_t num_in = 0;
-  while (num_in < in_len) {
-    uint32_t uni;
-    uint64_t todo;
-    uint8_t ch = in[num_in];
-    num_in++;
-    if (ch <= 0x7F) {
-      uni = ch;
-      todo = 0;
-    } else if (ch <= 0xBF) {
-      return -1;
-    } else if (ch <= 0xDF) {
-      uni = ch & 0x1F;
-      todo = 1;
-    } else if (ch <= 0xEF) {
-      uni = ch & 0x0F;
-      todo = 2;
-    } else if (ch <= 0xF7) {
-      uni = ch & 0x07;
-      todo = 3;
-    } else {
-      return -1;
-    }
-    for (uint64_t j = 0; j < todo; ++j) {
-      if (num_in == in_len) return -1;
-      uint8_t ch = in[num_in];
-      num_in++;
-      if (ch < 0x80 || ch > 0xBF) return -1;
-      uni <<= 6;
-      uni += ch & 0x3F;
-    }
-    if (uni >= 0xD800 && uni <= 0xDFFF) return -1;
-    if (uni > 0x10FFFF) return -1;
-    if (uni <= 0xFFFF) {
-      if (num_out == max_out) return -1;
-      out[num_out] = uni;
-      num_out++;
-    } else {
-      uni -= 0x10000;
-      if (num_out + 1 >= max_out) return -1;
-      out[num_out] = (uni >> 10) + 0xD800;
-      out[num_out + 1] = (uni & 0x3FF) + 0xDC00;
-      num_out += 2;
-    }
-  }
-  if (num_out == max_out) return -1;
-  out[num_out] = 0;
-  return num_out;
-}
diff --git a/twml/libtwml/src/ops/CMakeLists.txt b/twml/libtwml/src/ops/CMakeLists.txt
deleted file mode 100644
index e2feaff23..000000000
--- a/twml/libtwml/src/ops/CMakeLists.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
-cmake_policy(VERSION 2.8)
-set(CMAKE_MACOSX_RPATH 1)
-
-file(GLOB_RECURSE sources *.cpp)
-
-set (CMAKE_CXX_FLAGS "-Wall -std=c++11 -fno-stack-protector ${CMAKE_CXX_FLAGS}")
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_inc.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_INC)
-
-if (NOT (${TF_RES} EQUAL "0"))
-  message(${TF_RES})
-  message(FATAL_ERROR "Failed to get include path for tensorflow")
-endif()
-
-execute_process(
-  COMMAND
-  $ENV{LIBTWML_HOME}/src/ops/scripts/get_lib.sh
-  RESULT_VARIABLE
-  TF_RES
-  OUTPUT_VARIABLE
-  TF_LIB)
-
-if (NOT (${TF_RES} EQUAL "0"))
-  message(${TF_RES})
-  message(FATAL_ERROR "Failed to get lib path for tensorflow")
-endif()
-
-find_path(
-  TWML_INC
-  NAMES "twml.h"
-  PATHS $ENV{LIBTWML_HOME}/include)
-
-add_library(twml_tf MODULE ${sources})
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "$ENV{LIBTWML_HOME}/cmake")
-
-if (UNIX)
-  if (APPLE)
-    set (CMAKE_CXX_FLAGS "-undefined dynamic_lookup -stdlib=libc++  ${CMAKE_CXX_FLAGS}")
-    # -Wl,-all_load ensures symbols not used by twml_tf are also included.
-    # -Wl,-noall_load limits the scope of the previous flag.
-    set (LINK_ALL_OPTION  "-Wl,-all_load")
-    set (NO_LINK_ALL_OPTION  "-Wl,-noall_load")
-    set(TF_FRAMEWORK_LIB ${TF_LIB}/libtensorflow_framework.1.dylib)
-  else()
-    # -Wl,--whole-archive ensures symbols not used by twml_tf are also included.
-    # -Wl,--no-whole-archive limits the scope of the previous flag.
-    set (LINK_ALL_OPTION  "-Wl,--whole-archive")
-    set (NO_LINK_ALL_OPTION  "-Wl,--no-whole-archive")
-    set(TF_FRAMEWORK_LIB ${TF_LIB}/libtensorflow_framework.so.1)
-  endif()
-endif()
-
-
-target_include_directories(
-  twml_tf
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${TWML_INC}
-  # TF_INC needs to be the last to avoid some weird white-spacing issues with generated Makefile.
-  ${TF_INC} # Needed because of some header files auto-generated during build time.
-  ${TF_INC}/external/nsync/public/
-  )
-
-target_link_libraries(twml_tf
-  PUBLIC
-  # Since we are using twml_tf as the "one" dynamic library,
-  # we want it to have the C function symbols needed for other functions as well.
-  ${LINK_ALL_OPTION} twml ${NO_LINK_ALL_OPTION}
-  ${TF_FRAMEWORK_LIB}
-  )
diff --git a/twml/libtwml/src/ops/add1.cpp b/twml/libtwml/src/ops/add1.cpp
deleted file mode 100644
index 66281841a..000000000
--- a/twml/libtwml/src/ops/add1.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("Add1")
-.Attr("T: {float, double, int32}")
-.Input("input1: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class Add1 : public OpKernel {
- public:
-  explicit Add1(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
-                             &output_tensor));
-    auto output_flat = output_tensor->flat<T>();
-
-    // Add 1 to input and assign to output
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output_flat(i) = input(i) + 1;
-    }
-  }
-};
-
-
-REGISTER_OP("Add1Grad")
-.Attr("T: {float, double, int32}")
-.Input("grad_output: T")
-.Output("grad_input: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  });
-
-template<typename T>
-class Add1Grad : public OpKernel {
- public:
-  explicit Add1Grad(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& grad_output_tensor = context->input(0);
-    auto grad_output = grad_output_tensor.flat<T>();
-
-    // Create an grad_input tensor
-    Tensor* grad_input_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, grad_output_tensor.shape(),
-                             &grad_input_tensor));
-
-    auto grad_input_flat = grad_input_tensor->flat<T>();
-
-    // Copy from grad_output to grad_input
-    const int N = grad_output.size();
-    for (int i = 0; i < N; i++) {
-      grad_input_flat(i) = grad_output(i);
-    }
-  }
-};
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("Add1")                    \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    Add1<Type>);                    \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("Add1Grad")                \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    Add1Grad<Type>);                \
-
-REGISTER(float);
-REGISTER(double);
-REGISTER(int32);
diff --git a/twml/libtwml/src/ops/batch_prediction_request.cpp b/twml/libtwml/src/ops/batch_prediction_request.cpp
deleted file mode 100644
index a83c3ebcf..000000000
--- a/twml/libtwml/src/ops/batch_prediction_request.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-REGISTER_OP("DecodeAndHashBatchPredictionRequest")
-.Input("input_bytes: uint8")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of hashed data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  hashed_data_record_handle: A resource handle to the HashedDataRecordResource containing batch of HashedDataRecords.
-)doc");
-
-class DecodeAndHashBatchPredictionRequest : public OpKernel {
- public:
-  explicit DecodeAndHashBatchPredictionRequest(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      const uint8_t *input_bytes = resource->input.flat<uint8>().data();
-      twml::HashedDataRecordReader reader;
-      twml::HashedBatchPredictionRequest bpr;
-      reader.setKeepMap(&m_keep_map);
-      reader.setBuffer(input_bytes);
-      reader.setDecodeMode(m_decode_mode);
-      bpr.decode(reader);
-
-      resource->common = std::move(bpr.common());
-      resource->records = std::move(bpr.requests());
-
-      // Each datarecord has a copy of common features.
-      // Initialize total_size by common_size * num_records
-      int64 common_size = static_cast<int64>(resource->common.totalSize());
-      int64 num_records = static_cast<int64>(resource->records.size());
-      int64 total_size = common_size * num_records;
-      for (const auto &record : resource->records) {
-        total_size += static_cast<int64>(record.totalSize());
-      }
-
-      resource->total_size = total_size;
-      resource->num_labels = 0;
-      resource->num_weights = 0;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("DecodeAndHashBatchPredictionRequest").Device(DEVICE_CPU),
-  DecodeAndHashBatchPredictionRequest);
-
-REGISTER_OP("DecodeBatchPredictionRequest")
-.Input("input_bytes: uint8")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecordResource containing batch of DataRecords.
-)doc");
-
-class DecodeBatchPredictionRequest : public OpKernel {
- public:
-  explicit DecodeBatchPredictionRequest(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      const uint8_t *input_bytes = resource->input.flat<uint8>().data();
-      twml::DataRecordReader reader;
-      twml::BatchPredictionRequest bpr;
-      reader.setKeepMap(&m_keep_map);
-      reader.setBuffer(input_bytes);
-      bpr.decode(reader);
-
-      resource->common = std::move(bpr.common());
-      resource->records = std::move(bpr.requests());
-
-      resource->num_weights = 0;
-      resource->num_labels = 0;
-      resource->keep_map = &m_keep_map;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("DecodeBatchPredictionRequest").Device(DEVICE_CPU),
-  DecodeBatchPredictionRequest);
diff --git a/twml/libtwml/src/ops/batch_prediction_request_v2.cpp b/twml/libtwml/src/ops/batch_prediction_request_v2.cpp
deleted file mode 100644
index 3e89c9a0a..000000000
--- a/twml/libtwml/src/ops/batch_prediction_request_v2.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <cstdint>
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <iterator>
-
-template<typename InputType, typename RecordType>
-class DecodeBatchPredictionRequestKernel : public OpKernel {
- public:
-  explicit DecodeBatchPredictionRequestKernel(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- protected:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-  int64 m_decode_mode;
-
-  template<typename ResourceType>
-  void Decode(OpKernelContext* context, ResourceType *resource) {
-    resource->input = context->input(0);
-    const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, 0);
-    int num_labels = static_cast<int>(m_labels_map.size());
-    int num_weights = static_cast<int>(m_weights_map.size());
-
-    typename RecordType::Reader reader;
-    twml::GenericBatchPredictionRequest<RecordType> bpr(num_labels, num_weights);
-
-    reader.setKeepMap(&m_keep_map);
-    reader.setLabelsMap(&m_labels_map);
-    reader.setBuffer(input_bytes);
-    reader.setDecodeMode(m_decode_mode);
-    // Do not set weight map if it is empty. This will take a faster path.
-    if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-    }
-    bpr.decode(reader);
-
-    resource->common = std::move(bpr.common());
-    resource->records = std::move(bpr.requests());
-
-    resource->num_labels = num_labels;
-    resource->num_weights = num_weights;
-  }
-};
-
-
-REGISTER_OP("DecodeAndHashBatchPredictionRequestV2")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes a list/batch of data records and creates a handle to the batch of hashed data records.
-
-Compared to DecodeAndHashBatchPredictionRequest, DecodeAndHashBatchPredictionRequestV2 is used for training instead
-of serving. Thus label_features and weight_features[optional] must be passed, and labels and weights are extracted in
-the output.
-DecodeAndHashBatchPredictionRequestV2 controls what DataRecords we want to process together in a batch in training.
-For instance, we can put all instances for a query in the same batch when training a ranking model.
-Notice that this OP was added separately to make sure we would not break the API for DecodeAndHashBatchPredictionRequest.
-It requires some discussions if we merge the two ops into a single .cpp file in a future API revision.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  hashed_data_record_handle: A resource handle to the HashedDataRecordResource containing batch of HashedDataRecords.
-)doc");
-
-template<typename InputType>
-class DecodeAndHashBatchPredictionRequestV2 :
-    public DecodeBatchPredictionRequestKernel<InputType, twml::HashedDataRecord> {
-
-public:
-  DecodeAndHashBatchPredictionRequestV2(OpKernelConstruction *context)
-    : DecodeBatchPredictionRequestKernel<InputType, twml::HashedDataRecord>(context) {
-  }
-
- private:
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(
-        context,
-        makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      this->Decode(context, resource);
-
-      // Each datarecord has a copy of common features.
-      // Initialize total_size by common_size * num_records
-      int64 common_size = static_cast<int64>(resource->common.totalSize());
-      int64 num_records = static_cast<int64>(resource->records.size());
-      int64 total_size = common_size * num_records;
-      for (const auto &record : resource->records) {
-        total_size += static_cast<int64>(record.totalSize());
-      }
-
-      resource->total_size = total_size;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("DecodeBatchPredictionRequestV2")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that decodes batch prediction request and creates a handle to the batch of data records.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  shared_name: name used by the resource handle inside the resource manager.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: reserved, do not use.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of BatchPredictionRequest.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecordResource containing batch of DataRecords.
-)doc");
-
-
-template<typename InputType>
-class DecodeBatchPredictionRequestV2 :
-    public DecodeBatchPredictionRequestKernel<InputType, twml::DataRecord> {
-public:
-  DecodeBatchPredictionRequestV2(OpKernelConstruction *context)
-    : DecodeBatchPredictionRequestKernel<InputType, twml::DataRecord>(context) {
-  }
-
-private:
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(
-        context,
-        makeResourceHandle<DataRecordResource>(context, 0, &resource));
-      this->Decode(context, resource);
-      resource->keep_map = &(this->m_keep_map);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER_DECODE_OPS(InputType)                      \
-    REGISTER_KERNEL_BUILDER(                                \
-        Name("DecodeAndHashBatchPredictionRequestV2")       \
-        .Device(DEVICE_CPU)                                 \
-        .TypeConstraint<InputType>("InputType"),            \
-        DecodeAndHashBatchPredictionRequestV2<InputType>);  \
-    REGISTER_KERNEL_BUILDER(                                \
-        Name("DecodeBatchPredictionRequestV2")              \
-        .Device(DEVICE_CPU)                                 \
-        .TypeConstraint<InputType>("InputType"),            \
-        DecodeBatchPredictionRequestV2<InputType>);         \
-
-REGISTER_DECODE_OPS(uint8)
-REGISTER_DECODE_OPS(string)
diff --git a/twml/libtwml/src/ops/batch_prediction_response_writer.cpp b/twml/libtwml/src/ops/batch_prediction_response_writer.cpp
deleted file mode 100644
index 4876dd48a..000000000
--- a/twml/libtwml/src/ops/batch_prediction_response_writer.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("BatchPredictionResponseWriter")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and values into a BatchPredictionResponse.
-
-values: input feature value. (float/double)
-keys: feature ids from the original BatchPredictionRequest. (int64)
-
-Outputs
-  bytes: output BatchPredictionRequest serialized using Thrift into a uint8 tensor.
-)doc");
-
-template<typename T>
-class BatchPredictionResponseWriter : public OpKernel {
- public:
-  explicit BatchPredictionResponseWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-    const Tensor& values = context->input(1);
-
-    try {
-      // Ensure the inner dimension matches.
-      if (values.dim_size(values.dims() - 1) != keys.dim_size(keys.dims() - 1)) {
-        throw std::runtime_error("The sizes of keys and values need to match");
-      }
-
-      // set inputs as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-      const twml::Tensor in_values_ = TFTensor_to_twml_tensor(values);
-      // no tensors in this op
-      const twml::Tensor dummy_dense_keys_;
-      const std::vector<twml::RawTensor> dummy_dense_values_;
-
-      // call constructor BatchPredictionResponse
-      twml::BatchPredictionResponse tempResult(
-        in_keys_, in_values_, dummy_dense_keys_, dummy_dense_values_);
-
-      // determine the length of the result
-      int len = tempResult.encodedSize();
-      TensorShape result_shape = {1, len};
-
-      // Create an output tensor, the size is determined by the content of input.
-      Tensor* result = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape,
-                                                       &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // Call writer of BatchPredictionResponse
-      tempResult.write(out_result);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER(Type)                     \
-                                           \
-  REGISTER_KERNEL_BUILDER(                 \
-    Name("BatchPredictionResponseWriter")  \
-    .Device(DEVICE_CPU)                    \
-    .TypeConstraint<Type>("T"),            \
-    BatchPredictionResponseWriter<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp b/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp
deleted file mode 100644
index b98d23206..000000000
--- a/twml/libtwml/src/ops/batch_prediction_tensor_response_writer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("BatchPredictionTensorResponseWriter")
-.Attr("T: list({string, int32, int64, float, double})")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and dense tensors into a BatchPredictionResponse.
-
-values: list of tensors
-keys: feature ids from the original BatchPredictionRequest. (int64)
-
-Outputs
-  bytes: output BatchPredictionRequest serialized using Thrift into a uint8 tensor.
-)doc");
-
-class BatchPredictionTensorResponseWriter : public OpKernel {
- public:
-  explicit BatchPredictionTensorResponseWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-
-    try {
-      // set keys as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-
-      // check sizes
-      uint64_t num_keys = in_keys_.getNumElements();
-      uint64_t num_values = context->num_inputs() - 1;
-
-      OP_REQUIRES(context, num_values % num_keys == 0,
-        errors::InvalidArgument("Number of dense tensors not multiple of dense keys"));
-
-      // set dense tensor values
-      std::vector<twml::RawTensor> in_values_;
-      for (int i = 1; i < context->num_inputs(); i++) {
-        in_values_.push_back(TFTensor_to_twml_raw_tensor(context->input(i)));
-      }
-
-      // no continuous predictions in this op, only tensors
-      const twml::Tensor dummy_cont_keys_;
-      const twml::Tensor dummy_cont_values_;
-
-      // call constructor BatchPredictionResponse
-      twml::BatchPredictionResponse tempResult(
-        dummy_cont_keys_, dummy_cont_values_, in_keys_, in_values_);
-
-      // determine the length of the result
-      int len = tempResult.encodedSize();
-      TensorShape result_shape = {1, len};
-
-      // Create an output tensor, the size is determined by the content of input.
-      Tensor* result = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape,
-                                                       &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // Call writer of BatchPredictionResponse
-      tempResult.write(out_result);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchPredictionTensorResponseWriter").Device(DEVICE_CPU),
-    BatchPredictionTensorResponseWriter);
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp b/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp
deleted file mode 100644
index 0a7f02af3..000000000
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TWML modified to optimize binary features:
-// - Sparse tensor values are assumed to be binary, so only add operation is done
-//   rather than mul-add;
-// - In house version of vectorization is used instead of Eigen;
-// - Enable sharding and multithreading.
-
-#define EIGEN_USE_THREADS
-
-#include "binary_sparse_dense_matmul.h"
-#include "binary_sparse_dense_matmul_impl.h"
-
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-namespace shape_inference {
-// TODO: The `a_value` is supposed to be all ones.
-// Users should not call this op directly but to use it from `sparse_op` python library. 
-// To make it consistent with original op, the signature remains the same currently,
-//  we will think a better way to contrain correct use of this op.
-// CX-18174
-REGISTER_OP("BinarySparseTensorDenseMatMul")
-    .Input("a_indices: Tindices")
-    .Input("a_values: T")
-    .Input("a_shape: int64")
-    .Input("b: T")
-    .Output("product: T")
-    .Attr("T: type")
-    .Attr("Tindices: {int32,int64} = DT_INT64")
-    .Attr("adjoint_a: bool = false")
-    .Attr("adjoint_b: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      DimensionHandle unused_dim;
-      ShapeHandle unused;
-      ShapeHandle b;
-      ShapeHandle a_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));  // a_indices
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));  // a_values
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &a_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(a_shape, 2, &a_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &b));
-
-      bool adjoint_a;
-      bool adjoint_b;
-      TF_RETURN_IF_ERROR(c->GetAttr("adjoint_a", &adjoint_a));
-      TF_RETURN_IF_ERROR(c->GetAttr("adjoint_b", &adjoint_b));
-
-      DimensionHandle output_right = c->Dim(b, adjoint_b ? 0 : 1);
-      DimensionHandle output_left = c->Dim(a_shape, adjoint_a ? 1 : 0);
-      DimensionHandle inner_left = c->Dim(a_shape, adjoint_a ? 0 : 1);
-      DimensionHandle inner_right = c->Dim(b, adjoint_b ? 1 : 0);
-      TF_RETURN_IF_ERROR(c->Merge(inner_left, inner_right, &unused_dim));
-      c->set_output(0, c->Matrix(output_left, output_right));
-      return Status::OK();
-    });
-}  // namespace shape_inference
-
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-template <typename Device, typename T, typename Tindices>
-class BinarySparseTensorDenseMatMulOp : public OpKernel {
- public:
-  explicit BinarySparseTensorDenseMatMulOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_a", &adjoint_a_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_b", &adjoint_b_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* a_indices;
-    const Tensor* a_values;
-    const Tensor* a_shape;
-    const Tensor* b;
-    OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
-    OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values));
-    OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
-    OP_REQUIRES_OK(ctx, ctx->input("b", &b));
-
-    // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b->shape()),
-                errors::InvalidArgument("Tensor 'b' is not a matrix"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()),
-                errors::InvalidArgument("Tensor 'a_shape' is not a vector"));
-
-    OP_REQUIRES(
-        ctx, a_shape->NumElements() == 2,
-        errors::InvalidArgument("Tensor 'a_shape' must have 2 elements"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values->shape()),
-                errors::InvalidArgument("Tensor 'a_values' is not a vector"));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
-                errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
-
-    const int64 nnz = a_indices->shape().dim_size(0);
-    OP_REQUIRES(ctx, nnz == a_values->NumElements(),
-                errors::InvalidArgument("Number of rows of a_indices does not "
-                                        "match number of entries in a_values"));
-
-    OP_REQUIRES(
-        ctx, a_indices->shape().dim_size(1) == a_shape->NumElements(),
-        errors::InvalidArgument("Number of columns of a_indices does not match "
-                                "number of entries in a_shape"));
-
-    auto a_shape_t = a_shape->vec<int64>();
-    const int64 outer_left = (adjoint_a_) ? a_shape_t(1) : a_shape_t(0);
-    const int64 outer_right =
-        (adjoint_b_) ? b->shape().dim_size(0) : b->shape().dim_size(1);
-    const int64 inner_left = (adjoint_a_) ? a_shape_t(0) : a_shape_t(1);
-    const int64 inner_right =
-        (adjoint_b_) ? b->shape().dim_size(1) : b->shape().dim_size(0);
-
-    OP_REQUIRES(
-        ctx, inner_right == inner_left,
-        errors::InvalidArgument(
-            "Cannot multiply A and B because inner dimension does not match: ",
-            inner_left, " vs. ", inner_right,
-            ".  Did you forget a transpose?  "
-            "Dimensions of A: [",
-            a_shape_t(0), ", ", a_shape_t(1),
-            ").  Dimensions of B: ", b->shape().DebugString()));
-
-    TensorShape out_shape({outer_left, outer_right});
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
-
-    if (out->NumElements() == 0) {
-      // If a has shape [0, x] or b has shape [x, 0], the output shape
-      // is a 0-element matrix, so there is nothing to do.
-      return;
-    }
-
-    if (a_values->NumElements() == 0 || b->NumElements() == 0) {
-      // If a has shape [x, 0] and b has shape [0, y], the
-      // output shape is [x, y] where x and y are non-zero, so we fill
-      // the output with zeros.
-      out->flat<T>().device(ctx->eigen_device<Device>()) = 
-          out->flat<T>().constant(T(0));
-      return;
-    }
-
-#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                        \
-  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
-    Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
-        Device, T, Tindices, ADJ_A,                                        \
-        ADJ_B>::Compute(ctx, a_indices, a_values, a_shape, b, out);        \
-    OP_REQUIRES_OK(ctx, functor_status);                                   \
-  }
-
-    MAYBE_ADJOINT(false, false);
-    MAYBE_ADJOINT(false, true);
-    MAYBE_ADJOINT(true, false);
-    MAYBE_ADJOINT(true, true);
-
-#undef MAYBE_ADJOINT
-  }
-
- private:
-  bool adjoint_a_;
-  bool adjoint_b_;
-};
-
-#define REGISTER_CPU(TypeT, TypeIndex)           \
-  REGISTER_KERNEL_BUILDER(                       \
-      Name("BinarySparseTensorDenseMatMul")      \
-          .Device(DEVICE_CPU)                    \
-          .TypeConstraint<TypeT>("T")            \
-          .TypeConstraint<TypeIndex>("Tindices") \
-          .HostMemory("a_shape"),                \
-      BinarySparseTensorDenseMatMulOp<CPUDevice, TypeT, TypeIndex>);
-
-#define REGISTER_KERNELS_CPU(T) \
-  REGISTER_CPU(T, int64);       \
-  REGISTER_CPU(T, int32)
-
-REGISTER_KERNELS_CPU(float);
-REGISTER_KERNELS_CPU(double);
-REGISTER_KERNELS_CPU(int32);
-REGISTER_KERNELS_CPU(complex64);
-REGISTER_KERNELS_CPU(complex128);
-
-namespace functor {
-
-namespace {
-Status KOutOfBoundsError(int64 k, std::size_t i, int rhs_index_a,
-                         std::size_t lhs_right) {
-  return errors::InvalidArgument("k (", k, ") from index[", i, ",", rhs_index_a,
-                                 "] out of bounds (>=", lhs_right, ")");
-}
-
-Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
-                         int64 out_dim0) {
-  return errors::InvalidArgument("m (", m, ") from index[", i, ",", lhs_index_a,
-                                 "] out of bounds (>=", out_dim0, ")");
-}
-
-}  // namespace
-
-
-// The general functor just borrows the code from tf except that add is computed 
-// instead of mul-add.
-template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
-  // Vectorize certain operations above this size.
-  static const std::size_t kNumVectorize = 32;
-
-  static Status Compute(OpKernelContext* ctx,
-                        const Tensor *a_indices,
-                        const Tensor *a_values,
-                        const Tensor *a_shape,
-                        const Tensor *b,
-                        Tensor *out) {
-    return EigenCompute(ctx->eigen_device<CPUDevice>(), out->matrix<T>(),
-                        a_indices->matrix<Tindices>(), a_values->vec<T>(),
-                        b->matrix<T>());
-  }
-
-  static Status EigenCompute(const CPUDevice& d, typename TTypes<T>::Matrix out,
-                             typename TTypes<Tindices>::ConstMatrix a_indices,
-                             typename TTypes<T>::ConstVec a_values,
-                             typename TTypes<T>::ConstMatrix b) {
-    const std::size_t nnz = a_values.size();
-    const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
-    const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
-    const int lhs_index_a = ADJ_A ? 1 : 0;
-    const int rhs_index_a = ADJ_A ? 0 : 1;
-
-    out.setZero();
-
-    if (rhs_right < kNumVectorize) {
-      // Disable vectorization if the RHS of output is too small
-      auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
-
-      for (std::size_t i = 0; i < nnz; ++i) {
-        const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
-        const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
-        if (!FastBoundsCheck(k, lhs_right)) {
-          return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);
-        }
-        if (!FastBoundsCheck(m, out.dimension(0))) {
-          return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));
-        }
-        for (std::size_t n = 0; n < rhs_right; ++n) {
-          const T b_value = maybe_adjoint_b(k, n);
-          out(m, n) += b_value;
-        }
-      }
-    } else {
-      // Vectorization via Eigen.
-      const int b_chip_index = ADJ_B ? 1 : 0;
-
-#define LOOP_NNZ(b_passed)                                                  \
-  for (std::size_t i = 0; i < nnz; ++i) {                                   \
-    const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
-    const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
-    if (!FastBoundsCheck(k, lhs_right)) {                                   \
-      return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);               \
-    }                                                                       \
-    if (!FastBoundsCheck(m, out.dimension(0))) {                            \
-      return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));        \
-    }                                                                       \
-    out.template chip<0>(m) += b_passed.template chip<b_chip_index>(k);     \
-  }
-
-
-      if (ADJ_B) {
-        // Perform transpose and conjugation on B once, since we chip out B's
-        // columns in the nnz loop.
-        Eigen::array<int, 2> shuffle;  // preserve dimension order
-        shuffle[0] = 1; shuffle[1] = 0;
-        Eigen::Tensor<T, 2, Eigen::ColMajor> col_major_conj_b =
-            b.swap_layout().shuffle(shuffle).conjugate();
-        LOOP_NNZ(col_major_conj_b);
-      } else {
-        LOOP_NNZ(b);
-      }
-#undef LOOP_NNZ
-    }
-    return Status::OK();
-  }
-};
-
-
-// We have only specified and optimised the case with no matrix transpose, 
-// since it is the most typical usage in productions.
-template <typename Tindices>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, 
-                                      float, Tindices, false, false> {
-  static Status Compute(OpKernelContext* ctx,
-                        const Tensor *a_indices,
-                        const Tensor *a_values,
-                        const Tensor *a_shape,
-                        const Tensor *b,
-                        Tensor *out) {
-    auto a_indices_ptr = a_indices->flat<Tindices>().data();     
-    auto b_ptr = b->flat<float>().data();
-    auto out_ptr = out->flat<float>().data();
-    const int64 nnz = a_indices->shape().dim_size(0);
-    const int64 outer_left = a_shape->vec<int64>()(0);
-    const int64 outer_right = b->shape().dim_size(1);
-    ParallelLookupAndSegmentSum<Tindices>(ctx, a_indices_ptr, b_ptr, nnz,
-                                outer_left, outer_right, out_ptr);
-    return Status::OK();
-  }
-};
-
-}  // namespace functor
-
-}  // namespace tensorflow
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul.h b/twml/libtwml/src/ops/binary_sparse_dense_matmul.h
deleted file mode 100644
index 92494af52..000000000
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TWML modified to optimize binary features 
-#ifndef TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-#define TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-namespace functor {
-
-template <typename Device, typename T, typename Tindices, bool ADJ_A,
-          bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor {
-  static EIGEN_ALWAYS_INLINE Status Compute(
-      const Device& d, typename TTypes<T>::Matrix out,
-      typename TTypes<Tindices>::ConstMatrix a_indices,
-      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
-};
-
-template <typename MATRIX, bool ADJ>
-class MaybeAdjoint;
-
-template <typename MATRIX>
-class MaybeAdjoint<MATRIX, false> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
-      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
-    return m_(i, j);
-  }
-
- private:
-  const MATRIX m_;
-};
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T MaybeConj(T v) {
-  return v;
-}
-
-template <typename MATRIX>
-class MaybeAdjoint<MATRIX, true> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaybeAdjoint(MATRIX m) : m_(m) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename MATRIX::Scalar operator()(
-      const typename MATRIX::Index i, const typename MATRIX::Index j) const {
-    return Eigen::numext::conj(m_(j, i));
-  }
-
- private:
-  const MATRIX m_;
-};
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
diff --git a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h b/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h
deleted file mode 100644
index db61647cb..000000000
--- a/twml/libtwml/src/ops/binary_sparse_dense_matmul_impl.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
-
-#include <atomic>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-
-namespace tensorflow {
-namespace functor {
-
-// `ConservativeShard` is adopted rather than `Shard` in tensorflow because the
-// original `Shard` may generate number of shards more than the number of
-// threads, which is not ideal for this case, as it may cause too much overhead.
-static void ConservativeShard(int max_parallelism, thread::ThreadPool *workers,
-                              int64 total, int64 cost_per_unit,
-                              std::function<void(int64, int64)> work) {
-  if (total == 0) {
-    return;
-  }
-  max_parallelism = std::min(max_parallelism, workers->NumThreads());
-  if (max_parallelism <= 1) {
-    // Just inline the whole work since we only have 1 thread (core).
-    work(0, total);
-    return;
-  }
-  cost_per_unit = std::max(1LL, cost_per_unit);
-  // We shard [0, total) into "num_shards" shards.
-  //   1 <= num_shards <= num worker threads
-  //
-  // If total * cost_per_unit is small, it is not worth shard too
-  // much. Let us assume each cost unit is 1ns, kMinCostPerShard=10000
-  // is 10us.
-  static const int64 kMinCostPerShard = 10000;
-  const int num_shards =
-      std::max<int>(1, std::min(static_cast<int64>(max_parallelism),
-                                total * cost_per_unit / kMinCostPerShard));
-
-  // Each shard contains up to "block_size" units. [0, total) is sharded
-  // into:
-  //   [0, block_size), [block_size, 2*block_size), ...
-  // The 1st shard is done by the caller thread and the other shards
-  // are dispatched to the worker threads. The last shard may be smaller than
-  // block_size.
-  const int64 block_size = (total + num_shards - 1) / num_shards;
-  if (block_size >= total) {
-    work(0, total);
-    return;
-  }
-  const int num_shards_used = (total + block_size - 1) / block_size;
-  BlockingCounter counter(num_shards_used - 1);
-  for (int64 start = block_size; start < total; start += block_size) {
-    auto limit = std::min(start + block_size, total);
-    workers->Schedule([&work, &counter, start, limit]() {
-      work(start, limit);        // Compute the shard.
-      counter.DecrementCount();  // The shard is done.
-    });
-  }
-
-  // Inline execute the 1st shard.
-  work(0, std::min(block_size, total));
-  counter.Wait();
-}
-
-static inline void VectorSum(float *a, const float *b, int n) {
-  for (int i = 0; i < n; ++i) {
-    a[i] += b[i];
-  }
-}
-
-// This func is to vectorize the computation of segment sum.
-template<typename Tindices>
-static void LookupAndSegmentSum(const Tindices *a_indices, const float *b,
-                                int nnz, int outer_right, float *output) {
-  for (std::size_t i = 0; i < nnz; ++i) {
-    const Tindices m = a_indices[i * 2];
-    const Tindices k = a_indices[i * 2 + 1];
-    auto output_row_m = output + m * outer_right;
-    auto b_row_k = b + k * outer_right;
-    VectorSum(output_row_m, b_row_k, outer_right);
-  }
-}
-
-// This func enables sharding and multithreading, it comes with an overhead of
-// duplicating output buffer to achieve lock free output. So there should not
-// be too many threads.
-template<typename Tindices>
-static void ParallelLookupAndSegmentSum(OpKernelContext *ctx,
-                                        const Tindices *a_indices,
-                                        const float *b, int nnz, int outer_left,
-                                        int outer_right, float *output) {
-  auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-  int out_size = outer_left * outer_right;
-  if (worker_threads.num_threads <= 1) {
-    memset(output, 0, out_size * sizeof(float));
-    LookupAndSegmentSum<Tindices>(a_indices, b, 
-                                  nnz, outer_right,
-                                  output);
-    return;
-  }
-
-  // this is to make buffer align with kAllocatorAlignment
-  int padded_out_size = (out_size + (Allocator::kAllocatorAlignment - 1)) &
-                        ~(Allocator::kAllocatorAlignment - 1);
-  std::size_t num_bytes =
-      (worker_threads.num_threads - 1) * padded_out_size * sizeof(float);
-  auto buffer = std::unique_ptr<float>(reinterpret_cast<float *>(
-      port::AlignedMalloc(num_bytes, Allocator::kAllocatorAlignment)));
-  float *temp_out = buffer.get();
-
-  std::atomic<int> thread_index(0);
-
-  auto task = [&](int64 start, int64 limit) {
-    int local_thread_index = thread_index++;
-    float *buf_ptr = nullptr;
-    if (local_thread_index == 0) {
-      buf_ptr = output;
-    } else {
-      buf_ptr = temp_out + (local_thread_index - 1) * padded_out_size;
-    }
-    memset(buf_ptr, 0, out_size * sizeof(float));
-
-    LookupAndSegmentSum<Tindices>(a_indices + start * 2, b, 
-                                  limit - start, outer_right,
-                                  buf_ptr);
-  };
-
-  int cost_per_unit = outer_right;
-
-  // We don't use tensorflow shard func as tf may create more shards than
-  // number of threads.
-  ConservativeShard(worker_threads.num_threads, worker_threads.workers, nnz,
-                    static_cast<int64>(cost_per_unit), task);
-
-  for (int i = 1; i < thread_index; ++i) {
-    VectorSum(output, temp_out + (i - 1) * padded_out_size, out_size);
-  }
-}
-
-}  // namespace functor
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BINARY_SPARSE_TENSOR_DENSE_MATMUL_IMPL_H_
\ No newline at end of file
diff --git a/twml/libtwml/src/ops/block_format_dataset.cpp b/twml/libtwml/src/ops/block_format_dataset.cpp
deleted file mode 100644
index fdf4a9543..000000000
--- a/twml/libtwml/src/ops/block_format_dataset.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "block_format_reader.h"
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/io/random_inputstream.h"
-
-#if !defined(DISABLE_ZLIB)
-#include "tensorflow/core/lib/io/zlib_inputstream.h"
-#endif
-
-#include <twml.h>
-
-#include <cstdio>
-#include <algorithm>
-#include <iterator>
-
-using namespace tensorflow;
-
-
-inline std::string stripPath(std::string const &file_name) {
-  const auto pos = file_name.find_last_of("/");
-  if (pos == std::string::npos) return file_name;
-  return file_name.substr(pos + 1);
-}
-
-inline std::string getExtension(std::string const &file_name) {
-  const auto stripped_file_name = stripPath(file_name);
-  const auto pos = stripPath(stripped_file_name).find_last_of(".");
-  if (pos == std::string::npos) return "";
-  return stripped_file_name.substr(pos + 1);
-}
-
-REGISTER_OP("BlockFormatDatasetV2")
-.Input("filenames: string")
-.Input("compression_type: string")
-.Input("buffer_size: int64")
-.Output("handle: variant")
-.SetIsStateful()
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-
-Creates a dataset for streaming BlockFormat data in compressed (e.g. gzip), uncompressed formats.
-This op also has the ability stream a dataset containing files from multiple formats mentioned above.
-
-filenames: A scalar or vector containing the name(s) of the file(s) to be read.
-compression_type: A scalar string denoting the compression type. Can be 'none', 'zlib', 'auto'.
-buffer_size: A scalar denoting the buffer size to use during decompression.
-
-Outputs
-  handle: A handle to the dataset. This handle is later used to create an iterator to stream the data from the dataset.
-
-)doc");
-
-
-class BlockFormatDatasetV2 : public DatasetOpKernel {
- public:
-  using DatasetOpKernel::DatasetOpKernel;
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase **output) override {
-    const Tensor* filenames_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
-    OP_REQUIRES(
-        ctx, filenames_tensor->dims() <= 1,
-        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
-
-    const auto filenames_flat = filenames_tensor->flat<string>();
-    const int64 num_files = filenames_tensor->NumElements();
-    std::vector<string> filenames;
-    filenames.reserve(num_files);
-    std::copy(filenames_flat.data(),
-              filenames_flat.data() + num_files,
-              std::back_inserter(filenames));
-
-    string compression_type;
-    OP_REQUIRES_OK(
-        ctx, tensorflow::data::ParseScalarArgument<string>(
-            ctx, "compression_type", &compression_type));
-
-    int64 buffer_size = -1;
-    OP_REQUIRES_OK(
-        ctx, tensorflow::data::ParseScalarArgument<int64>(
-            ctx, "buffer_size", &buffer_size));
-
-    OP_REQUIRES(ctx, buffer_size >= 0,
-                errors::InvalidArgument(
-                    "`buffer_size` must be >= 0 (0 == no buffering)"));
-
-    OP_REQUIRES(ctx,
-                compression_type == "auto" ||
-                compression_type == "gz" ||
-                compression_type == "",
-                errors::InvalidArgument("Unknown extension: ", compression_type));
-
-    *output = new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx,
-            std::vector<string> filenames,
-            std::string compression_type,
-            int64 buffer_size)
-        : DatasetBase(DatasetContext(ctx)),
-          compression_type_(compression_type),
-          buffer_size_(buffer_size),
-          filenames_(std::move(filenames))
-    {}
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    string DebugString() const override { return "BlockFormatDatasetV2::Dataset"; }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* filenames = nullptr;
-      Node* compression_type = nullptr;
-      Node* buffer_size = nullptr;
-      TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
-      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_size_, &buffer_size));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {filenames, compression_type, buffer_size}, output));
-      return Status::OK();
-    }
-
-   private:
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::BlockFormat")}));
-    }
-
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params &params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        do {
-          // We are currently processing a file, so try to read the next record.
-          if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
-            Status s = reader_->ReadNext(&result_tensor.scalar<string>()());
-            if (s.ok()) {
-              out_tensors->emplace_back(std::move(result_tensor));
-              *end_of_sequence = false;
-              return Status::OK();
-            } else if (!errors::IsOutOfRange(s)) {
-              return s;
-            }
-
-            // We have reached the end of the current file, so maybe
-            // move on to next file.
-            reader_.reset();
-            ++current_file_index_;
-          }
-
-          // Iteration ends when there are no more files to process.
-          if (current_file_index_ == dataset()->filenames_.size()) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          // Actually move on to next file.
-          const string& next_filename =
-              dataset()->filenames_[current_file_index_];
-
-          auto compression_type = dataset()->compression_type_;
-          int64 buffer_size = dataset()->buffer_size_;
-
-          if (compression_type == "auto") {
-            compression_type = getExtension(next_filename);
-          }
-
-          if (compression_type != "gz" && compression_type != "") {
-            return errors::InvalidArgument("Unknown extension: ", compression_type);
-          }
-
-          tensorflow::Env* env = tensorflow::Env::Default();
-          TF_CHECK_OK(env->NewRandomAccessFile(next_filename, &file_));
-
-          // RandomAccessInputstream defaults the second param to "false".
-          // The second parameter "false" is the key issue.
-          // "false" assumes the ownership of the file is elsewhere.
-          // But making that "true" causes segfaults down the line.
-          // So keep the ownership of "file_" in this class and clean up properly.
-          file_stream_.reset(new tensorflow::io::RandomAccessInputStream(file_.get(), false));
-
-          if (compression_type == "gz") {
-            // unpack_stream does not take ownership of file_stream_
-#if !defined(DISABLE_ZLIB)
-            unpack_stream_.reset(new tensorflow::io::ZlibInputStream(
-                                   file_stream_.get(),
-                                   buffer_size,
-                                   buffer_size,
-                                   tensorflow::io::ZlibCompressionOptions::GZIP()));
-            reader_.reset(new BlockFormatReader(unpack_stream_.get()));
-#else
-            return errors::InvalidArgument("libtwml compiled without zlib support");
-#endif
-          } else {
-            unpack_stream_.reset(nullptr);
-            reader_.reset(new BlockFormatReader(file_stream_.get()));
-          }
-        } while (true);
-      }
-
-     private:
-      mutex mu_;
-      uint64_t current_file_index_ GUARDED_BY(mu_) = 0;
-      std::unique_ptr<tensorflow::RandomAccessFile> file_;
-      std::unique_ptr<tensorflow::io::InputStreamInterface> file_stream_;
-      std::unique_ptr<tensorflow::io::InputStreamInterface> unpack_stream_;
-      std::unique_ptr<BlockFormatReader> reader_ GUARDED_BY(mu_);
-    };
-
-    const std::string compression_type_;
-    const int64 buffer_size_;
-    const std::vector<string> filenames_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("BlockFormatDatasetV2")
-  .Device(DEVICE_CPU),
-  BlockFormatDatasetV2);
diff --git a/twml/libtwml/src/ops/block_format_reader.h b/twml/libtwml/src/ops/block_format_reader.h
deleted file mode 100644
index 29450cc03..000000000
--- a/twml/libtwml/src/ops/block_format_reader.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#pragma once
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/lib/io/random_inputstream.h"
-
-#include <twml.h>
-
-#include <string>
-
-using tensorflow::int64;
-using tensorflow::Status;
-using std::string;
-
-class BlockFormatReader : twml::BlockFormatReader {
- public:
-  explicit BlockFormatReader(tensorflow::io::InputStreamInterface *stream)
-      : twml::BlockFormatReader() , stream_(stream) {
-  }
-
-  // Read the next record.
-  // Returns OK on success,
-  // Returns OUT_OF_RANGE for end of file, or something else for an error.
-  Status ReadNext(string* record) {
-    if (this->next()) {
-      return stream_->ReadNBytes(this->current_size(), record);
-    }
-    return tensorflow::errors::OutOfRange("eof");
-  }
-
-  uint64_t read_bytes(void *dest, int size, int count) {
-    uint64_t bytesToRead = size * count;
-    std::string current;
-    // TODO: Try to merge ReadNBytes and the memcpy below
-    // ReadNBytes performs a memory copy already.
-    Status status = stream_->ReadNBytes(bytesToRead, &current);
-    if (!status.ok()) {
-      return 0;
-    }
-    memcpy(dest, current.c_str(), bytesToRead);
-    return count;
-  }
-
- private:
-  tensorflow::io::InputStreamInterface *stream_;
-  TF_DISALLOW_COPY_AND_ASSIGN(BlockFormatReader);
-};
diff --git a/twml/libtwml/src/ops/compress_sample_ids.cpp b/twml/libtwml/src/ops/compress_sample_ids.cpp
deleted file mode 100644
index 3053de471..000000000
--- a/twml/libtwml/src/ops/compress_sample_ids.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <algorithm>    // std::fill_n
-
-using namespace tensorflow;
-
-REGISTER_OP("CompressSampleIds")
-.Attr("T: {int32}")
-.Input("input: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->Vector(c->kUnknownDim));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class CompressSampleIds : public OpKernel {
- public:
-  explicit CompressSampleIds(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    const int N = input.size();
-
-    // Check for improper input
-    bool error = (N > 0 && input(0) < 0);
-    for (int i = 1; !error && i < N; i++) {
-      error = input(i - 1) > input(i);
-    }
-
-    OP_REQUIRES(
-      context, !error,
-      errors::InvalidArgument(
-        "Error in CompressSampleIds. SampleIds must be non-negative and non-decreasing"
-      )
-    );
-
-    // choose output size, either last input element + 1, or 0
-    int output_size = 0;
-    if (N > 0) {
-      output_size = input(N - 1) + 1;
-    }
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, TensorShape({output_size}), &output_tensor)
-    );
-    auto output_flat = output_tensor->flat<T>();
-
-    // Zero-initialize output
-    for (int i = 0; i < output_size; i++) {
-      output_flat(i) = 0;
-    }
-
-    // count how many of each input element
-    for (int i = 0; i < N; i++) {
-      output_flat(input(i)) ++;
-    }
-  }
-};
-
-REGISTER_OP("DecompressSampleIds")
-.Attr("T: {int32}")
-.Input("input: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->Vector(c->kUnknownDim));
-    return Status::OK();
-  });
-
-
-template<typename T>
-class DecompressSampleIds : public OpKernel {
- public:
-  explicit DecompressSampleIds(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    const int N = input.size();
-
-    // Check for improper input
-    bool error = false;
-    int output_size = 0;
-    for (int i = 0; !error && i < N; i++) {
-      error = input(i) < 0;
-      output_size += input(i);
-    }
-
-    OP_REQUIRES(
-      context, !error,
-      errors::InvalidArgument(
-        "Error in DecompressSampleIds. Inputs must be non-negative."
-      )
-    );
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, TensorShape({output_size}),&output_tensor)
-    );
-    auto output_flat = output_tensor->flat<T>();
-
-    T *output_data = output_flat.data();
-    for (int current_sample = 0; current_sample < N; current_sample++) {
-      std::fill_n(output_data, input(current_sample), current_sample);
-      output_data += input(current_sample);
-    }
-  }
-};
-
-
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("CompressSampleIds")       \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    CompressSampleIds<Type>);       \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("DecompressSampleIds")     \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    DecompressSampleIds<Type>);     \
-                                    \
-
-REGISTER(int32);
diff --git a/twml/libtwml/src/ops/contrib/get_substrings.cpp b/twml/libtwml/src/ops/contrib/get_substrings.cpp
deleted file mode 100644
index 8cd167e65..000000000
--- a/twml/libtwml/src/ops/contrib/get_substrings.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "../tensorflow_utils.h"
-#include "../resource_utils.h"
-
-#include <string>
-#include <set>
-
-using std::string;
-
-void join(const std::set<string>& v, char c, string& s) {
-         s.clear();
-         std::set<std::string>::iterator it = v.begin();
-         while (it != v.end()) {
-            s += *it;
-            it++;
-            if (it != v.end()) s+= c;
-         }
-}
-
-// cpp function that computes substrings of a given word
-std::string computeSubwords(std::string word, int32_t minn, int32_t maxn) {
-         std::string word2 = "<" + word + ">";
-         std::set<string> ngrams;
-         std::string s;
-         ngrams.insert(word);
-         ngrams.insert(word2);
-         for (size_t i = 0; i < word2.size(); i++) {
-            if ((word2[i] & 0xC0) == 0x80) continue;
-            for (size_t j = minn; i+j <= word2.size() && j <= maxn; j++) {
-              ngrams.insert(word2.substr(i, j));
-            }
-         }
-         join(ngrams, ';',  s);
-         ngrams.clear();
-         return s;
-}
-
-// tf-op function that computes substrings for a given tensor of words
-template< typename ValueType>
-
-void ComputeSubStringsTensor(OpKernelContext *context, int32 min_n, int32 max_n) {
-  try {
-      const Tensor& values = context->input(0);
-
-      auto values_flat = values.flat<ValueType>();
-
-      // batch_size from input_size  :
-      const int batch_size = values_flat.size();
-
-      // define the output tensor
-      Tensor* substrings = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, values.shape(), &substrings));
-
-      auto substrings_flat = substrings->flat<ValueType>();
-       // compute substrings for the given tensor values
-      for (int64 i = 0; i < batch_size; i++) {
-            substrings_flat(i) = computeSubwords(values_flat(i), min_n, max_n);
-      }
-  }
-  catch (const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-  }
-}
-
-REGISTER_OP("GetSubstrings")
-.Attr("ValueType: {string}")
-.Attr("min_n: int")
-.Attr("max_n: int")
-.Input("values: ValueType")
-.Output("substrings: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert word to substrings of length between min_n and max_n.
-
-Attr
-  min_n,max_n: The size of the substrings.
-
-Input
-  values: 1D input tensor containing the values.
-
-Outputs
-  substrings: A string tensor where substrings are joined by ";".
-)doc");
-
-template<typename ValueType>
-class GetSubstrings : public OpKernel {
- public:
-  explicit GetSubstrings(OpKernelConstruction *context) : OpKernel(context) {
-      OP_REQUIRES_OK(context, context->GetAttr("min_n", &min_n));
-      OP_REQUIRES_OK(context, context->GetAttr("max_n", &max_n));
-  }
-
- private:
-  int32 min_n;
-  int32 max_n;
-  void Compute(OpKernelContext *context) override {
-    ComputeSubStringsTensor<ValueType>(context, min_n, max_n);
-  }
-};
-
-
-#define REGISTER_SUBSTRINGS(ValueType)          \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("GetSubstrings")                       \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<ValueType>("ValueType"),    \
-    GetSubstrings<ValueType>);                  \
-
-REGISTER_SUBSTRINGS(string)
diff --git a/twml/libtwml/src/ops/data_record.cpp b/twml/libtwml/src/ops/data_record.cpp
deleted file mode 100644
index 71ea72ac4..000000000
--- a/twml/libtwml/src/ops/data_record.cpp
+++ /dev/null
@@ -1,1891 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include <twml/functions.h>
-#include <twml/utilities.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-
-using std::string;
-
-REGISTER_OP("DecodeDataRecord")
-.Attr("InputType: {uint8, string}")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Input("input_bytes: InputType")
-.Output("data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that creates a handle for the datarecord.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-shared_name and container are required when inheriting from ResourceOpKernel.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
-
-Outputs
-  data_record_handle: A resource handle to the DataRecord struct.
-)doc");
-
-template<typename InputType>
-class DecodeDataRecord : public OpKernel {
- public:
-  explicit DecodeDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      DataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<DataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      int batch_size = getBatchSize<InputType>(resource->input);
-      int num_labels = static_cast<int>(m_labels_map.size());
-      int num_weights = static_cast<int>(m_weights_map.size());
-
-      twml::DataRecordReader reader;
-      reader.setKeepMap(&m_keep_map);
-      reader.setLabelsMap(&m_labels_map);
-
-      // Do not set weight map if it is empty. This will take a faster path.
-      if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-      }
-
-      resource->records.clear();
-      resource->records.reserve(batch_size);
-      for (int i = 0; i < batch_size; i++) {
-        resource->records.emplace_back(num_labels, num_weights);
-      }
-
-      for (int64 id = 0; id < batch_size; id++) {
-        const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
-        reader.setBuffer(input_bytes);
-        // decode the reader
-        resource->records[id].decode(reader);
-      }
-      // This should be fine because m_keep_map should never go out of scope.
-      resource->keep_map = &m_keep_map;
-      resource->num_weights = num_weights;
-      resource->num_labels = num_labels;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-int64_t count_if_exists(const twml::DataRecord::BinaryFeatures &set,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &key : set) {
-    if (keep_map->find(key) == keep_map->end()) continue;
-    count++;
-  }
-  return count;
-}
-
-// This works for continuous, discrete, and string features
-template<typename V>
-int64_t count_if_exists(const twml::Map<int64_t, V> &map,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &elem : map) {
-    if (keep_map->find(elem.first) == keep_map->end()) continue;
-    count++;
-  }
-  return count;
-}
-
-int64_t count_if_exists(const twml::DataRecord::SparseBinaryFeatures &map,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &elem : map) {
-    if (keep_map->find(elem.first) == keep_map->end()) continue;
-    count += elem.second.size();
-  }
-  return count;
-}
-
-int64_t count_if_exists(const twml::DataRecord::SparseContinuousFeatures &map,
-                        const twml::Map<int64_t, int64_t> *const keep_map) {
-  int64_t count = 0;
-  for (const auto &elem : map) {
-    if (keep_map->find(elem.first) == keep_map->end()) continue;
-    count += elem.second.size();
-  }
-  return count;
-}
-
-REGISTER_OP("GetBinaryFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads binary features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: always set to 1 (float)
-)doc");
-
-class GetBinaryFeatures : public OpKernel {
- public:
-  explicit GetBinaryFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_binary_size = count_if_exists(common.getBinary(), handle->keep_map);
-      int64 total_binary_size = records.size() * common_binary_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_binary_size += count_if_exists(handle->records[id].getBinary(), handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_binary_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* ids = nullptr;
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getBinary()) {
-          if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it;
-          offset++;
-        }
-        for (const auto &it : records[id].getBinary()) {
-          if (handle->keep_map->find(it) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it;
-          offset++;
-        }
-      }
-      // All the values for binary features are 1.
-      std::fill(values_flat.data(), values_flat.data() + total_size, 1);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetContinuousFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads continuous features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: Datarecord keys (int64)
-  values: Datarecord values(float)
-)doc");
-
-class GetContinuousFeatures : public OpKernel {
- public:
-  explicit GetContinuousFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_continuous_size = count_if_exists(common.getContinuous(), handle->keep_map);
-      int64 total_continuous_size = records.size() * common_continuous_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_continuous_size += count_if_exists(handle->records[id].getContinuous(),
-                                                 handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_continuous_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* values = nullptr;
-      Tensor* ids = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto values_flat = values->flat<float>();
-      auto ids_flat = ids->flat<int64>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-        for (const auto &it : records[id].getContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetDiscreteFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads discrete features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values(int64)
-)doc");
-
-class GetDiscreteFeatures : public OpKernel {
- public:
-  explicit GetDiscreteFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_discrete_size = count_if_exists(common.getDiscrete(), handle->keep_map);
-      int64 total_discrete_size = records.size() * common_discrete_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_discrete_size += count_if_exists(handle->records[id].getDiscrete(),
-                                               handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_discrete_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* values = nullptr;
-      Tensor* ids = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto values_flat = values->flat<int64>();
-      auto ids_flat = ids->flat<int64>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getDiscrete()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-        for (const auto &it : records[id].getDiscrete()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          values_flat(offset) = it.second;
-          offset++;
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetStringFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads string features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-  values: always set to 1 (float)
-)doc");
-
-class GetStringFeatures : public OpKernel {
- public:
-  explicit GetStringFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_string_size = count_if_exists(common.getString(), handle->keep_map);
-      int64 total_string_size = records.size() * common_string_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_string_size += count_if_exists(handle->records[id].getString(),
-                                             handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_string_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-      Tensor* ids = nullptr;
-      Tensor*values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      std::fill(values_flat.data(), values_flat.data() + total_size, 1);
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getString()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          names_flat(offset) = it.second;
-          offset++;
-        }
-        for (const auto &it : records[id].getString()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          ids_flat(offset) = id;
-          keys_flat(offset) = it.first;
-          names_flat(offset) = it.second;
-          offset++;
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseBinaryFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads sparse binary features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-  values: always set to 1 (float)
-)doc");
-
-class GetSparseBinaryFeatures : public OpKernel {
- public:
-  explicit GetSparseBinaryFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_sparse_binary_size = count_if_exists(common.getSparseBinary(), handle->keep_map);
-      int64 total_sparse_binary_size = records.size() * common_sparse_binary_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_sparse_binary_size += count_if_exists(handle->records[id].getSparseBinary(),
-                                                    handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_sparse_binary_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-      Tensor* ids = nullptr;
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      // All the values for sparse binary features are 1.
-      std::fill(values_flat.data(), values_flat.data() + total_size, 1);
-      for (int64 id = 0; id < records.size(); id++) {
-        for (const auto &it : common.getSparseBinary()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner;
-            offset++;
-          }
-        }
-        for (const auto &it : records[id].getSparseBinary()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner;
-            offset++;
-          }
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseContinuousFeatures")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that reads sparse continuous features
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values(float)
-  names: DataRecord values(string)
-)doc");
-
-class GetSparseContinuousFeatures : public OpKernel {
- public:
-  explicit GetSparseContinuousFeatures(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-
-      int64 common_sparse_continuous_size = count_if_exists(common.getSparseContinuous(),
-                                                            handle->keep_map);
-      int64 total_sparse_continuous_size = records.size() * common_sparse_continuous_size;
-      for (int id = 0; id < records.size(); id++) {
-        total_sparse_continuous_size += count_if_exists(handle->records[id].getSparseContinuous(),
-                                                        handle->keep_map);
-      }
-      const int total_size = static_cast<int>(total_sparse_continuous_size);
-
-      TensorShape shape = {total_size};
-      Tensor* keys = nullptr;
-      Tensor* values = nullptr;
-      Tensor* names = nullptr;
-      Tensor* ids = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &names));
-
-      uint64_t offset = 0;
-      auto keys_flat = keys->flat<int64>();
-      auto values_flat = values->flat<float>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-
-      for (int64 id = 0; id < records.size(); id++) {
-        // copying the contents of the maps of maps
-        for (const auto &it : common.getSparseContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          // for each id; iterate through the number of maps corresponding to that id
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner.first;
-            values_flat(offset) = it_inner.second;
-            offset++;
-          }
-        }
-        // copying the contents of the maps of maps
-        for (const auto &it : records[id].getSparseContinuous()) {
-          if (handle->keep_map->find(it.first) == handle->keep_map->end()) continue;
-          // for each id; iterate through the number of maps corresponding to that id
-          for (const auto &it_inner : it.second) {
-            ids_flat(offset) = id;
-            keys_flat(offset) = it.first;
-            names_flat(offset) = it_inner.first;
-            values_flat(offset) = it_inner.second;
-            offset++;
-          }
-        }
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetBatchSizeFromDataRecord")
-.Input("data_record_handle: resource")
-.Output("batch_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns batch size from the data record.
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  batch_size: Number of records held in the handle.
-)doc");
-
-class GetBatchSizeFromDataRecord : public OpKernel {
- public:
-  explicit GetBatchSizeFromDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->records.size();
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetLabelsFromDataRecord")
-.Input("data_record_handle: resource")
-.Output("labels: float")
-.Attr("default_label: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns labels from the data record.
-
-Attr
-  default_label: The value used when a label is absent in a data record.
-
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
-)doc");
-
-class GetLabelsFromDataRecord : public OpKernel {
- private:
-  float default_label;
-
- public:
-  explicit GetLabelsFromDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_labels = static_cast<int>(handle->num_labels);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
-
-      Tensor *labels;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
-
-      // The default value of label is not present in data record is std::nanf
-      // For continuous labels, change that to a default_label or label.
-      auto func = [this](float label) -> float {
-        return std::isnan(label) ? default_label : label;
-      };
-
-      auto labels_data = labels->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_labels = record.labels();
-        labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetWeightsFromDataRecord")
-.Input("data_record_handle: resource")
-.Output("weights: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns weights from the data record.
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
-)doc");
-
-class GetWeightsFromDataRecord : public OpKernel {
- public:
-  explicit GetWeightsFromDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_weights = static_cast<int>(handle->num_weights);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
-
-      Tensor *weights;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
-
-      auto weights_data = weights->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_weights = record.weights();
-        weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-template<typename ValueType, typename FeatureType, typename TensorType>
-void SetValueGroup(
-const FeatureType& type,
-const int64& feature_id,
-const int64& id,
-const ValueType& default_value,
-TensorType values_flat) {
-  auto it = type.find(feature_id);
-  values_flat(id) = (it == type.end()) ? default_value : it->second;
-}
-
-template<typename ValueType, typename TensorType>
-// overloading for BinaryFeatures; as it needs to set a value of 1
-void SetValueGroup(
-const twml::DataRecord::BinaryFeatures& type,
-const int64& feature_id,
-const int64& id,
-const ValueType& default_value,
-TensorType values_flat) {
-  auto it = type.find(feature_id);
-  values_flat(id) = (it == type.end()) ? default_value : 1;
-}
-
-// Helper for Group Extraction of Dense Features
-template<typename ValueType, typename FeatureType>
-void ComputeHelperGroupFeaturesAsTensors(
-OpKernelContext* context,
-const std::vector<int64>& feature_ids,
-ValueType& default_value,
-std::function<const FeatureType&(const twml::DataRecord&)> f) {
-  auto handle = getHandle<DataRecordResource>(context, 0);
-  const auto &records = handle->records;
-  // Output shape is 2D; where the first dimension corresponds to the batch_size
-  // and the second corresponds to the number of features passed to the TF Op.
-  const int batch_size = static_cast<int64>(handle->records.size());
-  const int num_feature_ids = static_cast<int>(feature_ids.size());
-  TensorShape shape = {batch_size, num_feature_ids};
-
-  // Define the output
-  Tensor* values = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
-  auto values_flat = values->flat<ValueType>();
-
-  for (int64 id = 0; id < records.size(); id++) {
-    const auto &type = f(records[id]);
-    const auto id_offset = id * feature_ids.size();
-    for (int64 fid = 0; fid < feature_ids.size(); fid++) {
-      auto feature_id = feature_ids[fid];
-      // The value is set to default if it does not exist in the current DataRecord
-      SetValueGroup(type, feature_id, id_offset + fid, default_value, values_flat);
-    }
-  }
-}
-
-// Helper for Single Extraction of Dense Features
-template<typename ValueType, typename FeatureType>
-void ComputeHelperFeaturesAsTensors(
-OpKernelContext* context,
-ValueType& default_value,
-int64 feature_id,
-std::function<const FeatureType&(const twml::DataRecord&)> f) {
-  auto handle = getHandle<DataRecordResource>(context, 0);
-  const auto &records = handle->records;
-  // Output shape is 2D; where the first dimension corresponds to the batch_size
-  // and the second corresponds to the number of features passed to the TF Op.
-  const int total_size = static_cast<int64>(handle->records.size());
-  TensorShape shape = {total_size};
-
-  // Define the output
-  Tensor* values = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &values));
-  auto values_flat = values->flat<ValueType>();
-  for (int64 id = 0; id < records.size(); id++) {
-    const auto &type = f(records[id]);
-    SetValueGroup(type, feature_id, id, default_value, values_flat);
-  }
-}
-
-REGISTER_OP("GetBinaryAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetBinaryAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  float default_value;
-
- public:
-  explicit GetBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetContinuousAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetContinuousAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  float default_value;
-
- public:
-  explicit GetContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetDiscreteAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: int")
-.Output("values: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetDiscreteAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  int64 default_value;
-
- public:
-  explicit GetDiscreteAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetStringAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Attr("default_value: string")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  names: A Tensor corresponding to the value of the feature_id across multiple DataRecords
-)doc");
-
-class GetStringAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-  string default_value;
-
- public:
-  explicit GetStringAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
-      ComputeHelperFeaturesAsTensors(context, default_value, feature_id, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-REGISTER_OP("GetBinaryGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-
-class GetBinaryGroupAsTensor : public OpKernel {
- private:
-  float default_value;
-  std::vector<int64> feature_ids;
-
- public:
-  explicit GetBinaryGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-       std::function<const twml::DataRecord::BinaryFeatures &(const twml::DataRecord &)> f =
-        [](const twml::DataRecord& record) ->const twml::DataRecord::BinaryFeatures& { return record.getBinary(); };
-       ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-REGISTER_OP("GetContinuousGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: float")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-class GetContinuousGroupAsTensor : public OpKernel {
- private:
-  float default_value;
-  std::vector<int64> feature_ids;
-
- public:
-  explicit GetContinuousGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::ContinuousFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::ContinuousFeatures& { return record.getContinuous(); };
-      ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetDiscreteGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: int")
-.Output("values: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  values: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-class GetDiscreteGroupAsTensor : public OpKernel {
- private:
-  std::vector<int64> feature_ids;
-  int64 default_value;
-
- public:
-  explicit GetDiscreteGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::DiscreteFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::DiscreteFeatures& { return record.getDiscrete(); };
-      ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetStringGroupAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_ids: list(int)")
-.Attr("default_value: string")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns a Dense Tensor with the values of a particular feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_ids: List of ids representing the features whose values will be extracted.
-  default_value: default_value to be inputted if the values are missing from the current DataRecord.
-Outputs
-  names: A Tensor corresponding to the values of the feature_ids across multiple DataRecords
-)doc");
-
-class GetStringGroupAsTensor : public OpKernel {
- private:
-  std::vector<int64> feature_ids;
-  string default_value;
-
- public:
-  explicit GetStringGroupAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_ids", &feature_ids));
-    OP_REQUIRES_OK(context, context->GetAttr("default_value", &default_value));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      std::function<const twml::DataRecord::StringFeatures &(const twml::DataRecord &)> f =
-       [](const twml::DataRecord& record) ->const twml::DataRecord::StringFeatures& { return record.getString(); };
-    ComputeHelperGroupFeaturesAsTensors(context, feature_ids, default_value, f);
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseBinaryAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns tensors corresponding to the ids, keys and names of a particular
-feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-)doc");
-class GetSparseBinaryAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetSparseBinaryAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // We need two passes to the data:
-      // 1 to compute the output size of the tensor
-      // 2 to copy the values to the tensor
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-
-      // Creating a vector we increment every time a key is found
-      std::vector<std::string> temp_names;
-      std::vector<int64> temp_ids;
-
-      for (int64 id = 0; id < records.size(); id++) {
-        const auto &sparse_binary = records[id].getSparseBinary();
-        auto it = sparse_binary.find(feature_id);
-        // Find all instances of key in DataRecord
-        if (it != sparse_binary.end()) {
-          // insert to temp_names all the values in the dictionary value
-          temp_names.insert(temp_names.end(), it->second.begin(), it->second.end());
-          temp_ids.insert(temp_ids.end(), it->second.size(), id);
-        }
-      }
-
-      // The total_size will be the that of the saved vector
-      const int total_size = static_cast<int64>(temp_names.size());
-      TensorShape shape = {total_size};
-      Tensor* ids = nullptr;
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-
-      // The feature id value will always be the same
-      std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
-      std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
-      std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetSparseContinuousAsTensor")
-.Input("data_record_handle: resource")
-.Attr("feature_id: int")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("names: string")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns tensors corresponding to the ids, keys, names and values of a particular
-feature_id.
-Input
-  data_record_handle: Resource handle to DataRecord
-Attr
-  feature_id: Id representing the feature whose values will be extracted.
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-  keys: DataRecord keys (int64)
-  names: DataRecord values(string)
-  values: DataRecord values(float)
-)doc");
-class GetSparseContinuousAsTensor : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetSparseContinuousAsTensor(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // We need two passes to the data:
-      // 1 to compute the output size of the tensor
-      // 2 to copy the values to the tensor
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      const auto &records = handle->records;
-
-      // Creating a vector we increment every time a key is found
-      std::vector<std::string> temp_names;
-      std::vector<float> temp_values;
-      std::vector<int64> temp_ids;
-
-      for (int64 id = 0; id < records.size(); id++) {
-        const auto &sparse_continuous = records[id].getSparseContinuous();
-        auto it = sparse_continuous.find(feature_id);
-        // Find all instances of key in DataRecord
-        if (it != sparse_continuous.end()) {
-          // insert to temp_names all the values in the dictionary value
-          auto value_map = it->second;
-          for (auto& elem : value_map) {
-             temp_names.push_back(elem.first);
-             temp_values.push_back(elem.second);
-             temp_ids.push_back(id);
-          }
-        }
-      }
-
-      // The total_size will be the that of the saved vector
-      const int total_size = static_cast<int64>(temp_names.size());
-      TensorShape shape = {total_size};
-      Tensor* ids = nullptr;
-      Tensor* keys = nullptr;
-      Tensor* names = nullptr;
-      Tensor* values = nullptr;
-
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-      OP_REQUIRES_OK(context, context->allocate_output(2, shape, &names));
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape, &values));
-
-      auto keys_flat = keys->flat<int64>();
-      auto names_flat = names->flat<string>();
-      auto ids_flat = ids->flat<int64>();
-      auto values_flat = values->flat<float>();
-
-      // The feature id value will always be the same
-      std::fill(keys_flat.data(), keys_flat.data() + total_size, feature_id);
-      std::copy(temp_names.begin(), temp_names.end(), names_flat.data());
-      std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
-      std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-// Helper function to add ids, keys and values to common vector
-inline void addIdsKeysValuesToVectors(
-  const int64 id,
-  const int64 key,
-  const double value,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  ids.push_back(id);
-  keys.push_back(key);
-  values.push_back(value);
-}
-
-struct KeepFeatures {
-  KeepFeatures() : vec(), set() {}
-  template<typename ContainerType>
-  KeepFeatures(const std::vector<int64> &keep_features,
-               const ContainerType *const container) {
-    vec.reserve(keep_features.size());
-#ifdef USE_DENSE_HASH
-    set.resize(keep_features.size());
-    set.set_empty_key(0);
-#else
-    set.reserve(keep_features.size());
-#endif  // USE_DENSE_HASH
-    set.max_load_factor(0.5);
-    for (const auto &elem : keep_features) {
-      if (container->find(elem) == container->end()) continue;
-      vec.push_back(elem);
-      set.insert(elem);
-    }
-  }
-  size_t size() const {
-    return vec.size();
-  }
-  std::vector<int64> vec;
-  twml::Set<int64> set;
-};
-
-// Helper Function to Filter and Hash Feature for Binary Features
-void filterAndHashFeature(
-  const twml::DataRecord::BinaryFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      addIdsKeysValuesToVectors(current_id, *iter, 1, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem) == keep_features.set.end()) continue;
-      addIdsKeysValuesToVectors(current_id, elem, 1, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Continuous Features
-void filterAndHashFeature(
-  const twml::DataRecord::ContinuousFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      addIdsKeysValuesToVectors(current_id, iter->first, iter->second, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      addIdsKeysValuesToVectors(current_id, elem.first, elem.second, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Discrete Features
-void filterAndHashFeature(
-  const twml::DataRecord::DiscreteFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      int64_t key = twml::mixDiscreteIdAndValue(iter->first, iter->second);
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      int64_t key = twml::mixDiscreteIdAndValue(elem.first, elem.second);
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for String Features
-void filterAndHashFeature(
-  const twml::DataRecord::StringFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      int64_t key = twml::mixStringIdAndValue(
-        iter->first,
-        iter->second.size(),
-        reinterpret_cast<const uint8_t*>(iter->second.c_str()));
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      int64_t key = twml::mixStringIdAndValue(
-        elem.first,
-        elem.second.size(),
-        reinterpret_cast<const uint8_t*>(elem.second.c_str()));
-      addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Sparse Binary Features
-void filterAndHashFeature(
-  const twml::DataRecord::SparseBinaryFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      for (const auto &name : iter->second) {
-        int64_t key = twml::mixStringIdAndValue(iter->first, name.size(),
-                                                reinterpret_cast<const uint8_t*>(name.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-      }
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      for (const auto &name : elem.second) {
-        int64_t key = twml::mixStringIdAndValue(elem.first, name.size(),
-                                                reinterpret_cast<const uint8_t*>(name.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, 1, ids, keys, values);
-      }
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Sparse Continuous Features
-void filterAndHashFeature(
-  const twml::DataRecord::SparseContinuousFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      for (const auto &map : iter->second) {
-        int64_t key = twml::mixStringIdAndValue(
-          iter->first,
-          map.first.size(),
-          reinterpret_cast<const uint8_t*>(map.first.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      for (const auto &map : elem.second) {
-        int64_t key = twml::mixStringIdAndValue(
-          elem.first,
-          map.first.size(),
-          reinterpret_cast<const uint8_t*>(map.first.c_str()));
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  }
-}
-
-// Helper Function to Filter and Hash Feature for Sparse Continuous Features
-void filterAndHashFeatureCompat(
-  const twml::DataRecord::SparseContinuousFeatures& features,
-  const int64 current_id,
-  const KeepFeatures &keep_features,
-  std::vector<int64>& ids,
-  std::vector<int64>& keys,
-  std::vector<float>& values) {
-  if (keep_features.size() < 2 * features.size()) {
-    for (const auto &f : keep_features.vec) {
-      const auto &iter = features.find(f);
-      if (iter == features.end()) continue;
-      for (const auto &map : iter->second) {
-        int64_t key = twml::featureId(map.first);
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  } else {
-    for (const auto &elem : features) {
-      if (keep_features.set.find(elem.first) == keep_features.set.end()) continue;
-      for (const auto &map : elem.second) {
-        int64_t key = twml::featureId(map.first);
-        addIdsKeysValuesToVectors(current_id, key, map.second, ids, keys, values);
-      }
-    }
-  }
-}
-
-void copy_if_exists(std::vector<int64>& out,
-                    const std::vector<int64>& in,
-                    const twml::Map<int64_t, int64_t> *const map) {
-  out.reserve(in.size());
-  for (const auto &elem : in) {
-    if (map->find(elem) == map->end()) continue;
-    out.push_back(elem);
-  }
-}
-
-void ComputeHashedFeaturesAsTensor(OpKernelContext* context,
-                                   const DataRecordResource *const handle,
-                                   const KeepFeatures &binary_keep_features,
-                                   const KeepFeatures &continuous_keep_features,
-                                   const KeepFeatures &discrete_keep_features,
-                                   const KeepFeatures &string_keep_features,
-                                   const KeepFeatures &sparse_binary_keep_features,
-                                   const KeepFeatures &sparse_continuous_keep_features,
-                                   bool sparse_continuous_compatibility) {
-
-  const auto &records = handle->records;
-  uint64_t estimated_size = (binary_keep_features.size() + continuous_keep_features.size() +
-                             discrete_keep_features.size() + string_keep_features.size() +
-                             sparse_binary_keep_features.size() +
-                             sparse_continuous_keep_features.size());
-  // Construct temporary vectors for common features
-  std::vector<int64> common_ids, common_keys, temp_ids, temp_keys;
-  std::vector<float> common_values, temp_values;
-  common_ids.reserve(estimated_size);
-  common_keys.reserve(estimated_size);
-  common_values.reserve(estimated_size);
-
-  const auto &common_binary = handle->common.getBinary();
-  const auto &common_continuous = handle->common.getContinuous();
-  const auto &common_discrete = handle->common.getDiscrete();
-  const auto &common_string = handle->common.getString();
-  const auto &common_sparse_binary = handle->common.getSparseBinary();
-  const auto &common_sparse_continuous = handle->common.getSparseContinuous();
-
-  filterAndHashFeature(common_binary, 0, binary_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_continuous, 0, continuous_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_discrete, 0, discrete_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_string, 0, string_keep_features,
-                       common_ids, common_keys, common_values);
-  filterAndHashFeature(common_sparse_binary, 0, sparse_binary_keep_features,
-                       common_ids, common_keys, common_values);
-  if (sparse_continuous_compatibility) {
-    filterAndHashFeatureCompat(common_sparse_continuous, 0, sparse_continuous_keep_features,
-                               common_ids, common_keys, common_values);
-  } else {
-    filterAndHashFeature(common_sparse_continuous, 0, sparse_continuous_keep_features,
-                         common_ids, common_keys, common_values);
-  }
-  common_ids.clear();
-  // Construct temporary vectors for all features
-  estimated_size = (estimated_size + common_keys.size()) * records.size();
-  temp_ids.reserve(estimated_size);
-  temp_keys.reserve(estimated_size);
-  temp_values.reserve(estimated_size);
-
-  for (int64 id = 0; id < records.size(); id++) {
-    temp_ids.insert(temp_ids.end(), common_keys.size(), id);
-    temp_keys.insert(temp_keys.end(), common_keys.begin(), common_keys.end());
-    temp_values.insert(temp_values.end(), common_values.begin(), common_values.end());
-    const auto &binary = records[id].getBinary();
-    const auto &continuous = records[id].getContinuous();
-    const auto &discrete = records[id].getDiscrete();
-    const auto &str = records[id].getString();
-    const auto &sparse_binary = records[id].getSparseBinary();
-    const auto &sparse_continuous = records[id].getSparseContinuous();
-
-    filterAndHashFeature(binary, id, binary_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(continuous, id, continuous_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(discrete, id, discrete_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(str, id, string_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    filterAndHashFeature(sparse_binary, id, sparse_binary_keep_features,
-                         temp_ids, temp_keys, temp_values);
-    if (sparse_continuous_compatibility) {
-      filterAndHashFeatureCompat(sparse_continuous, id, sparse_continuous_keep_features,
-                                 temp_ids, temp_keys, temp_values);
-    } else {
-      filterAndHashFeature(sparse_continuous, id, sparse_continuous_keep_features,
-                           temp_ids, temp_keys, temp_values);
-    }
-  }
-
-  // Copy the temporary vectors into the output Tensors
-  TensorShape shape = {static_cast<int64>(temp_ids.size())};
-  Tensor* ids = nullptr;
-  Tensor* keys = nullptr;
-  Tensor* values = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-  OP_REQUIRES_OK(context, context->allocate_output(1, shape, &keys));
-  OP_REQUIRES_OK(context, context->allocate_output(2, shape, &values));
-  auto ids_flat = ids->flat<int64>();
-  auto keys_flat = keys->flat<int64>();
-  auto values_flat = values->flat<float>();
-  std::copy(temp_ids.begin(), temp_ids.end(), ids_flat.data());
-  std::copy(temp_keys.begin(), temp_keys.end(), keys_flat.data());
-  std::copy(temp_values.begin(), temp_values.end(), values_flat.data());
-}
-
-REGISTER_OP("GetHashedFeaturesAsSparseTensor")
-.Input("data_record_handle: resource")
-.Attr("binary_keep_features: list(int)")
-.Attr("continuous_keep_features: list(int)")
-.Attr("discrete_keep_features: list(int)")
-.Attr("string_keep_features: list(int)")
-.Attr("sparse_binary_keep_features: list(int)")
-.Attr("sparse_continuous_keep_features: list(int)")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-A tensorflow OP for returning required features of different type as
-a single sparse tensor. Hashing trick is applied.
-
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values (float)
-)doc");
-
-class GetHashedFeaturesAsSparseTensor: public OpKernel {
- public:
-  explicit GetHashedFeaturesAsSparseTensor(OpKernelConstruction* context): OpKernel(context) {
-    // Get the list of features to keep for each feature type
-    OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
-  }
-
- private:
-  std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
-  std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      // Create a new list of keep features based on the original keep_set.
-      // This is to ensure compatibility with existing behavior such as:
-      //  - Ensure no new features are decoded in this op.
-      //  - Ensure labels or weights dont get included here.
-      // TODO: Should we return features requested by user here even if they are labels / weights?
-      KeepFeatures binary_keep_features(binary_keep_features_, handle->keep_map);
-      KeepFeatures continuous_keep_features(continuous_keep_features_, handle->keep_map);
-      KeepFeatures discrete_keep_features(discrete_keep_features_, handle->keep_map);
-      KeepFeatures string_keep_features(string_keep_features_, handle->keep_map);
-      KeepFeatures sparse_binary_keep_features(sparse_binary_keep_features_, handle->keep_map);
-      KeepFeatures sparse_continuous_keep_features(sparse_continuous_keep_features_, handle->keep_map);
-      ComputeHashedFeaturesAsTensor(context, handle.get(),
-                                    binary_keep_features,
-                                    continuous_keep_features,
-                                    discrete_keep_features,
-                                    string_keep_features,
-                                    sparse_binary_keep_features,
-                                    sparse_continuous_keep_features,
-                                    false);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetHashedFeaturesAsSparseTensorV2")
-.Input("data_record_handle: resource")
-.Attr("binary_keep_features: list(int)")
-.Attr("continuous_keep_features: list(int)")
-.Attr("discrete_keep_features: list(int)")
-.Attr("string_keep_features: list(int)")
-.Attr("sparse_binary_keep_features: list(int)")
-.Attr("sparse_continuous_keep_features: list(int)")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("decode_mode: int = 0")
-.Output("ids: int64")
-.Output("keys: int64")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-A tensorflow OP for returning required features of different type as
-a single sparse tensor. Hashing trick is applied.
-
-Input
-  data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records in the batch (int64)
-  keys: DataRecord keys (int64)
-  values: DataRecord values (float)
-)doc");
-
-class GetHashedFeaturesAsSparseTensorV2: public OpKernel {
- public:
-  explicit GetHashedFeaturesAsSparseTensorV2(OpKernelConstruction* context): OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-    std::vector<int64> binary_keep_features_, continuous_keep_features_, discrete_keep_features_;
-    std::vector<int64> string_keep_features_, sparse_binary_keep_features_, sparse_continuous_keep_features_;
-
-    // Get the list of features to keep for each feature type
-    OP_REQUIRES_OK(context, context->GetAttr("binary_keep_features", &binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("continuous_keep_features", &continuous_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("discrete_keep_features", &discrete_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("string_keep_features", &string_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_binary_keep_features", &sparse_binary_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("sparse_continuous_keep_features", &sparse_continuous_keep_features_));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    twml::Map<int64_t, int64_t> keep_map;
-#ifdef USE_DENSE_HASH
-    keep_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-
-    binary_keep_features = KeepFeatures(binary_keep_features_, &keep_map);
-    continuous_keep_features = KeepFeatures(continuous_keep_features_, &keep_map);
-    discrete_keep_features = KeepFeatures(discrete_keep_features_, &keep_map);
-    string_keep_features = KeepFeatures(string_keep_features_, &keep_map);
-    sparse_binary_keep_features = KeepFeatures(sparse_binary_keep_features_, &keep_map);
-    sparse_continuous_keep_features = KeepFeatures(sparse_continuous_keep_features_, &keep_map);
-
-  }
-
- private:
-  KeepFeatures binary_keep_features, continuous_keep_features, discrete_keep_features;
-  KeepFeatures string_keep_features, sparse_binary_keep_features, sparse_continuous_keep_features;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<DataRecordResource>(context, 0);
-      // Create a new list of keep features based on the original keep_set.
-      // This is to ensure compatibility with existing behavior such as:
-      //  - Ensure no new features are decoded in this op.
-      //  - Ensure labels or weights dont get included here.
-      // TODO: Should we return features requested by user here even if they are labels / weights?
-      ComputeHashedFeaturesAsTensor(context, handle.get(),
-                                    binary_keep_features,
-                                    continuous_keep_features,
-                                    discrete_keep_features,
-                                    string_keep_features,
-                                    sparse_binary_keep_features,
-                                    sparse_continuous_keep_features,
-                                    m_decode_mode == 0);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-#define REGISTER_DECODE_DATA_RECORD(InputType)  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("DecodeDataRecord")                    \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<InputType>("InputType"),    \
-    DecodeDataRecord<InputType>);               \
-
-REGISTER_DECODE_DATA_RECORD(uint8)
-REGISTER_DECODE_DATA_RECORD(string)
-
-#define REGISTER_GETTER(FIELD)                  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "Features")               \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##Features);                      \
-
-#define REGISTER_GETTER_FROM_DR(FIELD)          \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "FromDataRecord")         \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##FromDataRecord);                \
-
-#define REGISTER_GETTER_AS_TENSOR(FIELD)        \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "AsTensor")               \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##AsTensor);                      \
-
-
-#define REGISTER_GETTER_GROUP_AS_TENSOR(FIELD)  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "GroupAsTensor")          \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##GroupAsTensor);                 \
-
-REGISTER_GETTER(Binary)
-REGISTER_GETTER(Continuous)
-REGISTER_GETTER(Discrete)
-REGISTER_GETTER(String)
-REGISTER_GETTER(SparseBinary)
-REGISTER_GETTER(SparseContinuous)
-REGISTER_GETTER_FROM_DR(BatchSize)
-REGISTER_GETTER_FROM_DR(Labels)
-REGISTER_GETTER_FROM_DR(Weights)
-REGISTER_GETTER_AS_TENSOR(Binary)
-REGISTER_GETTER_AS_TENSOR(Continuous)
-REGISTER_GETTER_AS_TENSOR(Discrete)
-REGISTER_GETTER_AS_TENSOR(String)
-REGISTER_GETTER_AS_TENSOR(SparseBinary)
-REGISTER_GETTER_AS_TENSOR(SparseContinuous)
-REGISTER_GETTER_GROUP_AS_TENSOR(Binary)
-REGISTER_GETTER_GROUP_AS_TENSOR(Continuous)
-REGISTER_GETTER_GROUP_AS_TENSOR(Discrete)
-REGISTER_GETTER_GROUP_AS_TENSOR(String)
-REGISTER_KERNEL_BUILDER(
-  Name("GetHashedFeaturesAsSparseTensor")
-  .Device(DEVICE_CPU),
-  GetHashedFeaturesAsSparseTensor);
-REGISTER_KERNEL_BUILDER(
-  Name("GetHashedFeaturesAsSparseTensorV2")
-  .Device(DEVICE_CPU),
-  GetHashedFeaturesAsSparseTensorV2);
diff --git a/twml/libtwml/src/ops/data_record_tensor_writer.cpp b/twml/libtwml/src/ops/data_record_tensor_writer.cpp
deleted file mode 100644
index 9368c870e..000000000
--- a/twml/libtwml/src/ops/data_record_tensor_writer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("DataRecordTensorWriter")
-.Attr("T: list({string, int32, int64, float, double, bool})")
-.Input("keys: int64")
-.Input("values: T")
-.Output("result: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that packages keys and dense tensors into a DataRecord.
-
-values: list of tensors
-keys: feature ids from the original DataRecord (int64)
-
-Outputs
-  bytes: output DataRecord serialized using Thrift into a uint8 tensor.
-)doc");
-
-class DataRecordTensorWriter : public OpKernel {
- public:
-  explicit DataRecordTensorWriter(OpKernelConstruction* context)
-  : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& keys = context->input(0);
-
-    try {
-      // set keys as twml::Tensor
-      const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-
-      // check sizes
-      uint64_t num_keys = in_keys_.getNumElements();
-      uint64_t num_values = context->num_inputs() - 1;
-
-      OP_REQUIRES(context, num_keys == num_values,
-        errors::InvalidArgument("Number of dense keys and dense tensors do not match"));
-
-      // populate DataRecord object
-      const int64_t *keys = in_keys_.getData<int64_t>();
-      twml::DataRecord record = twml::DataRecord();
-
-      for (int i = 1; i < context->num_inputs(); i++) {
-        const twml::RawTensor& value = TFTensor_to_twml_raw_tensor(context->input(i));
-        record.addRawTensor(keys[i-1], value);
-      }
-
-      // determine the length of the encoded result (no memory is copied)
-      twml::ThriftWriter thrift_dry_writer = twml::ThriftWriter(nullptr, 0, true);
-      twml::DataRecordWriter record_dry_writer = twml::DataRecordWriter(thrift_dry_writer);
-      record_dry_writer.write(record);
-      int len = thrift_dry_writer.getBytesWritten();
-      TensorShape result_shape = {1, len};
-
-      // allocate output tensor
-      Tensor* result = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, result_shape, &result));
-      twml::Tensor out_result = TFTensor_to_twml_tensor(*result);
-
-      // write to output tensor
-      uint8_t *buffer = out_result.getData<uint8_t>();
-      twml::ThriftWriter thrift_writer = twml::ThriftWriter(buffer, len, false);
-      twml::DataRecordWriter record_writer = twml::DataRecordWriter(thrift_writer);
-      record_writer.write(record);
-    } catch(const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("DataRecordTensorWriter").Device(DEVICE_CPU),
-    DataRecordTensorWriter);
diff --git a/twml/libtwml/src/ops/discretizer.cpp b/twml/libtwml/src/ops/discretizer.cpp
deleted file mode 100644
index 10d1b3c78..000000000
--- a/twml/libtwml/src/ops/discretizer.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-
-void ComputeDiscretizers(OpKernelContext* context, const bool return_bin_indices = false) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_ids = context->input(2);
-  const Tensor& bin_vals = context->input(3);
-  const Tensor& feature_offsets = context->input(4);
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(),
-                                                   &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, keys.shape(),
-                                                   &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_ids_ = TFTensor_to_twml_tensor(bin_ids);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-    const twml::Tensor feature_offsets_ = TFTensor_to_twml_tensor(feature_offsets);
-    twml::mdlInfer(out_keys_, out_vals_,
-                   in_keys_, in_vals_,
-                   bin_ids_, bin_vals_,
-                   feature_offsets_,
-                   return_bin_indices);
-  }  catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
-
-REGISTER_OP("MDL")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using MDL, you should use a hashmap to get the intersection of
-input `keys` with the features that MDL knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  mdl_keys = hashmap.find(keys) # mdl_keys are now in range [0, num_classes_from_calibration)
-  mdl_keys = where (mdl_keys != -1) # Ignore keys not found
-
-
-Inside MDL, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-
-template<typename T>
-class MDL : public OpKernel {
- public:
-  explicit MDL(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context);
-  }
-};
-
-REGISTER_OP("PercentileDiscretizer")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using PercentileDiscretizer, you should use a hashmap to get the intersection of
-input `keys` with the features that PercentileDiscretizer knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  percentile_discretizer_keys = hashmap.find(keys) # percentile_discretizer_keys are now in range [0, num_classes_from_calibration)
-  percentile_discretizer_keys = where (percentile_discretizer_keys != -1) # Ignore keys not found
-
-
-Inside PercentileDiscretizer, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizer : public OpKernel {
- public:
-  explicit PercentileDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context);
-  }
-};
-
-
-REGISTER_OP("PercentileDiscretizerBinIndices")
-.Attr("T: {float, double}")
-.Input("keys: int64")
-.Input("vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features.
-If the feature id and bin id of the discretized value is the same on multiple runs, they
-will always be assigned to the same output key and value, regardless of the bin_id assigned during
-calibration.
-
-Input
-  keys: A tensor containing feature ids.
-  vals: A tensor containing values at corresponding feature ids.
-  bin_ids: A tensor containing the discretized feature id for a given bin.
-  bin_vals: A tensor containing the bin boundaries for value at a given feature id.
-  feature_offsets: Specifies the starting location of bins for a given feature id.
-
-Expected Sizes:
-  keys, vals: [N].
-  bin_ids, bin_vals: [sum_{n=1}^{n=num_classes} num_bins(n)]
-
-  where
-  - N is the number of sparse features in the current batch.
-  - [0, num_classes) represents the range each feature id can take.
-  - num_bins(n) is the number of bins for a given feature id.
-  - If num_bins is fixed, then xs, ys are of size [num_classes * num_bins].
-
-Expected Types:
-  keys, bin_ids: int64.
-  vals: float or double.
-  bin_vals: same as vals.
-
-Before using PercentileDiscretizerBinIndices, you should use a hashmap to get the intersection of
-input `keys` with the features that PercentileDiscretizerBinIndices knows about:
-::
-  keys, vals # keys can be in range [0, 1 << 63)
-  percentile_discretizer_keys = hashmap.find(keys) # percentile_discretizer_keys are now in range [0, num_classes_from_calibration)
-  percentile_discretizer_keys = where (percentile_discretizer_keys != -1) # Ignore keys not found
-
-
-Inside PercentileDiscretizerBinIndices, the following is happening:
-::
-  start = offsets[key[i]]
-  end = offsets[key[i] + 1]
-  idx = binary_search for val[i] in [bin_vals[start], bin_vals[end]]
-
-  result_keys[i] = bin_ids[idx]
-  val[i] = 1 # binary feature value
-
-Outputs
-  new_keys: The discretized feature ids with same shape and size as keys.
-  new_vals: The discretized values with the same shape and size as vals.
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizerBinIndices : public OpKernel {
- public:
-  explicit PercentileDiscretizerBinIndices(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeDiscretizers(context, true);
-  }
-};
-
-
-#define REGISTER(Type)              \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizerBinIndices")   \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizerBinIndices<Type>);   \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizer")   \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizer<Type>);   \
-                                    \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("MDL")                     \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    MDL<Type>);                     \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/feature_extractor.cpp b/twml/libtwml/src/ops/feature_extractor.cpp
deleted file mode 100644
index 9e0910bae..000000000
--- a/twml/libtwml/src/ops/feature_extractor.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include <map>
-#include <vector>
-
-REGISTER_OP("FeatureExtractor")
-.Attr("T: {float, double} = DT_FLOAT")
-.Input("mask_in: bool")
-.Input("ids_in: int64")
-.Input("keys_in: int64")
-.Input("values_in: T")
-.Input("codes_in: int64")
-.Input("types_in: int8")
-.Output("ids_out: int64")
-.Output("keys_out: int64")
-.Output("values_out: T")
-.Output("codes_out: int64")
-.Output("types_out: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that extracts the desired indices of a Tensor based on a mask
-
-Input
-  mask_in: boolean Tensor that determines which are the indices to be kept (bool)
-  ids_in: input indices Tensor (int64)
-  keys_in: input keys Tensor (int64)
-  values_in: input values Tensor (float/double)
-  codes_in: input codes Tensor (int64)
-  types_in: input types Tensor(int8)
-
-Outputs
-  ids_out: output indices Tensor (int64)
-  keys_out: output keys Tensor (int64)
-  values_out: output values Tensor (float/double)
-  codes_out: output codes Tensor (int64)
-  types_out: output types Tensor(int8)
-
-)doc");
-template <typename T>
-class FeatureExtractor : public OpKernel {
- public:
-  explicit FeatureExtractor(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  template <typename A, typename U>
-  bool allequal(const A &t, const U &u) {
-      return t == u;
-  }
-
-  template <typename A, typename U, typename... Others>
-  bool allequal(const A &t, const U &u, Others const &... args) {
-      return (t == u) && allequal(u, args...);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Get input tensors
-    const Tensor& input_mask = context->input(0);
-    const Tensor& input_ids = context->input(1);
-    const Tensor& input_keys = context->input(2);
-    const Tensor& input_values = context->input(3);
-    const Tensor& input_codes = context->input(4);
-    const Tensor& input_types = context->input(5);
-
-    auto mask = input_mask.flat<bool>();
-    auto ids = input_ids.flat<int64>();
-    auto keys = input_keys.flat<int64>();
-    auto codes = input_codes.flat<int64>();
-    auto values = input_values.flat<T>();
-    auto types = input_types.flat<int8>();
-
-    // Verify that all Tensors have the same size.
-    OP_REQUIRES(context, allequal(mask.size(), ids.size(), keys.size(), codes.size(), values.size(), types.size()),
-                errors::InvalidArgument("all input vectors must be the same size."));
-
-    // Get the size of the output vectors by counting the numbers of trues.
-    int total_size = 0;
-    for (int i = 0; i < mask.size(); i++) {
-      if (mask(i))
-        total_size += 1;
-    }
-
-    // Shape is the number of Trues in the mask Eigen::Tensor
-    TensorShape shape_out = {total_size};
-
-    // Create the output tensors
-    Tensor* output_codes = nullptr;
-    Tensor* output_ids = nullptr;
-    Tensor* output_values = nullptr;
-    Tensor* output_types = nullptr;
-    Tensor* output_keys = nullptr;
-
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape_out, &output_ids));
-    OP_REQUIRES_OK(context, context->allocate_output(1, shape_out, &output_keys));
-    OP_REQUIRES_OK(context, context->allocate_output(2, shape_out, &output_values));
-    OP_REQUIRES_OK(context, context->allocate_output(3, shape_out, &output_codes));
-    OP_REQUIRES_OK(context, context->allocate_output(4, shape_out, &output_types));
-
-    auto output_ids_ = output_ids->flat<int64>();
-    auto output_keys_ = output_keys->flat<int64>();
-    auto output_codes_ = output_codes->flat<int64>();
-    auto output_values_ = output_values->flat<T>();
-    auto output_types_ = output_types->flat<int8>();
-
-    // Iterate through the mask and set values to output Eigen::Tensors
-    int j = 0;
-    for (int i = 0; i < mask.size(); i++) {
-      if (mask(i)) {
-        output_ids_(j) = ids(i);
-        output_keys_(j) = keys(i);
-        output_values_(j) = values(i);
-        output_codes_(j) = codes(i);
-        output_types_(j) = types(i);
-        ++j;
-      }
-    }
-  }
-};
-
-#define REGISTER(Type)                        \
-                                              \
-  REGISTER_KERNEL_BUILDER(                    \
-  Name("FeatureExtractor")  \
-  .Device(DEVICE_CPU)                         \
-  .TypeConstraint<Type>("T"),                 \
-  FeatureExtractor<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/feature_id.cpp b/twml/libtwml/src/ops/feature_id.cpp
deleted file mode 100644
index 150b5614c..000000000
--- a/twml/libtwml/src/ops/feature_id.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("FeatureId")
-.Attr("feature_names: list(string)")
-.Output("output: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that hashes a list of strings into int64. This is used for feature name hashing.
-
-Attr
-  feature_names: a list of string feature names (list(string)).
-
-Outputs
-  ouput: hashes corresponding to the string feature names (int64).
-)doc");
-
-
-class FeatureId : public OpKernel {
- private:
-    std::vector<string> input_vector;
-
- public:
-  explicit FeatureId(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_names", &input_vector));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Get size of the input_vector and create TensorShape shape
-    const int total_size = static_cast<int>(input_vector.size());
-    TensorShape shape = {total_size};
-
-    // Create an output tensor
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape,
-                             &output_tensor));
-    auto output_flat = output_tensor->flat<int64>();
-
-    // Transform the input tensor into a int64
-    for (int i = 0; i < total_size; i++) {
-      output_flat(i) = twml::featureId(input_vector[i]);
-    }
-  }
-};
-
-
-REGISTER_KERNEL_BUILDER(
-  Name("FeatureId")
-  .Device(DEVICE_CPU),
-  FeatureId);
diff --git a/twml/libtwml/src/ops/feature_mask.cpp b/twml/libtwml/src/ops/feature_mask.cpp
deleted file mode 100644
index fc1498413..000000000
--- a/twml/libtwml/src/ops/feature_mask.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include <map>
-#include <vector>
-#include <set>
-
-REGISTER_OP("FeatureMask")
-.Attr("T: {int64, int8}")
-.Input("keep: T")
-.Attr("list_keep: list(int)")
-.Output("mask: bool")
-
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP that creates a mask of the indices that should be kept.
-
-Attribute
-list_keep: list of values which should be kept(list(int))
-
-Input
-  keep: Tensor for which we will apply the mask (int64, int8)
-
-Outputs
-  mask: boolean Tensor. (bool)
-
-)doc");
-template <typename T>
-class FeatureMask : public OpKernel {
- private:
-  std::set<int64> feature_set_keep;
-
- public:
-  explicit FeatureMask(OpKernelConstruction* context)
-      : OpKernel(context) {
-        std::vector<int64> feature_list_keep;
-        OP_REQUIRES_OK(context, context->GetAttr("list_keep", &feature_list_keep));
-        // create set that contains the content of the feature_list_keep, since tensorflow does not allow
-        // me to directly ouput the contents of list_keep to a set
-        feature_set_keep = std::set<int64>(feature_list_keep.begin(), feature_list_keep.end());
-      }
-
-  void Compute(OpKernelContext* context) override {
-    // Get size of the input_vector and create TensorShape shape
-    const Tensor& input = context->input(0);
-
-    auto keep = input.flat<T>();
-
-    // Create an output tensor
-    Tensor* output_mask = nullptr;
-
-    // Output shape is determined and now we can copy the contents of the vector to the output Tensor.
-    const int total_size_out = static_cast<int>(keep.size());
-
-    TensorShape shape_out = {total_size_out};
-
-    OP_REQUIRES_OK(context, context->allocate_output(0, shape_out, &output_mask));
-
-    auto output_mask_ = output_mask->flat<bool>();
-
-    // Check if value is in set, output is boolean
-    for (int j = 0; j < keep.size(); j++){
-      output_mask_(j) = (feature_set_keep.count(keep(j)));
-    }
-  }
-};
-
-
-#define REGISTER(Type)                        \
-                                              \
-  REGISTER_KERNEL_BUILDER(                    \
-  Name("FeatureMask")  \
-  .Device(DEVICE_CPU)                         \
-  .TypeConstraint<Type>("T"),                 \
-  FeatureMask<Type>);  \
-
-REGISTER(int64);
-REGISTER(int8);
diff --git a/twml/libtwml/src/ops/fixed_length_tensor.cpp b/twml/libtwml/src/ops/fixed_length_tensor.cpp
deleted file mode 100644
index 876367ad3..000000000
--- a/twml/libtwml/src/ops/fixed_length_tensor.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-using std::string;
-
-template<typename IndexType, typename ValueType, bool calc_batch_size>
-void ComputeFixedLengthTensor(OpKernelContext *context, int64 max_length_) {
-  try {
-    const Tensor& segment_ids = context->input(0);
-    const Tensor& values = context->input(1);
-    const Tensor& pad_value = context->input(2);
-
-    auto indices_flat = segment_ids.flat<IndexType>();
-    auto values_flat = values.flat<ValueType>();
-
-    auto pad_value_scalar = pad_value.scalar<ValueType>()();
-
-    // Get maximum length from batch if user hasn't specified it.
-    int64 max_length = max_length_;
-    if (max_length < 0 && indices_flat.size() > 0) {
-      int64 current_id = indices_flat(0);
-      int64 current_length = 1;
-
-      for (int64 i = 1; i < indices_flat.size(); i++) {
-        if (current_id == indices_flat(i)) {
-          current_length++;
-        } else {
-          current_id = indices_flat(i);
-          max_length = std::max(max_length, current_length);
-          current_length = 1;
-        }
-      }
-      // This is needed if the last batch is the longest sequence.
-      max_length = std::max(max_length, current_length);
-    }
-
-    int64 batch_size = 0;
-    if (calc_batch_size) {
-      if (indices_flat.size() > 0) {
-        // The last value of segment_ids will have value batch_size  1;
-        batch_size = 1 + indices_flat(indices_flat.size() - 1);
-      } else {
-        batch_size = 0;
-      }
-    } else {
-      const Tensor& batch_size_tensor = context->input(3);
-      batch_size = batch_size_tensor.flat<int64>()(0);
-    }
-
-    TensorShape output_shape = {batch_size, max_length};
-    Tensor* fixed_length = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &fixed_length));
-
-    auto fixed_length_flat = fixed_length->flat<ValueType>();
-
-    int64 n = 0;
-    int64 offset = 0;
-    for (int64 i = 0; i < batch_size; i++) {
-      for (int64 j = 0; j < max_length; j++) {
-        if (n < indices_flat.size() && indices_flat(n) == i) {
-          // Copy from variable length tensor.
-          fixed_length_flat(offset + j) = values_flat(n);
-          n++;
-        } else {
-          // Pad to fixed length.
-          fixed_length_flat(offset + j) = pad_value_scalar;
-        }
-      }
-      // Corner case: truncate to max_length if user specified max_length < current length.
-      while (n < indices_flat.size() && i == indices_flat(n)) n++;
-
-      // Update output pointer
-      offset += max_length;
-    }
-  } catch (const std::exception &err) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-  }
-}
-
-REGISTER_OP("FixedLengthTensor")
-.Attr("IndexType: {int64, int32}")
-.Attr("ValueType: {int64, int32, string}")
-.Attr("max_length: int")
-.Input("segment_ids: IndexType")
-.Input("values: ValueType")
-.Input("pad_value: ValueType")
-.Output("fixed_length: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert variable length segments into fixed length tensor.
-
-Attr
-  max_length: The size of the inner most (i.e. last) dimension.
-
-Input
-  segment_ids: 1D input tensor containing the sorted segment_ids.
-  values: 1D input tensor containing the values.
-  pad_value: The value used for padding the fixed length tensor.
-
-Outputs
-  fixed_length: A fixed length tensor of size [batch_size, max_length].
-)doc");
-
-template<typename IndexType, typename ValueType>
-class FixedLengthTensor: public OpKernel {
- public:
-  explicit FixedLengthTensor(OpKernelConstruction *context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_length", &max_length_));
-  }
-
- private:
-  int64 max_length_;
-
-  void Compute(OpKernelContext *context) override {
-    ComputeFixedLengthTensor<IndexType, ValueType, true>(context, max_length_);
-  }
-};
-
-REGISTER_OP("FixedLengthTensorV2")
-.Attr("IndexType: {int64, int32}")
-.Attr("ValueType: {int64, int32, string}")
-.Attr("max_length: int")
-.Input("segment_ids: IndexType")
-.Input("values: ValueType")
-.Input("pad_value: ValueType")
-.Input("batch_size: int64")
-.Output("fixed_length: ValueType")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-
-A tensorflow OP to convert variable length segments into fixed length tensor.
-
-Attr
-  max_length: The size of the inner most (i.e. last) dimension.
-
-Input
-  segment_ids: 1D input tensor containing the sorted segment_ids.
-  values: 1D input tensor containing the values.
-  pad_value: The value used for padding the fixed length tensor.
-  batch_size: The batch size to use.
-
-Outputs
-  fixed_length: A fixed length tensor of size [batch_size, max_length].
-)doc");
-
-template<typename IndexType, typename ValueType>
-class FixedLengthTensorV2: public OpKernel {
- public:
-  explicit FixedLengthTensorV2(OpKernelConstruction *context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_length", &max_length_));
-  }
-
- private:
-  int64 max_length_;
-
-  void Compute(OpKernelContext *context) override {
-    ComputeFixedLengthTensor<IndexType, ValueType, false>(context, max_length_);
-  }
-};
-
-#define REGISTER_SPARSE_TO_FIXED_LENGTH(IndexType, ValueType)   \
-  REGISTER_KERNEL_BUILDER(                                      \
-    Name("FixedLengthTensor")                                   \
-    .Device(DEVICE_CPU)                                         \
-    .TypeConstraint<IndexType>("IndexType")                     \
-    .TypeConstraint<ValueType>("ValueType"),                    \
-    FixedLengthTensor<IndexType, ValueType>);                   \
-                                                                \
-  REGISTER_KERNEL_BUILDER(                                      \
-    Name("FixedLengthTensorV2")                                 \
-    .Device(DEVICE_CPU)                                         \
-    .TypeConstraint<IndexType>("IndexType")                     \
-    .TypeConstraint<ValueType>("ValueType"),                    \
-    FixedLengthTensorV2<IndexType, ValueType>);                 \
-
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, int64)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, int32)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int64, string)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, int64)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, int32)
-REGISTER_SPARSE_TO_FIXED_LENGTH(int32, string)
diff --git a/twml/libtwml/src/ops/hashed_data_record.cpp b/twml/libtwml/src/ops/hashed_data_record.cpp
deleted file mode 100644
index ba094c3d9..000000000
--- a/twml/libtwml/src/ops/hashed_data_record.cpp
+++ /dev/null
@@ -1,520 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <functional>
-
-REGISTER_OP("DecodeAndHashDataRecord")
-.Attr("InputType: {uint8, string}")
-.Input("input_bytes: InputType")
-.Attr("keep_features: list(int)")
-.Attr("keep_codes: list(int)")
-.Attr("label_features: list(int)")
-.Attr("weight_features: list(int) = []")
-.Attr("decode_mode: int = 0")
-.Output("hashed_data_record_handle: resource")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that creates a handle for the hashed data record.
-
-Attr
-  keep_features: a list of int ids to keep.
-  keep_codes: their corresponding code.
-  label_features: list of feature ids representing the labels.
-  weight_features: list of feature ids representing the weights. Defaults to empty list.
-  decode_mode: integer, indicates which decoding method to use. Let a sparse continuous
-    have a feature_name and a dict of {name: value}. 0 indicates feature_ids are computed
-    as hash(name). 1 indicates feature_ids are computed as hash(feature_name, name)
-  shared_name: name used by the resource handle inside the resource manager.
-  container: name used by the container of the resources.
-
-Input
-  input_bytes: Input tensor containing the serialized batch of HashedDataRecords.
-
-Outputs
-  hashed_data_record_handle: A resource handle to batch of HashedDataRecords.
-)doc");
-
-template<typename InputType>
-class DecodeAndHashDataRecord : public OpKernel {
- public:
-  explicit DecodeAndHashDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    std::vector<int64> keep_features;
-    std::vector<int64> keep_codes;
-
-    std::vector<int64> label_features;
-    std::vector<int64> weight_features;
-
-    OP_REQUIRES_OK(context, context->GetAttr("keep_features", &keep_features));
-    OP_REQUIRES_OK(context, context->GetAttr("keep_codes", &keep_codes));
-    OP_REQUIRES_OK(context, context->GetAttr("label_features", &label_features));
-    OP_REQUIRES_OK(context, context->GetAttr("weight_features", &weight_features));
-    OP_REQUIRES_OK(context, context->GetAttr("decode_mode", &m_decode_mode));
-
-    OP_REQUIRES(context, keep_features.size() == keep_codes.size(),
-                errors::InvalidArgument("keep keys and values must have same size."));
-
-#ifdef USE_DENSE_HASH
-    m_keep_map.set_empty_key(0);
-    m_labels_map.set_empty_key(0);
-    m_weights_map.set_empty_key(0);
-#endif  // USE_DENSE_HASH
-
-    for (uint64_t i = 0; i < keep_features.size(); i++) {
-      m_keep_map[keep_features[i]] = keep_codes[i];
-    }
-
-    for (uint64_t i = 0; i < label_features.size(); i++) {
-      m_labels_map[label_features[i]] = i;
-    }
-
-    for (uint64_t i = 0; i < weight_features.size(); i++) {
-      m_weights_map[weight_features[i]] = i;
-    }
-  }
-
- private:
-  twml::Map<int64_t, int64_t> m_keep_map;
-  twml::Map<int64_t, int64_t> m_labels_map;
-  twml::Map<int64_t, int64_t> m_weights_map;
-  int64 m_decode_mode;
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      HashedDataRecordResource *resource = nullptr;
-      OP_REQUIRES_OK(context, makeResourceHandle<HashedDataRecordResource>(context, 0, &resource));
-
-      // Store the input bytes in the resource so it isnt freed before the resource.
-      // This is necessary because we are not copying the contents for tensors.
-      resource->input = context->input(0);
-      int batch_size = getBatchSize<InputType>(resource->input);
-      int num_labels = static_cast<int>(m_labels_map.size());
-      int num_weights = static_cast<int>(m_weights_map.size());
-
-      twml::HashedDataRecordReader reader;
-      reader.setKeepMap(&m_keep_map);
-      reader.setLabelsMap(&m_labels_map);
-      reader.setDecodeMode(m_decode_mode);
-
-      // Do not set weight map if it is empty. This will take a faster path.
-      if (num_weights != 0) {
-        reader.setWeightsMap(&m_weights_map);
-      }
-
-      resource->records.clear();
-      resource->records.reserve(batch_size);
-
-      int64 total_size = 0;
-
-      for (int id = 0; id < batch_size; id++) {
-        const uint8_t *input_bytes = getInputBytes<InputType>(resource->input, id);
-        reader.setBuffer(input_bytes);
-        resource->records.emplace_back(num_labels, num_weights);
-        resource->records[id].decode(reader);
-        total_size += static_cast<int64>(resource->records[id].totalSize());
-      }
-
-      resource->total_size = total_size;
-      resource->num_labels = num_labels;
-      resource->num_weights = num_weights;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetIdsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns unhashed ids from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ids: ids specifies the index of the records[id] in the batch (int64)
-)doc");
-
-// This Kernel is used for both training and serving once the resource is created.
-class GetIdsFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetIdsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-      const int64 common_size = static_cast<int64>(common.totalSize());
-      const int64 total_size = handle->total_size;
-      TensorShape shape = {total_size};
-
-      Tensor *ids;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids));
-
-      int id = 0;
-      int64 offset = 0;
-      auto ids_flat = ids->flat<int64>();
-      for (const auto &record : records) {
-        // Since common features are added to each input, add the common_size to the current size.
-        // For training common_size == 0, for serving it can be a non-zero value.
-        int64 curr_size = static_cast<int64>(record.totalSize()) + common_size;
-        std::fill(ids_flat.data() + offset, ids_flat.data() + offset + curr_size, id);
-        offset += curr_size;
-        id++;
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-// OutType: Output Tensor Type. FieldType: The storage type used inside HashedDatarecord.
-template<typename OutType, typename FieldType>
-class GetOutputFromHashedDataRecord : public OpKernel {
- protected:
-  using Getter = std::function<const std::vector<FieldType>&(const twml::HashedDataRecord &)>;
-  Getter getter;
-
- public:
-  explicit GetOutputFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const auto &common = handle->common;
-      const int64 total_size = handle->total_size;
-      TensorShape shape = {total_size};
-
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
-
-      const auto &common_output = getter(common);
-
-      auto output_data = output->flat<OutType>().data();
-      for (const auto &record : records) {
-        // This is does not copy anything during training as common_size == 0
-        // It will copy the relevant common features coming from a batch prediction request.
-        output_data = std::copy(common_output.begin(), common_output.end(), output_data);
-
-        // Copy the current record to output.
-        const auto& rec_output = getter(record);
-        output_data = std::copy(rec_output.begin(), rec_output.end(), output_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetUKeysFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("ukeys: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns unhashed keys from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  ukeys: unhased keys / raw feature ids from the original request.
-)doc");
-
-class GetUKeysFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetUKeysFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.keys();
-    };
-  }
-};
-
-REGISTER_OP("GetKeysFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("keys: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns keys from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  keys: keys after raw feature ids are hashed with values (int64)
-)doc");
-
-class GetKeysFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetKeysFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.transformed_keys();
-    };
-  }
-};
-
-REGISTER_OP("GetValuesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns values from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  values: feature values.
-)doc");
-
-class GetValuesFromHashedDataRecord : public GetOutputFromHashedDataRecord<float, double> {
- public:
-  explicit GetValuesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<float, double>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<double> & {
-      return record.values();
-    };
-  }
-};
-
-REGISTER_OP("GetCodesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("codes: int64")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns codes from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  codes: deepbird feature code, usually from A,B,C,D ... in the config.
-)doc");
-
-class GetCodesFromHashedDataRecord : public GetOutputFromHashedDataRecord<int64, int64_t> {
- public:
-  explicit GetCodesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int64, int64_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<int64_t> & {
-      return record.codes();
-    };
-  }
-};
-
-REGISTER_OP("GetTypesFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("types: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns types from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  types: feature types corresponding to BINARY, DISCRETE, etc.
-)doc");
-
-class GetTypesFromHashedDataRecord : public GetOutputFromHashedDataRecord<int8, uint8_t> {
- public:
-  explicit GetTypesFromHashedDataRecord(OpKernelConstruction* context)
-      : GetOutputFromHashedDataRecord<int8, uint8_t>(context){
-    getter = [](const twml::HashedDataRecord &record) -> const std::vector<uint8_t> & {
-      return record.types();
-    };
-  }
-};
-
-REGISTER_OP("GetBatchSizeFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("batch_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns batch size from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  batch_size: Number of records held in the handle.
-)doc");
-
-class GetBatchSizeFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetBatchSizeFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->records.size();
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetTotalSizeFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("total_size: int64")
-.SetShapeFn(shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns total size from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  total_size: Total number of keys / values in the batch.
-)doc");
-
-class GetTotalSizeFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetTotalSizeFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-
-      Tensor *output;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output));
-      output->scalar<int64>()() = handle->total_size;
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetLabelsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("labels: float")
-.Attr("default_label: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns labels from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  labels: A 2D tensor of size [batch_size, num_labels] containing the label values.
-)doc");
-
-class GetLabelsFromHashedDataRecord : public OpKernel {
- private:
-  float default_label;
-
- public:
-  explicit GetLabelsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("default_label", &default_label));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_labels = static_cast<int>(handle->num_labels);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_labels};
-
-      Tensor *labels;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &labels));
-
-      // The default value of label is not present in data record is std::nanf
-      // For continuous labels, change that to a default_label or label.
-      auto func = [this](float label) -> float {
-        return std::isnan(label) ? default_label : label;
-      };
-
-      auto labels_data = labels->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_labels = record.labels();
-        labels_data = std::transform(rec_labels.begin(), rec_labels.end(), labels_data, func);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_OP("GetWeightsFromHashedDataRecord")
-.Input("hashed_data_record_handle: resource")
-.Output("weights: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns weights from the hashed data record.
-Input
-  hashed_data_record_handle: Resource handle to DataRecord
-
-Outputs
-  weights: A 2D tensor of size [batch_size, num_weights] containing the weight values.
-)doc");
-
-class GetWeightsFromHashedDataRecord : public OpKernel {
- public:
-  explicit GetWeightsFromHashedDataRecord(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto handle = getHandle<HashedDataRecordResource>(context, 0);
-      const auto &records = handle->records;
-      const int num_weights = static_cast<int>(handle->num_weights);
-      TensorShape shape = {static_cast<int64>(handle->records.size()), num_weights};
-
-      Tensor *weights;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &weights));
-
-      auto weights_data = weights->flat<float>().data();
-      for (const auto &record : records) {
-        const auto& rec_weights = record.weights();
-        weights_data = std::copy(rec_weights.begin(), rec_weights.end(), weights_data);
-      }
-    } catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-
-#define REGISTER_DECODE_AND_HASH(InputType)     \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("DecodeAndHashDataRecord")             \
-    .Device(DEVICE_CPU)                         \
-    .TypeConstraint<InputType>("InputType"),    \
-    DecodeAndHashDataRecord<InputType>);        \
-
-REGISTER_DECODE_AND_HASH(uint8)
-REGISTER_DECODE_AND_HASH(string)
-
-#define REGISTER_GETTER(FIELD)                  \
-  REGISTER_KERNEL_BUILDER(                      \
-    Name("Get" #FIELD "FromHashedDataRecord")   \
-    .Device(DEVICE_CPU),                        \
-    Get##FIELD##FromHashedDataRecord);          \
-
-REGISTER_GETTER(Ids)
-REGISTER_GETTER(UKeys)
-REGISTER_GETTER(Keys)
-REGISTER_GETTER(Values)
-REGISTER_GETTER(Codes)
-REGISTER_GETTER(Types)
-REGISTER_GETTER(BatchSize)
-REGISTER_GETTER(TotalSize)
-REGISTER_GETTER(Labels)
-REGISTER_GETTER(Weights)
diff --git a/twml/libtwml/src/ops/hashing_discretizer.cpp b/twml/libtwml/src/ops/hashing_discretizer.cpp
deleted file mode 100644
index 634f6db33..000000000
--- a/twml/libtwml/src/ops/hashing_discretizer.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-void ComputeHashingDiscretizer(
-  OpKernelContext*,
-  int64_t,
-  const twml::Map<int64_t, int64_t> &,
-  int64_t,
-  int64_t,
-  int64_t);
-
-REGISTER_OP("HashingDiscretizer")
-.Attr("T: {float, double}")
-.Input("input_ids: int64")
-.Input("input_vals: T")
-.Input("bin_vals: T")
-.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
-.Attr("n_bin: int")
-.Attr("output_bits: int")
-.Attr("cost_per_unit: int")
-.Attr("options: int")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn(
-  [](::tensorflow::shape_inference::InferenceContext* c) {
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(1));
-    return Status::OK();
-  }
-)
-.Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features (if calibrated).
-  - note - choice of float or double should be consistent among inputs/output
-
-Input
-  input_ids(int64): A tensor containing input feature ids (direct from data record).
-  input_vals(float/double): A tensor containing input values at corresponding feature ids.
-    - i.e. input_ids[i] <-> input_vals[i] for each i
-  bin_vals(float/double): A tensor containing the bin boundaries for values of a given feature.
-    - float or double, matching input_vals
-  feature_ids(int64 attr): 1D TensorProto of feature IDs seen during calibration
-    -> hint: look up make_tensor_proto:
-       proto_init = np.array(values, dtype=np.int64)
-       tensor_attr = tf.make_tensor_proto(proto_init)
-  n_bin(int): The number of bin boundary values per feature
-    -> hence, n_bin + 1 buckets for each feature
-  output_bits(int): The maximum number of bits to use for the output IDs.
-  cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
-    if not CPU-bound) to complete a unit of work. Overestimating creates too
-    many shards and CPU time will be dominated by per-shard overhead, such as
-    Context creation. Underestimating may not fully make use of the specified
-    parallelism.
-  options(int): selects behavior of the op.
-    0x00 in bits{1:0} for std::lower_bound bucket search.
-    0x01 in bits{1:0} for linear bucket search
-    0x02 in bits{1:0} for std::upper_bound bucket search
-    0x00 in bits{4:2} for integer_multiplicative_hashing
-    0x01 in bits{4:2} for integer64_multiplicative_hashing
-    higher bits/other values are reserved for future extensions
-
-Outputs
-  new_keys(int64): The discretized feature ids with same shape and size as keys.
-  new_vals(float or double): The discretized values with the same shape and size as vals.
-
-Operation
-  Note that the discretization operation maps observation vectors to higher dimensional
-    observation vectors. Here, we describe this mapping.
-
-  Let a calibrated feature observation be given by (F,x), where F is the ID of the
-    feature, and x is some real value (i.e., continuous feature). This kind of
-    representation is useful for the representation of sparse vectors, where there
-    are many zeros.
-
-  For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
-    (0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
-    elements of the vector.
-
-  The disretizer performs the following operation:
-    (F,x) -> (map(x|F),1).
-  Hence, we have that map(x|F) is a new feature ID, and the value observed for that
-    feature is 1. We might read map(x|F) as 'the map of x for feature F'.
-
-  For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
-    We will then have that map(x|F) is in the set newIDs(F) for any value of x. Each
-    set member of newIDs(F) is associated with a 'bin', as defined by the bin
-    boundaries given in the bin_vals input array. For any two different feature IDs F
-    and G, we would ideally have that INTERSECT(newIDs(F),newIDs(G)) is the empty set.
-    However, this is not guaranteed for this discretizer.
-
-  In the case of this hashing discretizer, map(x|F) can actually be written as follows:
-    let bucket = bucket(x|F) be the the bucket index for x, according to the
-    calibration on F. (This is an integer value in [0,n_bin], inclusive)
-    F is an integer ID. Here, we have that map(x|F) = hash_fn(F,bucket). This has
-    the desirable property that the new ID depends only on the calibration data
-    supplied for feature F, and not on any other features in the dataset (e.g.,
-    number of other features present in the calibration data, or order of features
-    in the dataset). Note that PercentileDiscretizer does NOT have this property.
-    This comes at the expense of the possibility of output ID collisions, which
-    we try to minimize through the design of hash_fn.
-
-  Example - consider input vector with a single element, i.e. [x].
-    Let's Discretize to one of 2 values, as follows:
-    Let F=0 for the ID of the single feature in the vector.
-    Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
-    bucket = bucket(x|F=0) = 0 if x<=BNDRY(0) else 1
-    Let map(x|F) = hash_fn(F=0,bucket=0) if x<=BNDRY(0) else hash_fn(F=0,bucket=1)
-  If we had another element y in the vector, i.e. [x, y], then we might additionally
-    Let F=1 for element y.
-    Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
-    bucket = bucket(x|F=1) = 0 if x<=BNDRY(1) else 1
-    Let map(x|F) = hash_fn(F=1,bucket=0) if x<=BNDRY(1) else hash_fn(F=1,bucket=1)
-  Note how the construction of map(x|F=1) does not depend on whether map(x|F=0)
-    was constructed.
-)doc");
-
-template<typename T>
-class HashingDiscretizer : public OpKernel {
- public:
-  explicit HashingDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("n_bin", &n_bin_));
-    OP_REQUIRES(context,
-                n_bin_ > 0,
-                errors::InvalidArgument("Must have n_bin_ > 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("output_bits", &output_bits_));
-    OP_REQUIRES(context,
-                output_bits_ > 0,
-                errors::InvalidArgument("Must have output_bits_ > 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("cost_per_unit", &cost_per_unit_));
-    OP_REQUIRES(context,
-                cost_per_unit_ >= 0,
-                errors::InvalidArgument("Must have cost_per_unit >= 0."));
-
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("options", &options_));
-
-    // construct the ID_to_index hash map
-    Tensor feature_IDs;
-
-    // extract the tensors
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_ids", &feature_IDs));
-
-    // for access to the data
-    // int64_t data type is set in to_layer function of the calibrator objects in Python
-    auto feature_IDs_flat = feature_IDs.flat<int64>();
-
-    // verify proper dimension constraints
-    OP_REQUIRES(context,
-                feature_IDs.shape().dims() == 1,
-                errors::InvalidArgument("feature_ids must be 1D."));
-
-    // reserve space in the hash map and fill in the values
-    int64_t num_features = feature_IDs.shape().dim_size(0);
-#ifdef USE_DENSE_HASH
-    ID_to_index_.set_empty_key(0);
-    ID_to_index_.resize(num_features);
-#else
-    ID_to_index_.reserve(num_features);
-#endif  // USE_DENSE_HASH
-    for (int64_t i = 0 ; i < num_features ; i++) {
-      ID_to_index_[feature_IDs_flat(i)] = i;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    ComputeHashingDiscretizer(
-      context,
-      output_bits_,
-      ID_to_index_,
-      n_bin_,
-      cost_per_unit_,
-      options_);
-  }
-
- private:
-  twml::Map<int64_t, int64_t> ID_to_index_;
-  int n_bin_;
-  int output_bits_;
-  int cost_per_unit_;
-  int options_;
-};
-
-#define REGISTER(Type)              \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("HashingDiscretizer")      \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    HashingDiscretizer<Type>);      \
-
-REGISTER(float);
-REGISTER(double);
-
-void ComputeHashingDiscretizer(
-    OpKernelContext* context,
-    int64_t output_bits,
-    const twml::Map<int64_t, int64_t> &ID_to_index,
-    int64_t n_bin,
-    int64_t cost_per_unit,
-    int64_t options) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_vals = context->input(2);
-
-  const int64 output_size = keys.dim_size(0);
-
-  TensorShape output_shape;
-  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(&output_size, 1, &output_shape));
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [&](int64 start, int64 limit) {
-      twml::hashDiscretizerInfer(out_keys_, out_vals_,
-                             in_keys_, in_vals_,
-                             n_bin,
-                             bin_vals_,
-                             output_bits,
-                             ID_to_index,
-                             start, limit,
-                             options);
-    };
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          output_size,
-          static_cast<int64>(cost_per_unit),
-          task);
-  } catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
-
diff --git a/twml/libtwml/src/ops/hashmap.cpp b/twml/libtwml/src/ops/hashmap.cpp
deleted file mode 100644
index ce11ff81d..000000000
--- a/twml/libtwml/src/ops/hashmap.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-
-#include <mutex>
-
-using namespace tensorflow;
-
-REGISTER_OP("Hashmap")
-.Input("keys: int64")
-.Input("hash_keys: int64")
-.Input("hash_values: int64")
-.Output("values: int64")
-.Output("mask: int8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check if the sizes are different in the input
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-  });
-
-
-class Hashmap : public OpKernel {
- private:
-  twml::HashMap hmap;
-  std::once_flag flag;
-
- public:
-  explicit Hashmap(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      // Quick hack
-      const Tensor& keys = context->input(0);
-
-      std::call_once(this->flag, [this, context](){
-          const Tensor& hash_keys = context->input(1);
-          const Tensor& hash_values = context->input(2);
-          const auto hash_keys_flat = hash_keys.flat<int64>();
-          const auto hash_values_flat = hash_values.flat<int64>();
-          const int64 N = hash_keys_flat.size();
-
-          for (int64 i = 0; i < N; i++) {
-            hmap.insert(hash_keys_flat(i), hash_values_flat(i));
-          }
-        });
-
-      Tensor* values = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(),
-                                                       &values));
-
-      Tensor* mask = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, keys.shape(),
-                                                       &mask));
-
-      // copy the values without sharing a storage
-      values->flat<int64>() = keys.flat<int64>();
-
-      auto keys_flat = keys.flat<int64>();
-      auto values_flat = values->flat<int64>();
-      auto mask_flat = mask->flat<int8>();
-
-      // TODO: use twml tensor
-      const int64 N = keys_flat.size();
-      for (int64 i = 0; i < N; i++) {
-        // values_flat(i), keys_flat(i) return references to tensorflow::int64.
-        // Using them in hmap.get() was causing issues because of automatic casting.
-        int64_t val = values_flat(i);
-        int64_t key = keys_flat(i);
-        mask_flat(i) = hmap.get(val, key);
-        values_flat(i) = val;
-      }
-    }  catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("Hashmap")
-  .Device(DEVICE_CPU),
-  Hashmap);
diff --git a/twml/libtwml/src/ops/isotonic_calibration.cpp b/twml/libtwml/src/ops/isotonic_calibration.cpp
deleted file mode 100644
index 10a8c22dc..000000000
--- a/twml/libtwml/src/ops/isotonic_calibration.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("IsotonicCalibration")
-.Attr("T: {float, double}")
-.Input("input: T")
-.Input("xs: T")
-.Input("ys: T")
-.Output("output: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  // output shape should be the same as input shape.
-  c->set_output(0, c->input(0));
-  return Status::OK();
-}).Doc(R"doc(
-
-This operation calibrates probabilities by fitting to a piece-wise non-decreasing function.
-
-Input
-  input: A tensor containing uncalibrated probabilities.
-  xs: A tensor containing the boundaries of the bins.
-  ys: A tensor contianing calibrated values for the corresponding bins.
-
-Expected Sizes:
-  input: [batch_size, num_labels].
-  xs, ys: [num_labels, num_bins].
-
-Expected Types:
-  input: float or double.
-  xs, ys: same as input.
-
-Outputs
-  output: A tensor containing calibrated probabilities with same shape and size as input.
-
-)doc");
-
-template<typename T>
-class IsotonicCalibration : public OpKernel {
- public:
-  explicit IsotonicCalibration(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const Tensor& xs = context->input(1);
-    const Tensor& ys = context->input(2);
-
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-      context,
-      context->allocate_output(0, input.shape(), &output));
-
-    try {
-      const twml::Tensor twml_input = TFTensor_to_twml_tensor(input);
-      const twml::Tensor twml_xs = TFTensor_to_twml_tensor(xs);
-      const twml::Tensor twml_ys = TFTensor_to_twml_tensor(ys);
-      twml::Tensor twml_output = TFTensor_to_twml_tensor(*output);
-
-      twml::linearInterpolation(twml_output, twml_input, twml_xs, twml_ys);
-    }  catch (const std::exception &e) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-    }
-  }
-};
-
-#define REGISTER(Type)                \
-                                      \
-  REGISTER_KERNEL_BUILDER(            \
-    Name("IsotonicCalibration")       \
-    .Device(DEVICE_CPU)               \
-    .TypeConstraint<Type>("T"),       \
-    IsotonicCalibration<Type>);       \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/num_intra_op_threads.cpp b/twml/libtwml/src/ops/num_intra_op_threads.cpp
deleted file mode 100644
index 7e5ef0cbf..000000000
--- a/twml/libtwml/src/ops/num_intra_op_threads.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("NumIntraOpThreads")
-.Input("x: float32")
-.Output("num_intra_op_threads: int32")
-.SetShapeFn(tensorflow::shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that returns the number of threads in the intra_op_parallelism pool
-This is not part of the Tensorflow API as of the date of writing this doc. Hence,
-a tensorflow operation is the best resort.
-Input
-  x: Dummy placeholder so that constant folding is not done by TF GraphOptimizer.
-  Please refer https://github.com/tensorflow/tensorflow/issues/22546 for more
-  details.
-Output
-  num_intra_op_threads: A scalar tensor corresponding to the number of threads in
-  the intra_op_parallelism pool
-)doc");
-
-class NumIntraOpThreads : public OpKernel {
- public:
-  explicit NumIntraOpThreads(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    int num_intra_op_threads = context->device()->tensorflow_cpu_worker_threads()->num_threads;
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    auto output_flat = output_tensor->flat<int32>();
-    output_flat(0) = num_intra_op_threads;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("NumIntraOpThreads").Device(DEVICE_CPU), NumIntraOpThreads);
diff --git a/twml/libtwml/src/ops/par_add.cpp b/twml/libtwml/src/ops/par_add.cpp
deleted file mode 100644
index c03c1ad89..000000000
--- a/twml/libtwml/src/ops/par_add.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/logging.h"
-#include <iostream>
-
-#include <vector>
-
-using namespace tensorflow;
-
-REGISTER_OP("ParAdd")
-  .Input("input_a: float")
-  .Input("input_b: float")
-  .Output("a_plus_b: float")
-  .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-  });
-
-
-class ParAddOp : public OpKernel {
- public:
-  explicit ParAddOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor0 = context->input(0);
-    auto input_flat0 = input_tensor0.flat<float>();
-    const Tensor& input_tensor1 = context->input(1);
-    auto input_flat1 = input_tensor1.flat<float>();
-
-    OP_REQUIRES(context, input_tensor0.shape() == input_tensor1.shape(),
-                errors::InvalidArgument("Input tensors must be identical shape."));
-
-    // Create an output tensor
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0,
-                                            input_tensor0.shape(),
-                                            &output_tensor));
-    auto output_flat = output_tensor->flat<float>();
-
-    // PARALLEL ADD
-    const int N = input_flat0.size();
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [=, &input_flat0, &input_flat1, &output_flat](int64 start, int64 limit) {
-      for (; start < limit; ++start) {
-        output_flat(start) = input_flat0(start) + input_flat1(start);
-      }
-    };
-
-    // this is a heuristic. high number is likely to be sharded into smaller pieces
-    int64 cost_per_unit = 1;
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          N,
-          cost_per_unit,
-          task);
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParAdd").Device(DEVICE_CPU), ParAddOp);
-
-
diff --git a/twml/libtwml/src/ops/partition_sparse_tensor.cpp b/twml/libtwml/src/ops/partition_sparse_tensor.cpp
deleted file mode 100644
index 4a210ba7f..000000000
--- a/twml/libtwml/src/ops/partition_sparse_tensor.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("PartitionSparseTensorMod")
-.Attr("T: {float, double}")
-.Input("indices: int64")
-.Input("values: T")
-.Output("result: output_types")
-.Attr("num_partitions: int")
-.Attr("output_types: list({int64, float, double})")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-  return Status::OK();
-}).Doc(R"doc(
-
-A tensorflow OP that partitions an input batch represented as a sparse tensor
-(indices are [ids, keys]) into separate sparse tensors to more optimally place
-sparse computations in distributed training.
-
-Inputs
-  indices: Indices from sparse tensor ([ids, keys] from the batch).
-  values: Batch values from the original features dict.
-
-Attr
-  num_partitions: Number of partitions to generate.
-  output_types: A list of types for the output tensors like
-                [tf.int64, tf.float32, tf.int64, tf.float32, ...]
-                The length must be 2 * num_partitions (see Outputs below)
-
-Outputs
-  List of dense tensors containing for each partition:
-    - partitioned indices tensor ([ids, keys] from partitioned batch)
-    - partitioned values tensor
-  The list lenth is 2 * num_partitions. Example:
-  [ [ids_1, keys_1], values_1, [ids_2, keys_2], values_2, ... ]
-)doc");
-
-template<typename T>
-class PartitionSparseTensorMod : public OpKernel {
- private:
-  int64 num_partitions;
-
- public:
-  explicit PartitionSparseTensorMod(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("num_partitions", &num_partitions));
-    OP_REQUIRES(context, num_partitions > 0,
-                errors::InvalidArgument("Number of partitions must be positive"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // grab input tensors
-    const Tensor& indices_tensor = context->input(0);  // (ids, keys)
-    const Tensor& values_tensor = context->input(1);
-
-    // check sizes
-    int64 num_keys = indices_tensor.shape().dim_size(0);
-    OP_REQUIRES(context, indices_tensor.dims() == 2,
-                errors::InvalidArgument("Indices tensor must be 2D [ids, keys]"));
-    OP_REQUIRES(context, indices_tensor.shape().dim_size(1) == 2,
-                errors::InvalidArgument("Indices tensor must have 2 cols [ids, keys]"));
-    OP_REQUIRES(context, values_tensor.shape().dim_size(0) == num_keys,
-                errors::InvalidArgument("Number of values must match number of keys"));
-
-    // grab input vectors
-    auto indices = indices_tensor.flat<int64>();
-    auto values = values_tensor.flat<T>();
-
-    // count the number of features that fall in each partition
-    std::vector<int64> partition_counts(num_partitions);
-
-    for (int i = 0; i < num_keys; i++) {
-      int64 key = indices(2 * i + 1);
-      int64 partition_id = key % num_partitions;
-      partition_counts[partition_id]++;
-    }
-
-    // allocate outputs for each partition and keep references
-    std::vector<int64*> output_indices_partitions;
-    std::vector<T*> output_values_partitions;
-    output_indices_partitions.reserve(num_partitions);
-    output_values_partitions.reserve(num_partitions);
-
-    for (int i = 0; i < num_partitions; i++) {
-      Tensor *output_indices = nullptr, *output_values = nullptr;
-      TensorShape shape_indices = TensorShape({partition_counts[i], 2});
-      TensorShape shape_values = TensorShape({partition_counts[i]});
-
-      OP_REQUIRES_OK(context, context->allocate_output(2 * i, shape_indices, &output_indices));
-      OP_REQUIRES_OK(context, context->allocate_output(2 * i + 1, shape_values, &output_values));
-
-      output_indices_partitions.push_back(output_indices->flat<int64>().data());
-      output_values_partitions.push_back(output_values->flat<T>().data());
-    }
-
-    // assign a partition id to each feature
-    // populate tensors for each partition
-    std::vector<int64> partition_indices(num_partitions);
-
-    for (int i = 0; i < num_keys; i++) {
-      int64 key = indices(2 * i + 1);
-      int64 pid = key % num_partitions;  // partition id
-      int64 idx = partition_indices[pid]++;
-
-      output_indices_partitions[pid][2 * idx] = indices(2 * i);
-      output_indices_partitions[pid][2 * idx + 1] = key / num_partitions;
-      output_values_partitions[pid][idx] = values(i);
-    }
-  }
-};
-
-#define REGISTER(Type)                \
-                                      \
-  REGISTER_KERNEL_BUILDER(            \
-    Name("PartitionSparseTensorMod")  \
-    .Device(DEVICE_CPU)               \
-    .TypeConstraint<Type>("T"),       \
-    PartitionSparseTensorMod<Type>);  \
-
-REGISTER(float);
-REGISTER(double);
diff --git a/twml/libtwml/src/ops/percentile_discretizer_v2.cpp b/twml/libtwml/src/ops/percentile_discretizer_v2.cpp
deleted file mode 100644
index 2a0dac7d8..000000000
--- a/twml/libtwml/src/ops/percentile_discretizer_v2.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-
-
-using namespace tensorflow;
-
-void CombinedComputeDiscretizers(
-  OpKernelContext*,
-  int64_t,
-  const twml::Map<int64_t, int64_t>&,
-  int64_t);
-
-REGISTER_OP("PercentileDiscretizerV2")
-.Attr("T: {float, double}")
-.Input("input_ids: int64")
-.Input("input_vals: T")
-.Input("bin_ids: int64")
-.Input("bin_vals: T")
-.Input("feature_offsets: int64")
-.Input("start_compute: int64")
-.Input("end_compute: int64")
-.Attr("output_bits: int")
-.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
-.Attr("feature_indices: tensor = { dtype: DT_INT64 }")
-.Attr("cost_per_unit: int")
-.Output("new_keys: int64")
-.Output("new_vals: T")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    // TODO: check sizes
-    c->set_output(0, c->input(0));
-    c->set_output(1, c->input(0));
-    return Status::OK();
-}).Doc(R"doc(
-
-This operation discretizes a tensor containing continuous features (if calibrated).
-  - note - choice of float or double should be consistent among inputs/output
-
-Input
-  input_ids(int64): A tensor containing input feature ids (direct from data record).
-  input_vals: A tensor containing input values at corresponding feature ids.
-    - i.e. input_ids[i] <-> input_vals[i] for each i
-    - float or double
-  bin_ids(int64): A tensor containing the discretized feature id for each bin.
-  bin_vals: A tensor containing the bin boundaries for values of a given feature.
-    - float or double
-  feature_offsets(int64): Specifies the starting location of bins for a given feature id.
-  start_compute(int64 scalar tensor): which index to start the computation at
-  end_compute(int64 scalar tensor): which index to end the computation right before
-    -> for example, (start_compute,end_compute)=(0,10) would compute on 0 thru 9
-  output_bits(int): The maximum number of bits to use for the output IDs.
-    -> 2**out_bits must be greater than bin_ids.size
-  feature_ids(int64): 1D TensorProto of feature IDs seen during calibration
-  feature_indices(int64): 1D TensorProto of feature indices corresponding with feature_IDs
-    -> hint: look up make_tensor_proto:
-       proto_init = np.array(values, dtype=np.int64)
-       tensor_attr = tf.make_tensor_proto(my_proto_init)
-  cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
-    if not CPU-bound) to complete a unit of work. Overestimating creates too
-    many shards and CPU time will be dominated by per-shard overhead, such as
-    Context creation. Underestimating may not fully make use of the specified
-    parallelism.
-
-Outputs
-  new_keys(int64): The discretized feature ids with same shape and size as keys.
-  new_vals(float or double): The discretized values with the same shape and size as vals.
-
-Operation
-  Note that the discretization operation maps observation vectors to higher dimensional
-    observation vectors. Here, we describe this mapping.
-
-  Let a calibrated feature observation be given by (F,x), where F is the ID of the
-    feature, and x is some real value (i.e., continuous feature). This kind of
-    representation is useful for the representation of sparse vectors, where there
-    are many zeros.
-
-  For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
-    (0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
-    elements of the vector
-
-  The disretizer performs the following operation:
-    (F,x) -> (map(x|F),1).
-  Hence, we have that map(x|F) is a new feature ID, and the value observed for that
-    feature is 1. We might read map(x|F) as 'the map of x for feature F'.
-
-  For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
-    We will then have that F~(x) is in the set newIDs(F) for any value of x. Each set member
-    of newIDs(F) is associated with a 'bin', as defined by the bin boundaries given in
-    the bin_vals input array. For any two different feature IDs F and G, we have that
-    INTERSECT(newIDs(F),newIDs(G)) is the empty set
-
-  Example - consider input vector with a single element, i.e. [x].
-    Let's Discretize to one of 2 values, as follows:
-    Let F=0 for the ID of the single feature in the vector.
-    Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
-    Let newIDs(F) = newIDs(0) = {0,1}
-    Let map(x|F) = map(x|0) = 0 if x<=BNDRY else 1
-  If we had another element y in the vector, i.e. [x, y], then we might additionally
-    Let F=1 for element y.
-    Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
-    Let newIDs(F) = newIDs(1) = {2,3} (so as to have empty intersect with newIDs(0))
-    Let map(x|F) = map(x|1) = 2 if x<=BNDRY else 3
-  Consider vector observation [-0.1, 0.2]. We then represent this as [(0, -0.1), (1, 0.2)]
-    Let BNDRY(0) = BNDRY(1) = 0. When we discretize the vector, we get:
-    (0, -0.1) -> (map(-0.1|0), 1) = (0, 1)
-    (1,  0.2) -> (map( 0.2|1), 1) = (3, 1)
-    Our output vector is then represented sparsely as [(0, 1), (3, 1)], and the dense
-    representation of this could be [1, 0, 0, 1]
-
-)doc");
-
-template<typename T>
-class PercentileDiscretizerV2 : public OpKernel {
- public:
-  explicit PercentileDiscretizerV2(OpKernelConstruction* context) : OpKernel(context) {
-    // get the number of output bits
-    // for use with features that have not been calibrated
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("output_bits", &output_bits_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("cost_per_unit", &cost_per_unit_));
-    OP_REQUIRES(context, cost_per_unit_ >= 0,
-                errors::InvalidArgument("Must have cost_per_unit >= 0."));
-
-    // construct the ID_to_index hash map
-    Tensor feature_IDs;
-    Tensor feature_indices;
-
-    // extract the tensors
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_ids", &feature_IDs));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("feature_indices", &feature_indices));
-
-    // for access to the data
-    // int64_t data type is set in to_layer function of the calibrator objects in Python
-    auto feature_IDs_flat = feature_IDs.flat<int64>();
-    auto feature_indices_flat = feature_indices.flat<int64>();
-
-    // verify proper dimension constraints
-    OP_REQUIRES(context, feature_IDs.shape() == feature_indices.shape(),
-                errors::InvalidArgument("feature_ids and feature_indices must be identical shape."));
-    OP_REQUIRES(context, feature_IDs.shape().dims() == 1,
-                errors::InvalidArgument("feature_ids and feature_indices must be 1D."));
-
-    // reserve space in the hash map and fill in the values
-    int num_features = feature_IDs.shape().dim_size(0);
-
-#ifdef USE_DENSE_HASH
-    ID_to_index_.set_empty_key(0);
-    ID_to_index_.resize(num_features);
-#else
-    ID_to_index_.reserve(num_features);
-#endif  // USE_DENSE_HASH
-    for (int i = 0 ; i < num_features ; i++) {
-      ID_to_index_[feature_IDs_flat(i)] = feature_indices_flat(i);
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    CombinedComputeDiscretizers(
-      context,
-      output_bits_,
-      ID_to_index_,
-      cost_per_unit_);
-  }
-
- private:
-  twml::Map<int64_t, int64_t> ID_to_index_;
-  int output_bits_;
-  int cost_per_unit_;
-};
-
-#define REGISTER(Type)              \
-  REGISTER_KERNEL_BUILDER(          \
-    Name("PercentileDiscretizerV2")         \
-    .Device(DEVICE_CPU)             \
-    .TypeConstraint<Type>("T"),     \
-    PercentileDiscretizerV2<Type>);         \
-
-REGISTER(float);
-REGISTER(double);
-
-void CombinedComputeDiscretizers(
-    OpKernelContext* context,
-    int64_t output_bits,
-    const twml::Map<int64_t, int64_t> &ID_to_index,
-    int64_t cost_per_unit) {
-  const Tensor& keys = context->input(0);
-  const Tensor& vals = context->input(1);
-  const Tensor& bin_ids = context->input(2);
-  const Tensor& bin_vals = context->input(3);
-  const Tensor& feature_offsets = context->input(4);
-
-  uint64 full_size = keys.dim_size(0);
-  const int total_size = static_cast<int64>(full_size);
-  TensorShape output_shape = {total_size};
-
-  Tensor* new_keys = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
-  Tensor* new_vals = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
-
-  try {
-    twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
-    twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
-
-    const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
-    const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
-    const twml::Tensor bin_ids_ = TFTensor_to_twml_tensor(bin_ids);
-    const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
-    const twml::Tensor feature_offsets_ = TFTensor_to_twml_tensor(feature_offsets);
-
-    // retrieve the thread pool from the op context
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    // Definition of the computation thread
-    auto task = [&](int64 start, int64 limit) {
-      twml::discretizerInfer(out_keys_, out_vals_,
-                             in_keys_, in_vals_,
-                             bin_ids_, bin_vals_,
-                             feature_offsets_, output_bits,
-                             ID_to_index,
-                             start, limit,
-                             start);
-    };
-
-    // let Tensorflow split up the work as it sees fit
-    Shard(worker_threads.num_threads,
-          worker_threads.workers,
-          full_size,
-          static_cast<int64>(cost_per_unit),
-          task);
-  }  catch (const std::exception &e) {
-    context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
-  }
-}
diff --git a/twml/libtwml/src/ops/resource_utils.h b/twml/libtwml/src/ops/resource_utils.h
deleted file mode 100644
index a41fe6845..000000000
--- a/twml/libtwml/src/ops/resource_utils.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#pragma once
-
-#include <twml.h>
-
-#include <atomic>
-#include <string>
-#include <vector>
-
-// Add these to make gcc ignore the warnings from tensorflow.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-compare"
-
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/resource_op_kernel.h"
-
-#pragma GCC diagnostic pop
-
-#include <memory>
-#include <functional>
-
-template<typename T>
-void unrefHandle(T *handle) {
-  handle->Unref();
-}
-
-template <typename T>
-using unique_handle = std::unique_ptr<T, std::function<void(T *)> >;
-
-// as std::type_index is not abi compatible, we bypass the hash_code checks.
-// https://github.com/tensorflow/tensorflow/commit/15275d3a14c77e2244ae1155f93243256f08e3ed
-#ifdef __APPLE__
-template <typename T>
-Status CreateTwmlResource(OpKernelContext* ctx, const ResourceHandle& p, T* value) {
-  return ctx->resource_manager()->Create(p.container(), p.name(), value);
-}
-
-template <typename T>
-Status LookupTwmlResource(OpKernelContext* ctx, const ResourceHandle& p,
-                      T** value) {
-  return ctx->resource_manager()->Lookup(p.container(), p.name(), value);
-}
-#endif  // __APPLE__
-
-template<typename T>
-unique_handle<T> getHandle(tensorflow::OpKernelContext* context, int input_idx) {
-  using namespace tensorflow;
-  T *ptr = nullptr;
-#ifdef __APPLE__
-  auto s = LookupTwmlResource(context, HandleFromInput(context, input_idx), &ptr);
-#else
-  auto s = LookupResource(context, HandleFromInput(context, input_idx), &ptr);
-#endif  // __APPLE__
-
-  if (!s.ok()) {
-    throw std::runtime_error("Failed to get resource handle");
-  }
-  return unique_handle<T>(ptr, unrefHandle<T>);
-}
-
-template<typename InputType>
-const uint8_t *getInputBytes(const Tensor &input, int id) {
-  return reinterpret_cast<const uint8_t *>(input.flat<InputType>().data());
-}
-
-template<>
-inline const uint8_t *getInputBytes<string>(const Tensor &input, int id) {
-  return reinterpret_cast<const uint8_t *>(input.flat<string>()(id).c_str());
-}
-
-template<typename InputType>
-const int getBatchSize(const Tensor &input) {
-  return 1;
-}
-
-template<>
-inline const int getBatchSize<string>(const Tensor &input) {
-  return static_cast<int>(input.NumElements());
-}
-
-class DataRecordResource : public ResourceBase {
- public:
-  Tensor input;
-  int64 num_labels;
-  int64 num_weights;
-  twml::DataRecord common;
-  std::vector<twml::DataRecord> records;
-  twml::Map<int64_t, int64_t> *keep_map;
-  string DebugString() const override { return "DataRecords resource"; }
-};
-
-// A thin layer around batch of HashedDataRecords
-class HashedDataRecordResource : public ResourceBase {
- public:
-  Tensor input;
-  int64 total_size;
-  int64 num_labels;
-  int64 num_weights;
-  twml::HashedDataRecord common;
-  std::vector<twml::HashedDataRecord> records;
-  string DebugString() const override { return "HashedDataRecord Resource"; }
-};
-
-#define TF_CHECK_STATUS(fn) do {                \
-    Status s = fn;                              \
-    if (!s.ok()) return s;                      \
-  } while (0)
-
-template<typename ResourceType>
-Status makeResourceHandle(OpKernelContext* context, int out_idx, ResourceType **resource_) {
-  static std::atomic<int64> id;
-  Tensor* handle_tensor;
-  TF_CHECK_STATUS(context->allocate_output(out_idx, TensorShape({}), &handle_tensor));
-
-  ResourceType *resource = new ResourceType();
-  const auto resource_name = typeid(ResourceType).name() + std::to_string(id++);
-  ResourceHandle handle = MakePerStepResourceHandle<ResourceType>(context, resource_name);
-#ifdef __APPLE__
-  TF_CHECK_STATUS(CreateTwmlResource(context, handle, resource));
-#else
-  TF_CHECK_STATUS(CreateResource(context, handle, resource));
-#endif  // __APPLE__
-  handle_tensor->scalar<ResourceHandle>()() = handle;
-
-  *resource_ = resource;
-  return Status::OK();
-}
diff --git a/twml/libtwml/src/ops/scripts/get_inc.py b/twml/libtwml/src/ops/scripts/get_inc.py
deleted file mode 100644
index c50edfa90..000000000
--- a/twml/libtwml/src/ops/scripts/get_inc.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Gets the path of headers for the current Tensorflow library"""
-
-import tensorflow.compat.v1 as tf
-
-print(tf.sysconfig.get_include(), end='')
diff --git a/twml/libtwml/src/ops/scripts/get_inc.sh b/twml/libtwml/src/ops/scripts/get_inc.sh
deleted file mode 100755
index 5cb064338..000000000
--- a/twml/libtwml/src/ops/scripts/get_inc.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-PEX_INTERPRETER=1 "$PYTHON_ENV" "$LIBTWML_HOME"/src/ops/scripts/get_inc.py
diff --git a/twml/libtwml/src/ops/scripts/get_lib.py b/twml/libtwml/src/ops/scripts/get_lib.py
deleted file mode 100644
index 7150c48b7..000000000
--- a/twml/libtwml/src/ops/scripts/get_lib.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Gets the path of headers for the current Tensorflow library"""
-
-import tensorflow.compat.v1 as tf
-
-print(tf.sysconfig.get_lib(), end='')
diff --git a/twml/libtwml/src/ops/scripts/get_lib.sh b/twml/libtwml/src/ops/scripts/get_lib.sh
deleted file mode 100755
index 1b9d802b6..000000000
--- a/twml/libtwml/src/ops/scripts/get_lib.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-PEX_INTERPRETER=1 "$PYTHON_ENV" "$LIBTWML_HOME"/src/ops/scripts/get_lib.py
diff --git a/twml/libtwml/src/ops/scripts/symlink.sh b/twml/libtwml/src/ops/scripts/symlink.sh
deleted file mode 100755
index 2ddb76371..000000000
--- a/twml/libtwml/src/ops/scripts/symlink.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-#Needed to create a "nice" symlink to _pywrap_tensorflow_internal.so so
-#that cmake can link with the library properly.
-
-#This library is only needed for streaming datasets and is linked with
-#libtwml_tf_data.so which will not be used at runtime.
-
-TF_PYTHON_LIB_DIR=$(PEX_INTERPRETER=1 "$PYTHON_ENV" "$TWML_HOME"/backends/tensorflow/src/scripts/get_lib.py)
-TF_INTERNAL_LIB=$TWML_HOME/backends/tensorflow/twml/lib/libtensorflow_internal.so
-rm -f "$TF_INTERNAL_LIB"
-ln -s "$TF_PYTHON_LIB_DIR"/python/_pywrap_tensorflow_internal.so "$TF_INTERNAL_LIB"
diff --git a/twml/libtwml/src/ops/sleep_op.cpp b/twml/libtwml/src/ops/sleep_op.cpp
deleted file mode 100644
index dd9a1834c..000000000
--- a/twml/libtwml/src/ops/sleep_op.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <chrono>
-#include <thread>
-
-using namespace tensorflow;
-
-REGISTER_OP("Sleep")
-.Input("num_milliseconds: int32")
-.Output("sleep_time_in_ms: int32")
-.SetShapeFn(tensorflow::shape_inference::ScalarShape)
-.Doc(R"doc(
-A tensorflow OP that sleeps for specified number of milliseconds. 
-This is a proxy to determine the number of inter_op_parallelism pool. 
-This is not part of the Tensorflow API as of the date of writing this 
-doc. Hence, a tensorflow operation is the best resort.
-Input
-  num_milliseconds: A scalar tensor corresponding to the number
-  of milliseconds the operation should sleep for
-Output
-  sleep_time_in_ms: A scalar tensor corresponding to the 
-  actual number of milliseconds for which the operation slept
-)doc");
-
-class SleepOp : public OpKernel {
- public:
-    explicit SleepOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-    void Compute(OpKernelContext* context) override {
-      // Grab the input tensor
-      const Tensor& input_tensor = context->input(0);
-      auto input = input_tensor.flat<int32>();
-
-      // Sleep for specified milliseconds
-      auto start = std::chrono::high_resolution_clock::now();
-      std::this_thread::sleep_for(std::chrono::milliseconds(input(0)));
-      auto end = std::chrono::high_resolution_clock::now();
-      std::chrono::duration<double, std::milli> elapsed = end-start;
-
-      // Set the output tensor
-      Tensor* output_tensor = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output_tensor));
-      auto output_flat = output_tensor->flat<int32>();
-      output_flat(0) = elapsed.count();
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("Sleep").Device(DEVICE_CPU), SleepOp);
diff --git a/twml/libtwml/src/ops/sparse_normalization.cpp b/twml/libtwml/src/ops/sparse_normalization.cpp
deleted file mode 100644
index 9b079429c..000000000
--- a/twml/libtwml/src/ops/sparse_normalization.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("SparseMaxNorm")
-.Attr("epsilon: float")
-.Input("max_values: Ref(float)")
-.Input("indices: int64")
-.Input("values: float")
-.Input("is_training: bool")
-.Output("updated_max_values: Ref(float)")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that normalizes a batch of sparse inputs based on the current maximum value.
-
-Input
-  max_values: float tensor variable representing the max values seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-  is_training: bool tensor specifying if the op should be run in training mode or not.
-
-Outputs
-  updated_max_values: max_values updated with the current batch.
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-  # During training / inference
-  for i, idx in enumerate(indices):
-    updated_max_values[idx] = max(max_values[idx], abs(values[i]))
-    normalized_values[i] = values[i] / updated_max_values[idx]
-
-)doc");
-
-class SparseMaxNorm : public OpKernel {
- private:
-  float epsilon_;
-
- public:
-  explicit SparseMaxNorm(OpKernelConstruction *context) : OpKernel(context) {
-        OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-  }
-
-  void Compute(OpKernelContext *context) override {
-        // We always return the input ref.
-    context->forward_ref_input_to_ref_output(0, 0);
-    Tensor max_values_tensor = context->mutable_input(0, false);
-
-    OP_REQUIRES(context, max_values_tensor.IsInitialized(),
-                errors::FailedPrecondition("Attempting to use uninitialized "
-                                           "parameters: ",
-                                           requested_input(0)));
-
-    const Tensor &indices_tensor = context->input(1);
-    const Tensor &values_tensor = context->input(2);
-    const Tensor &is_training_tensor = context->input(3);
-
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-    const bool is_training = is_training_tensor.scalar<bool>()();
-
-    auto max_values = max_values_tensor.flat<float>();
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto normalized_values = normalized_values_tensor->flat<float>();
-
-    const int64 N = indices.size();
-
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float value = values(i);
-      float max_value = std::max(max_values(idx), std::abs(value));
-
-      // Guaranteed to be between [-1, 1].
-      normalized_values(i) = value / std::max(max_value, epsilon_);
-
-      if (is_training) {
-        max_values(idx) = max_value;
-      }
-    }
-  }
-};
-
-REGISTER_OP("SparseBatchNorm")
-.Attr("input_size: int")
-.Attr("epsilon: float")
-.Input("means: Ref(float)")
-.Input("variances: Ref(float)")
-.Input("indices: int64")
-.Input("values: float")
-.Input("is_training: bool")
-.Output("updated_means: Ref(float)")
-.Output("updated_vars: Ref(float)")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that performs batch normalization.
-
-Attr
-  input_size: Size of the inputs.
-  epsilon: The minimum value of the variance.
-
-Input
-  mean: float tensor variable representing the running mean seen so far.
-  variances: float tensor variable representing the running variance seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-  is_training: bool tensor specifying if the op should be run in training mode or not.
-
-Outputs
-  updated_means: mean updated with the current batch.
-  updated_vars: variances updated with the current batch.
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-    if is_training:
-      means, variances = update_metrics(means, variances, values)
-
-    normalized_values = (values - means) / sqrt(variances + epsilon)
-    return normalized_values * gamma + beta
-
-)doc");
-
-class SparseBatchNorm : public OpKernel {
- private:
-  std::vector<int64> counts_;
-  std::vector<float> m2s_;
-  float epsilon_;
-
- public:
-  explicit SparseBatchNorm(OpKernelConstruction *context) : OpKernel(context) {
-    int64 input_size;
-    OP_REQUIRES_OK(context, context->GetAttr("input_size", &input_size));
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-    counts_.resize(input_size);
-    m2s_.resize(input_size);
-  }
-
-  void Compute(OpKernelContext *context) override {
-    // We always return the input ref.
-    context->forward_ref_input_to_ref_output(0, 0);
-    context->forward_ref_input_to_ref_output(1, 1);
-
-    Tensor means_tensor = context->mutable_input(0, true);
-    Tensor variances_tensor = context->mutable_input(1, true);
-
-    OP_REQUIRES(context, means_tensor.IsInitialized(),
-                errors::FailedPrecondition("Attempting to use uninitialized "
-                                           "parameters: ",
-                                           requested_input(0)));
-
-    OP_REQUIRES(context, variances_tensor.IsInitialized(),
-                errors::FailedPrecondition("Attempting to use uninitialized "
-                                           "parameters: ",
-                                           requested_input(1)));
-
-    const Tensor &indices_tensor = context->input(2);
-    const Tensor &values_tensor = context->input(3);
-    const Tensor &is_training_tensor = context->input(4);
-
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-    const bool is_training = is_training_tensor.scalar<bool>()();
-
-    auto means = means_tensor.flat<float>();
-    auto variances = variances_tensor.flat<float>();
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(2, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto normalized_values = normalized_values_tensor->flat<float>();
-    const int64 N = indices.size();
-
-    if (is_training) {
-      // Accumulate, mean, count, sum of squared differences.
-      // Reference wiki:
-      // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
-      // Reference paper:
-      // https://www.jstor.org/stable/1266577?seq=1#page_scan_tab_contents
-      for (int64 i = 0; i < N; i++) {
-        int64 idx = indices(i);
-        int64 count = counts_[idx] + 1;
-
-        float value = values(i);
-        float old_mean = means(idx);
-        float old_delta = value - old_mean;
-        float new_mean = old_mean + old_delta / count;
-        float new_delta = value - new_mean;
-
-        counts_[idx] = count;
-        m2s_[idx] += new_delta * old_delta;
-        means(idx) = new_mean;
-        variances(idx) = m2s_[idx] / count;
-      }
-    }
-
-    // Normalize the values
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float stdev = std::sqrt(variances(idx) + epsilon_);
-      normalized_values(i) = (values(i) - means(idx)) / stdev;
-    }
-  }
-};
-
-REGISTER_OP("SparseMaxNormInference")
-.Attr("epsilon: float")
-.Input("max_values: float")
-.Input("indices: int64")
-.Input("values: float")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that normalizes a batch of sparse inputs based on the current maximum value.
-This is the inference OP.
-
-Input
-  max_values: float tensor representing the max values seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-
-Outputs
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-  # During inference
-  for i, idx in enumerate(indices):
-    updated_max_values[idx] = max(max_values[idx], abs(values[i]))
-    normalized_values[i] = values[i] / updated_max_values[idx]
-
-)doc");
-
-class SparseMaxNormInference : public OpKernel {
- private:
-  float epsilon_;
-
- public:
-  explicit SparseMaxNormInference(OpKernelConstruction *context) : OpKernel(context) {
-        OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    const Tensor &max_values_tensor = context->input(0);
-    const Tensor &indices_tensor = context->input(1);
-    const Tensor &values_tensor = context->input(2);
-
-    const auto max_values = max_values_tensor.flat<float>();
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto normalized_values = normalized_values_tensor->flat<float>();
-
-    const int64 N = indices.size();
-
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float value = values(i);
-      float max_value = std::max(max_values(idx), std::abs(value));
-
-      // Guaranteed to be between [-1, 1].
-      normalized_values(i) = value / std::max(max_value, epsilon_);
-    }
-  }
-};
-
-REGISTER_OP("SparseMaxNormTraining")
-.Attr("epsilon: float")
-.Input("max_values: float")
-.Input("indices: int64")
-.Input("values: float")
-.Output("updated_max_values: float")
-.Output("normalized_values: float")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that normalizes a batch of sparse inputs based on the current maximum value.
-This is the training OP.
-
-Input
-  max_values: float tensor variable representing the max values seen so far.
-  indices: int64 tensor representing indices representing a feature.
-  values: float tensor representing values for the current batch.
-
-Outputs
-  updated_max_values: max_values updated with the current batch.
-  normalized_values: Input values normalized by the max value seen so far.
-
-The pseudo code for normalization can be seen below:
-
-  # During training
-  for i, idx in enumerate(indices):
-    updated_max_values[idx] = max(max_values[idx], abs(values[i]))
-    normalized_values[i] = values[i] / updated_max_values[idx]
-
-)doc");
-
-class SparseMaxNormTraining : public OpKernel {
- private:
-  float epsilon_;
-
- public:
-  explicit SparseMaxNormTraining(OpKernelConstruction *context) : OpKernel(context) {
-        OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    const Tensor &max_values_tensor = context->input(0);
-    const Tensor &indices_tensor = context->input(1);
-    const Tensor &values_tensor = context->input(2);
-
-    const auto max_values = max_values_tensor.flat<float>();
-    const auto indices = indices_tensor.flat<int64>();
-    const auto values = values_tensor.flat<float>();
-
-    Tensor *updated_max_values_tensor = nullptr;
-    Tensor *normalized_values_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, max_values_tensor.shape(),
-                                                     &updated_max_values_tensor));
-    OP_REQUIRES_OK(context, context->allocate_output(1, values_tensor.shape(),
-                                                     &normalized_values_tensor));
-
-    auto updated_max_values = updated_max_values_tensor->flat<float>();
-    auto normalized_values = normalized_values_tensor->flat<float>();
-
-    const int64 N = indices.size();
-
-    // This copy is needed because the values of updated_max_values are originally garbage.
-    // Also note that N is not the same as max_values.size()
-    std::copy(max_values.data(), max_values.data() + max_values.size(), updated_max_values.data());
-
-    for (int64 i = 0; i < N; i++) {
-      int64 idx = indices(i);
-      float value = values(i);
-      float updated_max_value = std::max(updated_max_values(idx), std::abs(value));
-      // Guaranteed to be between [-1, 1].
-      normalized_values(i) = value / std::max(updated_max_value, epsilon_);
-      // Saving the updated_max_values
-      updated_max_values(idx) = updated_max_value;
-    }
-  }
-};
-
-
-
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseMaxNorm")
-  .Device(DEVICE_CPU),
-  SparseMaxNorm);
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseBatchNorm")
-  .Device(DEVICE_CPU),
-  SparseBatchNorm);
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseMaxNormInference")
-  .Device(DEVICE_CPU),
-  SparseMaxNormInference);
-
-REGISTER_KERNEL_BUILDER(
-  Name("SparseMaxNormTraining")
-  .Device(DEVICE_CPU),
-  SparseMaxNormTraining);
diff --git a/twml/libtwml/src/ops/tensor_record.cpp b/twml/libtwml/src/ops/tensor_record.cpp
deleted file mode 100644
index ad044e378..000000000
--- a/twml/libtwml/src/ops/tensor_record.cpp
+++ /dev/null
@@ -1,692 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include <twml.h>
-#include "tensorflow_utils.h"
-#include "resource_utils.h"
-
-#include <algorithm>
-using std::string;
-
-REGISTER_OP("GetStringTensorsFromDataRecord")
-.Attr("feature_id: int")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("strings: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns string tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D int64 tensor representing the input index in a given batch.
-  strings: A 1D string tensor representing the decoded strings from the batch.
-)doc");
-
-REGISTER_OP("GetStringTensorsFromHashedDataRecord")
-.Attr("feature_id: int")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.Output("strings: string")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns string tensors from the hashed data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D int64 tensor representing the input index in a given batch.
-  strings: A 1D string tensor representing the decoded strings from the batch.
-)doc");
-
-template<typename Resource>
-class GetStringTensorsOp : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetStringTensorsOp(OpKernelConstruction *context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    const int64 batch_size = static_cast<int64>(handle->records.size());
-    const auto &records = handle->records;
-
-    try {
-      int64 total_size = 0;
-      for (const auto &record : records) {
-        try {
-          const auto &tensor = record.getRawTensor(feature_id);
-          total_size += static_cast<int64>(tensor.getNumElements());
-        } catch(const std::out_of_range &err) {
-          LOG(WARNING) << "Ignoring missing string tensor with key: " << feature_id << std::endl;
-          continue;
-        }
-      }
-
-      twml::ThriftReader reader(nullptr);
-      TensorShape shape = {total_size};
-      Tensor *strings_tensor = nullptr;
-      Tensor *ids_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, shape, &ids_tensor));
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape, &strings_tensor));
-
-      auto strings_data = strings_tensor->flat<string>().data();
-      auto ids_data = ids_tensor->flat<int64>().data();
-
-      for (int64 i = 0; i < batch_size; i++) {
-        const auto &record = records[i];
-        try {
-          const twml::RawTensor &tensor = record.getRawTensor(feature_id);
-          const uint8_t *buffer = static_cast<const uint8_t *>(tensor.getData<void>());
-          const int64 num_strings = static_cast<int64>(tensor.getNumElements());
-          reader.setBuffer(buffer);
-
-          for (int64 j = 0; j < num_strings; j++) {
-            const uint8_t *curr_begin = nullptr;
-            const auto curr_length = reader.getRawBuffer<uint8_t>(&curr_begin);
-            strings_data[j] = std::string(curr_begin, curr_begin + curr_length);
-            ids_data[j] = i;
-          }
-          ids_data += num_strings;
-          strings_data += num_strings;
-        } catch(const std::out_of_range &err) {
-          continue;
-        }
-      }
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetStringTensorsFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetStringTensorsOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetStringTensorsFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetStringTensorsOp<HashedDataRecordResource>);
-
-REGISTER_OP("GetTensorsFromDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Input("data_record_handle: resource")
-.Output("output: string")
-.Output("out_shape: int64")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_shape: A tensor containing [batch_size, thrift_shape].
-  out_type: Output type returned as a string tensor of size 1.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-REGISTER_OP("GetTensorsFromHashedDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Input("hashed_data_record_handle: resource")
-.Output("output: string")
-.Output("out_shape: int64")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that returns decodes and tensors from the hashed data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_shape: A tensor containing [batch_size, thrift_shape].
-  out_type: Output type returned as a string tensor of size 1.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-template<class Resource>
-class GetTensorsOp : public OpKernel {
- private:
-  bool assert_shape;
-  int64 feature_id;
-
- public:
-  explicit GetTensorsOp(OpKernelConstruction *context)
-      : OpKernel(context), assert_shape(true) {
-    OP_REQUIRES_OK(context, context->GetAttr("assert_shape", &assert_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    uint64 batch_size = handle->records.size();
-    const auto &records = handle->records;
-
-    try {
-      TensorShape raw_shape = {static_cast<int64>(batch_size)};
-      Tensor* output_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(0, raw_shape, &output_tensor));
-      auto output_flat = output_tensor->flat<string>();
-      auto output_data = output_flat.data();
-
-      twml_type type = TWML_TYPE_UNKNOWN;
-      bool is_big_endian = false;
-
-      std::vector<uint64> shape(1, batch_size);
-      uint64 length = 0;
-
-      for (auto record : records) {
-        const twml::RawTensor tensor = record.getRawTensor(feature_id);
-        const auto &curr_dims = tensor.getDims();
-        const auto curr_type = tensor.getType();
-        const bool curr_is_big_endian = tensor.is_big_endian();
-        const uint64 curr_length = tensor.getRawLength();
-
-        // Create the output tensor based on first tensor
-        if (shape.size() == 1) {
-          // Push the shape of individual tensors into shape
-          shape.reserve(curr_dims.size() + 1);
-          shape.insert(shape.end(), curr_dims.begin(), curr_dims.end());
-          type = curr_type;
-          is_big_endian = curr_is_big_endian;
-          length = curr_length;
-
-        } else {
-          if (assert_shape) {
-            // Assert shape of all tensors is the same.
-            bool is_same_shape = std::equal(shape.begin() + 1, shape.end(), curr_dims.begin());
-
-            if (!is_same_shape || length != curr_length) {
-              throw std::runtime_error("TensorShape mismatch for feature_id: "
-                                       + std::to_string(feature_id));
-            }
-          }
-
-          // Assert type and endianness of all tensors is the same.
-          if (type != curr_type || is_big_endian != curr_is_big_endian) {
-            throw std::runtime_error("Tensor type mismatch for feature_id: "
-                                     + std::to_string(feature_id));
-          }
-        }
-
-        // Copy from datarecord to output
-        const uint8 *tensor_data = reinterpret_cast<const uint8 *>(tensor.getData<void>());
-        *output_data = std::string(tensor_data, tensor_data + curr_length);
-
-        // Increment it for the next tensor in the batch.
-        output_data++;
-      }
-
-      Tensor *shape_tensor = nullptr;
-      TensorShape shape_shape = {static_cast<int64>(shape.size())};
-      OP_REQUIRES_OK(context, context->allocate_output(1, shape_shape, &shape_tensor));
-      auto shape_flat = shape_tensor->flat<int64>();
-      for (int i = 0; i < static_cast<int>(shape.size()); i++) {
-        shape_flat(i) = shape[i];
-      }
-
-      Tensor* type_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(2, {}, &type_tensor));
-      type_tensor->scalar<string>()() = twml::getTypeName(type);
-
-      Tensor* endian_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(3, {}, &endian_tensor));
-      endian_tensor->scalar<uint8>()() = is_big_endian;
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsOp<HashedDataRecordResource>);
-
-REGISTER_OP("GetTensorsWithMissingMaskFromDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Attr("default_shape: list(int)")
-.Attr("dtype_size: int")
-.Input("data_record_handle: resource")
-.Output("output: string")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.Output("is_found: bool")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  assert_shape: Specifies if the shape needs to be same across the batch.
-  feature_id: The hashed id of the feature name.
-  default_shape: Expected shape of output tensor.
-  dtype_size: expected size of each element.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_type: A string tensor represnting the type.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-  is_missing: A boolean tensor of length batch_size represnting if the tensor was found for an input.
-)doc");
-
-REGISTER_OP("GetTensorsWithMissingMaskFromHashedDataRecord")
-.Attr("assert_shape: bool")
-.Attr("feature_id: int")
-.Attr("default_shape: list(int)")
-.Attr("dtype_size: int")
-.Input("hashed_data_record_handle: resource")
-.Output("output: string")
-.Output("out_type: string")
-.Output("out_endian: uint8")
-.Output("is_found: bool")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  assert_shape: Specifies if the shape needs to be same across the batch.
-  feature_id: The hashed id of the feature name.
-  default_shape: Expected shape of output tensor.
-  dtype_size: expected size of each element.
-
-Input
-  hashed_data_record_handle: Resource handle to HashedDataRecord.
-
-Outputs
-  output: A 2D byte tensor representing the requested feature.
-  out_type: A string tensor represnting the type.
-  out_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-  is_missing: A boolean tensor of length batch_size represnting if the tensor was found for an input.
-)doc");
-
-template<class Resource>
-class GetTensorsWithMissingMaskOp : public OpKernel {
- private:
-  bool assert_shape;
-  int64 feature_id;
-  int64 dtype_size;
-  std::vector<int64> shape;
-
- public:
-  explicit GetTensorsWithMissingMaskOp(OpKernelConstruction *context)
-      : OpKernel(context), assert_shape(true) {
-    OP_REQUIRES_OK(context, context->GetAttr("assert_shape", &assert_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-    OP_REQUIRES_OK(context, context->GetAttr("default_shape", &shape));
-    OP_REQUIRES_OK(context, context->GetAttr("dtype_size", &dtype_size));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    uint64 batch_size = handle->records.size();
-    const auto &records = handle->records;
-
-    try {
-      TensorShape raw_shape = {static_cast<int64>(batch_size)};
-      Tensor* output_tensor = nullptr;
-      Tensor* is_found_tensor = nullptr;
-
-      OP_REQUIRES_OK(context, context->allocate_output(0, raw_shape, &output_tensor));
-      OP_REQUIRES_OK(context, context->allocate_output(3, raw_shape, &is_found_tensor));
-
-      auto output_flat = output_tensor->flat<string>();
-      auto output_data = output_flat.data();
-      auto is_found_data = is_found_tensor->flat<bool>().data();
-
-      twml_type type = TWML_TYPE_UNKNOWN;
-      bool is_big_endian = false;
-
-      uint64 length = std::accumulate(shape.begin(), shape.end(), dtype_size, std::multiplies<int64>());
-      for (auto record : records) {
-        try {
-          const twml::RawTensor tensor = record.getRawTensor(feature_id);
-          const auto &curr_dims = tensor.getDims();
-          const auto curr_type = tensor.getType();
-          const bool curr_is_big_endian = tensor.is_big_endian();
-          const uint64 curr_length = tensor.getRawLength();
-
-          if (type == TWML_TYPE_UNKNOWN) {
-            type = curr_type;
-            is_big_endian = curr_is_big_endian;
-            // FloatTensors are stored as a list of doubles.
-            // If the requested dtype_size is 4, update the length.
-            // NOTE: All the missing tensors before this have wrong length, this is fixed at the end.
-            if (type == TWML_TYPE_DOUBLE && is_big_endian && dtype_size == 4) {
-              length = length * 2;
-            }
-          } else {
-            // Assert type and endianness of all tensors is the same.
-            if (type != curr_type || is_big_endian != curr_is_big_endian) {
-              throw std::runtime_error("Tensor type mismatch for feature_id: "
-                                       + std::to_string(feature_id));
-            }
-          }
-
-          // Assert shape of all tensors is the same.
-          if (assert_shape && type != TWML_TYPE_UNKNOWN) {
-            // Assert shape of all tensors is the same.
-            bool is_same_shape = std::equal(shape.begin(), shape.end(), curr_dims.begin());
-
-            if (!is_same_shape || length != curr_length) {
-              throw std::runtime_error("TensorShape mismatch for feature_id: "
-                                       + std::to_string(feature_id));
-            }
-          }
-
-          // Copy from datarecord to output
-          const uint8 *tensor_data = reinterpret_cast<const uint8 *>(tensor.getData<void>());
-          *output_data = std::string(tensor_data, tensor_data + curr_length);
-          *is_found_data = true;
-        } catch(const std::out_of_range &err) {
-          *output_data = std::string();
-          output_data->resize(length);
-          *is_found_data = false;
-        }
-
-        // Increment it for the next tensor in the batch.
-        output_data++;
-        is_found_data++;
-      }
-
-      // Reset pointers to the beginning
-      output_data = output_flat.data();
-      is_found_data = is_found_tensor->flat<bool>().data();
-
-      // Resize any missing tensors before type (and hence true length) was known.
-      if (type == TWML_TYPE_DOUBLE) {
-        for (int64 i = 0; i < static_cast<int64>(records.size()); i++) {
-          if (!is_found_data[i]) {
-            output_data[i].resize(length);
-          }
-        }
-      }
-
-      Tensor* type_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, {}, &type_tensor));
-      type_tensor->scalar<string>()() = twml::getTypeName(type);
-
-      Tensor* endian_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(2, {}, &endian_tensor));
-      endian_tensor->scalar<uint8>()() = is_big_endian;
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsWithMissingMaskFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsWithMissingMaskOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetTensorsWithMissingMaskFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetTensorsWithMissingMaskOp<HashedDataRecordResource>);
-
-REGISTER_OP("GetSparseTensorsFromDataRecord")
-.Attr("feature_id: int")
-.Input("data_record_handle: resource")
-.Output("ids: int64")
-.Output("indices: string")
-.Output("values: string")
-.Output("dense_shape: int64")
-.Output("values_type: string")
-.Output("valueendian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D tensor representing which input in the batch the value belongs to.
-  indices: An string tensor containing indices of the sparse tensor as bytes.
-  values: An string tensor containing values of the sparse tensor as bytes.
-  dense_shape: A tensor containing [batch_size, thrift_shape].
-  values_type: The data type of value tensor returned as a string tensor of size 1.
-  values_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-REGISTER_OP("GetSparseTensorsFromHashedDataRecord")
-.Attr("feature_id: int")
-.Input("hashed_data_record_handle: resource")
-.Output("ids: int64")
-.Output("indices: string")
-.Output("values: string")
-.Output("dense_shape: int64")
-.Output("values_type: string")
-.Output("values_endian: uint8")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    return Status::OK();
-  }).Doc(R"doc(
-A tensorflow OP that decodes and returns tensors from the data record.
-
-Attr
-  feature_id: The hashed id of the feature name.
-
-Input
-  data_record_handle: Resource handle to DataRecord.
-
-Outputs
-  ids: A 1D tensor representing which input in the batch the value belongs to.
-  indices: An string tensor containing indices of the sparse tensor as bytes.
-  values: An string tensor containing values of the sparse tensor as bytes.
-  dense_shape: A tensor containing [batch_size, thrift_shape].
-  values_type: The data type of value tensor returned as a string tensor of size 1.
-  values_endian: Endianness of the bytes returned a tensor of size 1. 0: litte, 1: big.
-)doc");
-
-template<typename Resource>
-class GetSparseTensorsOp : public OpKernel {
- private:
-  int64 feature_id;
-
- public:
-  explicit GetSparseTensorsOp(OpKernelConstruction *context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_id", &feature_id));
-  }
-
-  void Compute(OpKernelContext *context) override {
-    auto handle = getHandle<Resource>(context, 0);
-    const int64 batch_size = static_cast<int64>(handle->records.size());
-    const auto &records = handle->records;
-
-    try {
-      twml_type type = TWML_TYPE_UNKNOWN;
-      bool is_big_endian = false;
-
-      std::vector<uint64> shape(1, batch_size);
-
-      int64 total_length = 0;
-      std::vector<int64> lengths;
-      lengths.reserve(batch_size);
-
-      int64 total_indices_length = 0;
-      std::vector<int64> indices_raw_lengths;
-      std::vector<const uint8 *> indices_data_ptrs;
-      indices_raw_lengths.reserve(batch_size);
-      indices_data_ptrs.reserve(batch_size);
-
-      int64 total_values_length = 0;
-      std::vector<int64> values_raw_lengths;
-      std::vector<const uint8 *> values_data_ptrs;
-      values_raw_lengths.reserve(batch_size);
-      values_data_ptrs.reserve(batch_size);
-
-      for (auto record : records) {
-        const twml::RawSparseTensor sparse_tensor = record.getRawSparseTensor(feature_id);
-        const twml::RawTensor indices = sparse_tensor.indices();
-        const twml::RawTensor values = sparse_tensor.values();
-        const auto &dense_shape = sparse_tensor.denseShape();
-        const auto indices_type = indices.getType();
-        const auto indices_is_big_endian = indices.is_big_endian();
-        const auto values_type = values.getType();
-        const bool values_is_big_endian = values.is_big_endian();
-
-        const uint64 indices_length = indices.getDims().back();
-        const uint64 values_length = values.getDims().back();
-
-        auto indices_raw_length = indices.getRawLength();
-        auto values_raw_length = values.getRawLength();
-
-        auto indices_data_ptr = reinterpret_cast<const uint8 *>(indices.getData<void>());
-        auto values_data_ptr = reinterpret_cast<const uint8 *>(values.getData<void>());
-
-        indices_raw_lengths.push_back(indices_raw_length);
-        values_raw_lengths.push_back(values_raw_length);
-
-        indices_data_ptrs.push_back(indices_data_ptr);
-        values_data_ptrs.push_back(values_data_ptr);
-
-        total_indices_length += indices_raw_length;
-        total_values_length += values_raw_length;
-
-        if (shape.size() == 1) {
-          shape.reserve(dense_shape.size() + 1);
-          shape.insert(shape.end(), dense_shape.begin(), dense_shape.end());
-          type = values_type;
-          is_big_endian = values_is_big_endian;
-        }
-
-        // Assert shape of all tensors is the same.
-        if (!std::equal(shape.begin() + 1, shape.end(), dense_shape.begin())) {
-          throw std::runtime_error("dense_shape of sparse tensors doesn't match for feature_id: "
-                                   + std::to_string(feature_id));
-        }
-        // Assert type of all values tensor is the same.
-        if (type != values_type || is_big_endian != values_is_big_endian) {
-          throw std::runtime_error("The type of values do not match for feature_id: "
-                                   + std::to_string(feature_id));
-        }
-        // Assert indices tensor is big endian and of type INT64.
-        if (indices_type != TWML_TYPE_INT64 || !indices_is_big_endian) {
-          throw std::runtime_error("Unexpected type for index tensor for feature_id: "
-                                   + std::to_string(feature_id));
-        }
-
-        if (indices_length != values_length) {
-          throw std::runtime_error("The length of values and indices does not match for : "
-                                   + std::to_string(feature_id));
-        }
-
-        lengths.push_back(indices_length);
-        total_length += indices_length;
-      }
-
-      Tensor* ids_tensor = nullptr;
-      TensorShape ids_shape = {static_cast<int64>(total_length)};
-      OP_REQUIRES_OK(context, context->allocate_output(0, ids_shape, &ids_tensor));
-      auto ids_tensor_flat = ids_tensor->flat<int64>();
-      auto ids_tensor_data = ids_tensor_flat.data();
-
-      TensorShape raw_shape = {static_cast<int64>(1)};
-
-      Tensor* indices_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(1, raw_shape, &indices_tensor));
-      auto indices_tensor_flat = indices_tensor->flat<string>();
-      auto indices_tensor_string = indices_tensor_flat.data();
-      indices_tensor_string->resize(total_indices_length);
-      auto indices_tensor_iter = indices_tensor_string->begin();
-
-      Tensor* values_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(2, raw_shape, &values_tensor));
-      auto values_tensor_flat = values_tensor->flat<string>();
-      auto values_tensor_string = values_tensor_flat.data();
-      values_tensor_string->resize(total_values_length);
-      auto values_tensor_iter = values_tensor_string->begin();
-
-      for (int64 i = 0; i < batch_size; i++) {
-        // Fill in the data for id == i for all values in the current input.
-        std::fill(ids_tensor_data, ids_tensor_data + lengths[i], i);
-        ids_tensor_data += lengths[i];
-
-        indices_tensor_iter = std::copy(indices_data_ptrs[i],
-                                        indices_data_ptrs[i] + indices_raw_lengths[i],
-                                        indices_tensor_iter);
-
-        values_tensor_iter = std::copy(values_data_ptrs[i],
-                                        values_data_ptrs[i] + values_raw_lengths[i],
-                                        values_tensor_iter);
-      }
-
-      Tensor *shape_tensor = nullptr;
-      TensorShape shape_shape = {static_cast<int64>(shape.size())};
-      OP_REQUIRES_OK(context, context->allocate_output(3, shape_shape, &shape_tensor));
-      auto shape_flat = shape_tensor->flat<int64>();
-      for (int i = 0; i < static_cast<int>(shape.size()); i++) {
-        shape_flat(i) = shape[i];
-      }
-
-      Tensor* type_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(4, {}, &type_tensor));
-      type_tensor->scalar<string>()() = twml::getTypeName(type);
-
-      Tensor* endian_tensor = nullptr;
-      OP_REQUIRES_OK(context, context->allocate_output(5, {}, &endian_tensor));
-      endian_tensor->scalar<uint8>()() = is_big_endian;
-    } catch(const std::exception &err) {
-      context->CtxFailureWithWarning(errors::InvalidArgument(err.what()));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetSparseTensorsFromDataRecord")
-  .Device(DEVICE_CPU),
-  GetSparseTensorsOp<DataRecordResource>);
-
-REGISTER_KERNEL_BUILDER(
-  Name("GetSparseTensorsFromHashedDataRecord")
-  .Device(DEVICE_CPU),
-  GetSparseTensorsOp<HashedDataRecordResource>);
diff --git a/twml/libtwml/src/ops/tensorflow_utils.cpp b/twml/libtwml/src/ops/tensorflow_utils.cpp
deleted file mode 100644
index 95ebc7e4c..000000000
--- a/twml/libtwml/src/ops/tensorflow_utils.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "tensorflow_utils.h"
-#include <string>
-#include <vector>
-
-twml::Tensor TFTensor_to_twml_tensor(Tensor &input) {
-  int ndims = input.dims();
-  std::vector<uint64_t> dims(ndims);
-  std::vector<uint64_t> strides(ndims);
-  for (int i = 0; i < ndims; i++) {
-    dims[i] = input.dim_size(i);
-  }
-  uint64_t stride = 1;
-  for (int i = ndims-1; i >= 0; i--) {
-    strides[i] = stride;
-    stride *= dims[i];
-  }
-
-  switch (input.dtype()) {
-    case DT_INT8:
-      return twml::Tensor(input.flat<int8>().data(), dims, strides, TWML_TYPE_INT8);
-    case DT_UINT8:
-      return twml::Tensor(input.flat<uint8>().data(), dims, strides, TWML_TYPE_UINT8);
-    case DT_INT32:
-      return twml::Tensor(input.flat<int32>().data(), dims, strides, TWML_TYPE_INT32);
-    case DT_INT64:
-      return twml::Tensor(input.flat<int64>().data(), dims, strides, TWML_TYPE_INT64);
-    case DT_FLOAT:
-      return twml::Tensor(input.flat<float>().data(), dims, strides, TWML_TYPE_FLOAT);
-    case DT_DOUBLE:
-      return twml::Tensor(input.flat<double>().data(), dims, strides, TWML_TYPE_DOUBLE);
-    case DT_BOOL:
-      return twml::Tensor(input.flat<bool>().data(), dims, strides, TWML_TYPE_BOOL);
-    case DT_STRING:
-      return twml::Tensor(input.flat<string>().data(), dims, strides, TWML_TYPE_STRING);
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "Unknown tensor data type.");
-      break;
-  }
-}
-
-const twml::Tensor TFTensor_to_twml_tensor(const Tensor &input) {
-  // TODO: define some type of constant tensor, which should be used for inputs to force not
-  // changing
-  return TFTensor_to_twml_tensor(const_cast<Tensor&>(input));
-}
-
-twml::RawTensor TFTensor_to_twml_raw_tensor(Tensor &input) {
-  int ndims = input.dims();
-  std::vector<uint64_t> dims(ndims);
-  std::vector<uint64_t> strides(ndims);
-  for (int i = 0; i < ndims; i++) {
-    dims[i] = input.dim_size(i);
-  }
-  uint64_t stride = 1;
-  for (int i = ndims-1; i >= 0; i--) {
-    strides[i] = stride;
-    stride *= dims[i];
-  }
-
-  switch (input.dtype()) {
-    case DT_INT8:
-      return twml::RawTensor(input.flat<int8>().data(), dims, strides, TWML_TYPE_INT8, false, input.flat<int8>().size());
-    case DT_UINT8:
-      return twml::RawTensor(input.flat<uint8>().data(), dims, strides, TWML_TYPE_UINT8, false, input.flat<uint8>().size());
-    case DT_INT32:
-      return twml::RawTensor(input.flat<int32>().data(), dims, strides, TWML_TYPE_INT32, false, input.flat<int32>().size());
-    case DT_INT64:
-      return twml::RawTensor(input.flat<int64>().data(), dims, strides, TWML_TYPE_INT64, false, input.flat<int64>().size());
-    case DT_FLOAT:
-      return twml::RawTensor(input.flat<float>().data(), dims, strides, TWML_TYPE_FLOAT, false, input.flat<float>().size());
-    case DT_DOUBLE:
-      return twml::RawTensor(input.flat<double>().data(), dims, strides, TWML_TYPE_DOUBLE, false, input.flat<double>().size());
-    case DT_BOOL:
-      return twml::RawTensor(input.flat<bool>().data(), dims, strides, TWML_TYPE_BOOL, false, input.flat<bool>().size());
-    case DT_STRING:
-      return twml::RawTensor(input.flat<string>().data(), dims, strides, TWML_TYPE_STRING, false, input.flat<string>().size());
-    default:
-      throw twml::Error(TWML_ERR_TYPE, "Unknown tensor data type.");
-      break;
-  }
-}
-
-const twml::RawTensor TFTensor_to_twml_raw_tensor(const Tensor &input) {
-  // TODO: define some type of constant tensor, which should be used for inputs to force not
-  // changing
-  return TFTensor_to_twml_raw_tensor(const_cast<Tensor&>(input));
-}
diff --git a/twml/libtwml/src/ops/tensorflow_utils.h b/twml/libtwml/src/ops/tensorflow_utils.h
deleted file mode 100644
index 4940f680d..000000000
--- a/twml/libtwml/src/ops/tensorflow_utils.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <twml.h>
-
-using namespace tensorflow;
-twml::Tensor TFTensor_to_twml_tensor(Tensor &input);
-twml::RawTensor TFTensor_to_twml_raw_tensor(Tensor &input);
-const twml::Tensor TFTensor_to_twml_tensor(const Tensor &input);
-const twml::RawTensor TFTensor_to_twml_raw_tensor(const Tensor &input);
-
diff --git a/twml/libtwml/src/ops/var_length_reader.cpp b/twml/libtwml/src/ops/var_length_reader.cpp
deleted file mode 100644
index 62b5fc2a1..000000000
--- a/twml/libtwml/src/ops/var_length_reader.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("VarLengthReader")
-.Input("input1: int32")
-.Output("output: int32")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-    ::tensorflow::shape_inference::ShapeHandle input;
-    // check that input has only 1 dimension.
-    TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
-    // there's no inference on output shape.
-    return Status::OK();
-  });
-
-
-class VarLengthReaderOp : public OpKernel {
- public:
-  explicit VarLengthReaderOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<int32>();
-
-    // get the first element in the input tensor, use it as output shape.
-    int32 len = input(0);
-    TensorShape output_shape = {1, len};
-
-    // Create an output tensor, the size is determined by the content of input.
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
-
-    auto output_flat = output_tensor->flat<int32>();
-
-    // Fill output with ones.
-    const int N = output_flat.size();
-    for (int i = 0; i < N; i++) {
-      output_flat(i) = 1;
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("VarLengthReader").Device(DEVICE_CPU), VarLengthReaderOp);
diff --git a/twml/setup.cfg b/twml/setup.cfg
deleted file mode 100644
index d887f33c2..000000000
--- a/twml/setup.cfg
+++ /dev/null
@@ -1,8 +0,0 @@
-[bdist_wheel]
-universal=1
-
-[build]
-build-lib=build_dir
-
-[bdist]
-bdist-base=build_dir
diff --git a/twml/setup.py b/twml/setup.py
deleted file mode 100644
index 7e4003bae..000000000
--- a/twml/setup.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-
-from setuptools import find_packages, setup
-
-
-THIS_DIR = os.path.dirname(os.path.realpath(__file__))
-TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, 'twml/tests/data')
-
-data_files = []
-for parent, children, files in os.walk(TWML_TEST_DATA_DIR):
-  data_files += [os.path.join(parent, f) for f in files]
-
-setup(
-  name='twml',
-  version='2.0',
-  description="Tensorflow wrapper for twml",
-  packages=find_packages(exclude=["build"]),
-  install_requires=[
-    'thriftpy2',
-    'numpy',
-    'pyyaml',
-    'future',
-    'scikit-learn',
-    'scipy'
-  ],
-  package_data={
-    'twml': data_files,
-  },
-)
diff --git a/twml/twml/__init__.py b/twml/twml/__init__.py
deleted file mode 100644
index 0c96df68b..000000000
--- a/twml/twml/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-""" Importing the pyton op wrappers """
-
-import os
-
-# Import from twitter.deepbird
-from twitter.deepbird.logging.log_level import set_logging_level  # noqa: F401
-from twitter.deepbird.sparse import SparseTensor  # noqa: F401
-from twitter.deepbird.sparse import sparse_dense_matmul  # noqa: F401
-
-from .util import dynamic_partition, feature_id, limit_bits, limit_sparse_tensor_size  # noqa: F401
-from .util import write_file, fixed_length_tensor, setup_tf_logging_formatter  # noqa: F401
-from .array import Array  # noqa: F401
-
-# Module to parse feature patterns and match them from data_spec.json
-from .feature_config import FeatureConfig, FeatureConfigBuilder  # noqa: F401
-
-# Data record streaming, reading, writing, and parsing.
-from .dataset import *  # noqa: T400
-from .readers import *  # noqa: T400
-from .block_format_writer import *  # noqa: T400
-
-# Graph output functions
-from .export_output_fns import *  # noqa: T400
-
-# Input parsers
-from .parsers import *  # noqa: T400
-
-# Input functions
-from .input_fns import *  # noqa: T400
-
-# Feature filter functions
-from .filters import *  # noqa: T400
-
-# Custom argparser for Trainer
-from .argument_parser import *  # noqa: T400
-
-from . import constants  # noqa: F401
-from . import errors  # noqa: F401
-from . import layers  # noqa: F401
-from . import lookup  # noqa: F401
-from . import readers  # noqa: F401
-from . import summary  # noqa: F401
-from . import tensorboard  # noqa: F401
-
-import tensorflow.compat.v1 as tf  # noqa: F402
-tf.disable_eager_execution()
-
-# TODO: Figure out a better way to deal with this.
-if 'OMP_NUM_THREADS' not in os.environ and 'MKL_NUM_THREADS' not in os.environ:
-  os.environ["OMP_NUM_THREADS"] = '1'
-
-# Import all custom C++ ops
-from libtwml import add1, partition_sparse_tensor, CLIB  # noqa: F401
-
-# Configure logging levels to info for various frameworks
-set_logging_level('INFO')
-
-from . import contrib  # noqa: F401
-from . import hooks  # noqa: F401
-from . import trainers  # noqa: F401
-from . import metrics  # noqa: F401
diff --git a/twml/twml/argument_parser.py b/twml/twml/argument_parser.py
deleted file mode 100644
index c771eebdf..000000000
--- a/twml/twml/argument_parser.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# pylint: disable=protected-access, arguments-differ
-"""
-Command-line argument parsing for the Trainer.
-"""
-import argparse
-from argparse import ArgumentError
-from operator import attrgetter
-import tempfile
-
-import twml
-import tensorflow.compat.v1 as tf
-
-
-SERIAL = "serial"
-TREE = "tree"
-LOG_LEVELS = {
-  "debug": tf.logging.DEBUG,
-  "info": tf.logging.INFO,
-  "warn": tf.logging.WARN,
-  "error": tf.logging.ERROR}
-
-
-class SortingHelpFormatter(argparse.HelpFormatter):
-  """
-  Used to sort args alphabetically in the help message.
-  """
-
-  def add_arguments(self, actions):
-    actions = sorted(actions, key=attrgetter('option_strings'))
-    super(SortingHelpFormatter, self).add_arguments(actions)
-
-
-def _set_log_level(level=None):
-  """Sets the tensorflow log level to the input level."""
-  if level is None:
-    return None
-  level = level.lower()
-  if level not in LOG_LEVELS.keys():
-    raise ValueError(f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}.")
-  tf.logging.set_verbosity(LOG_LEVELS[level])
-  tf.logging.info(f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}")
-  return level
-
-
-def get_trainer_parser():
-  """
-  Add common commandline args to parse for the Trainer class.
-  Typically, the user calls this function and then parses cmd-line arguments
-  into an argparse.Namespace object which is then passed to the Trainer constructor
-  via the params argument.
-
-  See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-  for a list and description of all cmd-line arguments.
-
-  Args:
-    learning_rate_decay:
-      Defaults to False. When True, parses learning rate decay arguments.
-
-  Returns:
-    argparse.ArgumentParser instance with some useful args already added.
-  """
-  parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-
-  parser.add_argument(
-    "--save_dir", type=str, default=tempfile.mkdtemp(),
-    help="Path to the training result directory."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR ")
-  parser.add_argument(
-    "--export_dir", type=str, default=None,
-    help="Path to the directory to export a SavedModel for prediction servers.")
-  parser.add_argument(
-    "--log_aggregation_app_id", type=str, default=None,
-    help="specify app_id for log aggregation. disabled by default.")
-  parser.add_argument(
-    "--train.batch_size", "--train_batch_size", type=int, default=32,
-    dest='train_batch_size',
-    help="number of samples per training batch")
-  parser.add_argument(
-    "--eval.batch_size", "--eval_batch_size", type=int, default=32,
-    dest='eval_batch_size',
-    help="number of samples per cross-validation batch. Defaults to train_batch_size")
-  parser.add_argument(
-    "--train.learning_rate", "--learning_rate", type=float, default=0.002,
-    dest='learning_rate',
-    help="learning rate. Scales the gradient update.")
-  parser.add_argument(
-    "--train.steps", "--train_steps", type=int, default=-1,
-    dest='train_steps',
-    help="number of training batches before running evaluation."
-         "Defaults to -1 (runs through entire dataset). "
-         "Only used for Trainer.[train,learn]. "
-         "For Trainer.train_and_evaluate, use train.max_steps instead. ")
-  parser.add_argument(
-    "--eval.steps", "--eval_steps", type=int, default=-1,
-    dest="eval_steps",
-    help="number of steps per evaluation. Each batch is a step."
-         "Defaults to -1 (runs through entire dataset). ")
-  parser.add_argument(
-    "--eval.period", "--eval_period", type=int, default=600,
-    dest="eval_period",
-    help="Trainer.train_and_evaluate waits for this long after each evaluation. "
-         "Defaults to 600 seconds (evaluate every ten minutes). "
-         "Note that anything lower than 10*60seconds is probably a bad idea because TF saves "
-         "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--eval.delay", "--eval_delay", type=int, default=120,
-    dest="eval_delay",
-    help="Trainer.train_and_evaluate waits for this long before performing the first evaluation"
-         "Defaults to 120 seconds (evaluate after first 2 minutes of training). "
-         "eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--train.max_steps", "--train_max_steps", type=int, default=None,
-    dest="train_max_steps",
-    help="Stop training after this many global steps. Each training batch is its own step."
-         "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1."
-         "If set to a non-positive value, loop forever. Usually useful with early stopping.")
-  parser.add_argument(
-    "--train.log_metrics", dest="train_log_metrics", action="store_true", default=False,
-    help="Set this to true to see metrics during training. "
-         "WARNING: metrics during training does not represent model performance. "
-         "WARNING: use for debugging only as this slows down training.")
-  parser.add_argument(
-    "--train.early_stop_patience", "--early_stop_patience", type=int, default=-1,
-    dest="early_stop_patience",
-    help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric."
-         "Defaults to -1 (no early-stopping)."
-         "NOTE: This can not be enabled when --distributed is also set.")
-  parser.add_argument(
-    "--train.early_stop_tolerance", "--early_stop_tolerance", type=float, default=0,
-    dest="early_stop_tolerance",
-    help="a non-negative tolerance for comparing early_stop_metric."
-         "e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-         "Defaults to 0.")
-  parser.add_argument(
-    "--train.dataset_shards", "--train_dataset_shards",
-    dest="train_dataset_shards",
-    type=int, default=None,
-    help="An int value that indicates the number of partitions (shards) for the dataset. This is"
-    " useful for codistillation and other techniques that require each worker to train on disjoint"
-    " partitions of the dataset.")
-  parser.add_argument(
-    "--train.dataset_shard_index", "--train_dataset_shard_index",
-    dest="train_dataset_shard_index",
-    type=int, default=None,
-    help="An int value (starting at zero) that indicates which partition (shard) of the dataset"
-    " to use if --train.dataset_shards is set.")
-  parser.add_argument(
-    "--continue_from_checkpoint", dest="continue_from_checkpoint", action="store_true",
-    help="DEPRECATED. This option is currently a no-op."
-    " Continuing from the provided checkpoint is now the default."
-    " Use --overwrite_save_dir if you would like to override it instead"
-    " and restart training from scratch.")
-  parser.add_argument(
-    "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-    help="Delete the contents of the current save_dir if it exists")
-  parser.add_argument(
-    "--data_threads", "--num_threads", type=int, default=2,
-    dest="num_threads",
-    help="Number of threads to use for loading the dataset. "
-         "num_threads is deprecated and to be removed in future versions. Use data_threads.")
-  parser.add_argument(
-    "--max_duration", "--max_duration", type=float, default=None,
-    dest="max_duration",
-    help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.")
-  parser.add_argument(
-    "--num_workers", type=int, default=None,
-    help="Number of workers to use when training in hogwild manner on a single node.")
-  parser.add_argument(
-    "--distributed", dest="distributed", action="store_true",
-    help="Pass this flag to use train_and_evaluate to train in a distributed fashion"
-         "NOTE: You can not use early stopping when --distributed is enabled"
-  )
-  parser.add_argument(
-    "--distributed_training_cleanup",
-    dest="distributed_training_cleanup",
-    action="store_true",
-    help="Set if using distributed training on GKE to stop TwitterSetDeployment"
-         "from continuing training upon restarts (will be deprecated once we migrate off"
-         "TwitterSetDeployment for distributed training on GKE)."
-  )
-  parser.add_argument(
-    "--disable_auto_ps_shutdown", default=False, action="store_true",
-    help="Disable the functionality of automatically shutting down parameter server after "
-         "distributed training complete (either succeed or failed)."
-  )
-  parser.add_argument(
-    "--disable_tensorboard", default=False, action="store_true",
-    help="Do not start the TensorBoard server."
-  )
-  parser.add_argument(
-    "--tensorboard_port", type=int, default=None,
-    help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.")
-  parser.add_argument(
-    "--health_port", type=int, default=None,
-    help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-         "Not user-facing as it is set automatically by the twml_cli."
-  )
-  parser.add_argument(
-    "--stats_port", type=int, default=None,
-    help="Port to listen on for stats endpoints"
-  )
-  parser.add_argument(
-    "--experiment_tracking_path",
-    dest="experiment_tracking_path",
-    type=str, default=None,
-    help="The tracking path of this experiment. Format: \
-        user_name:project_name:experiment_name:run_name. The path is used to track and display \
-        a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \
-        disabled when the deprecated Model Repo TrackRun is used in your model config. ")
-  parser.add_argument(
-    "--disable_experiment_tracking",
-    dest="disable_experiment_tracking",
-    action="store_true",
-    help="Whether experiment tracking should be disabled.")
-  parser.add_argument(
-    "--config.save_checkpoints_secs", "--save_checkpoints_secs", type=int, default=600,
-    dest='save_checkpoints_secs',
-    help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. "
-    "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.")
-  parser.add_argument(
-    "--config.keep_checkpoint_max", "--keep_checkpoint_max", type=int, default=20,
-    dest='keep_checkpoint_max',
-    help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. "
-    "Specifies how many checkpoints to keep. Defaults to 20.")
-  parser.add_argument(
-    "--config.tf_random_seed", "--tf_random_seed", type=int, default=None,
-    dest='tf_random_seed',
-    help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. "
-         "Specifies the seed to use. Defaults to None.")
-  parser.add_argument(
-    "--optimizer", type=str, default='SGD',
-    help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.")
-  parser.add_argument(
-    "--gradient_noise_scale", type=float, default=None,
-    help="adds 0-mean normal noise scaled by this value. Defaults to None.")
-  parser.add_argument(
-    "--clip_gradients", type=float, default=None,
-    help="If specified, a global clipping is applied to prevent "
-         "the norm of the gradient to exceed this value. Defaults to None.")
-  parser.add_argument(
-    "--dgc.density", "--dgc_density", type=float, default=0.1,
-    dest="dgc_density",
-    help="Specifies gradient density level when using deep gradient compression optimizer."
-         "E.g., default value being 0.1 means that only top 10%% most significant rows "
-         "(based on absolute value sums) are kept."
-  )
-  parser.add_argument(
-    "--dgc.density_decay", "--dgc_density_decay", type=bool, default=True,
-    dest="dgc_density_decay",
-    help="Specifies whether to (exponentially) decay the gradient density level when"
-         " doing gradient compression. If set 'False', the 'density_decay_steps', "
-         "'density_decay_rate' and 'min_density' arguments will be ignored."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_steps", "--dgc_density_decay_steps", type=int, default=10000,
-    dest="dgc_density_decay_steps",
-    help="Specifies the step interval to perform density decay."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_rate", "--dgc_density_decay_rate", type=float, default=0.5,
-    dest="dgc_density_decay_rate",
-    help="Specifies the decay rate when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.min_density", "--dgc_min_density", type=float, default=0.1,
-    dest="dgc_min_density",
-    help="Specifies the minimum density level when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.accumulation", "--dgc_accumulation", type=bool, default=False,
-    dest="dgc_accumulation",
-    help="Specifies whether to accumulate small gradients when using deep gradient compression "
-         "optimizer."
-  )
-  parser.add_argument(
-    "--show_optimizer_summaries", dest="show_optimizer_summaries", action="store_true",
-    help="When specified, displays gradients and learning rate in tensorboard."
-    "Turning it on has 10-20%% performance hit. Enable for debugging only")
-
-  parser.add_argument(
-    "--num_mkl_threads", dest="num_mkl_threads", default=1, type=int,
-    help="Specifies how many threads to use for MKL"
-    "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads."
-    "intra_op_parallelism_threads is set to num_mkl_threads.")
-
-  parser.add_argument("--verbosity", type=_set_log_level, choices=LOG_LEVELS.keys(), default=None,
-    help="Sets log level to a given verbosity.")
-
-  parser.add_argument(
-    "--feature_importance.algorithm", dest="feature_importance_algorithm",
-    type=str, default=TREE, choices=[SERIAL, TREE],
-    help="""
-    There are two algorithms that the module supports, `serial` and `tree`.
-      The `serial` algorithm computes feature importances for each feature, and
-      the `tree` algorithm groups features by feature name prefix, computes feature
-      importances for groups of features, and then only 'zooms-in' on a group when the
-      importance is greater than the `--feature_importance.sensitivity` value. The `tree` algorithm
-      will usually run faster, but for relatively unimportant features it will only compute an
-      upper bound rather than an exact importance value. We suggest that users generally stick
-      to the `tree` algorithm, unless if they have a very small number of features or
-      near-random model performance.
-      """)
-
-  parser.add_argument(
-    "--feature_importance.sensitivity", dest="feature_importance_sensitivity", type=float, default=0.03,
-    help="""
-    The maximum amount that permuting a feature group can cause the model performance (determined
-      by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature
-      group. This is only used for the `tree` algorithm.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.dont_build_tree", dest="dont_build_tree", action="store_true", default=False,
-    help="""
-    If True, don't build the feature trie for the tree algorithm and only use the extra_groups
-    """)
-
-  parser.add_argument(
-    "--feature_importance.split_feature_group_on_period", dest="split_feature_group_on_period", action="store_true", default=False,
-    help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm")
-
-  parser.add_argument(
-    "--feature_importance.example_count", dest="feature_importance_example_count", type=int, default=10000,
-    help="""
-    The number of examples used to compute feature importance.
-    Larger values yield more reliable results, but also take longer to compute.
-    These records are loaded into memory. This number is agnostic to batch size.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.data_dir", dest="feature_importance_data_dir", type=str, default=None,
-    help="Path to the dataset used to compute feature importance."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR "
-         "Defaults to eval_data_dir")
-
-  parser.add_argument(
-    "--feature_importance.metric", dest="feature_importance_metric", type=str, default="roc_auc",
-    help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_larger_the_better", dest="feature_importance_is_metric_larger_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_smaller_the_better", dest="feature_importance_is_metric_smaller_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)")
-
-  subparsers = parser.add_subparsers(help='Learning Rate Decay Functions. Can only pass 1.'
-                                          'Should be specified after all the optional arguments'
-                                          'and followed by its specific args'
-                                          'e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn'
-                                          ' --decay_rate 0.0004 --min_learning_rate 0.001',
-                                     dest='learning_rate_decay')
-
-  # Create the parser for the "exponential_learning_rate_decay_fn"
-  parser_exponential = subparsers.add_parser('exponential_learning_rate_decay',
-                                             help='Exponential learning rate decay. '
-                                             'Exponential decay implements:'
-                                             'decayed_learning_rate = learning_rate * '
-                                             'exponential_decay_rate ^ '
-                                             '(global_step / decay_steps')
-  parser_exponential.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay.")
-  parser_exponential.add_argument(
-    "--exponential_decay_rate", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay. Must be positive. ")
-
-  # Create the parser for the "polynomial_learning_rate_decay_fn"
-  parser_polynomial = subparsers.add_parser('polynomial_learning_rate_decay',
-                                            help='Polynomial learning rate decay. '
-                                            'Polynomial decay implements: '
-                                            'global_step = min(global_step, decay_steps)'
-                                            'decayed_learning_rate = '
-                                            '(learning_rate - end_learning_rate) * '
-                                            '(1 - global_step / decay_steps) ^ '
-                                            '(polynomial_power) + end_learning_rate'
-                                            'So for linear decay you can use a '
-                                            'polynomial_power=1 (the default)')
-  parser_polynomial.add_argument(
-    "--end_learning_rate", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay (ignored otherwise).")
-  parser_polynomial.add_argument(
-    "--polynomial_power", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay."
-         "The power of the polynomial. Defaults to linear, 1.0.")
-  parser_polynomial.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'polynomial' learning_rate_decay. ")
-
-  # Create the parser for the "piecewise_constant_learning_rate_decay_fn"
-  parser_piecewise_constant = subparsers.add_parser('piecewise_constant_learning_rate_decay',
-                                                    help='Piecewise Constant '
-                                                    'learning rate decay. '
-                                                    'For piecewise_constant, '
-                                                    'consider this example: '
-                                                    'We want to use a learning rate '
-                                                    'that is 1.0 for'
-                                                    'the first 100000 steps,'
-                                                    '0.5 for steps 100001 to 110000, '
-                                                    'and 0.1 for any additional steps. '
-                                                    'To do so, specify '
-                                                    '--piecewise_constant_boundaries=100000,110000'
-                                                    '--piecewise_constant_values=1.0,0.5,0.1')
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_values",
-    action=parse_comma_separated_list(element_type=float),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated floats or ints that specifies the values "
-         "for the intervals defined by boundaries. It should have one more "
-         "element than boundaries.")
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_boundaries",
-    action=parse_comma_separated_list(element_type=int),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated integers, with strictly increasing entries.")
-
-  # Create the parser for the "inverse_learning_rate_decay_fn"
-  parser_inverse = subparsers.add_parser('inverse_learning_rate_decay',
-                                         help='Inverse Leaning rate decay. '
-                                         'Inverse implements:'
-                                         'decayed_lr = max(lr /(1 + decay_rate * '
-                                         'floor(global_step /decay_step)),'
-                                         ' min_learning_rate)'
-                                         'When decay_step=1 this mimics the behaviour'
-                                         'of the default learning rate decay'
-                                         'of DeepBird v1.')
-
-  parser_inverse.add_argument(
-    "--decay_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.")
-  parser_inverse.add_argument(
-    "--min_learning_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.")
-  parser_inverse.add_argument(
-    "--decay_steps", type=float, default=1,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_learning_rate_decay_fn"
-  parser_cosine = subparsers.add_parser('cosine_learning_rate_decay',
-                                        help='Cosine Leaning rate decay. '
-                                        'Cosine implements:'
-                                        'decayed_lr = 0.5 * (1 + cos(pi *\
-                                         global_step / decay_steps)) * lr'
-                                       )
-
-  parser_cosine.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number.\
-    Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine.add_argument(
-    "--decay_steps", type=float,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_restart_learning_rate_decay_fn"
-  parser_cosine_restart = subparsers.add_parser('cosine_restarts_learning_rate_decay',
-                                                help='Applies cosine decay with restarts \
-                                                  to the learning rate'
-                                                'See [Loshchilov & Hutter, ICLR2016],\
-                                                   SGDR: Stochastic'
-                                                'Gradient Descent with Warm Restarts.'
-                                                'https://arxiv.org/abs/1608.03983'
-                                               )
-  parser_cosine_restart.add_argument(
-    "--first_decay_steps", type=float,
-    help="Required for 'cosine_restart' learning_rate_decay.")
-  parser_cosine_restart.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine_restart.add_argument(
-    "--t_mul", type=float, default=2,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Used to derive the number of iterations in the i-th period")
-  parser_cosine_restart.add_argument(
-    "--m_mul", type=float, default=1,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-      Used to derive the initial learning rate of the i-th period.")
-
-  # Create dummy parser for None, which is the default.
-  parser_default = subparsers.add_parser(
-    'no_learning_rate_decay',
-    help='No learning rate decay')  # noqa: F841
-
-  parser.set_default_subparser('no_learning_rate_decay')
-
-  return parser
-
-
-class DefaultSubcommandArgParse(argparse.ArgumentParser):
-  """
-  Subclass of argparse.ArgumentParser that sets default parser
-  """
-  _DEFAULT_SUBPARSER = None
-
-  def set_default_subparser(self, name):
-    """
-    sets the default subparser
-    """
-    self._DEFAULT_SUBPARSER = name
-
-  def _parse_known_args(self, arg_strings, *args, **kwargs):
-    """
-    Overwrites _parse_known_args
-    """
-    in_args = set(arg_strings)
-    d_sp = self._DEFAULT_SUBPARSER
-    if d_sp is not None and not {'-h', '--help'}.intersection(in_args):
-      for x_val in self._subparsers._actions:
-        subparser_found = (
-          isinstance(x_val, argparse._SubParsersAction) and
-          in_args.intersection(x_val._name_parser_map.keys())
-        )
-        if subparser_found:
-          break
-      else:
-        # insert default in first position, this implies no
-        # global options without a sub_parsers specified
-        arg_strings = arg_strings + [d_sp]
-    return super(DefaultSubcommandArgParse, self)._parse_known_args(
-      arg_strings, *args, **kwargs
-    )
-
-  def _check_value(self, action, value):
-    try:
-      super(DefaultSubcommandArgParse, self)._check_value(
-        action, value
-      )
-    except ArgumentError as error:
-      error.message += ("\nERROR: Deepbird is trying to interpret \"{}\" as a value of {}. If this is not what you expected, "
-        "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, "
-        "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after "
-        "the `learning_rate_decay` argument.\n").format(value, action.dest, value, value)
-      raise error
-
-
-def parse_comma_separated_list(element_type=str):
-  """
-  Generates an argparse.Action that converts a string representing a comma separated list to a
-  list and converts each element to a specified type.
-  """
-
-  # pylint: disable-msg=too-few-public-methods
-  class _ParseCommaSeparatedList(argparse.Action):
-    """
-    Converts a string representing a comma separated list to a list and converts each element to a
-    specified type.
-    """
-
-    def __call__(self, parser, namespace, values, option_string=None):
-      if values is not None:
-        values = [element_type(v) for v in values.split(',')]
-      setattr(namespace, self.dest, values)
-
-  return _ParseCommaSeparatedList
diff --git a/twml/twml/array.py b/twml/twml/array.py
deleted file mode 100644
index a8524a06d..000000000
--- a/twml/twml/array.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""Module containing wrapper class to allow numpy arrays to work with twml functions"""
-
-import ctypes as ct
-
-from absl import logging
-from libtwml import CLIB
-import numpy as np
-
-
-_NP_TO_TWML_TYPE = {
-  'float32': ct.c_int(1),
-  'float64': ct.c_int(2),
-  'int32': ct.c_int(3),
-  'int64': ct.c_int(4),
-  'int8': ct.c_int(5),
-  'uint8': ct.c_int(6),
-}
-
-
-class Array(object):
-  """
-  Wrapper class to allow numpy arrays to work with twml functions.
-  """
-
-  def __init__(self, array):
-    """
-    Wraps numpy array and creates a handle that can be passed to C functions from libtwml.
-
-    array: Numpy array
-    """
-    if not isinstance(array, np.ndarray):
-      raise TypeError("Input must be a numpy array")
-
-    try:
-      ttype = _NP_TO_TWML_TYPE[array.dtype.name]
-    except KeyError as err:
-      logging.error("Unsupported numpy type")
-      raise err
-
-    handle = ct.c_void_p(0)
-    ndim = ct.c_int(array.ndim)
-    dims = array.ctypes.get_shape()
-    isize = array.dtype.itemsize
-
-    strides_t = ct.c_size_t * array.ndim
-    strides = strides_t(*[n // isize for n in array.strides])
-
-    err = CLIB.twml_tensor_create(ct.pointer(handle),
-                                  array.ctypes.get_as_parameter(),
-                                  ndim, dims, strides, ttype)
-
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-    # Store the numpy array to ensure it isn't deleted before self
-    self._array = array
-
-    self._handle = handle
-
-    self._type = ttype
-
-  @property
-  def handle(self):
-    """
-    Return the twml handle
-    """
-    return self._handle
-
-  @property
-  def shape(self):
-    """
-    Return the shape
-    """
-    return self._array.shape
-
-  @property
-  def ndim(self):
-    """
-    Return the shape
-    """
-    return self._array.ndim
-
-  @property
-  def array(self):
-    """
-    Return the numpy array
-    """
-    return self._array
-
-  @property
-  def dtype(self):
-    """
-    Return numpy dtype
-    """
-    return self._array.dtype
-
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    CLIB.twml_tensor_delete(self._handle)
diff --git a/twml/twml/block_format_writer.py b/twml/twml/block_format_writer.py
deleted file mode 100644
index 9c4a9b6a8..000000000
--- a/twml/twml/block_format_writer.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""Module containing wrapper class to write block format data"""
-import ctypes as ct
-
-from libtwml import CLIB
-
-
-class BlockFormatWriter(object):
-  """
-  Class to write block format file.
-  """
-
-  def __init__(self, file_name, records_per_block=100):
-    file_name = file_name
-    if not isinstance(file_name, str):
-      raise ValueError("file_name has to be of type str")
-
-    self.file_name = ct.c_char_p(file_name.encode())
-    self.records_per_block = ct.c_int(int(records_per_block))
-    handle = ct.c_void_p(0)
-    err = CLIB.block_format_writer_create(ct.pointer(handle),
-                                          self.file_name,
-                                          self.records_per_block)
-    self._handle = None
-    # 1000 means TWML_ERR_NONE
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-    self._handle = handle
-
-  @property
-  def handle(self):
-    """
-    Return the handle
-    """
-    return self._handle
-
-  def write(self, class_name, record):
-    """
-    Write a record.
-
-    Note: `record` needs to be in a format that can be converted to ctypes.c_char_p.
-    """
-    if not isinstance(class_name, str):
-      raise ValueError("class_name has to be of type str")
-
-    record_len = len(record)
-    class_name = ct.c_char_p(class_name.encode())
-    record = ct.c_char_p(record)
-    err = CLIB.block_format_write(self._handle, class_name, record, record_len)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-  def flush(self):
-    """
-    Flush records in buffer to outputfile.
-    """
-    err = CLIB.block_format_flush(self._handle)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    if self._handle:
-      CLIB.block_format_writer_delete(self._handle)
diff --git a/twml/twml/constants.py b/twml/twml/constants.py
deleted file mode 100644
index c6c726eed..000000000
--- a/twml/twml/constants.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# These should coincide with 'enum class DecodeMode' values in HashedDataRecordReader.h
-
-from twitter.deepbird.io.legacy.constants import (
-  DECODE_MODES,  # noqa: F401
-  DEFAULT_DECODE_MODE,  # noqa: F401
-  HASH_FNAME_AND_VALNAME,  # noqa: F401
-  HASH_VALNAME,  # noqa: F401
-  HashingDiscretizerOptions,  # noqa: F401
-  DEFAULT_ZOOKEEPER_BASE_ZNODE,  # noqa: F401
-  DEFAULT_ZOOKEEPER_HOST,  # noqa: F401
-)
diff --git a/twml/twml/contrib/__init__.py b/twml/twml/contrib/__init__.py
deleted file mode 100644
index 1a5e8efe4..000000000
--- a/twml/twml/contrib/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# pylint: disable=wildcard-import
-""" experimental and contributed modules """
-
-from . import layers  # noqa: F401
-from . import feature_importances  # noqa: F401
-from . import calibrators  # noqa: F401
-from . import readers  # noqa: F401
-from . import utils  # noqa: F401
-from . import build_graphs_fns  # noqa: F401
-from . import feature_config  # noqa: F401
-from . import parsers  # noqa: F401
-from . import initializers  # noqa: F401
-from . import export # noqa: F401
-from . import feature_config_parsers # noqa: F401
-
-# These imports do not work with TF 2.x and are not needed either.
-# If you are using TF 2.x, use the modular targets under src/python/twitter/deepbird.
-import tensorflow
-from . import trainers  # noqa: F401
-from . import metrics  # noqa: F401
-from . import hooks  # noqa: F401
diff --git a/twml/twml/contrib/build_graphs_fns.py b/twml/twml/contrib/build_graphs_fns.py
deleted file mode 100644
index 829f61512..000000000
--- a/twml/twml/contrib/build_graphs_fns.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# pylint: disable=unused-argument, missing-docstring
-'''
-Common build graphs that can be reused
-'''
-import tensorflow.compat.v1 as tf
-
-
-def get_saved_modules_graph(input_graph_fn):
-  """
-  Get common graph for stitching different saved modules for export.
-  This graph is used to save checkpoints; and then export the modules
-  as a unity.
-  Args:
-        features:
-          model features
-        params:
-          model params
-        input_graph_fn:
-          main logic for the stitching
-  Returns:
-    build_graph
-  """
-  def build_graph(features, label, mode, params, config=None):
-    output = input_graph_fn(features, params)
-    # If mode is train, we just need to assign a dummy loss
-    # and update the train op. This is done to save the graph to save_dir.
-    if mode == 'train':
-      loss = tf.constant(1)
-      train_op = tf.assign_add(tf.train.get_global_step(), 1)
-      return {'train_op': train_op, 'loss': loss}
-    return output
-  return build_graph
diff --git a/twml/twml/contrib/calibrators/__init__.py b/twml/twml/contrib/calibrators/__init__.py
deleted file mode 100644
index 02181ed12..000000000
--- a/twml/twml/contrib/calibrators/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# pylint: disable=wildcard-import
-"""
-This module contains classes used for calibration.
-Typically, each calibrator defines a ``twml.calibrator.Calibrator`` subclass
-and a ``twml.calibrator.CalibrationFeature``.
-The latter manages weights and values of individual features.
-The former manages a set of ``CalibratorFeatures``
-(although some ``Calibrators`` don't use ``CalibrationFeature``).
-Ultimately, the ``Calibrator`` should produce an initialized layer via its ``to_layer()`` method.
-"""
-
-from .common_calibrators import calibrate_discretizer_and_export, add_discretizer_arguments  # noqa: F401
-from .calibrator import Calibrator  # noqa: F401
-from .mdl import MDLCalibrator  # noqa: F401
-from .isotonic import IsotonicCalibrator  # noqa: F401
-from .percentile_discretizer import PercentileDiscretizerCalibrator  # noqa: F401
-from .hashed_percentile_discretizer import HashedPercentileDiscretizerCalibrator  # noqa: F401
-from .hashing_discretizer import HashingDiscretizerCalibrator  # noqa: F401
\ No newline at end of file
diff --git a/twml/twml/contrib/calibrators/calibrator.py b/twml/twml/contrib/calibrators/calibrator.py
deleted file mode 100644
index 7408412e0..000000000
--- a/twml/twml/contrib/calibrators/calibrator.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# pylint: disable=missing-docstring, unused-argument
-''' Contains the base classes for CalibrationFeature and Calibrator '''
-
-
-from collections import defaultdict
-
-import numpy as np
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-import twml.util
-
-
-class CalibrationFeature(object):
-  '''
-  Accumulates values and weights for individual features.
-  Typically, each unique feature defined in the accumulated SparseTensor or Tensor
-  would have its own CalibrationFeature instance.
-  '''
-
-  def __init__(self, feature_id):
-    ''' Constructs a CalibrationFeature
-
-    Arguments:
-      feature_id:
-        number identifying the feature.
-    '''
-    self.feature_id = feature_id
-    self._calibrated = False
-    self._features_dict = defaultdict(list)
-
-  def add_values(self, new_features):
-    '''
-    Extends lists to contain the values in this batch
-    '''
-    for key in new_features:
-      self._features_dict[key].append(new_features[key])
-
-  def _concat_arrays(self):
-    '''
-    This class calls this function after you have added all the values.
-    It creates a dictionary with the concatanated arrays
-    '''
-    self._features_dict.update((k, np.concatenate(v)) for k, v in self._features_dict.items())
-
-  def calibrate(self, *args, **kwargs):
-    raise NotImplementedError
-
-
-class Calibrator(object):
-  '''
-  Accumulates features and their respective values for Calibration
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()`` and;
-   2. calibrate by calling ``calibrate()``;
-   3. convert to a twml.layers layer by calling ``to_layer()``.
-
-  Note you can only use one calibrator per Trainer.
-  '''
-
-  def __init__(self, calibrator_name=None, **kwargs):
-    '''
-    Arguments:
-      calibrator_name.
-        Default: if set to None it will be the same as the class name.
-        Please be reminded that if in the model there are many calibrators
-        of the same type the calibrator_name should be changed to avoid confusion.
-    '''
-    self._calibrated = False
-    if calibrator_name is None:
-      calibrator_name = twml.util.to_snake_case(self.__class__.__name__)
-    self._calibrator_name = calibrator_name
-    self._kwargs = kwargs
-
-  @property
-  def is_calibrated(self):
-    return self._calibrated
-
-  @property
-  def name(self):
-    return self._calibrator_name
-
-  def accumulate(self, *args, **kwargs):
-    '''Accumulates features and their respective values for Calibration.'''
-    raise NotImplementedError
-
-  def calibrate(self):
-    '''Calibrates after the accumulation has ended.'''
-    self._calibrated = True
-
-  def to_layer(self, name=None):
-    '''
-    Returns a twml.layers.Layer instance with the result of calibrator.
-
-    Arguments:
-      name:
-        name-scope of the layer
-    '''
-    raise NotImplementedError
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-
-    Returns:
-      dictionary of Layer constructor arguments to initialize the
-      layer Variables. Typically, this should contain enough information
-      to initialize empty layer Variables of the correct size, which will then
-      be filled with the right data using init_map.
-    '''
-    raise NotImplementedError
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-      name:
-        name for the calibrator.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def write_summary(self, writer, sess=None):
-    """
-    This method is called by save() to write tensorboard summaries to disk.
-    See MDLCalibrator.write_summary for an example.
-    By default, the method does nothing. It can be overloaded by child-classes.
-
-    Arguments:
-      writer:
-        `tf.summary.FilteWriter
-        <https://www.tensorflow.org/versions/master/api_docs/python/tf/summary/FileWriter>`_
-        instance.
-        The ``writer`` is used to add summaries to event files for inclusion in tensorboard.
-      sess (optional):
-        `tf.Session <https://www.tensorflow.org/versions/master/api_docs/python/tf/Session>`_
-        instance. The ``sess`` is used to produces summaries for the writer.
-    """
diff --git a/twml/twml/contrib/calibrators/common_calibrators.py b/twml/twml/contrib/calibrators/common_calibrators.py
deleted file mode 100644
index 5301901e4..000000000
--- a/twml/twml/contrib/calibrators/common_calibrators.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# pylint: disable=invalid-name, no-member, unused-argument
-"""
-This module contains common calibrate and export functions for calibrators.
-"""
-
-# These 3 TODO are encapsulated by CX-11446
-# TODO: many of these functions hardcode datarecords yet don't allow passing a parse_fn.
-# TODO: provide more generic (non DataRecord specific) functions
-# TODO: many of these functions aren't common at all.
-#       For example, Discretizer functions should be moved to PercentileDiscretizer.
-
-import copy
-import os
-import time
-
-from absl import logging
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-from twml.argument_parser import SortingHelpFormatter
-from twml.input_fns import data_record_input_fn
-from twml.util import list_files_by_datetime, sanitize_hdfs_path
-from twml.contrib.calibrators.isotonic import IsotonicCalibrator
-
-
-def calibrator_arguments(parser):
-  """
-  Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model
-  """
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir",
-    help="Path to save or load calibrator calibration")
-  parser.add_argument("--calibrator_batch_size", type=int, default=128,
-    dest="calibrator_batch_size",
-    help="calibrator batch size")
-  parser.add_argument("--calibrator_parts_downsampling_rate", type=float, default=1,
-    dest="calibrator_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--calibrator_max_steps", type=int, default=None,
-    dest="calibrator_max_steps",
-    help="Max Steps taken by calibrator to accumulate samples")
-  parser.add_argument("--calibrator_num_bins", type=int, default=22,
-    dest="calibrator_num_bins",
-    help="Num bins of calibrator")
-  parser.add_argument("--isotonic_calibrator", dest='isotonic_calibrator', action='store_true',
-    help="Isotonic Calibrator present")
-  parser.add_argument("--calibrator_keep_rate", type=float, default=1.0,
-    dest="calibrator_keep_rate",
-    help="Keep rate")
-  return parser
-
-
-def _generate_files_by_datetime(params):
-
-  files = list_files_by_datetime(
-    base_path=sanitize_hdfs_path(params.train_data_dir),
-    start_datetime=params.train_start_datetime,
-    end_datetime=params.train_end_datetime,
-    datetime_prefix_format=params.datetime_format,
-    extension="lzo",
-    parallelism=1,
-    hour_resolution=params.hour_resolution,
-    sort=True)
-
-  return files
-
-
-def get_calibrate_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.calibrator_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.calibrator_keep_rate,
-    parts_downsampling_rate=params.calibrator_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
-
-
-def get_discretize_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.discretizer_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.discretizer_keep_rate,
-    parts_downsampling_rate=params.discretizer_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
-
-
-def discretizer_arguments(parser=None):
-  """
-  Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model. Defaults to None
-  """
-
-  if parser is None:
-    parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-    parser.add_argument(
-      "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-      help="Delete the contents of the current save_dir if it exists")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %Y/%m/%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
-    parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
-    parser.add_argument(
-      "--tensorboard_port", type=int, default=None,
-      help="Port for tensorboard to run on.")
-    parser.add_argument(
-      "--stats_port", type=int, default=None,
-      help="Port for stats server to run on.")
-    parser.add_argument(
-      "--health_port", type=int, default=None,
-      help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-           "Not user-facing as it is set automatically by the twml_cli."
-    )
-    parser.add_argument(
-      "--data_spec", type=str, default=None,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-  parser.add_argument("--discretizer.save_dir", type=str,
-    dest="discretizer_save_dir",
-    help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer_batch_size", type=int, default=128,
-    dest="discretizer_batch_size",
-    help="Discretizer batch size")
-  parser.add_argument("--discretizer_keep_rate", type=float, default=0.0008,
-    dest="discretizer_keep_rate",
-    help="Keep rate")
-  parser.add_argument("--discretizer_parts_downsampling_rate", type=float, default=0.2,
-    dest="discretizer_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--discretizer_max_steps", type=int, default=None,
-    dest="discretizer_max_steps",
-    help="Max Steps taken by discretizer to accumulate samples")
-  return parser
-
-
-def calibrate(trainer, params, build_graph, input_fn, debug=False):
-  """
-  Calibrate Isotonic Calibration
-  Arguments:
-    trainer:
-      Trainer
-    params:
-      Parameters
-    build_graph:
-      Build Graph used to be the input to the calibrator
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if trainer._estimator.config.is_chief:
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.calibrator_save_dir)
-      tf.io.gfile.rmtree(params.calibrator_save_dir)
-
-    calibrator = IsotonicCalibrator(params.calibrator_num_bins)
-
-    # chief trains discretizer
-    logging.info("Chief training calibrator")
-
-    # Accumulate the features for each calibrator
-    features, labels = input_fn()
-    if 'weights' not in features:
-      raise ValueError("Weights need to be returned as part of the parse_fn")
-    weights = features.pop('weights')
-
-    preds = build_graph(features=features, label=None, mode='infer', params=params, config=None)
-    init = tf.global_variables_initializer()
-    table_init = tf.tables_initializer()
-    with tf.Session() as sess:
-      sess.run(init)
-      sess.run(table_init)
-      count = 0
-      max_steps = params.calibrator_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          weights_vals, labels_vals, preds_vals = sess.run([weights, labels, preds['output']])
-          calibrator.accumulate(preds_vals, labels_vals, weights_vals.flatten())
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    calibrator.calibrate()
-    calibrator.save(params.calibrator_save_dir)
-    trainer.estimator._params.isotonic_calibrator = True
-
-    if debug:
-      return calibrator
-
-  else:
-    calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(calibrator_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % calibrator_save_dir)
-      time.sleep(60)
-
-
-def discretize(params, feature_config, input_fn, debug=False):
-  """
-  Discretizes continuous features
-  Arguments:
-    params:
-      Parameters
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-    params.num_workers is None):
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.discretizer_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.discretizer_save_dir)
-      tf.io.gfile.rmtree(params.discretizer_save_dir)
-
-    config_map = feature_config()
-    discretize_dict = config_map['discretize_config']
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    batch = input_fn()
-    # Accumulate the features for each calibrator
-    with tf.Session() as sess:
-      count = 0
-      max_steps = params.discretizer_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          inputs = sess.run(batch)
-          for name, clbrt in discretize_dict.items():
-            clbrt.accumulate_features(inputs[0], name)
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      for name, clbrt in discretize_dict.items():
-        clbrt.calibrate()
-        clbrt.add_hub_signatures(name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(params.discretizer_save_dir, session)
-
-    for name, clbrt in discretize_dict.items():
-      clbrt.write_summary_json(params.discretizer_save_dir, name)
-
-    if debug:
-      return discretize_dict
-
-  else:
-    # wait for the file to be removed (if necessary)
-    # should be removed after an actual fix applied
-    time.sleep(60)
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
-
-
-def add_discretizer_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument("--discretizer.save_dir", type=str,
-                      dest="discretizer_save_dir",
-                      help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer.batch_size", type=int, default=128,
-                      dest="discretizer_batch_size",
-                      help="Discretizer batch size")
-  parser.add_argument("--discretizer.keep_rate", type=float, default=0.0008,
-                      dest="discretizer_keep_rate",
-                      help="Keep rate")
-  parser.add_argument("--discretizer.parts_downsampling_rate", type=float, default=0.2,
-                      dest="discretizer_parts_downsampling_rate",
-                      help="Parts downsampling rate")
-  parser.add_argument("--discretizer.num_bins", type=int, default=20,
-                      dest="discretizer_num_bins",
-                      help="Number of bins per feature")
-  parser.add_argument("--discretizer.output_size_bits", type=int, default=22,
-                      dest="discretizer_output_size_bits",
-                      help="Number of bits allocated to the output size")
-  return parser
-
-
-def add_isotonic_calibrator_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-  parser.add_argument("--calibrator.num_bins", type=int,
-    default=25000, dest="calibrator_num_bins",
-    help="number of bins for isotonic calibration")
-  parser.add_argument("--calibrator.parts_downsampling_rate", type=float, default=0.1,
-    dest="calibrator_parts_downsampling_rate", help="Parts downsampling rate")
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir", help="Path to save or load calibrator output")
-  parser.add_argument("--calibrator.load_tensorflow_module", type=str, default=None,
-    dest="calibrator_load_tensorflow_module",
-    help="Location from where to load a pretrained graph from. \
-                           Typically, this is where the MLP graph is saved")
-  parser.add_argument("--calibrator.export_mlp_module_name", type=str, default='tf_hub_mlp',
-    help="Name for loaded hub signature",
-    dest="export_mlp_module_name")
-  parser.add_argument("--calibrator.export_isotonic_module_name",
-    type=str, default="tf_hub_isotonic",
-    dest="calibrator_export_module_name",
-    help="export module name")
-  parser.add_argument("--calibrator.final_evaluation_steps", type=int,
-    dest="calibrator_final_evaluation_steps", default=None,
-    help="number of steps for final evaluation")
-  parser.add_argument("--calibrator.train_steps", type=int, default=-1,
-    dest="calibrator_train_steps",
-    help="number of steps for calibration")
-  parser.add_argument("--calibrator.batch_size", type=int, default=1024,
-    dest="calibrator_batch_size",
-    help="Calibrator batch size")
-  parser.add_argument("--calibrator.is_calibrating", action='store_true',
-    dest="is_calibrating",
-    help="Dummy argument to allow running in chief worker")
-  return parser
-
-
-def calibrate_calibrator_and_export(name, calibrator, build_graph_fn, params, feature_config,
-                                    run_eval=True, input_fn=None, metric_fn=None,
-                                    export_task_type_overrider=None):
-  """
-  Pre-set `isotonic calibrator` calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config which will be passed to the trainer
-    export_task_type_overrider:
-      the task type for exporting the calibrator
-      if specified, this will override the default export task type in trainer.hub_export(..)
-  """
-
-  # create calibrator params
-  params_c = copy.deepcopy(params)
-  params_c.data_threads = 1
-  params_c.num_workers = 1
-  params_c.continue_from_checkpoint = True
-  params_c.overwrite_save_dir = False
-  params_c.stats_port = None
-
-  # Automatically load from the saved Tensorflow Hub module if not specified.
-  if params_c.calibrator_load_tensorflow_module is None:
-    path_saved_tensorflow_model = os.path.join(params.save_dir, params.export_mlp_module_name)
-    params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model
-
-  if "calibrator_parts_downsampling_rate" in params_c:
-    params_c.train_parts_downsampling_rate = params_c.calibrator_parts_downsampling_rate
-  if "calibrator_save_dir" in params_c:
-    params_c.save_dir = params_c.calibrator_save_dir
-  if "calibrator_batch_size" in params_c:
-    params_c.train_batch_size = params_c.calibrator_batch_size
-    params_c.eval_batch_size = params_c.calibrator_batch_size
-  # TODO: Deprecate this option. It is not actually used. Calibrator
-  #       simply iterates until the end of input_fn.
-  if "calibrator_train_steps" in params_c:
-    params_c.train_steps = params_c.calibrator_train_steps
-
-  if metric_fn is None:
-    metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None)
-
-  # Common Trainer which will also be used by all workers
-  trainer = twml.trainers.DataRecordTrainer(
-    name=name,
-    params=params_c,
-    feature_config=feature_config,
-    build_graph_fn=build_graph_fn,
-    save_dir=params_c.save_dir,
-    metric_fn=metric_fn
-  )
-
-  if trainer._estimator.config.is_chief:
-
-    # Chief trains calibrator
-    logging.info("Chief training calibrator")
-
-    # Disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    hooks = None
-    if params_c.calibrator_train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)]
-
-    def parse_fn(input_x):
-      fc_parse_fn = feature_config.get_parse_fn()
-      features, labels = fc_parse_fn(input_x)
-      features['labels'] = labels
-      return features, labels
-
-    if input_fn is None:
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-
-    # Calibrate stage
-    trainer.estimator._params.mode = 'calibrate'
-    trainer.calibrate(calibrator=calibrator,
-                      input_fn=input_fn,
-                      steps=params_c.calibrator_train_steps,
-                      hooks=hooks)
-
-    # Save Checkpoint
-    # We need to train for 1 step, to save the graph to checkpoint.
-    # This is done just by the chief.
-    # We need to set the mode to evaluate to save the graph that will be consumed
-    # In the final evaluation
-    trainer.estimator._params.mode = 'evaluate'
-    trainer.train(input_fn=input_fn, steps=1)
-
-    # Restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    # Workers wait for calibration to be ready
-    final_calibrator_path = os.path.join(params_c.calibrator_save_dir,
-                                         params_c.calibrator_export_module_name)
-
-    final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path)
-
-    while not tf.io.gfile.exists(final_calibrator_path + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % final_calibrator_path)
-      time.sleep(60)
-
-  # Evaluate stage
-  if run_eval:
-    trainer.estimator._params.mode = 'evaluate'
-    # This will allow the Evaluate method to be run in Hogwild
-    # trainer.estimator._params.continue_from_checkpoint = True
-    trainer.evaluate(name='test', input_fn=input_fn, steps=params_c.calibrator_final_evaluation_steps)
-
-  trainer.hub_export(name=params_c.calibrator_export_module_name,
-    export_task_type_overrider=export_task_type_overrider,
-    serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn())
-
-  return trainer
-
-
-def calibrate_discretizer_and_export(name, calibrator, build_graph_fn, params, feature_config):
-  """
-  Pre-set percentile discretizer calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config or input_fn which will be passed to the trainer.
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-        params.num_workers is None):
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    # disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    # create discretizer params
-    params_c = copy.deepcopy(params)
-    params_c.data_threads = 1
-    params_c.train_steps = -1
-    params_c.train_max_steps = None
-    params_c.eval_steps = -1
-    params_c.num_workers = 1
-    params_c.tensorboard_port = None
-    params_c.stats_port = None
-
-    if "discretizer_batch_size" in params_c:
-      params_c.train_batch_size = params_c.discretizer_batch_size
-      params_c.eval_batch_size = params_c.discretizer_batch_size
-    if "discretizer_keep_rate" in params_c:
-      params_c.train_keep_rate = params_c.discretizer_keep_rate
-    if "discretizer_parts_downsampling_rate" in params_c:
-      params_c.train_parts_downsampling_rate = params_c.discretizer_parts_downsampling_rate
-    if "discretizer_save_dir" in params_c:
-      params_c.save_dir = params_c.discretizer_save_dir
-
-    # train discretizer
-    trainer = twml.trainers.DataRecordTrainer(
-      name=name,
-      params=params_c,
-      build_graph_fn=build_graph_fn,
-      save_dir=params_c.save_dir,
-    )
-
-    if isinstance(feature_config, twml.feature_config.FeatureConfig):
-      parse_fn = twml.parsers.get_continuous_parse_fn(feature_config)
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-    elif callable(feature_config):
-      input_fn = feature_config
-    else:
-      got_type = type(feature_config).__name__
-      raise ValueError(
-        "Expecting feature_config to be FeatureConfig or function got %s" % got_type)
-
-    hooks = None
-    if params_c.train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.train_steps)]
-
-    trainer.calibrate(calibrator=calibrator, input_fn=input_fn,
-                      steps=params_c.train_steps, hooks=hooks)
-    # restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
-
-
-def build_percentile_discretizer_graph(features, label, mode, params, config=None):
-  """
-  Pre-set Percentile Discretizer Build Graph
-  Follows the same signature as build_graph
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-  if isinstance(sparse_tf, tf.SparseTensor):
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-  elif isinstance(sparse_tf, twml.SparseTensor):
-    indices = sparse_tf.indices
-    ids = sparse_tf.ids
-
-  # Return weights, feature_ids, feature_values
-  weights = tf.gather(params=weights, indices=ids)
-  feature_ids = indices
-  feature_values = sparse_tf.values
-  # Update train_op and assign dummy_loss
-  train_op = tf.assign_add(tf.train.get_global_step(), 1)
-  loss = tf.constant(1)
-  if mode == 'train':
-    return {'train_op': train_op, 'loss': loss}
-  return {'feature_ids': feature_ids, 'feature_values': feature_values, 'weights': weights}
-
-
-def isotonic_module(mode, params):
-  """
-  Common Isotonic Calibrator module for Hub Export
-  """
-  inputs = tf.sparse_placeholder(tf.float32, name="sparse_input")
-  mlp = hub.Module(params.calibrator_load_tensorflow_module)
-  logits = mlp(inputs, signature=params.export_mlp_module_name)
-  isotonic_calibrator = hub.Module(params.save_dir)
-  output = isotonic_calibrator(logits, signature="isotonic_calibrator")
-  hub.add_signature(inputs={"sparse_input": inputs},
-    outputs={"default": output},
-    name=params.calibrator_export_module_name)
-
-
-def build_isotonic_graph_from_inputs(inputs, features, label, mode, params, config=None, isotonic_fn=None):
-  """
-  Helper function to build_isotonic_graph
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  """
-  if params.mode == 'calibrate':
-    mlp = hub.Module(params.calibrator_load_tensorflow_module)
-    logits = mlp(inputs, signature=params.export_mlp_module_name)
-    weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-    # Update train_op and assign dummy_loss
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    loss = tf.constant(1)
-    if mode == 'train':
-      return {'train_op': train_op, 'loss': loss}
-    return {'predictions': logits, 'targets': features['labels'], 'weights': weights}
-  else:
-    if isotonic_fn is None:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_module, mode=mode, params=params)
-    else:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_fn, mode=mode, params=params)
-    output_hub = hub.Module(isotonic_spec,
-      name=params.calibrator_export_module_name)
-    hub.register_module_for_export(output_hub, params.calibrator_export_module_name)
-    output = output_hub(inputs, signature=params.calibrator_export_module_name)
-    output = tf.clip_by_value(output, 0, 1)
-    loss = tf.reduce_sum(tf.stop_gradient(output))
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    return {'train_op': train_op, 'loss': loss, 'output': output}
-
-
-def build_isotonic_graph(features, label, mode, params, config=None, export_discretizer=True):
-  """
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  This assumes that MLP already contains all modules (include percentile
-  discretizer); if export_discretizer is set
-  then it does not export the MDL phase.
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  if export_discretizer:
-    return build_isotonic_graph_from_inputs(sparse_tf, features, label, mode, params, config)
-  discretizer = hub.Module(params.discretizer_path)
-
-  if params.discretizer_signature is None:
-    discretizer_signature = "percentile_discretizer_calibrator"
-  else:
-    discretizer_signature = params.discretizer_signature
-  input_sparse = discretizer(sparse_tf, signature=discretizer_signature)
-  return build_isotonic_graph_from_inputs(input_sparse, features, label, mode, params, config)
diff --git a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
deleted file mode 100644
index e14f62303..000000000
--- a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
-import twml
-
-
-class HashedPercentileDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashedPercentileDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashedPercentileDiscretizer instead.
-  '''
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.contrib.layers.HashedPercentileDiscretizer(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets
-    )
diff --git a/twml/twml/contrib/calibrators/hashing_discretizer.py b/twml/twml/contrib/calibrators/hashing_discretizer.py
deleted file mode 100644
index 965ced934..000000000
--- a/twml/twml/contrib/calibrators/hashing_discretizer.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
-import numpy as np
-import twml
-
-
-class HashingDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashingDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashingDiscretizer instead.
-  '''
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    # Need to sort hash_map_keys according to hash_map_values
-    # just in case they're not in order of being put in the dict
-    # hash_map_values is already 0 through len(hash_map_values)-1
-    hash_map_keys = hash_map_keys.flatten()
-    # why is this float32 in PercentileDiscretizerCalibrator.to_layer ????
-    # need int for indexing
-    hash_map_values = hash_map_values.flatten().astype(np.int32)
-    feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64)
-    for idx in range(len(hash_map_keys)):
-      feature_ids[hash_map_values[idx]] = hash_map_keys[idx]
-
-    return twml.contrib.layers.HashingDiscretizer(
-      feature_ids=feature_ids,
-      bin_vals=self._bin_vals.flatten(),
-      n_bin=self._n_bin + 1,  # (self._n_bin + 1) bin_vals for each feature_id
-      out_bits=self._out_bits,
-      cost_per_unit=500,
-      name=name
-    )
diff --git a/twml/twml/contrib/calibrators/isotonic.py b/twml/twml/contrib/calibrators/isotonic.py
deleted file mode 100644
index d03a75ff8..000000000
--- a/twml/twml/contrib/calibrators/isotonic.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# pylint: disable=arguments-differ, unused-argument
-''' Contains Isotonic Calibration'''
-
-from .calibrator import CalibrationFeature, Calibrator
-
-from absl import logging
-import numpy as np
-from sklearn.isotonic import isotonic_regression
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-import twml.layers
-
-
-DEFAULT_SAMPLE_WEIGHT = 1
-
-
-def sort_values(inputs, target, weight, ascending=True):
-  '''
-  Sorts arrays based on the first array.
-
-  Arguments:
-    inputs:
-      1D array which will dictate the order which the remainder 2 arrays will be sorted
-    target:
-      1D array
-    weight:
-      1D array
-    ascending:
-      Boolean. If set to True (the default), sorts values in ascending order.
-
-  Returns:
-    sorted inputs:
-      1D array sorted by the order of `ascending`
-    sorted targets:
-      1D array
-    sorted weight:
-      1D array
-  '''
-  # assert that the length of inputs and target are the same
-  if len(inputs) != len(target):
-    raise ValueError('Expecting inputs and target sizes to match')
-   # assert that the length of inputs and weight are the same
-  if len(inputs) != len(weight):
-    raise ValueError('Expecting inputs and weight sizes to match')
-  inds = inputs.argsort()
-  if not ascending:
-    inds = inds[::-1]
-  return inputs[inds], target[inds], weight[inds]
-
-
-class IsotonicFeature(CalibrationFeature):
-  '''
-  IsotonicFeature adds values, weights and targets to each feature and then runs
-  isotonic regression by calling `sklearn.isotonic.isotonic_regression
-  <http://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html>`_
-  '''
-
-  def _get_bin_boundaries(self, n_samples, bins, similar_bins):
-    """
-    Calculates the sample indices that define bin boundaries
-
-    Arguments:
-      n_samples:
-        (int) number of samples
-      bins:
-        (int) number of bins. Needs to be smaller or equal than n_samples.
-      similar_bins:
-        (bool) If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
-
-    Returns:
-      (list[int]) List of sample indices defining bin boundaries
-    """
-
-    if bins > n_samples:
-      raise ValueError(
-        "The number of bins needs to be less than or equal to the number of samples. "
-        "Currently bins={0} and n_samples={1}.".format(bins, n_samples)
-      )
-
-    step = n_samples // bins
-
-    if similar_bins:
-      # dtype=int will floor the linspace
-      bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int)
-    else:
-      bin_boundaries = range(0, step * bins, step)
-
-    bin_boundaries = np.append(bin_boundaries, n_samples)
-
-    return bin_boundaries
-
-  def calibrate(self, bins, similar_bins=False, debug=False):
-    '''Calibrates the IsotonicFeature into calibrated weights and bias.
-
-    1. Sorts the values of the feature class, based on the order of values
-    2. Performs isotonic regression using sklearn.isotonic.isotonic_regression
-    3. Performs the binning of the samples, in order to obtain the final weight and bias
-      which will be used for inference
-
-    Note that this method can only be called once.
-
-    Arguments:
-      bins:
-        number of bins.
-      similar_bins:
-        If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
-      debug:
-        Defaults to False. If debug is set to true, output other parameters useful for debugging.
-
-    Returns:
-      [calibrated weight, calibrated bias]
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    # parse through the dict to obtain the targets, weights and values
-    self._concat_arrays()
-    feature_targets = self._features_dict['targets']
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-    srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values(
-      inputs=feature_values,
-      target=feature_targets,
-      weight=feature_weights
-    )
-    calibrated_feature_values = isotonic_regression(
-      srtd_feature_targets, sample_weight=srtd_feature_weights)
-    # create the final outputs for the prediction of each class
-    bpreds = []
-    btargets = []
-    bweights = []
-    rpreds = []
-
-    # Create bin boundaries
-    bin_boundaries = self._get_bin_boundaries(
-      len(calibrated_feature_values), bins, similar_bins=similar_bins)
-
-    for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]):
-      # separate each one of the arrays based on their respective bins
-      lpreds = srtd_feature_values[int(sidx):int(eidx)]
-      lrpreds = calibrated_feature_values[int(sidx):int(eidx)]
-      ltargets = srtd_feature_targets[int(sidx):int(eidx)]
-      lweights = srtd_feature_weights[int(sidx):int(eidx)]
-
-      # calculate the outputs (including the bpreds and rpreds)
-      bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      btargets.append(np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights))))
-      bweights.append(np.squeeze(np.sum(lweights)))
-    # transposing the bpreds and rpreds which will be used as input to the inference step
-    bpreds = np.asarray(bpreds).T
-    rpreds = np.asarray(rpreds).T
-    btargets = np.asarray(btargets).T
-    bweights = np.asarray(bweights).T
-    # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate
-    self._calibrated = True
-    if debug:
-      return bpreds, rpreds, btargets, bweights
-    return bpreds, rpreds
-
-
-class IsotonicCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for isotonic calibration.
-  Internally, each feature's values is accumulated via its own isotonicFeature object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and
-   3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, similar_bins=False, **kwargs):
-    ''' Constructs an isotonicCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for isotonic.
-        Note that each feature actually maps to ``n_bin+1`` output IDs.
-    '''
-    super(IsotonicCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._similar_bins = similar_bins
-    self._ys_input = []
-    self._xs_input = []
-    self._isotonic_feature_dict = {}
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output: output of prediction of build_graph for calibrator
-    '''
-    weights = output['weights'] if 'weights' in output else None
-    return self.accumulate(output['predictions'], output['targets'], weights)
-
-  def accumulate(self, predictions, targets, weights=None):
-    '''
-    Accumulate a single batch of class predictions, class targets and class weights.
-    These are accumulated until calibrate() is called.
-
-    Arguments:
-      predictions:
-        float matrix of class values. Each dimension corresponds to a different class.
-        Shape is ``[n, d]``, where d is the number of classes.
-      targets:
-        float matrix of class targets. Each dimension corresponds to a different class.
-        Shape ``[n, d]``, where d is the number of classes.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each prediction.
-    '''
-    if predictions.shape != targets.shape:
-      raise ValueError(
-        'Expecting predictions.shape == targets.shape, got %s and %s instead' %
-        (str(predictions.shape), str(targets.shape)))
-    if weights is not None:
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weight, got %dD instead' % weights.ndim)
-      elif weights.size != predictions.shape[0]:
-        raise ValueError(
-          'Expecting predictions.shape[0] == weights.size, got %d != %d instead' %
-          (predictions.shape[0], weights.size))
-    # iterate through the rows of predictions and sets one class to each row
-    if weights is None:
-      weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT)
-    for class_key in range(predictions.shape[1]):
-      # gets the predictions and targets for that class
-      class_predictions = predictions[:, class_key]
-      class_targets = targets[:, class_key]
-      if class_key not in self._isotonic_feature_dict:
-        isotonic_feature = IsotonicFeature(class_key)
-        self._isotonic_feature_dict[class_key] = isotonic_feature
-      else:
-        isotonic_feature = self._isotonic_feature_dict[class_key]
-      isotonic_feature.add_values({'values': class_predictions, 'weights': weights,
-                                   'targets': class_targets})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each IsotonicFeature after accumulation is complete.
-    Results are stored in ``self._ys_input`` and ``self._xs_input``
-
-    Arguments:
-      debug:
-        Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``.
-    '''
-    super(IsotonicCalibrator, self).calibrate()
-    bias_temp = []
-    weight_temp = []
-    logging.info("Beginning isotonic calibration.")
-    isotonic_features_dict = self._isotonic_feature_dict
-    for class_id in isotonic_features_dict:
-      bpreds, rpreds = isotonic_features_dict[class_id].calibrate(bins=self._n_bin, similar_bins=self._similar_bins)
-      weight_temp.append(bpreds)
-      bias_temp.append(rpreds)
-    # save isotonic results onto a matrix
-    self._xs_input = np.array(weight_temp, dtype=np.float32)
-    self._ys_input = np.array(bias_temp, dtype=np.float32)
-    logging.info("Isotonic calibration finished.")
-    if debug:
-      return np.array(weight_temp), np.array(bias_temp)
-    return None
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    logging.info("You probably do not need to save the isotonic layer. \
-                  So feel free to set save to False in the Trainer. \
-                  Additionally this only saves the layer not the whole graph.")
-
-    def calibrator_module():
-      '''
-      Way to save Isotonic layer
-      '''
-      # The input to isotonic is a dense layer
-      inputs = tf.placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def to_layer(self):
-    """ Returns a twml.layers.Isotonic Layer that can be used for feature discretization.
-    """
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    isotonic_layer = twml.layers.Isotonic(
-      n_unit=self._xs_input.shape[0], n_bin=self._xs_input.shape[1],
-      xs_input=self._xs_input, ys_input=self._ys_input,
-      **self._kwargs)
-
-    return isotonic_layer
-
-  def get_layer_args(self, name=None):
-    """ Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation """
-    return {'n_unit': self._xs_input.shape[0], 'n_bin': self._xs_input.shape[1]}
diff --git a/twml/twml/contrib/calibrators/mdl.py b/twml/twml/contrib/calibrators/mdl.py
deleted file mode 100644
index 0fe3265a4..000000000
--- a/twml/twml/contrib/calibrators/mdl.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains MDLFeature and MDLCalibrator used for MDL calibration '''
-
-
-import os
-
-from .percentile_discretizer import PercentileDiscretizerCalibrator, PercentileDiscretizerFeature
-
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-import twml.layers
-
-
-DEFAULT_SAMPLE_WEIGHT = 1
-
-
-class MDLFeature(PercentileDiscretizerFeature):
-  ''' Accumulates and calibrates a single sparse MDL feature. '''
-
-
-class MDLCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for MDL calibration.
-  Internally, each feature's values is accumulated via its own ``MDLFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.MDL layer by calling ``to_layer()``.
-
-  '''
-
-  def to_layer(self, name=None):
-    """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
-
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
-        %d but requested that the output be limited to %d values (%d bits),
-        which is smaller than that. Please ensure the output has enough bits
-        to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = twml.layers.MDL(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets,
-      **self._kwargs
-    )
-
-    return discretizer
-
-  def save(self, save_dir, name='calibrator', verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory
-      name:
-        name for the graph scope. Passed to to_layer(name=name) to set
-        scope of layer.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    layer_args = self.get_layer_args()
-
-    calibrator_filename = os.path.join(save_dir, name + '.json.tf')
-    calibrator_dict = {
-      'layer_args': layer_args,
-      'saved_layer_scope': name + '/',
-    }
-    twml.write_file(calibrator_filename, calibrator_dict, encode='json')
-
-    if verbose:
-      logging.info("The layer graph and other information necessary ")
-      logging.info("for multi-phase training is saved in directory:")
-      logging.info(save_dir)
-      logging.info("This directory can be specified as --init_from_dir argument.")
-      logging.info("")
-      logging.info("Other information is available in: %s.json.tf", name)
-      logging.info("This file can be loaded with twml.read_file(decode='json) to obtain ")
-      logging.info("layer_args, saved_layer_scope and variable_names")
-
-    graph = tf.Graph()
-    # save graph for tensorboard as well
-    writer = tf.summary.FileWriter(logdir=save_dir, graph=graph)
-
-    with tf.Session(graph=graph) as sess:
-      self.write_summary(writer, sess)
-    writer.flush()
diff --git a/twml/twml/contrib/calibrators/percentile_discretizer.py b/twml/twml/contrib/calibrators/percentile_discretizer.py
deleted file mode 100644
index eefce62c2..000000000
--- a/twml/twml/contrib/calibrators/percentile_discretizer.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \
-    for PercentileDiscretizer calibration '''
-
-
-
-from .calibrator import CalibrationFeature, Calibrator
-
-import os
-import numpy as np
-import tensorflow.compat.v1 as tf
-import tensorflow_hub as hub
-import twml
-import twml.layers
-
-
-DEFAULT_SAMPLE_WEIGHT = 1
-
-
-class PercentileDiscretizerFeature(CalibrationFeature):
-  ''' Accumulates and calibrates a single sparse PercentileDiscretizer feature. '''
-
-  @staticmethod
-  def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer):
-    '''
-    Determine how many training values fell into a given bin during calibration.
-    This is calculated by finding the index of the first appearance of each bin
-    boundary in values (values may repeat, so that isn't trivially in indices.)
-    Subtracting each bin boundary index from the next tells you how many values fall in
-    that bin.
-    To get this to calculate the last bin correctly, len(values) is appended to the
-    list of bound indices.
-
-    This assumes that ``bin_vals`` excludes np.inf bin boundaries when
-    PercentileDiscretizer was calibrated
-    with fewer values than bins.
-
-    Arguments:
-      values:
-        1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending
-      indices:
-        1D int32 ndarray of the indices (in values) of the bin boundaries
-      bin_vals:
-        1D ndarray containing the bin boundaries
-      bin_counts_buffer:
-        ndarray buffer for returning the PercentileDiscretizer histogram
-    '''
-    # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1]
-    # append index of the last bin since that cannot be empty with how
-    # PercentileDiscretizer is implemented
-    nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1)
-    bin_start_indices = indices.take(nonempty_bins)
-
-    # if multiples of a bin's lower bound value exist, find the first one
-    for (i, idx) in enumerate(bin_start_indices):
-      cur_idx = idx
-      while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]:
-        bin_start_indices[i] = cur_idx = cur_idx - 1
-
-    # the end of each bin is the start of the next bin,
-    # until the last, which is the end of the array
-    # broadcast the counts to the nonempty bins, 0 otherwise
-    bin_counts_buffer[:] = 0
-    bin_counts_buffer[nonempty_bins] = np.diff(np.append(bin_start_indices, values.size))
-
-  def calibrate(
-          self,
-          bin_vals, percentiles, percentile_indices,
-          bin_counts_buffer=None):
-    '''Calibrates the PercentileDiscretizerFeature into bin values for
-    use in PercentileDiscretizerCalibrator.
-    Note that this method can only be called once.
-
-    Arguments:
-      bin_vals:
-        Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature.
-        Will be updated with the results of the calibration.
-        A 1D ndarray.
-      percentiles:
-        1D array of size n_bin with values ranging from 0 to 1.
-        For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)``
-      percentile_indices:
-        Empty 1D array of size n_bin used to store intermediate results when
-        calling twml.twml_optim_nearest_interpolation().
-        For example, np.empty(self._n_bin + 1, dtype=np.float32).
-      bin_counts_buffer:
-        optional ndarray buffer used for retaining count of values per PercentileDiscretizer
-        bucket (for debug and feature exploration purposes)
-
-    Returns:
-      calibrated bin_vals for use by ``PercentileDiscretizerCalibrator``
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    if bin_vals.ndim != 1:
-      raise RuntimeError("Expecting bin_vals row")
-
-    # # concatenate values and weights buffers
-    self._concat_arrays()
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-
-    # get features ready for the bins, order array indices by feature values.
-    indices = np.argsort(feature_values)
-
-    # get ordered values and weights using array indices
-    values = feature_values.take(indices)
-    weights = feature_weights.take(indices)
-
-    # Normalizes the sum of weights to be between 0 and 1
-    weights = np.cumsum(weights, out=feature_weights)
-    weights -= weights[0]
-    if weights[-1] > 0:  # prevent zero-division
-      weights /= weights[-1]
-
-    # Check if we have less values than bin_vals
-    if values.size < bin_vals.size:
-      # Fills all the bins with a value that won't ever be reached
-      bin_vals.fill(np.inf)
-      # Forces the first to be -inf
-      bin_vals[0] = -np.inf
-      # Copies the values as boundaries
-      bin_vals[1:values.size + 1] = values
-
-      if bin_counts_buffer is not None:
-        # slice out bins with +/-np.inf boundary -- their count will be zero anyway
-        # we can't just assume all other bins will have 1 value since there can be dups
-        short_indices = np.arange(values.size, dtype=np.int32)
-        bin_counts_buffer.fill(0)
-        self._gather_debug_info(
-          values, short_indices, bin_vals[1:values.size + 1],
-          bin_counts_buffer[1:values.size + 1])
-
-    else:
-      # Gets the indices for the values that define the boundary for the bins
-      indices_float = np.arange(0, weights.size, dtype=np.float32)
-
-      # Gets things in the correct shape for the linear interpolation
-      weights = weights.reshape(1, weights.size)
-      indices_float = indices_float.reshape(1, weights.size)
-
-      # wrap ndarrays into twml.Array
-      percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1))
-      weights_tarray = twml.Array(weights)
-      indices_float_tarray = twml.Array(indices_float)
-      percentile_indices_tarray = twml.Array(percentile_indices.reshape(percentiles.size, 1))
-
-      # Performs the binary search to find the indices corresponding to the percentiles
-      err = twml.CLIB.twml_optim_nearest_interpolation(
-        percentile_indices_tarray.handle, percentiles_tarray.handle,  # output, input
-        weights_tarray.handle, indices_float_tarray.handle  # xs, ys
-      )
-      if err != 1000:
-        raise ValueError("""twml.CLIB.twml_optim_nearest_interpolation
-          caught an error (see previous stdout). Error code: """ % err)
-
-      indices = indices[:bin_vals.size]
-      indices[:] = percentile_indices
-      indices[0] = 0
-      indices[-1] = weights.size - 1
-
-      # Gets the values at those indices and copies them into bin_vals
-      values.take(indices, out=bin_vals)
-
-      # get # of values per bucket
-      if bin_counts_buffer is not None:
-        self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer)
-
-    self._calibrated = True
-
-
-class PercentileDiscretizerCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for PercentileDiscretizer calibration.
-  Internally, each feature's values is accumulated via its own
-  ``PercentileDiscretizerFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, out_bits, bin_histogram=True,
-               allow_empty_calibration=False, **kwargs):
-    ''' Constructs an PercentileDiscretizerCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for PercentileDiscretizer.
-        Note that each feature actually maps to n_bin+1 output IDs.
-      out_bits:
-        The maximum number of bits to use for the output IDs.
-        2**out_bits must be greater than bin_ids.size or an error is raised.
-      bin_histogram:
-        When True (the default), gathers information during calibration
-        to build a bin_histogram.
-      allow_empty_calibration:
-        allows operation where we might not calibrate any features.
-        Default False to error out if no features were calibrated.
-        Typically, values of uncalibrated features pass through discretizers
-        untouched (though the feature ids will be truncated to obey out_bits).
-    '''
-    super(PercentileDiscretizerCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-
-    self._bin_ids = None
-    self._bin_vals = np.empty(0, dtype=np.float32)  # Note changed from 64 (v1) to 32 (v2)
-
-    self._bin_histogram = bin_histogram
-    self._bin_histogram_dict = None
-
-    self._hash_map_counter = 0
-    self._hash_map = {}
-
-    self._discretizer_feature_dict = {}
-    self._allow_empty_calibration = allow_empty_calibration
-
-  @property
-  def bin_ids(self):
-    '''
-    Gets bin_ids
-    '''
-    return self._bin_ids
-
-  @property
-  def bin_vals(self):
-    '''
-    Gets bin_vals
-    '''
-    return self._bin_vals
-
-  @property
-  def hash_map(self):
-    '''
-    Gets hash_map
-    '''
-    return self._hash_map
-
-  @property
-  def discretizer_feature_dict(self):
-    '''
-    Gets feature_dict
-    '''
-    return self._discretizer_feature_dict
-
-  def accumulate_features(self, inputs, name):
-    '''
-    Wrapper around accumulate for PercentileDiscretizer.
-    Arguments:
-      inputs:
-        batch that will be accumulated
-      name:
-        name of the tensor that will be accumulated
-
-    '''
-    sparse_tf = inputs[name]
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-    weights = np.take(inputs["weights"], ids)
-    return self.accumulate(indices, sparse_tf.values, weights)
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output:
-        output of prediction of build_graph for calibrator
-    '''
-    return self.accumulate(output['feature_ids'], output['feature_values'], output['weights'])
-
-  def accumulate(self, feature_keys, feature_vals, weights=None):
-    '''Accumulate a single batch of feature keys, values and weights.
-
-    These are accumulate until ``calibrate()`` is called.
-
-    Arguments:
-      feature_keys:
-        1D int64 array of feature keys.
-      feature_vals:
-        1D float array of feature values. Each element of this array
-        maps to the commensurate element in ``feature_keys``.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each feature key, value pair.
-        Typically, this is the weight of each sample (but you still need
-        to provide one weight per key,value pair).
-        Each element of this array maps to the commensurate element in feature_keys.
-    '''
-    if feature_keys.ndim != 1:
-      raise ValueError('Expecting 1D feature_keys, got %dD' % feature_keys.ndim)
-    if feature_vals.ndim != 1:
-      raise ValueError('Expecting 1D feature_values, got %dD' % feature_vals.ndim)
-    if feature_vals.size != feature_keys.size:
-      raise ValueError(
-        'Expecting feature_keys.size == feature_values.size, got %d != %d' %
-        (feature_keys.size, feature_vals.size))
-    if weights is not None:
-      weights = np.squeeze(weights)
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weights, got %dD' % weights.ndim)
-      elif weights.size != feature_keys.size:
-        raise ValueError(
-          'Expecting feature_keys.size == weights.size, got %d != %d' %
-          (feature_keys.size, weights.size))
-    if weights is None:
-      weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT)
-    unique_keys = np.unique(feature_keys)
-    for feature_id in unique_keys:
-      idx = np.where(feature_keys == feature_id)
-      if feature_id not in self._discretizer_feature_dict:
-        self._hash_map[feature_id] = self._hash_map_counter
-        # unlike v1, the hash_map_counter is incremented AFTER assignment.
-        # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3
-        self._hash_map_counter += 1
-        # creates a new cache if we never saw the feature before
-        discretizer_feature = PercentileDiscretizerFeature(feature_id)
-        self._discretizer_feature_dict[feature_id] = discretizer_feature
-      else:
-        discretizer_feature = self._discretizer_feature_dict[feature_id]
-      discretizer_feature.add_values({'values': feature_vals[idx], 'weights': weights[idx]})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each PercentileDiscretizer feature after accumulation is complete.
-
-    Arguments:
-      debug:
-        Boolean to request debug info be returned by the method.
-        (see Returns section below)
-
-    The calibration results are stored in two matrices:
-      bin_ids:
-        2D array of size number of accumulate ``features x n_bin+1``.
-        Contains the new IDs generated by PercentileDiscretizer. Each row maps to a feature.
-        Each row maps to different value bins. The IDs
-        are in the range ``1 -> bin_ids.size+1``
-      bin_vals:
-        2D array of the same size as bin_ids.
-        Each row maps to a feature. Each row contains the bin boundaries.
-        These boundaries represent feature values.
-
-    Returns:
-      if debug is True, the method returns
-
-        - 1D int64 array of feature_ids
-        - 2D float32 array copy of bin_vals (the bin boundaries) for each feature
-        - 2D int64 array of bin counts corresponding to the bin boundaries
-
-    '''
-    n_feature = len(self._discretizer_feature_dict)
-    if n_feature == 0 and not self._allow_empty_calibration:
-      raise RuntimeError("Need to accumulate some features for calibration\n"
-                         "Likely, the calibration data is empty. This can\n"
-                         "happen if the dataset is small, or if the following\n"
-                         "cli args are set too low:\n"
-                         "  --discretizer_keep_rate (default=0.0008)\n"
-                         "  --discretizer_parts_downsampling_rate (default=0.2)\n"
-                         "Consider increasing the values of these args.\n"
-                         "To allow empty calibration data (and degenerate discretizer),\n"
-                         "use the allow_empty_calibration input of the constructor.")
-
-    self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1)
-    self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1)
-
-    self._bin_vals.resize(n_feature, self._n_bin + 1)
-
-    # buffers shared by PercentileDiscretizerFeature.calibrate()
-    percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32)
-
-    # Tensor from 0 to 1 in the number of steps provided
-    percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32)
-
-    if debug or self._bin_histogram:
-      debug_feature_ids = np.empty(n_feature, dtype=np.int64)
-      bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64)
-
-    # progress bar for calibration phase
-    progress_bar = tf.keras.utils.Progbar(n_feature)
-
-    discretizer_features_dict = self._discretizer_feature_dict
-    for i, feature_id in enumerate(discretizer_features_dict):
-      if debug or self._bin_histogram:
-        debug_feature_ids[self._hash_map[feature_id]] = feature_id
-        bin_counts_buffer = bin_counts[self._hash_map[feature_id]]
-      else:
-        bin_counts_buffer = None
-
-      # calibrate each PercentileDiscretizer feature (puts results in bin_vals)
-      discretizer_features_dict[feature_id].calibrate(
-        self._bin_vals[self._hash_map[feature_id]],  # Gets feature-values
-        percentiles, percentile_indices,
-        bin_counts_buffer=bin_counts_buffer
-      )
-
-      # update progress bar 20 times
-      if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1):
-        progress_bar.update(i + 1)
-
-    super(PercentileDiscretizerCalibrator, self).calibrate()
-
-    if self._bin_histogram:
-      # save bin histogram data for later
-      self._bin_histogram_dict = {
-        'feature_ids': debug_feature_ids,
-        'bin_counts': bin_counts,
-        'bin_vals': self._bin_vals,
-        'out_bits': self._out_bits,
-      }
-
-    if debug:
-      return debug_feature_ids, self._bin_vals.copy(), bin_counts
-
-    return None
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.layers.PercentileDiscretizer(
-      n_feature=n_feature,
-      n_bin=self._n_bin,
-      out_bits=self._out_bits,
-      bin_values=self._bin_vals.flatten(),
-      hash_keys=hash_map_keys,
-      hash_values=hash_map_values.astype(np.int64),
-      bin_ids=self._bin_ids.flatten().astype(np.int64),
-      feature_offsets=feature_offsets,
-      name=name,
-      **self._kwargs
-    )
-
-  def to_layer(self, name=None):
-    """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
-
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
-        %d but requested that the output be limited to %d values (%d bits),
-        which is smaller than that. Please ensure the output has enough bits
-        to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = self._create_discretizer_layer(n_feature, hash_map_keys,
-                                                 hash_map_values, feature_offsets, name)
-
-    return discretizer
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-    See twml.calibrator.Calibrator.get_layer_args for more detailed documentation.
-    '''
-    layer_args = {
-      'n_feature': len(self._discretizer_feature_dict),
-      'n_bin': self._n_bin,
-      'out_bits': self._out_bits,
-    }
-
-    return layer_args
-
-  def add_hub_signatures(self, name):
-    """
-    Add Hub Signatures for each calibrator
-
-    Arguments:
-      name:
-        Calibrator name
-    """
-    sparse_tf = tf.sparse_placeholder(tf.float32)
-    calibrator_layer = self.to_layer()
-    hub.add_signature(
-      inputs=sparse_tf,
-      outputs=calibrator_layer(sparse_tf, keep_inputs=False),
-      name=name)
-
-  def write_summary(self, writer, sess=None):
-    """
-    This method is called by save() to write a histogram of
-    PercentileDiscretizer feature bins to disk. A histogram is included for each
-    feature.
-
-    Arguments:
-      writer:
-        tf.summary.FilteWriter instance.
-        used to add summaries to event files for inclusion in tensorboard.
-      sess:
-        tf.Session instance. Used to produces summaries for the writer.
-    """
-    bin_counts_ph = tf.placeholder(tf.int64)
-    bin_counts = self._bin_histogram_dict['bin_counts']
-
-    # Record that distribution into a histogram summary
-    histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph)
-    for i in range(bin_counts.shape[0]):
-      bin_counts_summary = sess.run(histo, feed_dict={bin_counts_ph: bin_counts[i]})
-      writer.add_summary(bin_counts_summary, global_step=i)
-
-  def write_summary_json(self, save_dir, name="default"):
-    """
-    Export bin information to HDFS.
-    
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    """
-    # Since the size is small: (# of bins) * (# of features), we always dump the file.
-    discretizer_export_bin_filename = os.path.join(save_dir, name + '_bin.json')
-    discretizer_export_bin_dict = {
-      'feature_ids': self._bin_histogram_dict['feature_ids'].tolist(),
-      'bin_boundaries': self._bin_histogram_dict['bin_vals'].tolist(),
-      'output_bits': self._bin_histogram_dict['out_bits']
-    }
-    twml.write_file(discretizer_export_bin_filename, discretizer_export_bin_dict, encode='json')
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory using TF Hub.
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      # creates the signature to the calibrator module
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=False),
-        name=name)
-      # and another signature for keep_inputs mode
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=True),
-        name=name + '_keep_inputs')
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-    self.write_summary_json(save_dir, name)
diff --git a/twml/twml/contrib/eventbus/input_fn.py b/twml/twml/contrib/eventbus/input_fn.py
deleted file mode 100644
index c184d9434..000000000
--- a/twml/twml/contrib/eventbus/input_fn.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from reader import EventBusPipedBinaryRecordReader
-import tensorflow.compat.v1 as tf
-import twml
-
-
-"""
-This module provides input function for DeepBird v2 training.
-The training data records are loaded from an EventBus reader.
-"""
-
-
-def get_eventbus_data_record_generator(eventbus_reader):
-  """
-  This module provides a data record generater from EventBus reader.
-
-  Args:
-    eventbus_reader: EventBus reader
-
-  Returns:
-    gen: Data record generater
-  """
-  eventbus_reader.initialize()
-  counter = [0]
-
-  def gen():
-    while True:
-      record = eventbus_reader.read()
-      if eventbus_reader.debug:
-        tf.logging.warn("counter: {}".format(counter[0]))
-        with open('tmp_record_{}.bin'.format(counter[0]), 'wb') as f:
-          f.write(record)
-        counter[0] = counter[0] + 1
-      yield record
-  return gen
-
-
-def get_eventbus_data_record_dataset(eventbus_reader, parse_fn, batch_size):
-  """
-  This module generates batch data for training from a data record generator.
-  """
-  dataset = tf.data.Dataset.from_generator(
-    get_eventbus_data_record_generator(eventbus_reader), tf.string, tf.TensorShape([]))
-  return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=4).prefetch(buffer_size=10)
-
-
-def get_train_input_fn(feature_config, params, parse_fn=None):
-  """
-  This module provides input function for DeepBird v2 training.
-  It gets batched training data from data record generator.
-  """
-  eventbus_reader = EventBusPipedBinaryRecordReader(
-    params.jar_file, params.num_eb_threads, params.subscriber_id,
-    filter_str=params.filter_str, debug=params.debug)
-
-  train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn(
-    feature_config, ["ids", "keys", "values", "batch_size", "weights"])
-
-  return lambda: get_eventbus_data_record_dataset(
-    eventbus_reader, train_parse_fn, params.train_batch_size)
diff --git a/twml/twml/contrib/eventbus/reader.py b/twml/twml/contrib/eventbus/reader.py
deleted file mode 100644
index 2f8e2749e..000000000
--- a/twml/twml/contrib/eventbus/reader.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import io
-import logging
-import subprocess
-from threading import Lock
-
-"""
-This module provides a binary data record reader for EventBus data.
-It starts a EventBus subscriber in a separate process to receive EventBus streaming data.
-The subscriber is supposed to outputs received data through PIPE to this module.
-This module parses input and output binary data record to serve as a record reader.
-"""
-
-
-class BinaryRecordReader(object):
-  def initialize(self):
-    pass
-
-  def read(self):
-    """Read raw bytes for one record
-    """
-    raise NotImplementedError
-
-  def close(self):
-    pass
-
-
-class ReadableWrapper(object):
-  def __init__(self, internal):
-    self.internal = internal
-
-  def __getattr__(self, name):
-    return getattr(self.internal, name)
-
-  def readable(self):
-    return True
-
-
-class EventBusPipedBinaryRecordReader(BinaryRecordReader):
-
-  JAVA = '/usr/lib/jvm/java-11-twitter/bin/java'
-  RECORD_SEPARATOR_HEX = [
-    0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29,
-    0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff
-  ]
-  RECORD_SEPARATOR = ''.join([chr(i) for i in RECORD_SEPARATOR_HEX])
-  RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR)
-  CHUNK_SIZE = 8192
-
-  def __init__(self, jar_file, num_eb_threads, subscriber_id,
-               filter_str=None, buffer_size=32768, debug=False):
-    self.jar_file = jar_file
-    self.num_eb_threads = num_eb_threads
-    self.subscriber_id = subscriber_id
-    self.filter_str = filter_str if filter_str else '""'
-    self.buffer_size = buffer_size
-    self.lock = Lock()
-    self._pipe = None
-    self._buffered_reader = None
-    self._bytes_buffer = None
-
-    self.debug = debug
-
-  def initialize(self):
-    if not self._pipe:
-      self._pipe = subprocess.Popen(
-        [
-          self.JAVA, '-jar', self.jar_file,
-          '-subscriberId', self.subscriber_id,
-          '-numThreads', str(self.num_eb_threads),
-          '-dataFilter', self.filter_str,
-          '-debug' if self.debug else ''
-        ],
-        stdout=subprocess.PIPE
-      )
-      self._buffered_reader = io.BufferedReader(
-        ReadableWrapper(self._pipe.stdout), self.buffer_size)
-      self._bytes_buffer = io.BytesIO()
-    else:
-      logging.warning('Already initialized')
-
-  def _find_next_record(self):
-    tail = ['']
-    while True:
-      chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE)
-      index = chunk.find(self.RECORD_SEPARATOR)
-      if index < 0:
-        self._bytes_buffer.write(chunk[:-self.RECORD_SEPARATOR_LENGTH])
-        tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH:]
-      else:
-        self._bytes_buffer.write(chunk[:index])
-        return chunk[(index + self.RECORD_SEPARATOR_LENGTH):]
-
-  def _read(self):
-    with self.lock:
-      remaining = self._find_next_record()
-      record = self._bytes_buffer.getvalue()
-      # clean up buffer
-      self._bytes_buffer.close()
-      self._bytes_buffer = io.BytesIO()
-      self._bytes_buffer.write(remaining)
-
-      return record
-
-  def read(self):
-    while True:
-      try:
-        return self._read()
-      except Exception as e:
-        logging.error("Error reading bytes for next record: {}".format(e))
-        if self.debug:
-          raise
-
-  def close(self):
-    try:
-      self._bytes_buffer.close()
-      self._buffered_reader.close()
-      self._pipe.terminate()
-    except Exception as e:
-      logging.error("Error closing reader: {}".format(e))
diff --git a/twml/twml/contrib/export/__init__.py b/twml/twml/contrib/export/__init__.py
deleted file mode 100644
index 99892dcfa..000000000
--- a/twml/twml/contrib/export/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from . import export_fn # noqa: F401
-from . import exporters # noqa: F401
diff --git a/twml/twml/contrib/export/export_fn.py b/twml/twml/contrib/export/export_fn.py
deleted file mode 100644
index 6e59fff07..000000000
--- a/twml/twml/contrib/export/export_fn.py
+++ /dev/null
@@ -1,264 +0,0 @@
-"""
-Functions for exporting models for different modes.
-"""
-from collections import OrderedDict
-import os
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator.export import export
-import twml
-import yaml
-
-
-def get_sparse_batch_supervised_input_receiver_fn(feature_config, keep_fields=None):
-  """Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors
-  with labels and weights as defined in feature_config.
-  This input_receiver_fn is required for exporting models with 'train' mode to be trained with
-  Java API
-
-  Args:
-    feature_config (FeatureConfig): deepbird v2 feature config object
-    keep_fields (list): list of fields to keep
-
-  Returns:
-    supervised_input_receiver_fn: input_receiver_fn used for train mode
-  """
-  def supervised_input_receiver_fn():
-    serialized_request = tf.placeholder(dtype=tf.uint8, name='request')
-    receiver_tensors = {'request': serialized_request}
-
-    bpr = twml.contrib.readers.HashedBatchPredictionRequest(serialized_request, feature_config)
-    features = bpr.get_sparse_features() if keep_fields is None else bpr.get_features(keep_fields)
-    features['weights'] = bpr.weights
-    labels = bpr.labels
-    features, labels = bpr.apply_filter(features, labels)
-
-    return export.SupervisedInputReceiver(features, labels, receiver_tensors)
-
-  return supervised_input_receiver_fn
-
-
-def update_build_graph_fn_for_train(build_graph_fn):
-  """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse
-  similar to the export_output_fns for serving.
-  The key difference here is that
-  1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of
-     creating an export_output object. This is because of the way estimators export model in 'train'
-     mode doesn't take custom export_output
-  2. We only do it when `mode == 'train'` to avoid altering the graph when exporting
-     for 'infer' mode
-
-  Args:
-    build_graph_fn (Callable): deepbird v2 build graph function
-
-  Returns:
-    new_build_graph_fn: An updated build_graph_fn that inserts serialized BatchPredictResponse
-                        to graph output when in 'train' mode
-  """
-  def new_build_graph_fn(features, label, mode, params, config=None):
-    output = build_graph_fn(features, label, mode, params, config)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      output.update(
-        twml.export_output_fns.batch_prediction_continuous_output_fn(output)[
-          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs
-      )
-    return output
-  return new_build_graph_fn
-
-
-def export_model_for_train_and_infer(
-    trainer, feature_config, keep_fields, export_dir, as_text=False):
-  """Function for exporting model with both 'train' and 'infer' mode.
-
-  This means the exported saved_model.pb will contain two meta graphs, one with tag 'train'
-  and the other with tag 'serve', and it can be loaded in Java API with either tag depending on
-  the use case
-
-  Args:
-    trainer (DataRecordTrainer): deepbird v2 DataRecordTrainer
-    feature_config (FeatureConfig): deepbird v2 feature config
-    keep_fields (list of string): list of field keys, e.g.
-                                  ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes')
-    export_dir (str): a directory (local or hdfs) to export model to
-    as_text (bool): if True, write 'saved_model.pb' as binary file, else write
-                    'saved_model.pbtxt' as human readable text file. Default False
-  """
-  train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn(
-    feature_config, keep_fields)
-  predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn(
-    feature_config, keep_fields)
-  trainer._export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-  trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn)
-  trainer._estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map={
-      tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn,
-      tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn
-    },
-    as_text=as_text,
-  )
-
-  trainer.export_model_effects(export_dir)
-
-
-def export_all_models_with_receivers(estimator, export_dir,
-                                     train_input_receiver_fn,
-                                     eval_input_receiver_fn,
-                                     predict_input_receiver_fn,
-                                     export_output_fn,
-                                     export_modes=('train', 'eval', 'predict'),
-                                     register_model_fn=None,
-                                     feature_spec=None,
-                                     checkpoint_path=None,
-                                     log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    estimator:
-      Should be of type tf.estimator.Estimator.
-      You can get this from trainer using trainer.estimator
-    export_dir:
-      Directory to export the model.
-    train_input_receiver_fn:
-      Input receiver for train interface.
-    eval_input_receiver_fn:
-      Input receiver for eval interface.
-    predict_input_receiver_fn:
-      Input receiver for predict interface.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    register_model_fn:
-      An optional function which is called with export_dir after models are exported.
-      Defaults to None.
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # TODO: Fix for hogwild / distributed training.
-
-  if export_dir is None:
-    raise ValueError("export_dir can not be None")
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  input_receiver_fn_map = {}
-
-  if "train" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn
-
-  if "eval" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn
-
-  if "predict" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn
-
-  export_dir = estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map=input_receiver_fn_map,
-    checkpoint_path=checkpoint_path,
-  )
-
-  if register_model_fn is not None:
-    register_model_fn(export_dir, feature_spec, log_features)
-
-  return export_dir
-
-
-def export_all_models(trainer,
-                      export_dir,
-                      parse_fn,
-                      serving_input_receiver_fn,
-                      export_output_fn=None,
-                      export_modes=('train', 'eval', 'predict'),
-                      feature_spec=None,
-                      checkpoint=None,
-                      log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    trainer:
-      An object of type twml.trainers.Trainer.
-    export_dir:
-      Directory to export the model.
-    parse_fn:
-      The parse function used parse the inputs for train and eval.
-    serving_input_receiver_fn:
-      The input receiver function used during serving.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    feature_spec:
-      A dictionary obtained from FeatureConfig.get_feature_spec() to serialize
-      as feature_spec.yaml in export_dir.
-      Defaults to None
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # Only export from chief in hogwild or distributed modes.
-  if trainer.params.get('distributed', False) and not trainer.estimator.config.is_chief:
-    tf.logging.info("Trainer.export_model ignored due to instance not being chief.")
-    return
-
-  if feature_spec is None:
-    if getattr(trainer, '_feature_config') is None:
-      raise ValueError("feature_spec is set to None."
-                       "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function")
-    else:
-      feature_spec = trainer._feature_config.get_feature_spec()
-
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  old_export_output_fn = trainer._export_output_fn
-  trainer._export_output_fn = export_output_fn
-  supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(parse_fn)
-  if not checkpoint:
-    checkpoint = trainer.best_or_latest_checkpoint
-
-  export_dir = export_all_models_with_receivers(estimator=trainer.estimator,
-                                                export_dir=export_dir,
-                                                train_input_receiver_fn=supervised_input_receiver_fn,
-                                                eval_input_receiver_fn=supervised_input_receiver_fn,
-                                                predict_input_receiver_fn=serving_input_receiver_fn,
-                                                export_output_fn=export_output_fn,
-                                                export_modes=export_modes,
-                                                register_model_fn=trainer.export_model_effects,
-                                                feature_spec=feature_spec,
-                                                checkpoint_path=checkpoint,
-                                                log_features=log_features)
-  trainer._export_output_fn = old_export_output_fn
-  return export_dir
-
-
-def export_feature_spec(dir_path, feature_spec_dict):
-  """
-  Exports a FeatureConfig.get_feature_spec() dict to <dir_path>/feature_spec.yaml.
-  """
-  def ordered_dict_representer(dumper, data):
-    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
-
-  try:
-    # needed for Python 2
-    yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str)
-    yaml.add_representer(unicode, yaml.representer.SafeRepresenter.represent_unicode)
-  except NameError:
-    # 'unicode' type doesn't exist on Python 3
-    # PyYAML handles unicode correctly in Python 3
-    pass
-
-  yaml.add_representer(OrderedDict, ordered_dict_representer)
-
-  fbase = "feature_spec.yaml"
-  fname = fbase.encode('utf-8') if type(dir_path) != str else fbase
-  file_path = os.path.join(dir_path, fname)
-  with tf.io.gfile.GFile(file_path, mode='w') as f:
-    yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True)
-  tf.logging.info("Exported feature spec to %s" % file_path)
-
-  return file_path
-
-
-# Keep the alias for compatibility.
-get_supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn
diff --git a/twml/twml/contrib/export/exporters.py b/twml/twml/contrib/export/exporters.py
deleted file mode 100644
index 122955cbc..000000000
--- a/twml/twml/contrib/export/exporters.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-Wrappers around tf.estimator.Exporters to export models and save checkpoints.
-"""
-import os
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator import exporter
-import twml
-
-
-class _AllSavedModelsExporter(tf.estimator.Exporter):
-  """Internal exporter class to be used for exporting models for different modes."""
-
-  def __init__(self,
-               name,
-               input_receiver_fn_map,
-               backup_checkpoints,
-               assets_extra=None,
-               as_text=False):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-      assets_extra: Additional assets to be included in the exported model.
-      as_text: Specifies if the exported model should be in a human readable text format.
-    """
-    self._name = name
-    self._input_receiver_fn_map = input_receiver_fn_map
-    self._backup_checkpoints = backup_checkpoints
-    self._assets_extra = assets_extra
-    self._as_text = as_text
-
-  @property
-  def name(self):
-    return self._name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    del is_the_final_export
-
-    export_path = twml.util.sanitize_hdfs_path(export_path)
-    checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path)
-
-    if self._backup_checkpoints:
-      backup_path = os.path.join(export_path, "checkpoints")
-      # Ensure backup_path is created. makedirs passes if dir already exists.
-      tf.io.gfile.makedirs(backup_path)
-      twml.util.backup_checkpoint(checkpoint_path, backup_path, empty_backup=False)
-
-    export_result = estimator.experimental_export_all_saved_models(
-      export_path,
-      self._input_receiver_fn_map,
-      assets_extra=self._assets_extra,
-      as_text=self._as_text,
-      checkpoint_path=checkpoint_path)
-
-    return export_result
-
-
-class BestExporter(tf.estimator.BestExporter):
-  """
-  This class inherits from tf.estimator.BestExporter with the following differences:
-    - It also creates a backup of the best checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='best_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               event_file_pattern='eval/*.tfevents.*',
-               compare_fn=exporter._loss_smaller,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter
-    """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
-
-    super(BestExporter, self).__init__(
-      name, serving_input_receiver_fn, event_file_pattern, compare_fn,
-      assets_extra, as_text, exports_to_keep)
-
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
-
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
-
-
-class LatestExporter(tf.estimator.LatestExporter):
-  """
-  This class inherits from tf.estimator.LatestExporter with the following differences:
-    - It also creates a backup of the latest checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='latest_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter
-    """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
-
-    super(LatestExporter, self).__init__(
-      name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep)
-
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
-
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
diff --git a/twml/twml/contrib/feature_config.py b/twml/twml/contrib/feature_config.py
deleted file mode 100644
index 833695751..000000000
--- a/twml/twml/contrib/feature_config.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-Feature configuration for DeepBird jobs returns dictionary of sparse and dense Features
-"""
-from twitter.deepbird.io.legacy.contrib import feature_config
-import twml
-
-
-class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-
-    # Override the class in the spec.
-    doc["class"] = "twml.contrib.FeatureConfig"
-
-    return doc
-
-
-class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  # Overwrite self.build() to return twml.FeatureConfig instead
-  def build(self):
-    """
-    Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder.
-    """
-
-    (
-      keep_tensors,
-      keep_sparse_tensors,
-      feature_map,
-      features_add,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
-
-    discretize_dict = {}
-    for config in self._sparse_extraction_configs:
-      if config.discretize_num_bins and config.discretize_output_size_bits:
-        if config.discretize_type == "percentile":
-          calibrator = twml.contrib.calibrators.PercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashed_percentile":
-          calibrator = twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashing":
-          calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator
-        else:
-          raise ValueError("Unsupported discretizer type: " + config.discretize_type)
-        discretize_dict[config.output_name] = calibrator(
-          config.discretize_num_bins,
-          config.discretize_output_size_bits,
-          allow_empty_calibration=config.allow_empty_calibration,
-        )
-      elif config.discretize_num_bins or config.discretize_output_size_bits:
-        raise ValueError(
-          "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig"
-        )
-
-    return FeatureConfig(
-      features={},
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=keep_tensors,
-      sparse_tensor_types=keep_sparse_tensors,
-      feature_types=feature_map,
-      sparse_extraction_configs=self._sparse_extraction_configs,
-      feature_extraction_configs=self._feature_extraction_configs,
-      feature_group_extraction_configs=self._feature_group_extraction_configs,
-      image_configs=self._image_configs,
-      discretize_config=discretize_dict,
-      feature_ids=features_add,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=feature_name_to_feature_parser,
-      feature_in_bq_name=feature_in_bq_name,
-    )
-
-
-TensorExtractionConfig = feature_config.TensorExtractionConfig
-
-FeatureGroupExtractionConfig = feature_config.FeatureGroupExtractionConfig
-
-ImageExtractionConfig = feature_config.ImageExtractionConfig
-
-_set_tensor_namedtuple = feature_config._set_tensor_namedtuple
diff --git a/twml/twml/contrib/feature_config_parsers.py b/twml/twml/contrib/feature_config_parsers.py
deleted file mode 100644
index 83c402e2e..000000000
--- a/twml/twml/contrib/feature_config_parsers.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""Utility functions to create FeatureConfig objects from feature_spec.yaml files"""
-import os
-import re
-
-import tensorflow.compat.v1 as tf
-import yaml
-from twml.feature_config import FeatureConfigBuilder
-from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2
-
-
-def _get_config_version(config_dict):
-  doc = config_dict
-  supported_classes = {
-    "twml.FeatureConfig": "v1",
-    "twml.contrib.FeatureConfig": "v2"
-  }
-  if "class" not in doc:
-    raise ValueError("'class' key not found")
-  if doc["class"] not in supported_classes.keys():
-    raise ValueError("Class %s not supported. Supported clases are %s"
-                     % (doc["class"], supported_classes.keys()))
-  return supported_classes[doc["class"]]
-
-
-def _validate_config_dict_v1(config_dict):
-  """
-  Validate spec exported by twml.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.FeatureConfig":
-    malformed_error("'class' is not twml.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format' key not found")
-
-  # validate spec exported by twml.FeatureConfig
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    if "filters" not in doc:
-      malformed_error("'filters' key not found")
-    elif type(doc["filters"]) != list:
-      malformed_error("'filters' is not a list")
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
-
-
-def _validate_config_dict_v2(config_dict):
-  """
-  Validate spec exported by twml.contrib.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.contrib.FeatureConfig":
-    malformed_error("'class' is not twml.contrib.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format key not found'")
-
-  # validate spec exported by twml.contrib.FeatureConfig (basic validation only)
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparseTensors", "discretizeConfig"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    list_keys = ["sparseFeatureGroups", "denseFeatureGroups", "denseFeatures", "images", "filters"]
-    for key in list_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != list:
-        malformed_error("'%s' is not a list" % key)
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
-
-
-def _create_feature_config_v1(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilder(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add features
-    for feature_info in config_dict["features"].values():
-      feature_name = re.escape(feature_info["featureName"])
-      feature_group = feature_info["featureGroup"]
-      fc_builder.add_feature(feature_name, feature_group)
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-    # feature filters
-    for feature_name in config_dict["filters"]:
-      fc_builder.add_filter(feature_name)
-    # weight
-    if config_dict["weight"]:
-      weight_feature = list(config_dict["weight"].values())[0]["featureName"]
-      fc_builder.define_weight(weight_feature)
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
-
-
-def _create_feature_config_v2(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilderV2(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add sparse group extraction configs
-    for sparse_group in config_dict["sparseFeatureGroups"]:
-      fids = sparse_group["features"].keys()
-      fnames = [sparse_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_features_as_hashed_sparse(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        output_tensor_name=sparse_group["outputName"],
-        hash_space_size_bits=sparse_group["hashSpaceBits"],
-        discretize_num_bins=sparse_group["discretize"]["numBins"],
-        discretize_output_size_bits=sparse_group["discretize"]["outputSizeBits"],
-        discretize_type=sparse_group["discretize"]["type"],
-        type_filter=sparse_group["filterType"])
-
-    # add dense group extraction configs
-    for dense_group in config_dict["denseFeatureGroups"]:
-      fids = dense_group["features"].keys()
-      fnames = [dense_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_feature_group(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        group_name=dense_group["outputName"],
-        type_filter=dense_group["filterType"],
-        default_value=dense_group["defaultValue"])
-
-    # add dense feature configs
-    for dense_features in config_dict["denseFeatures"]:
-      fids = dense_features["features"].keys()
-      fnames = [dense_features["features"][fid]["featureName"] for fid in fids]
-      default_value = dense_features["defaultValue"]
-      if len(fnames) == 1 and type(default_value) != dict:
-        fc_builder.extract_feature(
-          feature_name=re.escape(fnames[0]),
-          expected_shape=dense_features["expectedShape"],
-          default_value=dense_features["defaultValue"])
-      else:
-        fc_builder.extract_features(
-          feature_regexes=[re.escape(fname) for fname in fnames],
-          default_value_map=dense_features["defaultValue"])
-
-    # add image feature configs
-    for image in config_dict["images"]:
-      fc_builder.extract_image(
-        feature_name=image["featureName"],
-        preprocess=image["preprocess"],
-        out_type=tf.as_dtype(image["outType"].lower()),
-        channels=image["channels"],
-        default_image=image["defaultImage"],
-      )
-
-    # add other tensor features (non-image)
-    tensor_fnames = []
-    image_fnames = [img["featureName"] for img in config_dict["images"]]
-    for tensor_fname in config_dict["tensors"]:
-      if tensor_fname not in image_fnames:
-        tensor_fnames.append(tensor_fname)
-    for sparse_tensor_fname in config_dict["sparseTensors"]:
-      tensor_fnames.append(sparse_tensor_fname)
-    fc_builder.extract_tensors(tensor_fnames)
-
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
-
-
-def create_feature_config_from_dict(config_dict, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature spec dict.
-  """
-  config_version = _get_config_version(config_dict)
-  if config_version == "v1":
-    _validate_config_dict_v1(config_dict)
-    feature_config = _create_feature_config_v1(config_dict, data_spec_path)
-  elif config_version == "v2":
-    _validate_config_dict_v2(config_dict)
-    feature_config = _create_feature_config_v2(config_dict, data_spec_path)
-  else:
-    raise ValueError("version not supported")
-
-  return feature_config
-
-
-def create_feature_config(config_path, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature_spec.yaml file.
-  """
-  _, ext = os.path.splitext(config_path)
-  if ext not in ['.yaml', '.yml']:
-    raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported")
-
-  with tf.io.gfile.GFile(config_path, mode='r') as fs:
-    config_dict = yaml.safe_load(fs)
-
-  return create_feature_config_from_dict(config_dict, data_spec_path)
diff --git a/twml/twml/contrib/feature_importances/__init__.py b/twml/twml/contrib/feature_importances/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml/contrib/feature_importances/feature_importances.py b/twml/twml/contrib/feature_importances/feature_importances.py
deleted file mode 100644
index a8bfcc129..000000000
--- a/twml/twml/contrib/feature_importances/feature_importances.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# checkstyle: noqa
-
-import time
-from collections import defaultdict
-
-from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
-from com.twitter.mlmetastore.modelrepo.core import FeatureImportance, FeatureNames
-from twitter.deepbird.io.util import match_feature_regex_list
-
-from twml.contrib.feature_importances.helpers import (
-  _get_feature_name_from_config,
-  _get_feature_types_from_records,
-  _get_metrics_hook,
-  _expand_prefix,
-  longest_common_prefix,
-  write_list_to_hdfs_gfile)
-from twml.contrib.feature_importances.feature_permutation import PermutedInputFnFactory
-from twml.tracking import ExperimentTracker
-
-from tensorflow.compat.v1 import logging
-from requests.exceptions import HTTPError, RetryError
-from queue import Queue
-
-
-SERIAL = "serial"
-TREE = "tree"
-INDIVIDUAL = "Individual"
-GROUP = "Group"
-ROC_AUC = "roc_auc"
-RCE = "rce"
-LOSS = "loss"
-
-
-def _repartition(feature_list_queue, fnames_ftypes, split_feature_group_on_period):
-  """
-  Iterate through letters to partition each feature by prefix, and then put each tuple
-    (prefix, feature_partition) into the feature_list_queue
-  Args:
-    prefix (str): The prefix shared by each feature in list_of_feature_types
-    feature_list_queue (Queue<(str, list<(str, str)>)>): The queue of feature groups
-    fnames_ftypes (list<(str, str)>): List of (fname, ftype) pairs. Each fname begins with prefix
-    split_feature_group_on_period (str): If true, require that feature groups end in a period
-  Returns:
-    Updated queue with each group in fnames_ftypes
-  """
-  assert len(fnames_ftypes) > 1
-
-  split_character = "." if split_feature_group_on_period else None
-  # Compute the longest prefix of the words
-  prefix = longest_common_prefix(
-    strings=[fname for fname, _ in fnames_ftypes], split_character=split_character)
-
-  # Separate the features by prefix
-  prefix_to_features = defaultdict(list)
-  for fname, ftype in fnames_ftypes:
-    assert fname.startswith(prefix)
-    new_prefix = _expand_prefix(fname=fname, prefix=prefix, split_character=split_character)
-    prefix_to_features[new_prefix].append((fname, ftype))
-
-  # Add all of the new partitions to the queue
-  for new_prefix, fname_ftype_list in prefix_to_features.items():
-    extended_new_prefix = longest_common_prefix(
-      strings=[fname for fname, _ in fname_ftype_list], split_character=split_character)
-    assert extended_new_prefix.startswith(new_prefix)
-    feature_list_queue.put((extended_new_prefix, fname_ftype_list))
-  return feature_list_queue
-
-
-def _infer_if_is_metric_larger_the_better(stopping_metric):
-  # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to
-  #   larger numbers being worse (e.g. LOSS)
-  if stopping_metric is None:
-    raise ValueError("Error: Stopping Metric cannot be None")
-  elif stopping_metric.startswith(LOSS):
-    logging.info("Interpreting {} to be a metric where larger numbers are worse".format(stopping_metric))
-    is_metric_larger_the_better = False
-  else:
-    logging.info("Interpreting {} to be a metric where larger numbers are better".format(stopping_metric))
-    is_metric_larger_the_better = True
-  return is_metric_larger_the_better
-
-
-def _check_whether_tree_should_expand(baseline_performance, computed_performance, sensitivity, stopping_metric, is_metric_larger_the_better):
-  """
-  Returns True if
-    - the metric is positive (e.g. ROC_AUC) and computed_performance is nontrivially smaller than the baseline_performance
-    - the metric is negative (e.g. LOSS) and computed_performance is nontrivially larger than the baseline_performance
-  """
-  difference = ((baseline_performance[stopping_metric] - computed_performance[stopping_metric]) /
-                 baseline_performance[stopping_metric])
-
-  if not is_metric_larger_the_better:
-      difference = -difference
-
-  logging.info(
-    "Found a {} difference of {}. Sensitivity is {}.".format("positive" if is_metric_larger_the_better else "negative", difference, sensitivity))
-  return difference > sensitivity
-
-
-def _compute_multiple_permuted_performances_from_trainer(
-    factory, fname_ftypes, trainer, parse_fn, record_count):
-  """Compute performances with fname and fype permuted
-  """
-  metrics_hook = _get_metrics_hook(trainer)
-  trainer._estimator.evaluate(
-    input_fn=factory.get_permuted_input_fn(
-      batch_size=trainer._params.eval_batch_size, parse_fn=parse_fn, fname_ftypes=fname_ftypes),
-    steps=(record_count + trainer._params.eval_batch_size) // trainer._params.eval_batch_size,
-    hooks=[metrics_hook],
-    checkpoint_path=trainer.best_or_latest_checkpoint)
-  return metrics_hook.metric_values
-
-
-def _get_extra_feature_group_performances(factory, trainer, parse_fn, extra_groups, feature_to_type, record_count):
-  """Compute performance differences for the extra feature groups
-  """
-  extra_group_feature_performance_results = {}
-  for group_name, raw_feature_regex_list in extra_groups.items():
-    start = time.time()
-    fnames = match_feature_regex_list(
-      features=feature_to_type.keys(),
-      feature_regex_list=[regex for regex in raw_feature_regex_list],
-      preprocess=False,
-      as_dict=False)
-
-    fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames]
-
-    logging.info("Extracted extra group {} with features {}".format(group_name, fnames_ftypes))
-    extra_group_feature_performance_results[group_name] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fnames_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      group_name, int(time.time() - start)))
-  return extra_group_feature_performance_results
-
-
-def _feature_importances_tree_algorithm(
-    data_dir, trainer, parse_fn, fnames, stopping_metric, file_list=None, datarecord_filter_fn=None, split_feature_group_on_period=True,
-    record_count=99999, is_metric_larger_the_better=None, sensitivity=0.025, extra_groups=None, dont_build_tree=False):
-  """Tree algorithm for feature and feature group importances. This algorithm build a prefix tree of
-  the feature names and then traverses the tree with a BFS. At each node (aka group of features with
-  a shared prefix) the algorithm computes the performance of the model when we permute all features
-  in the group. The algorithm only zooms-in on groups that impact the performance by more than
-  sensitivity. As a result, features that affect the model performance by less than sensitivity will
-  not have an exact importance.
-  Args:
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    parse_fn: (function): The parse_fn used by eval_input_fn
-    fnames (list<string>): The list of feature names
-    stopping_metric (str): The metric to use to determine when to stop expanding trees
-    file_list (list<str>): The list of filenames. Exactly one of file_list and data_dir should be
-      provided
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    split_feature_group_on_period (boolean): If true, split feature groups by period rather than on
-      optimal prefix
-    record_count (int): The number of records to compute importances over
-    is_metric_larger_the_better (boolean): If true, assume that stopping_metric is a metric where larger
-      values are better (e.g. ROC-AUC)
-    sensitivity (float): The smallest change in performance to continue to expand the tree
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group. You should only supply a value for this argument if you have a set
-      of features that you want to evaluate as a group but don't share a prefix
-    dont_build_tree (boolean): If True, don't build the tree and only compute the extra_groups importances
-  Returns:
-    A dictionary that contains the individual and group feature importances
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  baseline_performance = _compute_multiple_permuted_performances_from_trainer(
-    factory=factory, fname_ftypes=[],
-    trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-  out = {"None": baseline_performance}
-
-  if stopping_metric not in baseline_performance:
-    raise ValueError("The stopping metric '{}' not found in baseline_performance. Metrics are {}".format(
-      stopping_metric, list(baseline_performance.keys())))
-
-  is_metric_larger_the_better = (
-    is_metric_larger_the_better if is_metric_larger_the_better is not None else _infer_if_is_metric_larger_the_better(stopping_metric))
-  logging.info("Using {} as the stopping metric for the tree algorithm".format(stopping_metric))
-
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-  all_feature_types = list(feature_to_type.items())
-
-  individual_feature_performances = {}
-  feature_group_performances = {}
-  if dont_build_tree:
-    logging.info("Not building feature importance trie. Will only compute importances for the extra_groups")
-  else:
-    logging.info("Building feature importance trie")
-    # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where
-    #   each feature in list_of_feature_type_pairs will have have the prefix "prefix"
-    feature_list_queue = _repartition(
-      feature_list_queue=Queue(), fnames_ftypes=all_feature_types, split_feature_group_on_period=split_feature_group_on_period)
-
-    while not feature_list_queue.empty():
-      # Pop the queue. We should never have an empty list in the queue
-      prefix, fnames_ftypes = feature_list_queue.get()
-      assert len(fnames_ftypes) > 0
-
-      # Compute performance from permuting all features in fname_ftypes
-      logging.info(
-        "\n\nComputing importances for {} ({}...). {} elements left in the queue \n\n".format(
-          prefix, fnames_ftypes[:5], feature_list_queue.qsize()))
-      start = time.time()
-      computed_performance = _compute_multiple_permuted_performances_from_trainer(
-        factory=factory, fname_ftypes=fnames_ftypes,
-        trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-      logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-        prefix, int(time.time() - start)))
-      if len(fnames_ftypes) == 1:
-        individual_feature_performances[fnames_ftypes[0][0]] = computed_performance
-      else:
-        feature_group_performances[prefix] = computed_performance
-      # Dig deeper into the features in fname_ftypes only if there is more than one feature in the
-      #    list and the performance drop is nontrivial
-      logging.info("Checking performance for {} ({}...)".format(prefix, fnames_ftypes[:5]))
-      check = _check_whether_tree_should_expand(
-        baseline_performance=baseline_performance, computed_performance=computed_performance,
-        sensitivity=sensitivity, stopping_metric=stopping_metric, is_metric_larger_the_better=is_metric_larger_the_better)
-      if len(fnames_ftypes) > 1 and check:
-        logging.info("Expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-        feature_list_queue = _repartition(
-          feature_list_queue=feature_list_queue, fnames_ftypes=fnames_ftypes, split_feature_group_on_period=split_feature_group_on_period)
-      else:
-        logging.info("Not expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-
-  # Baseline performance is grouped in with individual_feature_importance_results
-  individual_feature_performance_results = dict(
-    out, **{k: v for k, v in individual_feature_performances.items()})
-  group_feature_performance_results = {k: v for k, v in feature_group_performances.items()}
-
-  if extra_groups is not None:
-    logging.info("Computing performances for extra groups {}".format(extra_groups.keys()))
-    for group_name, performances in _get_extra_feature_group_performances(
-        factory=factory,
-        trainer=trainer,
-        parse_fn=parse_fn,
-        extra_groups=extra_groups,
-        feature_to_type=feature_to_type,
-        record_count=record_count).items():
-      group_feature_performance_results[group_name] = performances
-  else:
-    logging.info("Not computing performances for extra groups")
-
-  return {INDIVIDUAL: individual_feature_performance_results,
-          GROUP: group_feature_performance_results}
-
-
-def _feature_importances_serial_algorithm(
-    data_dir, trainer, parse_fn, fnames, file_list=None, datarecord_filter_fn=None, factory=None, record_count=99999):
-  """Serial algorithm for feature importances. This algorithm computes the
-  importance of each feature.
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-
-  out = {}
-  for fname, ftype in list(feature_to_type.items()) + [(None, None)]:
-    logging.info("\n\nComputing importances for {}\n\n".format(fname))
-    start = time.time()
-    fname_ftypes = [(fname, ftype)] if fname is not None else []
-    out[str(fname)] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fname_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      fname, int(time.time() - start)))
-  # The serial algorithm does not compute group feature results.
-  return {INDIVIDUAL: out, GROUP: {}}
-
-
-def _process_feature_name_for_mldash(feature_name):
-  # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as
-  #   part of a url
-  return feature_name.replace("/", "__")
-
-
-def compute_feature_importances(
-    trainer, data_dir=None, feature_config=None, algorithm=TREE, parse_fn=None, datarecord_filter_fn=None, **kwargs):
-  """Perform a feature importance analysis on a trained model
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    algorithm (str): The algorithm to use
-    parse_fn: (function): The parse_fn used by eval_input_fn. By default this is
-      feature_config.get_parse_fn()
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-  """
-
-  # We only use the trainer's eval files if an override data_dir is not provided
-  if data_dir is None:
-    logging.info("Using trainer._eval_files (found {} as files)".format(trainer._eval_files))
-    file_list = trainer._eval_files
-  else:
-    logging.info("data_dir provided. Looking at {} for data.".format(data_dir))
-    file_list = None
-
-  feature_config = feature_config or trainer._feature_config
-  out = {}
-  if not feature_config:
-    logging.warn("WARN: Not computing feature importance because trainer._feature_config is None")
-    out = None
-  else:
-    parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn()
-    fnames = _get_feature_name_from_config(feature_config)
-    logging.info("Computing importances for {}".format(fnames))
-    logging.info("Using the {} feature importance computation algorithm".format(algorithm))
-    algorithm = {
-      SERIAL: _feature_importances_serial_algorithm,
-      TREE: _feature_importances_tree_algorithm}[algorithm]
-    out = algorithm(data_dir=data_dir, trainer=trainer, parse_fn=parse_fn, fnames=fnames, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn, **kwargs)
-  return out
-
-
-def write_feature_importances_to_hdfs(
-    trainer, feature_importances, output_path=None, metric="roc_auc"):
-  """Publish a feature importance analysis to hdfs as a tsv
-  Args:
-    (see compute_feature_importances for other args)
-    trainer (Trainer)
-    feature_importances (dict): Dictionary of feature importances
-    output_path (str): The remote or local file to write the feature importances to. If not
-      provided, this is inferred to be the trainer save dir
-    metric (str): The metric to write to tsv
-  """
-  # String formatting appends (Individual) or (Group) to feature name depending on type
-  perfs = {"{} ({})".format(k, importance_key) if k != "None" else k: v[metric]
-    for importance_key, importance_value in feature_importances.items()
-    for k, v in importance_value.items()}
-
-  output_path = ("{}/feature_importances-{}".format(
-    trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir,
-    output_path if output_path is not None else str(time.time())))
-
-  if len(perfs) > 0:
-    logging.info("Writing feature_importances for {} to hdfs".format(perfs.keys()))
-    entries = [
-      {
-        "name": name,
-        "drop": perfs["None"] - perfs[name],
-        "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8),
-        "perf": perfs[name]
-      } for name in perfs.keys()]
-    out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"]
-    for entry in sorted(entries, key=lambda d: d["drop"]):
-      out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry))
-    logging.info("\n".join(out))
-    write_list_to_hdfs_gfile(out, output_path)
-    logging.info("Wrote feature feature_importances to {}".format(output_path))
-  else:
-    logging.info("Not writing feature_importances to hdfs")
-  return output_path
-
-
-def write_feature_importances_to_ml_dash(trainer, feature_importances, feature_config=None):
-  # type: (DataRecordTrainer, FeatureConfig, dict) -> None
-  """Publish feature importances + all feature names to ML Metastore
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    feature_importances (dict, default=None): Dictionary of precomputed feature importances
-    feature_importance_metric (str, default=None): The metric to write to ML Dashboard
-  """
-  experiment_tracking_path = trainer.experiment_tracker.tracking_path\
-    if trainer.experiment_tracker.tracking_path\
-    else ExperimentTracker.guess_path(trainer._save_dir)
-
-  logging.info('Computing feature importances for run: {}'.format(experiment_tracking_path))
-
-  feature_importance_list = []
-  for key in feature_importances:
-    for feature, imps in feature_importances[key].items():
-      logging.info('FEATURE NAME: {}'.format(feature))
-      feature_name = feature.split(' (').pop(0)
-      for metric_name, value in imps.items():
-        try:
-          imps[metric_name] = float(value)
-          logging.info('Wrote feature importance value {} for metric: {}'.format(str(value), metric_name))
-        except Exception as ex:
-          logging.error("Skipping writing metric:{} to ML Metastore due to invalid metric value: {} or value type: {}. Exception: {}".format(metric_name, str(value), type(value), str(ex)))
-          pass
-
-      feature_importance_list.append(FeatureImportance(
-        run_id=experiment_tracking_path,
-        feature_name=_process_feature_name_for_mldash(feature_name),
-        feature_importance_metrics=imps,
-        is_group=key == GROUP
-      ))
-
-# setting feature config to match the one used in compute_feature_importances
-  feature_config = feature_config or trainer._feature_config
-  feature_names = FeatureNames(
-    run_id=experiment_tracking_path,
-    names=list(feature_config.features.keys())
-  )
-
-  try:
-    client = ModelRepoClient()
-    logging.info('Writing feature importances to ML Metastore')
-    client.add_feature_importances(feature_importance_list)
-    logging.info('Writing feature names to ML Metastore')
-    client.add_feature_names(feature_names)
-  except (HTTPError, RetryError) as err:
-    logging.error('Feature importance is not being written due to: '
-                  'HTTPError when attempting to write to ML Metastore: \n{}.'.format(err))
diff --git a/twml/twml/contrib/feature_importances/feature_permutation.py b/twml/twml/contrib/feature_importances/feature_permutation.py
deleted file mode 100644
index 809f5fde0..000000000
--- a/twml/twml/contrib/feature_importances/feature_permutation.py
+++ /dev/null
@@ -1,129 +0,0 @@
-from copy import deepcopy
-import random
-import types
-
-from twitter.deepbird.util.thrift.simple_converters import (
-  bytes_to_thrift_object, thrift_object_to_bytes)
-
-from tensorflow.compat.v1 import logging
-from com.twitter.ml.api.ttypes import DataRecord  # pylint: disable=import-error
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class PermutedInputFnFactory(object):
-
-  def __init__(self, data_dir, record_count, file_list=None, datarecord_filter_fn=None):
-    """
-    Args:
-      data_dir (str): The location of the records on hdfs
-      record_count (int): The number of records to process
-      file_list (list<str>, default=None): The list of data files on HDFS. If provided, use this instead
-        of data_dir
-      datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    if not (data_dir is None) ^ (file_list is None):
-      raise ValueError("Exactly one of data_dir and file_list can be provided. Got {} for data_dir and {} for file_list".format(
-        data_dir, file_list))
-
-    file_list = file_list if file_list is not None else twml.util.list_files(twml.util.preprocess_path(data_dir))
-    _next_batch = twml.input_fns.default_input_fn(file_list, 1, lambda x: x,
-      num_threads=2, shuffle=True, shuffle_files=True)
-    self.records = []
-    # Validate datarecord_filter_fn
-    if datarecord_filter_fn is not None and not isinstance(datarecord_filter_fn, types.FunctionType):
-      raise TypeError("datarecord_filter_fn is not function type")
-    with tf.Session() as sess:
-      for i in range(record_count):
-        try:
-          record = bytes_to_thrift_object(sess.run(_next_batch)[0], DataRecord)
-          if datarecord_filter_fn is None or datarecord_filter_fn(record):
-            self.records.append(record)
-        except tf.errors.OutOfRangeError:
-          logging.info("Stopping after reading {} records out of {}".format(i, record_count))
-          break
-      if datarecord_filter_fn:
-        logging.info("datarecord_filter_fn has been applied; keeping {} records out of {}".format(len(self.records), record_count))
-
-  def _get_record_generator(self):
-    return (thrift_object_to_bytes(r) for r in self.records)
-
-  def get_permuted_input_fn(self, batch_size, parse_fn, fname_ftypes):
-    """Get an input function that passes in a preset number of records that have been feature permuted
-    Args:
-      parse_fn (function): The function to parse inputs
-      fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    """
-    def permuted_parse_pyfn(bytes_array):
-      out = []
-      for b in bytes_array:
-        rec = bytes_to_thrift_object(b, DataRecord)
-        if fname_ftypes:
-          rec = _permutate_features(rec, fname_ftypes=fname_ftypes, records=self.records)
-        out.append(thrift_object_to_bytes(rec))
-      return [out]
-
-    def permuted_parse_fn(bytes_tensor):
-      parsed_bytes_tensor = parse_fn(tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string))
-      return parsed_bytes_tensor
-
-    def input_fn(batch_size=batch_size, parse_fn=parse_fn, factory=self):
-      return (tf.data.Dataset
-          .from_generator(self._get_record_generator, tf.string)
-          .batch(batch_size)
-          .map(permuted_parse_fn, 4)
-          .make_one_shot_iterator()
-          .get_next())
-    return input_fn
-
-
-def _permutate_features(rec, fname_ftypes, records):
-  """Replace a feature value with a value from random selected record
-  Args:
-    rec: (datarecord): A datarecord returned from DataRecordGenerator
-    fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    records: (list<datarecord>): The records to sample from
-  Returns:
-    The record with the feature permuted
-  """
-  rec_new = deepcopy(rec)
-  rec_replace = random.choice(records)
-
-  # If the replacement datarecord does not have the feature type entirely, add it in
-  #   to make the logic a bit simpler
-  for fname, feature_type in fname_ftypes:
-    fid = twml.feature_id(fname)[0]
-    if rec_replace.__dict__.get(feature_type, None) is None:
-      rec_replace.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-    if rec_new.__dict__.get(feature_type, None) is None:
-      rec_new.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-
-    if feature_type != 'binaryFeatures':
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, dict()):
-        # If the replacement datarecord does not contain the feature but the original does
-        del rec_new.__dict__[feature_type][fid]
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = dict()
-        rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[feature_type][fid]
-      else:
-        # If neither datarecord contains this feature
-        pass
-    else:
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, set()):
-        # If the replacement datarecord does not contain the feature but the original does
-        rec_new.__dict__[feature_type].remove(fid)
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = set()
-        rec_new.__dict__[feature_type].add(fid)
-        # If neither datarecord contains this feature
-      else:
-        # If neither datarecord contains this feature
-        pass
-  return rec_new
diff --git a/twml/twml/contrib/feature_importances/helpers.py b/twml/twml/contrib/feature_importances/helpers.py
deleted file mode 100644
index f3f600e8b..000000000
--- a/twml/twml/contrib/feature_importances/helpers.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import uuid
-
-from tensorflow.compat.v1 import logging
-import twml
-import tensorflow.compat.v1 as tf
-
-
-def write_list_to_hdfs_gfile(list_to_write, output_path):
-  """Use tensorflow gfile to write a list to a location on hdfs"""
-  locname = "/tmp/{}".format(str(uuid.uuid4()))
-  with open(locname, "w") as f:
-    for row in list_to_write:
-      f.write("%s\n" % row)
-  tf.io.gfile.copy(locname, output_path, overwrite=False)
-
-
-def decode_str_or_unicode(str_or_unicode):
-  return str_or_unicode.decode() if hasattr(str_or_unicode, 'decode') else str_or_unicode
-
-
-def longest_common_prefix(strings, split_character):
-  """
-  Args:
-    string (list<str>): The list of strings to find the longest common prefix of
-    split_character (str): If not None, require that the return string end in this character or
-      be the length of the entire string
-  Returns:
-    The string corresponding to the longest common prefix
-  """
-  sorted_strings = sorted(strings)
-  s1, s2 = sorted_strings[0], sorted_strings[-1]
-  if s1 == s2:
-    # If the strings are the same, just return the full string
-    out = s1
-  else:
-    # If the strings are not the same, return the longest common prefix optionally ending in split_character
-    ix = 0
-    for i in range(min(len(s1), len(s2))):
-      if s1[i] != s2[i]:
-        break
-      if split_character is None or s1[i] == split_character:
-        ix = i + 1
-    out = s1[:ix]
-  return out
-
-
-def _expand_prefix(fname, prefix, split_character):
-  if len(fname) == len(prefix):
-    # If the prefix is already the full feature, just take the feature name
-    out = fname
-  elif split_character is None:
-    # Advance the prefix by one character
-    out = fname[:len(prefix) + 1]
-  else:
-    # Advance the prefix to the next instance of split_character or the end of the string
-    for ix in range(len(prefix), len(fname)):
-      if fname[ix] == split_character:
-        break
-    out = fname[:ix + 1]
-  return out
-
-
-def _get_feature_types_from_records(records, fnames):
-  # This method gets the types of the features in fnames by looking at the datarecords themselves.
-  #   The reason why we do this rather than extract the feature types from the feature_config is
-  #   that the feature naming conventions in the feature_config are different from those in the
-  #   datarecords.
-  fids = [twml.feature_id(fname)[0] for fname in fnames]
-  feature_to_type = {}
-  for record in records:
-    for feature_type, values in record.__dict__.items():
-      if values is not None:
-        included_ids = set(values)
-        for fname, fid in zip(fnames, fids):
-          if fid in included_ids:
-            feature_to_type[fname] = feature_type
-  return feature_to_type
-
-
-def _get_metrics_hook(trainer):
-  def get_metrics_fn(trainer=trainer):
-    return {k: v[0]for k, v in trainer.current_estimator_spec.eval_metric_ops.items()}
-  return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn)
-
-
-def _get_feature_name_from_config(feature_config):
-  """Extract the names of the features on a feature config object
-  """
-  decoded_feature_names = []
-  for f in feature_config.get_feature_spec()['features'].values():
-    try:
-      fname = decode_str_or_unicode(f['featureName'])
-    except UnicodeEncodeError as e:
-      logging.error("Encountered decoding exception when decoding %s: %s" % (f, e))
-    decoded_feature_names.append(fname)
-  return decoded_feature_names
diff --git a/twml/twml/contrib/hooks.py b/twml/twml/contrib/hooks.py
deleted file mode 100644
index 6d68831fc..000000000
--- a/twml/twml/contrib/hooks.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import datetime
-
-from absl import logging
-import pytz
-import tensorflow.compat.v1 as tf
-
-
-class StopAtTimeHook(tf.train.SessionRunHook):
-  """
-  Hook that stops training at a fixed datetime
-  """
-
-  def __init__(self, stop_time):
-    """
-    Arguments:
-      stop_time:
-        a datetime.datetime or a datetime.timedelta specifying when to stop.
-        For naive datetime.datetime objects (with no time zone specified),
-        UTC time zone is assumed.
-    """
-    if isinstance(stop_time, datetime.timedelta):
-      self._stop_datetime = pytz.utc.localize(datetime.datetime.utcnow() + stop_time)
-    elif isinstance(stop_time, datetime.datetime):
-      if stop_time.tzinfo is None:
-        self._stop_datetime = pytz.utc.localize(stop_time)
-      else:
-        self._stop_datetime = stop_time.astimezone(pytz.UTC)
-    else:
-      raise ValueError("Expecting datetime or timedelta for stop_time arg")
-    self._stop_requested = False
-
-  def after_run(self, run_context, run_values):
-    delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow())
-    if delta.total_seconds() <= 0:
-      logging.info("StopAtTimeHook reached stop_time; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
diff --git a/twml/twml/contrib/initializers.py b/twml/twml/contrib/initializers.py
deleted file mode 100644
index 52bad3a19..000000000
--- a/twml/twml/contrib/initializers.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-
-TWML_INIT_FEED_KEY = "TWML_INIT_FEED_COLLECTION"
-
-
-class PartitionConstant(tf.keras.initializers.Constant):
-  """A constant initializer that supports partitions"""
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if not isinstance(self.value, np.ndarray):
-        raise ValueError(
-          "Currently, PartitionConstant only supports "
-          "partitioning on np.ndarrays. Got {}".format(type(self.value).__name__))
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
-
-
-partition_constant_initializer = PartitionConstant
-
-
-class PlaceholderInitializer(tf.keras.initializers.Initializer):
-  """A placeholder initializer that supports partitions"""
-
-  def __init__(self, shape, dtype):
-    self.dtype = dtype
-    self.value = tf.placeholder(dtype=dtype, shape=shape)
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if self.dtype != dtype:
-        raise ValueError("dtype does not match placeholder dtype")
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
-
-
-def get_init_feed_dict():
-  """Get the init feed dictionary to be used when running the init op."""
-  # Get the reference to the collection.
-  init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY)
-  init_feed_dict = {}
-  for d in init_feed_collection:
-    init_feed_dict.update(d)
-  return init_feed_dict
-
-
-def clear_init_feed_collection():
-  """Clear the init feed collection."""
-  init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY)
-  while init_feed_collection:
-    init_feed_collection.pop()
diff --git a/twml/twml/contrib/layers/__init__.py b/twml/twml/contrib/layers/__init__.py
deleted file mode 100644
index aa6e7d7e4..000000000
--- a/twml/twml/contrib/layers/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# pylint: disable=wildcard-import
-""" This module contains all contrib Layers. """
-
-from .hashed_percentile_discretizer import HashedPercentileDiscretizer  # noqa: F401
-from .hashing_discretizer import HashingDiscretizer  # noqa: F401
-from .mask_layer import MaskLayer  # noqa: F401
-from .embedding_lookup import EmbeddingLookup  # noqa: F401
-from .factorization_machine import FactorizationMachine # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
-from .stacked_rnn import StackedRNN, stacked_rnn  # noqa: F401
-from .zscore_normalization import ZscoreNormalization, zscore_normalization  # noqa: F401
diff --git a/twml/twml/contrib/layers/embedding_lookup.py b/twml/twml/contrib/layers/embedding_lookup.py
deleted file mode 100644
index c83dc7edd..000000000
--- a/twml/twml/contrib/layers/embedding_lookup.py
+++ /dev/null
@@ -1,419 +0,0 @@
-import os
-import re
-import time
-
-from collections import OrderedDict
-
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.python.ops.lookup_ops import index_table_from_tensor
-
-import twml
-
-# Padding is 0, UNK is 1:
-PAD_WORD_ID = 0
-OOV_WORD_ID = 1
-
-
-def load_initializers_from_csv(
-  embedding_path, vocab_size=-1, embedding_size=None, separator=None, vocab=None
-):
-  """
-  Loads embeddings saved in the `glove format <https://nlp.stanford.edu/projects/glove/>`_.
-  The glove format is a txt file separated by spaces.
-  Each line looks like: "word 0.00001 0.2334 ...".
-
-  Arguments:
-    embedding_path:
-      path to the embeddings file on HDFS (hdfs://default/...)
-      or its local_path (/path/to/...).
-      The embedding_path may also specify a pattern. In which case, the embeddings
-      are read in the lexical order of the filenames that match the order.
-    vocab_size:
-      the maximum size of the vocabulary. The top ``vocab_size`` words in the file
-      are included in the vocabulary. If you specify a positive vocab_size,
-      the words are expected to be in descending order of frequency.
-      This allows the embeddings to be easily filtered to top vocab_size words.
-      Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words.
-      A negative vocab_size loads all embeddings.
-      Reducing the vocab_size may also help with memory issues,
-      allowing the embedding initializers to fit inside the graph.
-    embedding_size:
-      Defaults to None. If None, the embedding size is infered from the file name.
-      For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both infrered
-      as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is
-      inferred from the first line in the file. If ``embedding_size`` is provided,
-      only the last ``embedding_size`` values of each line are considered. This
-      allows the line parser to recover from partial word parsing errors.
-    separator:
-      Specifies the separator to use when splitting each line into values.
-      Default value is a whitespace (same as glove format).
-    vocab:
-      OrderedDict mapping words to np.array embedding vectors. Initializes the vocabulary.
-      Duplicate words found in the file are ignored.
-      Defaults to a vocabulary of two words::
-
-        vocab = OrderedDict()
-        vocab[''] = np.random.randn(embedding_size)
-        vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  Returns:
-    tuple of (vocab_initializer, weight_initializer, shape)
-
-    vocab_initializer:
-      A tf.constant_initializer containing a vector of word strings of size vocab_size.
-    weight_initializer:
-      A twml.contrib.initializers.partition_constant_initializer containing
-      the weight matrix of embeddings of size vocab_size x embedding_size.
-    shape:
-      A tuple containing of (vocab_size, embedding_size).
-
-  """
-
-  start = time.time()
-
-  embedding_path = twml.util.sanitize_hdfs_path(embedding_path)
-
-  is_user_vocab = True
-  if vocab is None:
-    vocab = OrderedDict()
-    vocab[''] = True
-    vocab['<UNK>'] = True
-    is_user_vocab = False
-  elif not isinstance(vocab, OrderedDict):
-    raise RuntimeError(
-      "Expecting vocab argument of type OrderedDict or None. "
-      "Got type %s instead." % type(vocab).__name__
-    )
-
-  if embedding_size is None:
-    embedding_file = os.path.basename(embedding_path)
-    match = re.search(r"[^\d]([\d]+)d", embedding_file)
-    if match is not None:
-      embedding_size = int(match.group(1))
-
-  if embedding_size is not None and not isinstance(embedding_size, int):
-    raise RuntimeError(
-      "Expecting embedding_size argument of type int or None. "
-      "Got type %s, instead." % type(embedding_size).__name__
-    )
-
-  embedding_paths = sorted(tf.io.gfile.glob(embedding_path))
-
-  if len(embedding_paths) > 1:
-    raise ValueError(
-      "You are most likely using a the wrong --embedding.path"
-    )
-
-  embedding_path = embedding_paths[0]
-  logging.info("Reading embeddings file from path %s.." % embedding_path)
-
-  with tf.io.gfile.GFile(embedding_path) as f:
-    lines = f.readlines()
-
-  logging.info("Done reading embeddings file from path %s." % embedding_path)
-
-  logging.info("Parsing vocbulary and embeddings...")
-
-  for line in lines:
-    # Word and weights separated by space
-    values = line.strip().split(separator)
-    # Word is first symbol on each line
-    word = values[0]
-
-    if word not in vocab:
-      if embedding_size is None or embedding_size <= 0:
-        # get all elements after the first one.
-        word_weights = values[1:]
-        embedding_size = len(word_weights)
-      else:
-        # get the last embedding_size elements
-        word_weights = values[-min(embedding_size, len(values) - 1) :]
-
-      try:
-        if len(word_weights) != embedding_size:
-          raise ValueError
-
-        word_weights = np.asarray(word_weights, dtype=np.float32)
-        vocab[word] = word_weights
-      except ValueError:
-        logging.info("Wasn't able to load embeddings for word '%s'. Ignoring it" % word)
-
-      vocab_len = len(vocab)
-      if vocab_size > 0 and vocab_len == vocab_size:
-        # Limit vocabulary to top terms
-        break
-      elif (vocab_len % 1000) == 0:
-        logging.info("Loaded %d words into vocab" % vocab_len)
-
-    else:
-      logging.info("found duplicate word: %s" % word)
-
-  if not is_user_vocab:
-    vocab[''] = np.random.randn(embedding_size)
-    vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  words = list(vocab.keys())
-  weights = list(vocab.values())
-
-  weights = np.asarray(weights, dtype=np.float32)
-  assert weights.shape[0] == len(vocab)
-  assert weights.shape[1] == embedding_size
-
-  vocab_initializer = tf.constant_initializer(words, tf.string)
-  weight_initializer = twml.contrib.initializers.PartitionConstant(weights, tf.float32)
-
-  logging.info("Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start))
-  return vocab_initializer, weight_initializer, weights.shape
-
-
-def add_parser_arguments(parser):
-  """
-  Adds the embedding.path and embedding.vocab_size command-line arguments to the parser.
-  These can be used to call an initializer loader function like
-  the ``load_initializers_from_csv`` function.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument(
-    "--embedding.path",
-    "--embedding_path",
-    dest="embedding_path",
-    type=str,
-    default=None,
-    help="When specified, loads glove embeddings from .txt glove file",
-  )
-  parser.add_argument(
-    "--embedding.vocab_size",
-    "--embedding_vocab_size",
-    dest="embedding_vocab_size",
-    type=int,
-    default=-1,
-    help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).",
-  )
-
-  return parser
-
-
-class EmbeddingLookup(twml.layers.Layer):
-  """Layer for looking up embeddings.
-  Transforms a sequence of strings to a sequence of embeddings.
-
-  Arguments:
-    vocab_size:
-      The number of word strings and embeddings in the vocabulary.
-    output_size:
-      Long or Integer, dimensionality of the output space. The embedding vector size.
-    vocab_initializer:
-      Initializer function for the vocabulary. Required. The initializer should
-      return a list of strings of size vocab_size.
-    weight_initializer:
-      Initializer function for the weight matrix of size vocab_size x output_size.
-      This argument defaults to zeros_initializer().
-      This is valid when the EmbeddingLookup is the first layer of
-      parameters but should be changed otherwise.
-    trainable:
-      Boolean, if `True` adds variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-      Defaults to True: trains the embeddings.
-    num_oov_buckets:
-      The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket
-      ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not
-      specified, index `OOV_WORD_ID` is used for OOV strings.
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column, does not support yet)
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    dtype:
-      Defaults to tf.float32. Specifies the dtype of the weights.
-    use_placeholder:
-      Defaults to True.
-      If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`.
-      If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support.
-    checkpoint_dir:
-      Default to None.
-      If set to the path of a checkpoint, load embedding from the checkpoint.
-    convert_to_lowercase:
-      Default to True.
-      Converting all string inputs to lowercase.
-
-  Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`.
-  """
-
-  def __init__(
-    self,
-    vocab_size,
-    output_size,
-    vocab_initializer,
-    weight_initializer=None,
-    trainable=True,
-    num_oov_buckets=None,
-    oov_word_id=None,
-    name=None,
-    num_partitions=1,
-    partition_axis=0,
-    weight_regularizer=None,
-    dtype=None,
-    use_placeholder=True,
-    checkpoint_dir=None,
-    convert_to_lowercase=True,
-    **kwargs,
-  ):
-    if dtype is None:
-      # prevents a bug where the parent class defaults to the type of the first input tensor.
-      dtype = tf.float32
-    super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-
-    is_constant_init = isinstance(weight_initializer, tf.keras.initializers.Constant)
-    if use_placeholder and (not is_constant_init) and (weight_initializer is not None):
-      raise ValueError("Weight initializer should be a `Constant` or `None`.")
-
-    if weight_initializer is None:
-      self.weight_initializer = tf.zeros_initializer()
-    else:
-      self.weight_initializer = weight_initializer
-    self.use_placeholder = use_placeholder
-    self.checkpoint_dir = checkpoint_dir
-    self.convert_to_lowercase = convert_to_lowercase
-
-    self.vocab_initializer = vocab_initializer
-    self.vocab_size = vocab_size
-    self.output_size = output_size
-    self.num_partitions = num_partitions
-    self.partition_axis = partition_axis
-    self.weight_regularizer = weight_regularizer
-    self.trainable = trainable
-    self.oov_word_id = oov_word_id
-    self.num_oov_buckets = num_oov_buckets
-
-    if self.oov_word_id is not None and self.num_oov_buckets is not None:
-      raise ValueError("At most one of oov_word_id or num_oov_buckets should be specified")
-    elif self.oov_word_id is None and self.num_oov_buckets is None:
-      self.oov_word_id = OOV_WORD_ID  # use the default OOV word id
-
-    if partition_axis != 0:
-      raise NotImplementedError("embedding_lookup only supports partition_axis = 0")
-
-  def build(self, input_shapes):
-    """
-    creates the ``vocab`` and ``weight`` Variables
-    of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively.
-    """
-    partitioner = None
-
-    additional_buckets_for_oov = self.num_oov_buckets if self.num_oov_buckets is not None else 0
-    shape = [self.vocab_size + additional_buckets_for_oov, self.output_size]
-
-    if self.use_placeholder:
-      embedding_weight_initializer = twml.contrib.initializers.PlaceholderInitializer(
-        shape, self.dtype
-      )
-      tf.add_to_collection(
-        twml.contrib.initializers.TWML_INIT_FEED_KEY,
-        {embedding_weight_initializer.value: self.weight_initializer.value},
-      )
-    else:
-      embedding_weight_initializer = self.weight_initializer
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self.vocab = self.add_variable(
-      'vocab',
-      initializer=self.vocab_initializer,
-      shape=[self.vocab_size],
-      dtype=tf.string,
-      trainable=False,
-    )
-
-    self.weight = self.add_variable(
-      'weight',
-      initializer=None if self.checkpoint_dir is not None else embedding_weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=self.trainable,
-      partitioner=partitioner,
-    )
-    if self.checkpoint_dir is not None:
-      twml.trainers.trainer.init_from_checkpoint(self.checkpoint_dir, {'weight': self.weight.name})
-
-    self.built = True
-
-  def call(
-    self, inputs, debug=False, oov_summaries=False, **kwargs
-  ):  # pylint: disable=unused-argument
-    """Converts word strings to word ids using the vocabulary lookup table.
-    Then converts the word ids to their commensurate embedding vector.
-
-    Arguments:
-      inputs:
-        A tensor of word strings. Typically, of size batch_size x seq_len.
-      debug:
-        When True, prints the input strings and their commensurate input_ids.
-        Defaults to False.
-      oov_summaries:
-        When True, log the out-of-vocabulary (OOV) rate to TensorBoard
-        Defaults to False.
-
-    Returns:
-      The mapping of input word strings to output embedding vectors.
-      Given an input of shape ``batch_size x seq_len``, the output has shape
-      ``batch_size x seq_len x embedding_size``.
-    """
-    if self.convert_to_lowercase:
-      inputs = tf.strings.lower(inputs)
-    if self.num_oov_buckets is None:
-      lookup_table = index_table_from_tensor(self.vocab, default_value=self.oov_word_id)
-    else:
-      lookup_table = index_table_from_tensor(self.vocab, num_oov_buckets=self.num_oov_buckets)
-    input_ids = lookup_table.lookup(inputs)
-
-    if oov_summaries:
-      oov_count = tf.reduce_sum(
-        tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32)
-      )
-      valid_count = tf.reduce_sum(
-        tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32)
-      )
-      oov_rate = oov_count / valid_count
-      tf.summary.scalar('OOV_rate', oov_rate)
-
-    if debug:
-
-      def print_debug():
-        return tf.print("input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140)
-
-      with tf.control_dependencies([twml.util.do_every_n_steps(print_debug, 1000)]):
-        input_ids = tf.identity(input_ids)
-
-    output_embeddings = tf.nn.embedding_lookup(
-      params=self.weight, ids=input_ids, partition_strategy='div'
-    )
-
-    output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size]))
-    output_embeddings.set_shape(output_shape)
-
-    return output_embeddings
diff --git a/twml/twml/contrib/layers/factorization_machine.py b/twml/twml/contrib/layers/factorization_machine.py
deleted file mode 100644
index 3b8adae42..000000000
--- a/twml/twml/contrib/layers/factorization_machine.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# pylint: disable=no-member, arguments-differ, attribute-defined-outside-init, unused-argument
-"""
-Implementing factorization Layer
-"""
-
-from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs
-
-import tensorflow.compat.v1 as tf
-import twml
-from twml.layers.layer import Layer
-
-
-class FactorizationMachine(Layer):
-  """factorization machine layer class.
-  This layer implements the factorization machine operation.
-  The paper is "Factorization Machines" by Steffen Rendle.
-  TDD: go/tf-fm-tdd
-
-  Arguments:
-    num_latent_variables:
-      num of latent variables
-      The number of parameter in this layer is num_latent_variables x n where n is number of
-      input features.
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-  """
-
-  def __init__(self,
-    num_latent_variables=10,
-    weight_initializer=None,
-    activation=None,
-    trainable=True,
-    name=None,
-    use_sparse_grads=True,
-    use_binary_values=False,
-    weight_regularizer=None,
-    substract_self_cross=True,
-    **kwargs):
-    super(FactorizationMachine, self).__init__(trainable=trainable, name=name, **kwargs)
-
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.num_latent_variables = num_latent_variables
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.substract_self_cross = substract_self_cross
-
-  def build(self, input_shape):
-    """
-    creates``weight`` Variable of shape``[input_size, num_latent_variables]``.
-
-    """
-
-    shape = [input_shape[1], self.num_latent_variables]
-
-    # There is a 2GB limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    requested_size = input_shape[1] * self.num_latent_variables * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor can not be larger than 2GB. " %
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total)"
-                       (input_shape[1], self.num_latent_variables, dtype.name))
-
-    if not callable(self.weight_initializer):
-      shape = None
-
-    # dense tensor
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-    )
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A SparseTensor
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns a number with cross info
-    """
-    # The following are given:
-    # - inputs is a sparse tensor, we call it sp_x.
-    # - The dense_v tensor is a dense matrix, whose row i
-    #   corresponds to the vector V_i.
-    #   weights has shape [num_features, k]
-    sp_x = inputs
-    if isinstance(inputs, twml.SparseTensor):
-      sp_x = inputs.to_tf()
-    elif not isinstance(sp_x, tf.SparseTensor):
-      raise TypeError("The sp_x must be of type tf.SparseTensor or twml.SparseTensor")
-
-    indices = sp_x.indices[:, 1]
-    batch_ids = sp_x.indices[:, 0]
-    values = tf.reshape(sp_x.values, [-1, 1], name=self.name)
-    if self.use_sparse_grads:
-      v = tf.nn.embedding_lookup(self.weight, indices)
-      # if (self.use_binary_values):
-      #   values = tf.ones(tf.shape(values), dtype=values.dtype)
-      v_times_x = v * values
-      # First term: Sum_k  [Sum_i (v_ik * x_i)]^2
-      all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name)
-      all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1)
-
-      if self.substract_self_cross:
-        # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ]
-        v_times_x_2 = v_times_x**2
-        self_crosses = tf.reduce_sum(tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1)
-        outputs = all_crosses_squared - self_crosses
-      else:
-        outputs = all_crosses_squared
-    else:
-      # need to check if prediction is faster with code below
-      crossTerm = tf.reduce_sum((tf.sparse_tensor_dense_matmul(sp_x, self.weight)**2), 1)
-
-      if self.substract_self_cross:
-        # compute self-cross term
-        self_crossTerm = tf.reduce_sum(tf.segment_sum((tf.gather(self.weight, indices) * values)**2, batch_ids), 1)
-        outputs = crossTerm - self_crossTerm
-      else:
-        outputs = crossTerm
-
-    if self.activation is not None:
-      outputs = self.activation(outputs)
-
-    outputs = tf.reshape(outputs, [-1, 1], name=self.name)
-    outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32))
-    # set more explicit and static shape to avoid shape inference error
-    # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None`
-    outputs.set_shape([None, 1])
-    return outputs
diff --git a/twml/twml/contrib/layers/full_dense.py b/twml/twml/contrib/layers/full_dense.py
deleted file mode 100644
index ad78a91a4..000000000
--- a/twml/twml/contrib/layers/full_dense.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# pylint: disable=no-member,arguments-differ, attribute-defined-outside-init
-"""
-Implementing Full Dense Layer
-"""
-from twml.layers import Layer
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.layers import core
-
-
-class FullDense(Layer):
-  """
-  Full-connected, Dense input layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    use_bias:
-      Boolean whether to include a bias parameter in the layer
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weights:
-      list of underlying weight and bias matrix components. no guarantee on order of elements
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
-  """
-
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs)
-    self._output_sizes = self._get_output_partition_sizes(output_size, num_partitions)
-    self._units = output_size
-    self._activation = activation
-    self._weight_initializer = weight_initializer
-    self._bias_initializer = bias_initializer
-    self._weight_regularizer = weight_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._weight_constraint = weight_constraint
-    self._bias_constraint = bias_constraint
-    self._use_bias = use_bias
-    # NOTE - many initializers depend on fan_in and fan_out
-    #      - as such, initialization here may be different than
-    #      - for a non-partitioned FullDense
-    self._parts = [core.Dense(units=out_size,
-                              activation=activation,
-                              use_bias=use_bias,
-                              kernel_initializer=weight_initializer,
-                              bias_initializer=bias_initializer,
-                              kernel_regularizer=weight_regularizer,
-                              bias_regularizer=bias_regularizer,
-                              activity_regularizer=activity_regularizer,
-                              kernel_constraint=weight_constraint,
-                              bias_constraint=bias_constraint,
-                              trainable=trainable,
-                              name=name,
-                              **kwargs) for out_size in self._output_sizes]
-
-  @staticmethod
-  def _get_output_partition_sizes(out_size, num_parts):
-    """ Returns the appropriate output sizes of the partitions """
-    boundaries = [out_size * n // num_parts for n in range(num_parts + 1)]
-    return [k - j for j, k in zip(boundaries[:], boundaries[1:])]
-
-  def build(self, input_shapes):
-    """ Create the appropriately sized weights and biases in each layer partition """
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    for part in self._parts:
-      part.build(input_shape)
-
-    self.built = True
-
-  @property
-  def units(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def output_size(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def activation(self):
-    """ Returns the activation function """
-    return self._activation
-
-  @property
-  def weight_initializer(self):
-    """ Returns the weight_initializer """
-    return self._weight_initializer
-
-  @property
-  def weight_regularizer(self):
-    """ Returns the weight_regularizer """
-    return self._weight_regularizer
-
-  @property
-  def weight_constraint(self):
-    """ Returns the weight_constraint """
-    return self._weight_constraint
-
-  @property
-  def bias_initializer(self):
-    """ Returns the bias_initializer """
-    return self._bias_initializer
-
-  @property
-  def bias_regularizer(self):
-    """ Returns the bias_regularizer """
-    return self._bias_regularizer
-
-  @property
-  def bias_constraint(self):
-    """ Returns the bias_constraint """
-    return self._bias_constraint
-
-  @property
-  def use_bias(self):
-    """ Returns whether a bias is used in the layer """
-    return self._use_bias
-
-  @property
-  def trainable_variables(self):
-    """ Returns the trainable variables of the layer """
-    trainable_vars = []
-    for pt in self._parts:
-      trainable_vars += pt.trainable_variables
-    return trainable_vars
-
-  @property
-  def trainable_weights(self):
-    """ Returns the trainable variables of the layer """
-    return self.trainable_variables
-
-  @property
-  def non_trainable_variables(self):
-    """ Returns the non-trainable variables of the layer """
-    non_trainable_vars = []
-    for pt in self._parts:
-      non_trainable_vars += pt.non_trainable_variables
-    return non_trainable_vars
-
-  @property
-  def non_trainable_weights(self):
-    """ Returns the non-trainable variables of the layer """
-    return self.non_trainable_variables
-
-  @property
-  def variables(self):
-    """ Returns a list of all weights and biases in this layer """
-    layer_vars = []
-    for pt in self._parts:
-      layer_vars += pt.weights
-    return layer_vars
-
-  @property
-  def weights(self):
-    """ Returns a list of all weights and biases in this layer """
-    return self.variables
-
-  @property
-  def dtype(self):
-    """ Returns the dtype of the layers weights """
-    return self._parts[0].dtype
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A dense Tensor or a list of such.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
-
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-       `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`.
-    """
-    if not isinstance(inputs, (list, tuple)):
-      inputs = [inputs]
-
-    outputs = []
-    for inp in inputs:
-      part_outputs = [part(inp) for part in self._parts]
-      outputs.append(tf.concat(part_outputs, axis=-1))
-
-    return tf.accumulate_n(outputs)
-
-
-def full_dense(inputs, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               reuse=None,
-               **kwargs):
-  """Functional interface for the fully-connected dense-input layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    inputs: Tensor input.
-    output_size: Integer or Long, dimensionality of the output space.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor with shape `inputs.shape[:-1] + [output_size]`.
-  """
-  if not isinstance(inputs, (list, tuple)):
-    inputs = [inputs]
-
-  dtype = inputs[0].dtype.base_dtype
-
-  layer = FullDense(output_size=output_size,
-                    weight_initializer=weight_initializer,
-                    weight_regularizer=weight_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    num_partitions=num_partitions,
-                    activation=activation,
-                    use_bias=use_bias,
-                    bias_initializer=bias_initializer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    trainable=trainable,
-                    name=name,
-                    dtype=dtype,
-                    _scope=name,
-                    _reuse=reuse,
-                    **kwargs)
-
-  return layer(inputs)
diff --git a/twml/twml/contrib/layers/hashed_percentile_discretizer.py b/twml/twml/contrib/layers/hashed_percentile_discretizer.py
deleted file mode 100644
index b32c3be8d..000000000
--- a/twml/twml/contrib/layers/hashed_percentile_discretizer.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing HashedPercentileDiscretizer Layer
-"""
-
-
-from twitter.deepbird.util.hashing import (
-  integer_multiplicative_hashing_uniform,
-  integer_multiplicative_hashing,
-)  # noqa: F401
-
-from libtwml import percentile_discretizer_bin_indices
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-from twml.layers.layer import Layer
-from twml.layers.partition import Partition
-from twml.layers.stitch import Stitch
-
-
-class HashedPercentileDiscretizer(Layer):
-  """
-  HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator
-  after accumulating data
-  and performing minimum description length (PercentileDiscretizer) calibration.
-
-  HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an HashedPercentileDiscretizer
-  bin.
-  Each HashedPercentileDiscretizer input feature is converted to n_bin bins.
-  Each HashedPercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values
-  per bin is roughly equal (for each given HashedPercentileDiscretizer feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  The difference between this layer and PercentileDiscretizer is that the
-  DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the
-  same input feature id + bin. This is useful if you want to user transfer learning on pre-trained
-  sparse to dense embedding layers, but re-calibrate your discretizer on newer data.
-  """
-
-  def __init__(self, n_feature, n_bin, out_bits,
-               bin_values=None, hash_keys=None, hash_values=None,
-               bin_ids=None, feature_offsets=None,
-               hash_fn=integer_multiplicative_hashing_uniform, **kwargs):
-    """
-    Creates a non-initialized `HashedPercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during HashedPercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of HashedPercentileDiscretizer bins used for
-        HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys,
-        hash_values, bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that HashedPercentileDiscretizer discretizes and knows
-        about. The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer
-          2. transate the HashedPercentileDiscretizer features into a hash_feature ID that
-          HashedPercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the HashedPercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-      hash_fn:
-        a function that takes in `feature_ids`, `bucket_indices` and `output_size` and
-        hashes the bucketed features into the `output_size` buckets. The default uses knuth's
-        multiplicative hashing
-    """
-    super(HashedPercentileDiscretizer, self).__init__(**kwargs)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    # build variables
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._out_bits = out_bits
-
-    hash_keys = hash_keys
-    if hash_keys is None:
-      hash_keys = np.empty(n_feature, dtype=np.int64)
-
-    hash_values = hash_values
-    if hash_values is None:
-      hash_values = np.empty(n_feature, dtype=np.int64)
-
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-    self.bin_ids = bin_ids
-    if bin_ids is None:
-      bin_ids = np.empty(max_discretizer_feature, dtype=np.int64)
-
-    self.bin_values = bin_values
-    if bin_values is None:
-      bin_values = np.empty(max_discretizer_feature, dtype=np.float32)
-
-    self.feature_offsets = feature_offsets
-    if feature_offsets is None:
-      feature_offsets = np.empty(n_feature, dtype=np.int64)
-
-    self.hash_fn = hash_fn
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements HashedPercentileDiscretizer inference where inputs are intersected with a
-    hash_map.
-    Part of the inputs are discretized using twml.discretizer
-    to produce a discretizer_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    hashed_keys = self.hash_map.lookup(keys)
-    hashed_keys = tf.cast(hashed_keys, tf.int64)
-
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    found = tf.reshape(found, [-1])
-    continuous_feature_ids = tf.boolean_mask(keys, found)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_discretizer_keys, discretizer_in_keys = key
-    non_discretizer_vals, discretizer_in_vals = vals
-
-    non_discretizer_keys = twml.util.limit_bits(non_discretizer_keys, self._out_bits)
-    self.non_discretizer_keys = non_discretizer_keys
-
-    # run HashedPercentileDiscretizer on the keys/values it knows about
-    output = percentile_discretizer_bin_indices(discretizer_in_keys,
-                                                discretizer_in_vals,
-                                                self.bin_ids,
-                                                self.bin_values,
-                                                self.feature_offsets)
-    discretizer_bucket_idxs, discretizer_vals = output
-    new_discretizer_keys = self.hash_fn(continuous_feature_ids, discretizer_bucket_idxs,
-                                        self.output_size)
-    # Stitch the keys and values from discretizer and non discretizer indices back, with help
-    # of the Stitch Layer
-    self.discretizer_out_keys = new_discretizer_keys
-
-    concat_data = self.stitch([non_discretizer_vals, discretizer_vals],
-                              [non_discretizer_keys, new_discretizer_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
diff --git a/twml/twml/contrib/layers/hashing_discretizer.py b/twml/twml/contrib/layers/hashing_discretizer.py
deleted file mode 100644
index 2a8244f4b..000000000
--- a/twml/twml/contrib/layers/hashing_discretizer.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing HashingDiscretizer Layer
-"""
-
-
-import libtwml
-import tensorflow.compat.v1 as tf
-import twml
-from twml.constants import HashingDiscretizerOptions
-from twml.layers.layer import Layer
-
-
-class HashingDiscretizer(Layer):
-  """A layer that discretizes continuous features, with hashed feature assignments
-
-  HashingDiscretizer converts sparse continuous features into sparse
-  binary features. Each binary output feature indicates the presence of a
-  value in a HashingDiscretizer bin.
-
-  Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins.
-
-  - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins
-  - bin assignment = sum(bin_vals<val)
-
-  The difference between this layer and PercentileDiscretizer is that the
-  HashingDiscretizer always assigns the same output id in the
-  SparseTensor to the same input (feature id, bin) pair. This is useful if you
-  want to user transfer learning on pre-trained sparse to dense embedding
-  layers, but re-calibrate your discretizer on newer data.
-
-  If there are no calibrated features, then the discretizer will only apply
-  twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-  the discretizer will be a "no-operation", other than obeying `out_bits`
-
-  Typically, a HashingDiscretizer layer will be generated by calling the
-  to_layer() method of the HashingDiscretizerCalibrator
-  """
-
-  def __init__(self, feature_ids, bin_vals, n_bin, out_bits,
-               cost_per_unit=500, options=None, **kwargs):
-    """
-    Creates a non-initialized `HashingDiscretizer` object.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      feature_ids (1D int64 numpy array):
-      - list of feature IDs that have been calibrated and have corresponding
-        bin boundary values in the bin_vals array
-      - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin]
-      bin_vals (1D float numpy array):
-      - These are the bin boundary values for each calibrated feature
-      - len(bin_vals) = n_bin*len(feature_ids)
-      n_bin (int):
-      - number of HashingDiscretizer bins is actually n_bin + 1
-      - ***Note*** that if a value N is passed for the value of n_bin to
-        HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator
-        will generate N+1 bin boundaries for each feature, and hence there
-        will actually be N+2 potential bins for each feature
-      out_bits (int):
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      cost_per_unit (int):
-      - heuristic for intra op multithreading. approximate nanoseconds per input value.
-      options (int or None for default):
-      - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing.
-      - Use values in twml.constants.HashingDiscretizerOptions to select options as follows
-        choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND}
-        choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT}
-        Bitwise OR these together to construct the options input.
-        For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)`
-    """
-    super(HashingDiscretizer, self).__init__(**kwargs)
-
-    self._feature_ids = feature_ids
-    self._bin_vals = bin_vals
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-    self.cost_per_unit = cost_per_unit
-    if options is None:
-      options = HashingDiscretizerOptions.SEARCH_LOWER_BOUND | HashingDiscretizerOptions.HASH_32BIT
-    self._options = options
-
-    if not self.built:
-      self.build(input_shape=None)
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
-    """
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """
-    Implements HashingDiscretizer inference on a twml.SparseTensor.
-    Alternatively, accepts a tf.SparseTensor that can be converted
-    to twml.SparseTensor.
-
-    Performs discretization of input values.
-    i.e. bucket_val = bucket(val | feature_id)
-
-    This bucket mapping depends on the calibration (i.e. the bin boundaries).
-    However, (feature_id, bucket_val) pairs are mapped to new_feature_id in
-    a way that is independent of the calibration procedure
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashingDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A tf.SparseTensor, created from twml.SparseTensor.to_tf()
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if len(self._feature_ids) > 0:
-      # pass all inputs to the c++ op
-      # the op determines whether to discretize (when a feature is calibrated),
-      #   or whether to simply limit bits and pass through (when not calibrated)
-      # NOTE - Hashing is done in C++
-      discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer(
-        input_ids=keys,  # Input
-        input_vals=vals,  # Input
-        bin_vals=self._bin_vals,  # Input
-        feature_ids=tf.make_tensor_proto(self._feature_ids),  # Attr
-        n_bin=self._n_bin,  # Attr
-        output_bits=self._out_bits,  # Attr
-        cost_per_unit=self.cost_per_unit,  # Attr
-        options=self._options,  # Attr
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_size = tf.convert_to_tensor(1 << self._out_bits, tf.int64)
-    output_shape = [batch_size, output_size]
-
-    return twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
diff --git a/twml/twml/contrib/layers/mask_layer.py b/twml/twml/contrib/layers/mask_layer.py
deleted file mode 100644
index f5e788c7b..000000000
--- a/twml/twml/contrib/layers/mask_layer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from twml.contrib.pruning import apply_mask
-from twml.layers import Layer
-
-
-class MaskLayer(Layer):
-  """
-  This layer corresponds to `twml.contrib.pruning.apply_mask`.
-
-  It applies a binary mask to mask out channels of a given tensor. The masks can be
-  optimized using `twml.contrib.trainers.PruningDataRecordTrainer`.
-  """
-
-  def call(self, inputs, **kwargs):
-    """
-    Applies a binary mask to the channels of the input.
-
-    Arguments:
-      inputs:
-        input tensor
-      **kwargs:
-        additional keyword arguments
-
-    Returns:
-      Masked tensor
-    """
-    return apply_mask(inputs)
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
diff --git a/twml/twml/contrib/layers/stacked_rnn.py b/twml/twml/contrib/layers/stacked_rnn.py
deleted file mode 100644
index e05f5d853..000000000
--- a/twml/twml/contrib/layers/stacked_rnn.py
+++ /dev/null
@@ -1,189 +0,0 @@
-
-from twitter.deepbird.compat.v1.rnn import stack_bidirectional_dynamic_rnn
-
-import tensorflow.compat.v1 as tf
-import tensorflow
-import twml
-
-
-def _get_rnn_cell_creator(cell_type):
-  if cell_type == "LSTM":
-    Cell = tf.nn.rnn_cell.LSTMCell
-  elif cell_type == "GRU":
-    Cell = tf.nn.rnn_cell.GRUCell
-  else:
-    raise ValueError("cell_type: %s is not supported."
-                     "It should be one of 'LSTM' or 'GRU'." % cell_type)
-  return Cell
-
-
-def _apply_dropout_wrapper(rnn_cells, dropout):
-  """ Apply dropout wrapper around each cell if necessary """
-  if rnn_cells is None:
-    return None
-
-  cells = []
-  for i, dropout_rate in enumerate(dropout):
-    cell = rnn_cells[i]
-    if dropout_rate > 0:
-      cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=(1.0 - dropout_rate))
-    cells.append(cell)
-  return cells
-
-
-def _create_bidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells_forward = [Cell(output_size) for output_size in num_units]
-    cells_backward = [Cell(output_size) for output_size in num_units]
-    cells_forward = _apply_dropout_wrapper(cells_forward, dropout)
-    cells_backward = _apply_dropout_wrapper(cells_backward, dropout)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
-    with tf.variable_scope(scope_name):
-      outputs, final_states, _ = stack_bidirectional_dynamic_rnn(
-        cells_fw=cells_forward, cells_bw=cells_backward, inputs=inputs,
-        sequence_length=sequence_lengths, dtype=inputs.dtype)
-      return final_states[-1][-1]
-
-  return stacked_rnn_cell
-
-
-def _create_unidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells = [Cell(output_size) for output_size in num_units]
-    cells = _apply_dropout_wrapper(cells, dropout)
-    multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
-    with tf.variable_scope(scope_name):
-      outputs, final_states = tf.nn.static_rnn(
-        multi_cell,
-        tf.unstack(inputs, axis=1),
-        dtype=inputs.dtype,
-        sequence_length=sequence_lengths)
-      return final_states[-1].h
-
-  return stacked_rnn_cell
-
-
-def _create_regular_rnn_cell(num_units, dropout, cell_type, is_bidirectional):
-  if is_bidirectional:
-    return _create_bidirectional_rnn_cell(num_units, dropout, cell_type)
-  else:
-    return _create_unidirectional_rnn_cell(num_units, dropout, cell_type)
-
-
-class StackedRNN(twml.layers.Layer):
-  """
-  Layer for stacking RNN modules.
-  This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs.
-
-  Arguments:
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented.
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      This is for forward compatibility, this is not yet implemented.
-      Defaults to False.
-  """
-
-  def __init__(self,
-               num_units,
-               dropout=0,
-               is_training=True,
-               cell_type="LSTM",
-               is_bidirectional=False,
-               name="stacked_rnn"):
-
-    super(StackedRNN, self).__init__(name=name)
-
-    if (is_bidirectional):
-      raise NotImplementedError("Bidirectional RNN is not yet implemented")
-
-    if (cell_type != "LSTM"):
-      raise NotImplementedError("Only LSTMs are supported")
-
-    if not isinstance(num_units, (list, tuple)):
-      num_units = [num_units]
-    else:
-      num_units = num_units
-
-    self.num_layers = len(num_units)
-    if not isinstance(dropout, (tuple, list)):
-      dropout = [dropout] * self.num_layers
-    else:
-      dropout = dropout
-
-    self.is_training = is_training
-
-    is_gpu_available = twml.contrib.utils.is_gpu_available()
-    same_unit_size = all(size == num_units[0] for size in num_units)
-    same_dropout_rate = any(val == dropout[0] for val in dropout)
-
-    self.stacked_rnn_cell = None
-    self.num_units = num_units
-    self.dropout = dropout
-    self.cell_type = cell_type
-    self.is_bidirectional = is_bidirectional
-
-  def build(self, input_shape):
-    self.stacked_rnn_cell = _create_regular_rnn_cell(self.num_units,
-                                                     self.dropout,
-                                                     self.cell_type,
-                                                     self.is_bidirectional)
-
-  def call(self, inputs, sequence_lengths):
-    """
-    Arguments:
-      inputs:
-        A tensor of size [batch_size, max_sequence_length, embedding_size].
-      sequence_lengths:
-        The length of each input sequence in the batch. Should be of size [batch_size].
-    Returns:
-      final_output
-        The output of at the end of sequence_length.
-    """
-    return self.stacked_rnn_cell(inputs, sequence_lengths)
-
-
-def stacked_rnn(inputs, sequence_lengths, num_units,
-                dropout=0, is_training=True,
-                cell_type="LSTM", is_bidirectional=False, name="stacked_rnn"):
-  """Functional interface for StackedRNN
-  Arguments:
-    inputs:
-      A tensor of size [batch_size, max_sequence_length, embedding_size].
-    sequence_lengths:
-      The length of each input sequence in the batch. Should be of size [batch_size].
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM" or "GRU".
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      Defaults to False.
-  Returns
-    outputs, state.
-  """
-  rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name)
-  return rnn(inputs, sequence_lengths)
diff --git a/twml/twml/contrib/layers/zscore_normalization.py b/twml/twml/contrib/layers/zscore_normalization.py
deleted file mode 100644
index 8a1064965..000000000
--- a/twml/twml/contrib/layers/zscore_normalization.py
+++ /dev/null
@@ -1,247 +0,0 @@
-"""
-Contains the twml.layers.ZscoreNormalization layer.
-"""
-from twml.layers.layer import Layer
-import tensorflow.compat.v1 as tf
-
-from tensorflow.python.training import moving_averages
-
-
-# This is copied from tensorflow.contrib.framework.python.ops.add_model_variable in 1.15
-# Not available in 2.x
-# TODO: Figure out if this is really necessary.
-def _add_model_variable(var):
-  """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
-  Args:
-    var: a variable.
-  """
-  if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES):
-    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var)
-
-
-def update_moving_variable(batch_var, moving_var, decay, zero_debias=True, name=None):
-  update_op = moving_averages.assign_moving_average(
-      moving_var, batch_var, decay, zero_debias=zero_debias, name=None)
-  _add_model_variable(moving_var)
-  with tf.control_dependencies([update_op]):
-    return tf.identity(moving_var)
-
-
-class ZscoreNormalization(Layer):
-  """
-  Perform z-score normalization using moving mean and std.
-  Missing values are not included during mean/std calculation
-  This layer should only be used right after input layer.
-
-  Args:
-    decay:
-      using large decay to include longer moving means.
-    data_type:
-      use float64 to prevent overflow during variance calculation.
-    name:
-      Layer name
-  Returns:
-    A layer representing the output of the ZscoreNormalization transformation.
-   """
-
-  def __init__(
-    self,
-    decay=0.9999,
-    data_type=tf.float64,
-    name=None,
-    **kwargs):
-    super(ZscoreNormalization, self).__init__(name=name, **kwargs)
-    self.epsilon = tf.constant(1., data_type)
-    self.decay = decay
-    self.data_type = data_type
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the moving_mean and moving_var tf.Variables of the layer."""
-    input_dim = input_shape[1]
-    self.moving_mean = self.add_variable(
-      '{}_mean/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.moving_var = self.add_variable(
-      '{}_variance/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    """
-
-    return input_shape
-
-  def _training_pass(self, input, dense_mask, input_dtype, handle_single, zero_debias):
-    epsilon = self.epsilon
-    moving_mean, moving_var = self.moving_mean, self.moving_var
-    # calculate the number of exisiting value for each feature
-    tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0)
-    mask_ones = tf.cast(tensor_batch_num, tf.bool)
-    eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon)
-    # the following filled 0 with epision
-    tensor_batch_num_eps = tf.where(mask_ones,
-                                    tensor_batch_num,
-                                    eps_vector
-                                  )
-    tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0)
-    tensor_batch_divided = input / tensor_batch_num_eps_broacast
-    tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0)
-
-    # update moving mean here, and use it to calculate the std.
-    tensor_moving_mean = update_moving_variable(tensor_batch_mean, moving_mean, self.decay,
-                                                zero_debias, name="mean_ema_op")
-
-    tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0)
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean))
-    # divided by sqrt(n) before square, and then do summation for numeric stability.
-    broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0)
-    tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps
-    tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div)
-    tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0)
-
-    # update moving var here, dont replace 0 with eps before updating.
-    tensor_moving_var = update_moving_variable(tensor_batch_var, moving_var, self.decay,
-                                               zero_debias, name="var_ema_op")
-
-    # if std is 0, replace it with epsilon
-    tensor_moving_std = tf.sqrt(tensor_moving_var)
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                    eps_vector,
-                                    tensor_moving_std)
-
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0)
-      moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0)
-      missing_input_norm = tf.where(
-        tf.math.logical_and(dense_mask, moving_var_mask_zero),
-        tf.ones_like(missing_input_norm),
-        missing_input_norm
-      )
-    if input_dtype != self.data_type:
-      missing_input_norm = tf.cast(missing_input_norm, input_dtype)
-    return missing_input_norm
-
-  def _infer_pass(self, input, dense_mask, input_dtype, handle_single):
-    epsilon = tf.cast(self.epsilon, input_dtype)
-    testing_moving_mean = tf.cast(self.moving_mean, input_dtype)
-    tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype)
-
-    broad_mean = tf.expand_dims(testing_moving_mean, 0)
-    tensor_batch_sub_mean = input - broad_mean
-
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean)
-                            )
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                      tf.fill(tf.shape(tensor_moving_std), epsilon),
-                                      tensor_moving_std)
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_broad = tf.expand_dims(tensor_moving_std, 0)
-      moving_var_mask_zero = tf.math.logical_not(tf.cast(moving_var_broad, tf.bool))
-
-      missing_input_norm = tf.where(tf.math.logical_and(dense_mask, moving_var_mask_zero),
-                          tf.ones_like(missing_input_norm),
-                          missing_input_norm
-                          )
-    return missing_input_norm
-
-  def call(
-    self,
-    input,
-    is_training,
-    dense_mask=None,
-    zero_debias=True,
-    handle_single=False):
-    """
-    Args:
-    -----------
-    input:  B x D : float32/float64
-      missing value must be set to 0.
-    is_training: bool
-      training phase or testing phase
-    dense_mask: B x D : bool
-      missing value should be marked as 0, non-missing as 1. same shape as input
-    zero_debias: bool
-      bias correction of the moving average. (biased towards 0 in the beginning.
-      see adam paper. https://arxiv.org/abs/1412.6980)
-    handle_single: bool
-      if std==0, and feature is not missing value, set the value to 1, instead of 0.
-      This is super rare if input only consists of continous feature.
-      But if one-hot feature is included,
-      they will all have same values 1, in that case, make sure to set handle_single to true.
-    """
-
-    if dense_mask is None:
-      dense_mask = tf.math.logical_not(tf.equal(input, 0))
-    input_dtype = input.dtype
-
-    if is_training:
-      if input_dtype != self.data_type:
-        input = tf.cast(input, self.data_type)
-      return self._training_pass(input, dense_mask, input_dtype, handle_single, zero_debias)
-    else:
-      return self._infer_pass(input, dense_mask, input_dtype, handle_single)
-
-
-def zscore_normalization(
-  input,
-  is_training,
-  decay=0.9999,
-  data_type=tf.float64,
-  name=None,
-  dense_mask=None,
-  zero_debias=True,
-  handle_single=False, **kwargs):
-  """
-  Args:
-  ------------
-  input:  B x D : float32/float64
-    missing value must be set to 0.
-  is_training: bool
-    training phase or testing phase
-  decay:
-    using large decay to include longer moving means.
-  data_type:
-    use float64 to zprevent overflow during variance calculation.
-  name:
-    Layer name
-  dense_mask: B x D : bool
-    missing value should be marked as 0, non-missing as 1. same shape as input
-  zero_debias: bool
-    bias correction of the moving average. (biased towards 0 in the beginning.
-    see adam paper. https://arxiv.org/abs/1412.6980)
-  handle_single: bool
-    if std==0, and feature is not missing value, set the value to 1, instead of 0.
-    This is super rare if input only consists of continous feature.
-    But if one-hot feature is included,
-    they will all have same values 1, in that case, make sure to set handle_single to true.
-  """
-
-  norm_layer = ZscoreNormalization(decay=decay, data_type=data_type, name=name, **kwargs)
-  return norm_layer(input,
-                    is_training,
-                    dense_mask=dense_mask,
-                    zero_debias=zero_debias,
-                    handle_single=handle_single)
diff --git a/twml/twml/contrib/metrics/__init__.py b/twml/twml/contrib/metrics/__init__.py
deleted file mode 100644
index 37e6563c9..000000000
--- a/twml/twml/contrib/metrics/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental metric(s) for search and ranking"""
-
-from .search_metrics import get_search_metric_fn, ndcg  # noqa: F401
-from .metrics import *  # noqa: F401
diff --git a/twml/twml/contrib/metrics/metrics.py b/twml/twml/contrib/metrics/metrics.py
deleted file mode 100644
index dea1a5273..000000000
--- a/twml/twml/contrib/metrics/metrics.py
+++ /dev/null
@@ -1,209 +0,0 @@
-"""
-Module containing extra tensorflow metrics used at Twitter.
-This module conforms to conventions used by tf.metrics.*.
-In particular, each metric constructs two subgraphs: value_op and update_op:
-  - The value op is used to fetch the current metric value.
-  - The update_op is used to accumulate into the metric.
-
-Note: similar to tf.metrics.*, metrics in here do not support multi-label learning.
-We will have to write wrapper classes to create one metric per label.
-
-Note: similar to tf.metrics.*, batches added into a metric via its update_op are cumulative!
-
-"""
-
-from collections import OrderedDict
-
-import tensorflow.compat.v1 as tf
-from twml.metrics import get_multi_binary_class_metric_fn
-
-
-
-# checkstyle: noqa
-def get_partial_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1, predcols=None):
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    if predcols is None:
-      preds = graph_output['output']
-    else:
-      if isinstance(predcols, int):
-        predcol_list=[predcols]
-      else:
-        predcol_list=list(predcols)
-      for col in predcol_list:
-        assert 0 <= col < graph_output['output'].shape[class_dim], 'Invalid Prediction Column Index !'
-      preds  = tf.gather(graph_output['output'], indices=predcol_list, axis=class_dim)     # [batchSz, num_col]
-      labels = tf.gather(labels, indices=predcol_list, axis=class_dim)                     # [batchSz, num_col]
-
-    predInfo = {'output': preds}
-    if 'threshold' in graph_output:
-      predInfo['threshold'] = graph_output['threshold']
-    if 'hard_output' in graph_output:
-      predInfo['hard_output'] = graph_output['hard_output']
-
-    metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim)
-    metrics_op_res = metrics_op(predInfo, labels, weights)
-    return metrics_op_res
-
-  return get_eval_metric_ops
-
-
-
-# Numeric Prediction Performance among TopK Predictions
-def mean_numeric_label_topK(labels, predictions, weights, name, topK_id):
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  return tf.metrics.mean(values=top_k_labels, name=name)
-
-def mean_gated_numeric_label_topK(labels, predictions, weights, name, topK_id, bar=2.0):
-  assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float"
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  gated_top_k_labels  = tf.cast(top_k_labels > bar*1.0, tf.int32)
-  return tf.metrics.mean(values=gated_top_k_labels, name=name)
-
-SUPPORTED_NUMERIC_METRICS = {
-  'mean_numeric_label_topk': mean_numeric_label_topK,
-  'mean_gated_numeric_label_topk': mean_gated_numeric_label_topK
-}
-DEFAULT_NUMERIC_METRICS = ['mean_numeric_label_topk', 'mean_gated_numeric_label_topk']
-
-
-
-def get_metric_topK_fn_helper(targetMetrics, supportedMetrics_op, metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  """
-  :param targetMetrics:        Target Metric List
-  :param supportedMetrics_op:  Supported Metric Operators             Dict
-  :param metrics:              Metric Set to evaluate
-  :param topK:                 (topK_min, topK_max, topK_delta)       Tuple
-  :param predcol:              Prediction Column Index
-  :param labelcol:             Label Column Index
-  :return:
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if targetMetrics is None or supportedMetrics_op is None:
-    raise ValueError("Invalid Target Metric List/op !")
-
-  targetMetrics = set([m.lower() for m in targetMetrics])
-  if metrics is None:
-    metrics = list(targetMetrics)
-  else:
-    metrics = [m.lower() for m in metrics if m.lower() in targetMetrics]
-
-  num_k     = int((topK[1]-topK[0])/topK[2]+1)
-  topK_list = [topK[0]+d*topK[2] for d in range(num_k)]
-  if 1 not in topK_list:
-    topK_list = [1] + topK_list
-
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-    eval_metric_ops = OrderedDict()
-
-    if predcol is None:
-      pred = graph_output['output']
-    else:
-      assert 0 <= predcol < graph_output['output'].shape[1], 'Invalid Prediction Column Index !'
-      assert labelcol is not None
-      pred   = tf.reshape(graph_output['output'][:, predcol], shape=[-1, 1])
-      labels = tf.reshape(labels[:, labelcol], shape=[-1, 1])
-    numOut = graph_output['output'].shape[1]
-    pred_score = tf.reshape(graph_output['output'][:, numOut-1], shape=[-1, 1])
-
-    # add metrics to eval_metric_ops dict
-    for metric_name in metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in supportedMetrics_op:
-        metric_factory = supportedMetrics_op.get(metric_name)
-
-        if 'topk' not in metric_name:
-          value_op, update_op = metric_factory(
-            labels=labels,
-            predictions=pred,
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
-        else:
-          for K in topK_list:
-            K_min = tf.minimum(K, tf.shape(pred_score)[0])
-            topK_id = tf.nn.top_k(tf.reshape(pred_score, shape=[-1]), k=K_min)[1]           # [topK]
-            value_op, update_op = metric_factory(
-              labels=labels,
-              predictions=pred,
-              weights=weights,
-              name=metric_name+'__k_'+str(K),
-              topK_id=topK_id)
-            eval_metric_ops[metric_name+'__k_'+str(K)] = (value_op, update_op)
-
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-
-def get_numeric_metric_fn(metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  if metrics is None:
-    metrics = list(DEFAULT_NUMERIC_METRICS)
-  metrics   = list(set(metrics))
-
-  metric_op = get_metric_topK_fn_helper(targetMetrics=list(DEFAULT_NUMERIC_METRICS),
-                                        supportedMetrics_op=SUPPORTED_NUMERIC_METRICS,
-                                        metrics=metrics, topK=topK, predcol=predcol, labelcol=labelcol)
-  return metric_op
-
-
-
-def get_single_binary_task_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 1]        [pred_Task1]
-  labels:                        [BatchSz, 2]        [Task1, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
-
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=0, labelcol=1)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
-
-  return get_eval_metric_ops
-
-
-def get_dual_binary_tasks_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 3]        [pred_Task1, pred_Task2, Score]
-  labels:                        [BatchSz, 3]        [Task1, Task2, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
-
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=2, labelcol=2)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
-
-  return get_eval_metric_ops
diff --git a/twml/twml/contrib/metrics/search_metrics.py b/twml/twml/contrib/metrics/search_metrics.py
deleted file mode 100644
index 7d7a502f1..000000000
--- a/twml/twml/contrib/metrics/search_metrics.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""
-Module containing extra tensorflow metrics used at Twitter.
-This module conforms to conventions used by tf.metrics.*.
-In particular, each metric constructs two subgraphs: value_op and update_op:
-  - The value op is used to fetch the current metric value.
-  - The update_op is used to accumulate into the metric.
-
-Note: similar to tf.metrics.*, metrics in here do not support multi-label learning.
-We will have to write wrapper classes to create one metric per label.
-
-Note: similar to tf.metrics.*, batches added into a metric via its update_op are cumulative!
-
-"""
-
-from collections import OrderedDict
-from functools import partial
-
-import tensorflow.compat.v1 as tf
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes, ops
-from tensorflow.python.ops import array_ops, state_ops
-import twml
-from twml.contrib.utils import math_fns
-
-
-def ndcg(labels, predictions,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None,
-                  top_k_int=1):
-  # pylint: disable=unused-argument
-  """
-  Compute full normalized discounted cumulative gain (ndcg) based on predictions
-  ndcg = dcg_k/idcg_k, k is a cut off ranking postion
-  There are a few variants of ndcg
-  The dcg (discounted cumulative gain) formula used in
-  twml.contrib.metrics.ndcg is::
-
-    \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)}
-
-  k is the length of items to be ranked in a batch/query
-  Notice that whether k will be replaced with a fixed value requires discussions
-  The scores in predictions are transformed to order and relevance scores to calculate ndcg
-  A relevance score means how relevant a DataRecord is to a particular query
-
-  Arguments:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Returns:
-    ndcg: A `Tensor` representing the ndcg score.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'ndcg', (labels, predictions)):
-    label_scores = tf.to_float(labels, name='label_to_float')
-    predicted_scores = tf.to_float(predictions, name='predictions_to_float')
-
-    if context.executing_eagerly():
-      raise RuntimeError('ndcg is not supported when eager execution '
-                         'is enabled.')
-
-    total_ndcg = _metric_variable([], dtypes.float32, name='total_ndcg')
-    count_query = _metric_variable([], dtypes.float32, name='query_count')
-
-    # actual ndcg cutoff position top_k_int
-    max_prediction_size = array_ops.size(predicted_scores)
-    top_k_int = tf.minimum(max_prediction_size, top_k_int)
-    # the ndcg score of the batch
-    ndcg = math_fns.cal_ndcg(label_scores,
-      predicted_scores, top_k_int=top_k_int)
-    # add ndcg of the current batch to total_ndcg
-    update_total_op = state_ops.assign_add(total_ndcg, ndcg)
-    with ops.control_dependencies([ndcg]):
-      # count_query stores the number of queries
-      # count_query increases by 1 for each batch/query
-      update_count_op = state_ops.assign_add(count_query, 1)
-
-    mean_ndcg = math_fns.safe_div(total_ndcg, count_query, 'mean_ndcg')
-    update_op = math_fns.safe_div(update_total_op, update_count_op, 'update_mean_ndcg_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_ndcg)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean_ndcg, update_op
-
-
-# Copied from metrics_impl.py with minor modifications.
-# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
-def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
-
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
-
-
-# binary metric_name: (metric, requires thresholded output)
-SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML binary metrics
-  'rce': (twml.metrics.rce, False),
-  'nrce': (partial(twml.metrics.rce, normalize=True), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (twml.metrics.ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (twml.metrics.predicted_ctr, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR'), False),
-}
-
-# search metric_name: metric
-SUPPORTED_SEARCH_METRICS = {
-  # TWML search metrics
-  # ndcg needs the raw prediction scores to sort
-  'ndcg': ndcg,
-}
-
-
-def get_search_metric_fn(binary_metrics=None, search_metrics=None,
-  ndcg_top_ks=[1, 3, 5, 10], use_binary_metrics=False):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for ranking. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions. Required.
-    threshold:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      If the lables are 0s and 1s
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Arguments:
-    only used in pointwise learning-to-rank
-
-    binary_metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ctr (same as positive sample ratio.)
-        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-        - nrce (normalized rce, do not use this one if you do not understand what it is)
-        - pr_auc
-        - roc_auc
-        - accuracy (percentage of predictions that are correct)
-        - precision (true positives) / (true positives + false positives)
-        - recall (true positives) / (true positives + false negatives)
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When binary_metrics is None (the default), it defaults to all supported metrics
-
-    search_metrics (list of String):
-      a list of metrics of interest. E.g. ['ndcg']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ndcg
-
-      NOTE: ndcg works for ranking-relatd problems.
-      A batch contains all DataRecords that belong to the same query
-      If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords
-      that belong to the same query and have different labels -- ndcg does not apply in here.
-
-      When search_metrics is None (the default), it defaults to all supported search metrics
-      currently only 'ndcg'
-
-    ndcg_top_ks (list of integers):
-      The cut-off ranking postions for a query
-      When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10]
-
-    use_binary_metrics:
-      False (default)
-      Only set it to true in pointwise learning-to-rank
-  """
-  # pylint: disable=dict-keys-not-iterating
-
-  if ndcg_top_ks is None or not ndcg_top_ks:
-    ndcg_top_ks = [1, 3, 5, 10]
-
-  if search_metrics is None:
-    search_metrics = list(SUPPORTED_SEARCH_METRICS.keys())
-
-  if binary_metrics is None and use_binary_metrics:
-    # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well
-    # they are only used in pointwise learing-to-rank
-    binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys())
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    # hard_preds is a tensor
-    # check hard_preds is None and then check if it is empty
-    if hard_preds is None or tf.equal(tf.size(hard_preds), 0):
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add search metrics to eval_metric_ops dict
-    for metric_name in search_metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name)
-      if search_metric_factory:
-        if metric_name == 'ndcg':
-          for top_k in ndcg_top_ks:
-            # metric name will show as ndcg_1, ndcg_10, ...
-            metric_name_ndcg_top_k = metric_name + '_' + str(top_k)
-            top_k_int = tf.constant(top_k, dtype=tf.int32)
-            # Note: having weights in ndcg does not make much sense
-            # Because ndcg already has position weights/discounts
-            # Thus weights are not applied in ndcg metric
-            value_op, update_op = search_metric_factory(
-              labels=labels,
-              predictions=preds,
-              name=metric_name_ndcg_top_k,
-              top_k_int=top_k_int)
-            eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the search metric named ' + metric_name)
-
-    if use_binary_metrics:
-      # add binary metrics to eval_metric_ops dict
-      for metric_name in binary_metrics:
-
-        if metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        metric_name = metric_name.lower()  # metric name are case insensitive.
-        binary_metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        if binary_metric_factory:
-          value_op, update_op = binary_metric_factory(
-            labels=labels,
-            predictions=(hard_preds if requires_threshold else preds),
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the binary metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
diff --git a/twml/twml/contrib/optimizers/__init__.py b/twml/twml/contrib/optimizers/__init__.py
deleted file mode 100644
index 112b2b410..000000000
--- a/twml/twml/contrib/optimizers/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental optimizer classes"""
-from .deep_gradient_compression_optimizer import DeepGradientCompressionOptimizer  # noqa: F401
-from .pruning_optimizer import PruningOptimizer  # noqa: F401
diff --git a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
deleted file mode 100644
index 2c71ed13f..000000000
--- a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-A custom optimizer to implement Deep Gradient Compression. The general idea of
-gradient compression is to compress the gradients exchanged across machines,
-in order to reduce the communication overhead of distributing computing efforts.
-More details in https://arxiv.org/abs/1712.01887
-"""
-
-# TODO: Test how much communication overhead this DeepGradientCompressionOptimizer can reduce under
-# multi-GPU and distributed setting.
-
-import tensorflow.compat.v1 as tf
-
-
-def compute_threshold(grad, density):
-  """
-  A utility function to compute the threshold for gradient sparsification, given the gradient
-  tensor and the density.
-  Args:
-    grad(tf.Tensor):
-      Gradient tensor for some variable.
-    density(float):
-      Density degree when sparsifying gradients.
-  Returns(float):
-    Threshold for gradient sparsification.
-  """
-  flat_grad = tf.reshape(grad, [-1])
-  abs_flat_grad = tf.abs(flat_grad)
-  size = tf.shape(abs_flat_grad)[0]
-  k = tf.maximum(tf.constant(1),
-                 tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32))
-  topk, _ = tf.nn.top_k(abs_flat_grad, k, False)
-  return topk[-1]
-
-
-def get_top_row_indices(values, density):
-  """
-  A utility function to get indices of most significant rows, given the density degree.
-  Args:
-    values(tf.Tensor):
-      Gradient or locally accumulated gradient for some variable.
-    density(float):
-      Density degree when filtering out rows.
-  Returns(list(int)):
-    Indices of most significant rows.
-  """
-  abs_values = tf.abs(values)
-
-  try:
-    row_num = tf.shape(abs_values)[0]
-    k = tf.maximum(tf.constant(1),
-                   tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32))
-    row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True))
-    _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False)
-    # print "abs_values", abs_values, "row_sums", row_sums
-    return top_row_indices
-    # return tf.range(row_num)
-
-  except ValueError:  # if the tensor is 0-D or 1-D
-    return None
-
-
-class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer):
-  """
-  A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887).
-  """
-
-  def __init__(self, learning_rate, use_locking=False, name="Sparse",
-               density=1.0,
-               density_decay=False,
-               density_decay_steps=10000,
-               density_decay_rate=0.5,
-               min_density=0.1,
-               accumulation=False):
-    super(DeepGradientCompressionOptimizer, self).__init__(learning_rate, use_locking, name)
-    self._initial_density_t = tf.convert_to_tensor(density)
-    self._density_decay = density_decay
-    dtype = self._initial_density_t.dtype
-    self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype)
-    self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype)
-    self._min_density_t = tf.convert_to_tensor(min_density, dtype)
-    self._accumulation = accumulation
-
-  def _prepare(self):
-    super(DeepGradientCompressionOptimizer, self)._prepare()
-    if not self._density_decay:
-      self._density_t = self._initial_density_t
-    else:
-      dtype = self._initial_density_t.dtype
-      global_step = tf.cast(tf.train.get_global_step(), dtype)
-      p = tf.floor(tf.divide(global_step, self._density_decay_steps_t))
-      decayed_density = tf.multiply(self._initial_density_t,
-                                    tf.pow(self._density_decay_rate_t, p))
-      self._density_t = tf.maximum(self._min_density_t, decayed_density)
-
-  def _create_slots(self, var_list):
-    """
-    Create a slot variable to accumulate gradients locally for each variable in `var_list`.
-    Args:
-      var_list(list(tf.Variable)):
-        List of variables to accumulate gradients locally for.
-    """
-    for var in var_list:
-      self._zeros_slot(var, "g_buffer", self._name)
-
-  def _apply_dense(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(grad, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
-
-      g_buffer = tf.assign_add(g_buffer, grad)
-
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
-
-      return tf.group(*[update_var, update_g_buffer])
-
-  def _apply_sparse_duplicate_indices(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad.values, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(grad, var)  # noqa: E501
-
-      sparsified_values = tf.gather(grad.values, top_row_indices)
-      sparsified_indices = tf.gather(grad.indices, top_row_indices)
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
-
-      g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values)
-
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer,
-                     self)._apply_sparse_duplicate_indices(grad, var)
-
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
-
-      return tf.group(*[update_var, update_g_buffer])
diff --git a/twml/twml/contrib/optimizers/pruning_optimizer.py b/twml/twml/contrib/optimizers/pruning_optimizer.py
deleted file mode 100644
index 2bcd612ed..000000000
--- a/twml/twml/contrib/optimizers/pruning_optimizer.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""
-Provides a general optimizer for pruning features of a neural network.
-
-The optimizer estimates the computational cost of features, combines this information with pruning
-signals indicating their usefulness, and disables features via binary masks at regular intervals.
-
-To make a layer prunable, use `twml.contrib.pruning.apply_mask`:
-
-  dense1 = tf.layers.dense(inputs=inputs, units=50, activation=tf.nn.relu)
-  dense1 = apply_mask(dense1)
-
-To prune the network, apply PruningOptimizer to any cross-entropy loss:
-
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  optimizer = PruningOptimizer(learning_rate=0.001, momentum=0.5)
-  minimize = optimizer.minimize(
-      loss=loss,
-      prune_every=10,
-      burn_in=100,
-      global_step=tf.train.get_global_step())
-"""
-
-import tensorflow.compat.v1 as tf
-
-from twml.contrib.pruning import computational_cost, prune, update_pruning_signals
-from twml.contrib.pruning import MASK_COLLECTION
-
-
-class PruningOptimizer(tf.train.MomentumOptimizer):
-  """
-  Updates parameters with SGD and pruning masks using Fisher pruning.
-
-  Arguments:
-    learning_rate: float
-      Learning rate of SGD
-
-    momentum: float
-      Momentum used by SGD
-
-    use_locking: bool
-      If `True`, use locks for update operations
-
-    name: str
-      Optional name prefix for the operations created when applying gradients
-
-    use_nesterov: bool
-      If `True`, use Nesterov momentum
-  """
-
-  def __init__(
-      self,
-      learning_rate,
-      momentum=0.9,
-      use_locking=False,
-      name="PruningOptimizer",
-      use_nesterov=False):
-    super(PruningOptimizer, self).__init__(
-        learning_rate=learning_rate,
-        momentum=momentum,
-        use_locking=use_locking,
-        name=name,
-        use_nesterov=use_nesterov)
-
-  def minimize(
-    self,
-    loss,
-    prune_every=100,
-    burn_in=0,
-    decay=.96,
-    flops_weight='AUTO',
-    flops_target=0,
-    update_params=None,
-    method='Fisher',
-    *args,
-    **kwargs):
-    """
-    Create operations to minimize loss and to prune features.
-
-    A pruning signal measures the importance of feature maps. This is weighed against the
-    computational cost of computing a feature map. Features are then iteratively pruned
-    based on a weighted average of feature importance S and computational cost C (in FLOPs):
-
-    $$S + w * C$$
-
-    Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not
-    necessarily optimal.
-
-    Arguments:
-      loss: tf.Tensor
-        The value to minimize
-
-      prune_every: int
-        One entry of a mask is set to zero only every few update steps
-
-      burn_in: int
-        Pruning starts only after this many parameter updates
-
-      decay: float
-        Controls exponential moving average of pruning signals
-
-      flops_weight: float or str
-        Controls the targeted trade-off between computational complexity and performance
-
-      flops_target: float
-        Stop pruning when computational complexity is less or this many floating point ops
-
-      update_params: tf.Operation
-        Optional training operation used instead of MomentumOptimizer to update parameters
-
-      method: str
-        Method used to compute pruning signal (currently only supports 'Fisher')
-
-    Returns:
-      A `tf.Operation` updating parameters and pruning masks
-
-    References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
-    """
-
-    # gradient-based updates of parameters
-    if update_params is None:
-      update_params = super(PruningOptimizer, self).minimize(loss, *args, **kwargs)
-
-    masks = tf.get_collection(MASK_COLLECTION)
-
-    with tf.variable_scope('pruning_opt', reuse=True):
-      # estimate computational cost per data point
-      batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0]
-      cost = tf.divide(computational_cost(loss), batch_size, name='computational_cost')
-
-      tf.summary.scalar('computational_cost', cost)
-
-      if masks:
-        signals = update_pruning_signals(loss, masks=masks, decay=decay, method=method)
-
-        # estimate computational cost per feature map
-        costs = tf.gradients(cost, masks)
-
-        # trade off computational complexity and performance
-        if flops_weight.upper() == 'AUTO':
-          signals = [s / (c + 1e-6) for s, c in zip(signals, costs)]
-        elif not isinstance(flops_weight, float) or flops_weight != 0.:
-          signals = [s - flops_weight * c for s, c in zip(signals, costs)]
-
-        counter = tf.Variable(0, name='pruning_counter')
-        counter = tf.assign_add(counter, 1, use_locking=True)
-
-        # only prune every so often after a burn-in phase
-        pruning_cond = tf.logical_and(counter > burn_in, tf.equal(counter % prune_every, 0))
-
-        # stop pruning after reaching threshold
-        if flops_target > 0:
-          pruning_cond = tf.logical_and(pruning_cond, tf.greater(cost, flops_target))
-
-        update_masks = tf.cond(
-          pruning_cond,
-          lambda: prune(signals, masks=masks),
-          lambda: tf.group(masks))
-
-        return tf.group([update_params, update_masks])
-
-    # no masks found
-    return update_params
diff --git a/twml/twml/contrib/parsers.py b/twml/twml/contrib/parsers.py
deleted file mode 100644
index a27f2acbd..000000000
--- a/twml/twml/contrib/parsers.py
+++ /dev/null
@@ -1,21 +0,0 @@
-'''
-Contains implementations of functions to parse the contrib.FeatureConfig
-
-Modelers can use the functions in this module as the the train/eval_parse_fn of
-the DataRecordTrainer constructor to customize how to parse their datasets.
-
-Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
-
-from twitter.deepbird.io.legacy.contrib.parsers import (
-  _convert_to_fixed_length_tensor,  # noqa: F401
-  _get_input_receiver_fn_feature_dict,  # noqa: F401
-  _merge_dictionaries,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_keras_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_string_tensor_parse_fn,  # noqa: F401
-  get_string_tensor_serving_input_receiver_fn,  # noqa: F401
-  get_supervised_input_receiver_fn_feature_dict,  # noqa: F401
-  parse_string_tensor,  # noqa: F401
-)
diff --git a/twml/twml/contrib/pruning.py b/twml/twml/contrib/pruning.py
deleted file mode 100644
index b6ddee693..000000000
--- a/twml/twml/contrib/pruning.py
+++ /dev/null
@@ -1,363 +0,0 @@
-"""
-This module implements tools for pruning neural networks.
-
-In particular, it provides tools for dealing with masks:
-
-  features = apply_mask(features)
-
-The function `apply_mask` applies a binary mask to the channels of a given tensor. Consider the
-following loss:
-
-  logits = tf.matmul(features, weights)
-  loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
-
-Each mask has a corresponding pruning signal. The function `update_pruning_signals` will update and
-return these signals:
-
-  signals = update_pruning_signals(loss)
-
-The pruning operation will zero out the mask entry with the smallest corresponding pruning signal:
-
-  prune(signals)
-
-The following function allows us to estimate the computational cost of a graph (number of FLOPs):
-
-  cost = computational_cost(loss)
-
-To compute the cost of each feature per data point, we can do:
-
-  costs = tf.gradients(cost / batch_size, masks)
-
-The current implementation of `computational_cost` is designed to work with standard feed-forward
-and convolutional network architectures only, but may fail with more complicated architectures.
-"""
-
-
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-MASK_COLLECTION = 'pruning/masks'
-MASK_EXTENDED_COLLECTION = 'pruning/masks_extended'
-OP_COLLECTION = 'pruning/ops'
-
-
-def apply_mask(tensor, name='pruning'):
-  """
-  Point-wise multiplies a tensor with a binary mask.
-
-  During training, pruning is simulated by setting entries of the mask to zero.
-
-  Arguments:
-    tensor: tf.Tensor
-      A tensor where the last dimension represents channels which will be masked
-
-  Returns:
-    `tf.Tensor` with same shape as `tensor`
-  """
-
-  tensor_shape = tensor.shape
-
-  with tf.variable_scope(name, reuse=True):
-    # allocate masks and corresponding pruning signals
-    mask = tf.Variable(tf.ones(tensor.shape.as_list()[-1]), trainable=False, name='mask')
-    pruning_signal = tf.Variable(tf.zeros_like(mask), trainable=False, name='signal')
-
-    # extending masks is a trick to get a separate gradient for each data point
-    mask_extended = extend_mask(mask, tensor)
-
-  # store extended mask, pruning signal, and other vars for easy access later
-  mask.extended = mask_extended
-  mask.pruning_signal = pruning_signal
-  mask.tensor = tensor
-
-  # mask tensor
-  tensor = tf.multiply(tensor, mask_extended)
-  tensor.set_shape(tensor_shape)
-  tensor._mask = mask
-
-  tf.add_to_collection(MASK_COLLECTION, mask)
-  tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended)
-  tf.add_to_collection(OP_COLLECTION, tensor.op)
-
-  return tensor
-
-
-def extend_mask(mask, tensor):
-  """
-  Repeats the mask for each data point stored in a tensor.
-
-  If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional
-  tensor with A copies or `mask`.
-
-  Arguments:
-    mask: tf.Tensor
-      The mask which will be extended
-
-    tensor: tf.Tensor
-      The tensor to which the extended mask will be applied
-
-  Returns:
-    The extended mask
-  """
-
-  batch_size = tf.shape(tensor)[:1]
-  ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype)
-  multiples = tf.concat([batch_size, ones], 0)
-  mask_shape = tf.concat([ones, [-1]], 0)
-  return tf.tile(tf.reshape(mask, mask_shape), multiples)
-
-
-def find_input_mask(tensor):
-  """
-  Find ancestral mask affecting the number of pruned channels of a tensor.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  if tensor.op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D', 'Transpose']:
-    # op produces a new number of channels, preceding mask therefore irrelevant
-    return None
-  if not tensor.op.inputs:
-    return None
-  for input in tensor.op.inputs:
-    mask = find_input_mask(input)
-    if mask is not None:
-      return mask
-
-
-def find_output_mask(tensor):
-  """
-  Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape.
-
-  Arguments:
-    tensor: tf.Tensor or tf.Variable
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if isinstance(tensor, tf.Variable):
-    return find_output_mask(tensor.op.outputs[0])
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  for op in tensor.consumers():
-    if len(op.outputs) != 1:
-      continue
-    if op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D']:
-      # masks of descendants are only relevant if tensor is right-multiplied
-      if tensor == op.inputs[1]:
-        return find_output_mask(op.outputs[0])
-      return None
-    mask = find_output_mask(op.outputs[0])
-    if mask is not None:
-      return mask
-
-
-def find_mask(tensor):
-  """
-  Returns masks indicating channels of the tensor that are effectively removed from the graph.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a mask
-
-  Returns:
-    A `tf.Tensor` with binary entries indicating disabled channels
-  """
-
-  input_mask = find_input_mask(tensor)
-  output_mask = find_output_mask(tensor)
-  if input_mask is None:
-    return output_mask
-  if output_mask is None:
-    return input_mask
-  if input_mask is output_mask:
-    return input_mask
-  return input_mask * output_mask
-
-
-def pruned_shape(tensor):
-  """
-  Computes the shape of a tensor after taking into account pruning of channels.
-
-  Note that the shape will only differ in the last dimension, even if other dimensions are also
-  effectively disabled by pruning masks.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a pruned shape
-
-  Returns:
-    A `tf.Tensor[tf.float32]` representing the pruned shape
-  """
-
-  mask = find_mask(tensor)
-
-  if mask is None:
-    return tf.cast(tf.shape(tensor), tf.float32)
-
-  return tf.concat([
-    tf.cast(tf.shape(tensor)[:-1], mask.dtype),
-    tf.reduce_sum(mask, keepdims=True)], 0)
-
-
-def computational_cost(op_or_tensor, _observed=None):
-  """
-  Estimates the computational complexity of a pruned graph (number of floating point operations).
-
-  This function currently only supports sequential graphs such as those of MLPs and
-  simple CNNs with 2D convolutions in NHWC format.
-
-  Note that the computational cost returned by this function is proportional to batch size.
-
-  Arguments:
-    op_or_tensor: tf.Tensor or tf.Operation
-      Root node of graph for which to compute computational cost
-
-  Returns:
-    A `tf.Tensor` representing a number of floating point operations
-  """
-
-  cost = tf.constant(0.)
-
-  # exclude cost of computing extended pruning masks
-  masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)]
-  if op_or_tensor in masks_extended:
-    return cost
-
-  # convert tensor to op
-  op = op_or_tensor.op if isinstance(op_or_tensor, (tf.Tensor, tf.Variable)) else op_or_tensor
-
-  # make sure cost of op will not be counted twice
-  if _observed is None:
-    _observed = []
-  elif op in _observed:
-    return cost
-  _observed.append(op)
-
-  # compute cost of computing inputs
-  for tensor in op.inputs:
-    cost = cost + computational_cost(tensor, _observed)
-
-  # add cost of operation
-  if op.op_def is None or op in tf.get_collection(OP_COLLECTION):
-    # exclude cost of undefined ops and pruning ops
-    return cost
-
-  elif op.op_def.name == 'MatMul':
-    shape_a = pruned_shape(op.inputs[0])
-    shape_b = pruned_shape(op.inputs[1])
-    return cost + shape_a[0] * shape_b[1] * (2. * shape_a[1] - 1.)
-
-  elif op.op_def.name in ['Add', 'Mul', 'BiasAdd']:
-    return cost + tf.cond(
-        tf.size(op.inputs[0]) > tf.size(op.inputs[1]),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[0])),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[1])))
-
-  elif op.op_def.name in ['Conv2D']:
-    output_shape = pruned_shape(op.outputs[0])
-    input_shape = pruned_shape(op.inputs[0])
-    kernel_shape = pruned_shape(op.inputs[1])
-    inner_prod_cost = (tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2. - 1.)
-    return cost + tf.reduce_prod(output_shape) * inner_prod_cost
-
-  return cost
-
-
-def update_pruning_signals(loss, decay=.96, masks=None, method='Fisher'):
-  """
-  For each mask, computes corresponding pruning signals indicating the importance of a feature.
-
-  Arguments:
-    loss: tf.Tensor
-      Any cross-entropy loss
-
-    decay: float
-      Controls exponential moving average of pruning signals
-
-    method: str
-      Method used to compute pruning signal (currently only supports 'Fisher')
-
-  Returns:
-    A `list[tf.Tensor]` of pruning signals corresponding to masks
-
-  References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  if method not in ['Fisher']:
-    raise ValueError('Pruning method \'{0}\' not supported.'.format(method))
-
-  if not masks:
-    return []
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # compute gradients of extended masks (yields separate gradient for each data point)
-    grads = tf.gradients(loss, [m.extended for m in masks])
-
-    # estimate Fisher pruning signals from batch
-    signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads]
-
-    # update pruning signals
-    signals = [m.pruning_signal for m in masks]
-    signals = [tf.assign(s, decay * s + (1. - decay) * f, use_locking=True)
-      for s, f in zip(signals, signals_batch)]
-
-  return signals
-
-
-def prune(signals, masks=None):
-  """
-  Prunes a single feature by zeroing the mask entry with the smallest pruning signal.
-
-  Arguments:
-    signals: list[tf.Tensor]
-      A list of pruning signals
-
-    masks: list[tf.Tensor]
-      A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)`
-
-  Returns:
-    A `tf.Operation` which updates masks
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # make sure we don't select already pruned units
-    signals = [tf.where(m > .5, s, tf.zeros_like(s) + np.inf) for m, s in zip(masks, signals)]
-
-    # find units with smallest pruning signal in each layer
-    min_idx = [tf.argmin(s) for s in signals]
-    min_signals = [s[i] for s, i in zip(signals, min_idx)]
-
-    # find layer with smallest pruning signal
-    l = tf.argmin(min_signals)
-
-    # construct pruning operations, one for each mask
-    updates = []
-    for k, i in enumerate(min_idx):
-      # set mask of layer l to 0 where pruning signal is smallest
-      updates.append(
-        tf.cond(
-          tf.equal(l, k),
-          lambda: tf.scatter_update(
-            masks[k], tf.Print(i, [i], message="Pruning layer [{0}] at index ".format(k)), 0.),
-          lambda: masks[k]))
-
-    updates = tf.group(updates, name='prune')
-
-  return updates
diff --git a/twml/twml/contrib/readers/__init__.py b/twml/twml/contrib/readers/__init__.py
deleted file mode 100644
index e96cf0449..000000000
--- a/twml/twml/contrib/readers/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental readers classes"""
-from .batch_prediction_request import BatchPredictionRequest  # noqa: F401
-from .data_record import DataRecord  # noqa: F401
-from .hashed_batch_prediction_request import HashedBatchPredictionRequest  # noqa: F401
diff --git a/twml/twml/contrib/readers/batch_prediction_request.py b/twml/twml/contrib/readers/batch_prediction_request.py
deleted file mode 100644
index 4408b33b4..000000000
--- a/twml/twml/contrib/readers/batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for BatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.contrib.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/contrib/readers/data_record.py b/twml/twml/contrib/readers/data_record.py
deleted file mode 100644
index ae8cc0b68..000000000
--- a/twml/twml/contrib/readers/data_record.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-This module includes facilities for manipulating data records in DeepBird v2.
-This contains a submodule that allows for easy feature access as Tensors.
-The result of this subclass methods are dictionaries of Tensors and SparseTensors
-"""
-
-from twitter.deepbird.io.legacy.contrib.readers.data_record import (
-  SUPPORTED_DENSE_FEATURE_TYPES,  # noqa: F401
-  DataRecord,  # noqa: F401
-)
diff --git a/twml/twml/contrib/readers/hashed_batch_prediction_request.py b/twml/twml/contrib/readers/hashed_batch_prediction_request.py
deleted file mode 100644
index 3454f8483..000000000
--- a/twml/twml/contrib/readers/hashed_batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for HashedBatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.contrib.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/contrib/trainers/__init__.py b/twml/twml/contrib/trainers/__init__.py
deleted file mode 100644
index 3226cd805..000000000
--- a/twml/twml/contrib/trainers/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental trainer classes"""
-from .batch_prediction_request_trainer import BatchPredictionRequestTrainer  # noqa: F401
-from .pruning_data_record_trainer import PruningDataRecordTrainer  # noqa: F401
-from .trainer_utils import build_keras_trainer # noqa: F401
diff --git a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
deleted file mode 100644
index 2effa87ed..000000000
--- a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# pylint: disable=arguments-differ, invalid-name
-"""
-This file contains the DataRecordTrainer class.
-"""
-import warnings
-
-import twml
-from twml.trainers import DataRecordTrainer
-
-
-class BatchPredictionRequestTrainer(DataRecordTrainer):  # pylint: disable=abstract-method
-  """
-  The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases
-  that input is BatchPredictionRequest at Twitter and also where only the build_graph methods
-  needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
-    """
-    The BatchPredictionRequestTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer and twml.DataRecordTrainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
-    """
-
-    # Check and update train_batch_size and eval_batch_size in params before initialization
-    # to print correct parameter logs and does not stop running
-    # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params
-    updated_params = self.check_batch_size_params(params)
-    super(BatchPredictionRequestTrainer, self).__init__(
-      name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs)
-
-  def check_batch_size_params(self, params):
-    """ Verify that params has the correct key,values """
-    # updated_params is an instance of tensorflow.contrib.training.HParams
-    updated_params = twml.util.convert_to_hparams(params)
-    param_values = updated_params.values()
-
-    # twml.trainers.Trainer.check_params already checks other constraints,
-    # such as being an integer
-    if 'train_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if param_values['train_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that train_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'train_batch_size is always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper error warning, change/pass --train.batch_size 1
-        # so that train_batch_size = 1
-        updated_params.train_batch_size = 1
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError('Expecting params.eval_batch_size to be an integer.')
-      if param_values['eval_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that eval_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'eval_batch_size is also always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper warning raises, change/pass --eval.batch_size 1
-        # so that eval_batch_size = 1
-        updated_params.eval_batch_size = 1
-
-    if 'eval_batch_size' not in param_values:
-      updated_params.eval_batch_size = 1
-
-    if not updated_params.eval_batch_size:
-      updated_params.eval_batch_size = 1
-
-    return updated_params
-
-  @staticmethod
-  def add_batch_prediction_request_arguments():
-    """
-    Add commandline args to parse typically for the BatchPredictionRequestTrainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(BatchPredictionRequestTrainer,
-      BatchPredictionRequestTrainer).add_parser_arguments()
-
-    # mlp arguments
-    parser.add_argument(
-      '--model.use_existing_discretizer', action='store_true',
-      dest="model_use_existing_discretizer",
-      help='Load a pre-trained calibration or train a new one')
-    parser.add_argument(
-      '--model.use_binary_values', action='store_true',
-      dest='model_use_binary_values',
-      help='Use the use_binary_values optimization')
-
-    # control hom many featues we keep in sparse tensors
-    # 12 is enough for learning-to-rank for now
-    parser.add_argument(
-      '--input_size_bits', type=int, default=12,
-      help='Number of bits allocated to the input size')
-
-    parser.add_argument(
-      '--loss_function', type=str, default='ranknet',
-      dest='loss_function',
-      help='Options are pairwise: ranknet (default), lambdarank, '
-      'listnet, listmle, attrank, '
-      'pointwise')
-
-    # whether convert sparse tensors to dense tensor
-    # in order to use dense normalization methods
-    parser.add_argument(
-      '--use_dense_tensor', action='store_true',
-      dest='use_dense_tensor',
-      default=False,
-      help='If use_dense_tensor is False, '
-      'sparse tensor and spare normalization are in use. '
-      'If use_dense_tensor is True, '
-      'dense tensor and dense normalization are in use.')
-
-    parser.add_argument(
-      '--dense_normalization', type=str, default='mean_max_normalizaiton',
-      dest='dense_normalization',
-      help='Options are mean_max_normalizaiton (default), standard_normalizaiton')
-
-    parser.add_argument(
-      '--sparse_normalization', type=str, default='SparseMaxNorm',
-      dest='sparse_normalization',
-      help='Options are SparseMaxNorm (default), SparseBatchNorm')
-
-    # so far only used in pairwise learning-to-rank
-    parser.add_argument(
-      '--mask', type=str, default='full_mask',
-      dest='mask',
-      help='Options are full_mask (default), diag_mask')
-
-    return parser
diff --git a/twml/twml/contrib/trainers/pruning_data_record_trainer.py b/twml/twml/contrib/trainers/pruning_data_record_trainer.py
deleted file mode 100644
index 4796e5390..000000000
--- a/twml/twml/contrib/trainers/pruning_data_record_trainer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-from twml.trainers import DataRecordTrainer
-from twml.contrib.optimizers import PruningOptimizer
-
-
-class PruningDataRecordTrainer(DataRecordTrainer):
-  @staticmethod
-  def get_train_op(params, loss):
-    train_op = DataRecordTrainer.get_train_op(params, loss)
-
-    optimizer = PruningOptimizer(learning_rate=params.get('learning_rate'))
-
-    return optimizer.minimize(
-        loss=loss,
-        prune_every=params.get('pruning_iter', 5000),
-        burn_in=params.get('pruning_burn_in', 100000),
-        decay=params.get('pruning_decay', .9999),
-        flops_target=params.get('pruning_flops_target', 250000),
-        update_params=train_op,
-        global_step=tf.train.get_global_step())
-
-  def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
-    kwargs['optimize_loss_fn'] = self.get_train_op
-
-    super(PruningDataRecordTrainer, self).__init__(
-      name=name,
-      params=params,
-      build_graph_fn=build_graph_fn,
-      feature_config=feature_config,
-      **kwargs)
-
-  def export_model(self, *args, **kwargs):
-    # TODO: modify graph before exporting to take into account masks
-    return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs)
-
-  @staticmethod
-  def add_parser_arguments():
-    parser = DataRecordTrainer.add_parser_arguments()
-    parser.add_argument(
-      "--pruning.iter", "--pruning_iter", type=int, default=5000,
-      dest="pruning_iter",
-      help="A single feature or feature map is pruned every this many iterations")
-    parser.add_argument(
-      "--pruning.burn_in", "--pruning_burn_in", type=int, default=100000,
-      dest="pruning_burn_in",
-      help="Only start pruning after collecting statistics for this many training steps")
-    parser.add_argument(
-      "--pruning.flops_target", "--pruning_flops_target", type=int, default=250000,
-      dest="pruning_flops_target",
-      help="Stop pruning when estimated number of floating point operations reached this target. \
-      For example, a small feed-forward network might require 250,000 FLOPs to run.")
-    parser.add_argument(
-      "--pruning.decay", "--pruning_decay", type=float, default=.9999,
-      dest="pruning_decay",
-      help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \
-      signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \
-      steps.")
-    return parser
diff --git a/twml/twml/contrib/trainers/trainer_utils.py b/twml/twml/contrib/trainers/trainer_utils.py
deleted file mode 100644
index f279571be..000000000
--- a/twml/twml/contrib/trainers/trainer_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-This is a temporary close gap solution that allows TensorFlow users to do exploration and
-experimentation using Keras models, and production training using twml Trainer.
-
-As of now (Q4 2019), Keras model training using `model.fit()` has various issues, making it unfit
-for production training:
-  1. `model.fit()` is slow in TF 1.14. This will be fixed with future TensorFlow updates.
-  2. `model.fit()` crashes during model saving or in eager mode when the input has SparseTensor.
-  3. Models saved using TF 2.0 API cannot be served by TensorFlow's Java API.
-
-Until MLCE team resolves the above issues, MLCE team recommends the following:
-  - Please feel free to use Keras models for experimentation and exploration.
-  - Please stick to twml Trainer for production training & exporting,
-    especially if you want to serve your model using Twitter's prediction servers.
-
-This module provide tooling for easily training keras models using twml Trainer.
-
-This module takes a Keras model that performs binary classification, and returns a
-`twml.trainers.Trainer` object performing the same task.
-The common way to use the returned Trainer object is to call its
-`train`, `evaluate`, `learn`, or `train_and_evaluate` method with an input function.
-This input function can be created from the tf.data.Dataset you used with your Keras model.
-
-.. note: this util handles the most common case. If you have cases not satisfied by this util,
-         consider writing your own build_graph to wrap your keras models.
-"""
-from twitter.deepbird.hparam import HParams
-
-import tensorflow  # noqa: F401
-import tensorflow.compat.v2 as tf
-
-import twml
-
-
-def build_keras_trainer(
-  name,
-  model_factory,
-  save_dir,
-  loss_fn=None,
-  metrics_fn=None,
-  **kwargs):
-  """
-  Compile the given model_factory into a twml Trainer.
-
-  Args:
-    name: a string name for the returned twml Trainer.
-
-    model_factory: a callable that returns a keras model when called.
-      This keras model is expected to solve a binary classification problem.
-      This keras model takes a dict of tensors as input, and outputs a logit or probability.
-
-    save_dir: a directory where the trainer saves data. Can be an HDFS path.
-
-    loss_fn: the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy.
-
-    metrics_fn: metrics function used by TensorFlow estimators.
-    Defaults to twml.metrics.get_binary_class_metric_fn().
-
-    **kwargs: for people familiar with twml Trainer's options, they can be passed in here
-      as kwargs, and they will be forwarded to Trainer as opts.
-      See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args.
-
-  Returns:
-    a twml.trainers.Trainer object which can be used for training and exporting models.
-  """
-  build_graph = create_build_graph_fn(model_factory, loss_fn)
-
-  if metrics_fn is None:
-    metrics_fn = twml.metrics.get_binary_class_metric_fn()
-
-  opts = HParams(**kwargs)
-  opts.add_hparam('save_dir', save_dir)
-
-  return twml.trainers.Trainer(
-    name,
-    opts,
-    build_graph_fn=build_graph,
-    save_dir=save_dir,
-    metric_fn=metrics_fn)
-
-
-def create_build_graph_fn(model_factory, loss_fn=None):
-  """Create a build graph function from the given keras model."""
-
-  def build_graph(features, label, mode, params, config=None):
-    # create model from model factory.
-    model = model_factory()
-
-    # create loss function if the user didn't specify one.
-    if loss_fn is None:
-      build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
-    else:
-      build_graph_loss_fn = loss_fn
-
-    output = model(features)
-    if mode == 'infer':
-      loss = None
-    else:
-      weights = features.get('weights', None)
-      loss = build_graph_loss_fn(y_true=label, y_pred=output, sample_weight=weights)
-
-    if isinstance(output, dict):
-      if loss is None:
-        return output
-      else:
-        output['loss'] = loss
-        return output
-    else:
-      return {'output': output, 'loss': loss}
-
-  return build_graph
diff --git a/twml/twml/contrib/utils/__init__.py b/twml/twml/contrib/utils/__init__.py
deleted file mode 100644
index 56a083048..000000000
--- a/twml/twml/contrib/utils/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# pylint: disable=wildcard-import
-"""This module contains experimental util functions for contrib."""
-
-from .math_fns import safe_div, safe_log, cal_ndcg, cal_swapped_ndcg  # noqa: F401
-from .masks import diag_mask, full_mask  # noqa: F401
-from .normalizer import mean_max_normalizaiton, standard_normalizaiton  # noqa: F401
-from .scores import get_pairwise_scores, get_pairwise_label_scores  # noqa: F401
-# pointwise functions
-from .loss_fns import get_pointwise_loss  # noqa: F401
-# ranknet functions
-from .loss_fns import get_pair_loss  # noqa: F401
-# listwise functions
-from .loss_fns import get_attrank_loss, get_listnet_loss, get_listmle_loss  # noqa: F401
-# lambdarank functions
-from .loss_fns import get_lambda_pair_loss  # noqa: F401
-from .device import get_device_map, get_gpu_list, get_gpu_count, is_gpu_available  # noqa: F401
-from .similarities import cosine_similarity  # noqa: F401
-from . import interp # noqa: F401
diff --git a/twml/twml/contrib/utils/datasets.py b/twml/twml/contrib/utils/datasets.py
deleted file mode 100644
index d31ea3ae4..000000000
--- a/twml/twml/contrib/utils/datasets.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import random
-
-import twml
-
-get_time_based_dataset_files = twml.util.list_files_by_datetime
-
-
-def resolve_train_and_eval_files_overlap(
-  train_files, eval_files, fraction_kept_for_eval, seed=None
-):
-  """Resolve any overlap between train and eval files.
-
-  Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of
-  the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the
-  `eval_files`.
-
-  The following example demonstrates its usage:
-
-  >>> orig_train_files = ['f1', 'f2', 'f3', 'f4']
-  >>> orig_eval_files = ['f1', 'f2', 'f3']
-  >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap(
-  ...     orig_train_files, orig_eval_files, 0.5
-  ... )
-  >>> set(resolved_train_files) & set(resolved_eval_files) == set()
-  True
-  >>> len(resolved_train_files) == 3
-  True
-  >>> len(resolved_eval_files) == 2
-  True
-
-  Args:
-    train_files: A list of the files used for training.
-    eval_files: A list of the files used for validation.
-    fraction_kept_for_eval: A fraction of files in the intersection between `train_files` and
-      `eval_files` exclusively kept for evaluation.
-    seed: A seed for generating random numbers.
-
-  Returns:
-    A tuple `(new_train_files, new_eval_files)` with the overlapping resolved.
-  """
-
-  rng = random.Random(seed)
-
-  train_files = set(train_files)
-  eval_files = set(eval_files)
-  overlapping_files = train_files & eval_files
-  train_files_selected_for_eval = set(rng.sample(
-    overlapping_files,
-    int(len(overlapping_files) * fraction_kept_for_eval)
-  ))
-  train_files = train_files - train_files_selected_for_eval
-  eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval
-  return list(train_files), list(eval_files)
-
-
-def get_time_based_dataset_files_for_train_and_eval(
-  base_path,
-  train_start_datetime,
-  train_end_datetime,
-  eval_start_datetime,
-  eval_end_datetime,
-  fraction_kept_for_eval,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1
-):
-  """Get train/eval dataset files organized with a time-based prefix.
-
-  This is just a convenience built around `get_dataset_files_prefixed_by_time` and
-  `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation.
-  """
-
-  train_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=train_start_datetime,
-    end_datetime=train_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  eval_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=eval_start_datetime,
-    end_datetime=eval_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  return resolve_train_and_eval_files_overlap(
-    train_files=train_files,
-    eval_files=eval_files,
-    fraction_kept_for_eval=fraction_kept_for_eval
-  )
diff --git a/twml/twml/contrib/utils/device.py b/twml/twml/contrib/utils/device.py
deleted file mode 100644
index ab189c98a..000000000
--- a/twml/twml/contrib/utils/device.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-Functions to query devices being used by tensorflow
-"""
-
-from tensorflow.python.client import device_lib
-
-
-def get_device_map():
-  """Returns the map of device name to device type"""
-  local_device_protos = device_lib.list_local_devices()
-  return {x.name: x.device_type for x in local_device_protos}
-
-
-def get_gpu_list():
-  """Returns the list of GPUs available"""
-  device_map = get_device_map()
-  return [name for name in device_map if device_map[name] == 'GPU']
-
-
-def get_gpu_count():
-  """Returns the count of GPUs available"""
-  return len(get_gpu_list())
-
-
-def is_gpu_available():
-  """Returns if GPUs are available"""
-  return get_gpu_count() > 0
diff --git a/twml/twml/contrib/utils/interp.py b/twml/twml/contrib/utils/interp.py
deleted file mode 100644
index 419d89030..000000000
--- a/twml/twml/contrib/utils/interp.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Interpolation functions
-"""
-
-import libtwml
-import tensorflow.compat.v1 as tf
-import twml
-
-
-def linear_interp1(inputs, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    ref_inputs:
-      Reference grid points used for interpolation.
-    ref_outputs:
-      Reference output values used for interpolation.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  ndims = inputs.shape.ndims
-  ref_inputs_ndims = ref_inputs.shape.ndims
-  ref_outputs_ndims = ref_inputs.shape.ndims
-
-  if (ref_inputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims))
-
-  if (ref_outputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_outputs: %d" % (ndims, ref_outputs_ndims))
-
-  if ndims > 2:
-    raise ValueError("Input dimensions should be < 2D. But got %d." % ndims)
-
-  original_input_shape = tf.shape(inputs)
-  # This is needed because isotonic_calibration expects:
-  # - inputs of size [num_samples, num_classes]
-  # - ref_inputs, ref_outputs of size [num_classes, num_bins]
-  inputs = tf.reshape(inputs, [-1, 1])
-  ref_inputs = tf.reshape(ref_inputs, [1, -1])
-  ref_outputs = tf.reshape(ref_outputs, [1, -1])
-
-  # isotonic_calibration is simply doing linear interpolation.
-  # This needs to be renamed in the future to make it consistent.
-  outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs)
-  return tf.reshape(outputs, original_input_shape)
-
-
-def linear_interp1_by_class(inputs, input_classes, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    input_classes:
-      The class index to use from the reference grid.
-    ref_inputs:
-      Reference 2D grid points used for interpolation.
-      Each row denotes the grid from a different class.
-    ref_outputs:
-      Reference 2D output values used for interpolation.
-      Each row denotes the grid from a different class.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  input_classes = tf.convert_to_tensor(input_classes)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  original_input_shape = tf.shape(inputs)
-
-  # pass through
-  def in_func(x):
-    return x
-
-  # indexed function
-  def cond_func(i, fn):
-    idx = input_classes[i]
-    x = tf.expand_dims(fn(), axis=0)
-    return linear_interp1(x, ref_inputs[idx], ref_outputs[idx])
-
-  # Use while loop for now, needs to be replace by a custom C++ op later.
-  outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func)
-  return tf.reshape(outputs, original_input_shape)
diff --git a/twml/twml/contrib/utils/loss_fns.py b/twml/twml/contrib/utils/loss_fns.py
deleted file mode 100644
index eb25b430a..000000000
--- a/twml/twml/contrib/utils/loss_fns.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import tensorflow.compat.v1 as tf
-from twml.contrib.utils import masks, math_fns
-
-
-def get_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params):
-  """
-  Paiwise learning-to-rank ranknet loss
-  Check paper https://www.microsoft.com/en-us/research/publication/
-  learning-to-rank-using-gradient-descent/
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count))
-  return loss
-
-
-def get_lambda_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params, swapped_ndcg):
-  """
-  Paiwise learning-to-rank lambdarank loss
-  faster than the previous gradient method
-  Note: this loss depends on ranknet cross-entropy
-  delta NDCG is applied to ranknet cross-entropy
-  Hence, it is still a gradient descent method
-  Check paper http://citeseerx.ist.psu.edu/viewdoc/
-  download?doi=10.1.1.180.634&rep=rep1&type=pdf for more information
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-    swapped_ndcg: swapped ndcg of shape [n_data, n_data]
-    ndcg values when swapping each pair in the prediction ranking order
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count, swapped_ndcg))
-  return loss
-
-
-def _get_average_cross_entropy_loss(pairwise_label_scores, pairwise_predicted_scores,
-                                    mask, pair_count, swapped_ndcg=None):
-  """
-  Average the loss for a batchPredictionRequest based on a desired number of pairs
-  """
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=pairwise_label_scores,
-    logits=pairwise_predicted_scores)
-  loss = mask * loss
-  if swapped_ndcg is not None:
-    loss = loss * swapped_ndcg
-  loss = tf.reduce_sum(loss) / pair_count
-  return loss
-
-
-def get_listmle_loss(labels, predicted_scores):
-  r"""
-  listwise learning-to-rank listMLE loss
-  Note: Simplified MLE formula is used in here (omit the proof in here)
-  \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores)))
-  n is tf.shape(predicted_scores)[0]
-  Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-  Returns:
-    average loss
-  """
-  labels = tf.reshape(labels, [-1, 1])
-  n_data = tf.shape(labels)[0]
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-
-  predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(labels,
-    predicted_scores, n_data)
-
-  loss = (-1) * tf.reduce_sum(predicted_scores)
-  # sum over 1 to n_data - 1
-  temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1])
-  temp = tf.reshape(temp, [])
-  loss = tf.add(loss, temp)
-
-  exps = tf.exp(predicted_scores_ordered_by_labels)
-  exp_sum = tf.reduce_sum(exps)
-  # clip exp_sum for safer log
-  loss = tf.add(loss, math_fns.safe_log(exp_sum))
-
-  iteration = tf.constant(0)
-
-  def _cond(iteration, loss, exp_sum, exp):
-    return tf.less(iteration, n_data - 2)
-
-  def _gen_loop_body():
-    def loop_body(iteration, loss, exp_sum, exps):
-      temp = tf.gather(exps, [iteration])
-      temp = tf.reshape(temp, [])
-      exp_sum = tf.subtract(exp_sum, temp)
-      # clip exp_sum for safer log
-      loss = tf.add(loss, math_fns.safe_log(exp_sum))
-      return tf.add(iteration, 1), loss, exp_sum, exps
-    return loop_body
-
-  iteration, loss, exp_sum, exps = tf.while_loop(_cond, _gen_loop_body(),
-    (iteration, loss, exp_sum, exps))
-  loss = loss / tf.cast(n_data, dtype=tf.float32)
-  return loss
-
-
-def _get_ordered_predicted_scores(labels, predicted_scores, n_data):
-  """
-  Order predicted_scores based on sorted labels
-  """
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(labels), k=n_data)
-  ordered_labels_indices = tf.transpose(ordered_labels_indices)
-  predicted_scores_ordered_by_labels = tf.gather_nd(predicted_scores,
-    ordered_labels_indices)
-  return predicted_scores_ordered_by_labels
-
-
-def get_attrank_loss(labels, predicted_scores, weights=None):
-  """
-  Modified listwise learning-to-rank AttRank loss
-  Check paper https://arxiv.org/abs/1804.05936 for more information
-  Note: there is an inconsistency between the paper statement and
-  their public code
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # The authors immeplemented the following, which is basically listnet
-  # attention_labels = _get_attentions(labels)
-  # attention_labels = tf.reshape(attention_labels, [1, -1])
-  # predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=attention_labels,
-  #   logits=predicted_scores))
-
-  # The paper proposed the following
-  # attention_labels = _get_attentions(labels)
-  # # However the following line is wrong based on their statement
-  # # as _get_attentions can give 0 results when input < 0
-  # # and the result cannot be used in _get_attrank_cross_entropy
-  # # log(a_i^S)
-  # # attention_predicted_scores = _get_attentions(predicted_scores)
-  # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  # # the range of attention_predicted_scores is [0, 1)
-  # # this gives sigmoid [0.5, 0.732)
-  # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either
-
-  # Implemented the following instead
-  # _get_attentions is applied to labels
-  # softmax is applied to predicted_scores
-  reshaped_labels = tf.reshape(labels, [1, -1])
-  attention_labels = _get_attentions(reshaped_labels)
-  reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores)
-  loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  return loss
-
-
-def _get_attentions(raw_scores):
-  """
-  Used in attention weights in AttRank loss
-  for a query/batch/batchPreidictionRequest
-  (a rectified softmax function)
-  """
-  not_consider = tf.less_equal(raw_scores, 0)
-  mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  expon_labels = mask * tf.exp(raw_scores)
-
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = math_fns.safe_div(expon_labels, expon_label_sum)
-  return attentions
-
-
-def _get_attrank_cross_entropy(labels, logits):
-  # logits is not safe based on their satement
-  # do not use this function directly elsewhere
-  results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(1 - logits)
-  results = (-1) * results
-  results = tf.reduce_mean(results)
-  return results
-
-
-def get_listnet_loss(labels, predicted_scores, weights=None):
-  """
-  Listwise learning-to-rank listet loss
-  Check paper https://www.microsoft.com/en-us/research/
-  wp-content/uploads/2016/02/tr-2007-40.pdf
-  for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # top one probability is the same as softmax
-  labels_top_one_probs = _get_top_one_probs(labels)
-  predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores)
-
-  if weights is None:
-    loss = tf.reduce_mean(
-      _get_listnet_cross_entropy(labels=labels_top_one_probs,
-      logits=predicted_scores_top_one_probs))
-    return loss
-
-  loss = tf.reduce_mean(
-    _get_listnet_cross_entropy(labels=labels_top_one_probs,
-    logits=predicted_scores_top_one_probs) * weights) / tf.reduce_mean(weights)
-  return loss
-
-
-def _get_top_one_probs(labels):
-  """
-  Used in listnet top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  (essentially a softmax function)
-  """
-  expon_labels = tf.exp(labels)
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = expon_labels / expon_label_sum
-  return attentions
-
-
-def _get_listnet_cross_entropy(labels, logits):
-  """
-  Used in listnet
-  cross entropy on top-one probabilities
-  between ideal/label top-one probabilities
-  and predicted/logits top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  """
-  # it is safe to use log on logits
-  # that come from _get_top_one_probs
-  # do not use this function directly elsewhere
-  results = (-1) * labels * math_fns.safe_log(logits)
-  return results
-
-
-def get_pointwise_loss(labels, predicted_scores, weights=None):
-  """
-  Pointwise learning-to-rank pointwise loss
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  if weights is None:
-    loss = tf.reduce_mean(
-      tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-      logits=predicted_scores))
-    return loss
-  loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-        logits=predicted_scores) * weights) / tf.reduce_mean(weights)
-  return loss
diff --git a/twml/twml/contrib/utils/masks.py b/twml/twml/contrib/utils/masks.py
deleted file mode 100644
index f3143dc52..000000000
--- a/twml/twml/contrib/utils/masks.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-def diag_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except the diagonal
-    each cell contains a paiwise score difference
-    only selfs/diags are 0s
-  """
-  mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data]))
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
-
-
-def full_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except pairs that have the same labels
-    each cell contains a paiwise score difference
-    all pairwise_label_scores = 0.5: selfs and same labels are 0s
-  """
-  not_consider = tf.equal(pairwise_label_scores, 0.5)
-  mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.reduce_sum(mask)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
diff --git a/twml/twml/contrib/utils/math_fns.py b/twml/twml/contrib/utils/math_fns.py
deleted file mode 100644
index 2d9e72282..000000000
--- a/twml/twml/contrib/utils/math_fns.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import tensorflow.compat.v1 as tf
-from tensorflow.python.ops import array_ops, math_ops
-
-
-# Copied from metrics_impl.py
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/metrics_impl.py#L216
-def safe_div(numerator, denominator, name=None):
-  """
-  Example usage: calculating NDCG = DCG / IDCG to handle cases when
-  IDCG = 0 returns 0 instead of Infinity 
-  Do not use this dividing funciton unless it makes sense to your problem
-  Divides two tensors element-wise, returns 0 if the denominator is <= 0.
-  Args:
-    numerator: a real `Tensor`.
-    denominator: a real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
-def cal_ndcg(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate NDCG score for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds DCG / IDCG.
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  dcg = _dcg_idcg(predicted_relevance, cg_discount)
-  idcg = _dcg_idcg(sorted_relevance, cg_discount)
-  # the ndcg score of the batch
-  # idcg is 0 if label_scores are all 0
-  ndcg = safe_div(dcg, idcg, 'one_ndcg')
-  return ndcg
-
-
-def cal_swapped_ndcg(label_scores, predicted_scores, top_k_int):
-  """
-  Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`. 
-  Returns:
-    a `Tensor` that holds swapped NDCG by .
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  # cg_discount is safe as a denominator
-  dcg_k = predicted_relevance / cg_discount
-  dcg = tf.reduce_sum(dcg_k)
-
-  idcg_k = sorted_relevance / cg_discount
-  idcg = tf.reduce_sum(idcg_k)
-
-  ndcg = safe_div(dcg, idcg, 'ndcg_in_lambdarank_training')
-
-  # remove the gain from label i then add the gain from label j
-  tiled_ij = tf.tile(dcg_k, [1, top_k_int])
-  new_ij = (predicted_relevance / tf.transpose(cg_discount))
-
-  tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1])
-  new_ji = tf.transpose(predicted_relevance) / cg_discount
-
-  # if swap i and j, remove the stale cg for i, then add the new cg for i,
-  # remove the stale cg for j, and then add the new cg for j
-  new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji
-
-  new_ndcg = safe_div(new_dcg, idcg, 'new_ndcg_in_lambdarank_training')
-  swapped_ndcg = tf.abs(ndcg - new_ndcg)
-  return swapped_ndcg
-
-
-def _dcg_idcg(relevance_scores, cg_discount):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    relevance_scores: a real `Tensor`.
-    cg_discount: a real `Tensor`, with dtype matching relevance_scores
-  Returns:
-    a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount}  
-  """
-  # cg_discount is safe
-  dcg_k = relevance_scores / cg_discount
-  return tf.reduce_sum(dcg_k)
-
-
-def _get_ranking_orders(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: an integer or an int `Tensor`.
-  Returns:
-    two `Tensors` that hold sorted_labels: the ground truth relevance socres
-    and predicted_order: relevance socres based on sorted predicted_scores
-  """
-  # sort predictions_scores and label_scores
-  # size [batch_size/num of DataRecords, 1]
-  label_scores = tf.reshape(label_scores, [-1, 1])
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-  # sorted_labels contians the relevance scores of the correct order
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(label_scores), k=top_k_int)
-  sorted_labels = tf.transpose(sorted_labels)
-  # sort predicitons and use the indices to obtain the relevance scores of the predicted order
-  sorted_predictions, ordered_predictions_indices = tf.nn.top_k(
-    tf.transpose(predicted_scores), k=top_k_int)
-  ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices)
-  # predicted_order contians the relevance scores of the predicted order
-  predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels)
-  return sorted_labels, predicted_order
-
-
-def _get_cg_discount(top_k_int=1):
-  r"""
-  Calculate discounted gain factor for ranking position till top_k_int
-  Args:
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds \log_{2}(i + 1), i \in [1, k] 
-  """
-  log_2 = tf.log(tf.constant(2.0, dtype=tf.float32))
-  # top_k_range needs to start from 1 to top_k_int
-  top_k_range = tf.range(top_k_int) + 1
-  top_k_range = tf.reshape(top_k_range, [-1, 1])
-  # cast top_k_range to float
-  top_k_range = tf.cast(top_k_range, dtype=tf.float32)
-  cg_discount = tf.log(top_k_range + 1.0) / log_2
-  return cg_discount
-
-
-def _get_relevance_scores(scores):
-  return 2 ** scores - 1
-
-
-def safe_log(raw_scores, name=None):
-  """
-  Calculate log of a tensor, handling cases that
-  raw_scores are close to 0s
-  Args:
-    raw_scores: An float `Tensor`.
-  Returns:
-    A float `Tensor` that hols the safe log base e of input
-  """
-  epsilon = 1E-8
-  clipped_raw_scores = tf.maximum(raw_scores, epsilon)
-  return tf.log(clipped_raw_scores)
diff --git a/twml/twml/contrib/utils/normalizer.py b/twml/twml/contrib/utils/normalizer.py
deleted file mode 100644
index a6a7035b8..000000000
--- a/twml/twml/contrib/utils/normalizer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import tensorflow.compat.v1 as tf
-from twml.contrib.utils import math_fns
-
-
-def mean_max_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / abs(max value)
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0])
-  dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0]))
-  dense_tensor = math_fns.safe_div(dense_tensor - dense_mean, dense_abs_max,
-    'mean_max_normalization_in_batch')
-  return dense_tensor
-
-
-def standard_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  z-normalization or standard_normalization in batch
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / variance
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  epsilon = 1E-7
-  dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0)
-  # using epsilon is safer than math_fns.safe_div in here
-  dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon)
-  return dense_tensor
diff --git a/twml/twml/contrib/utils/scores.py b/twml/twml/contrib/utils/scores.py
deleted file mode 100644
index 84e792c13..000000000
--- a/twml/twml/contrib/utils/scores.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-def get_pairwise_scores(tensor_input):
-  """
-  This is so far used in pariwise learning-to-rank
-
-  Arguments:
-    tensor_input: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-
-  Returns:
-    pairwise scores: a dense `Tensor` of shape [n_data, n_data].
-  """
-  return tensor_input - tf.transpose(tensor_input)
-
-
-def get_pairwise_label_scores(labels):
-  """
-  This is so far used in pariwise learning-to-rank
-  Args:
-    labels: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-  Returns:
-    pairwise label scores: a dense `Tensor` of shape [n_data, n_data].
-      each value is within [0, 1]
-  """
-  # raw pairwise label scores/differences
-  pairwise_label_scores = get_pairwise_scores(labels)
-  # sanity check to make sure values in differences_ij are [-1, 1]
-  differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0)
-  # values in pairwise_label_scores are within [0, 1] for cross entropy
-  return (1.0 / 2.0) * (1.0 + differences_ij)
diff --git a/twml/twml/contrib/utils/similarities.py b/twml/twml/contrib/utils/similarities.py
deleted file mode 100644
index 212065f88..000000000
--- a/twml/twml/contrib/utils/similarities.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-def cosine_similarity(x1, x2, axis):
-  """
-  cosine similarity of two tensors.
-
-  Arguments:
-    x1:
-      A tf.Tensor
-    x2:
-      A tf.Tensor
-    axis: Dimension along which to normalize.
-  """
-  normalize_x1 = tf.nn.l2_normalize(x1, axis=axis)
-  normalize_x2 = tf.nn.l2_normalize(x2, axis=axis)
-  return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis)
diff --git a/twml/twml/dataset.py b/twml/twml/dataset.py
deleted file mode 100644
index 4356fdc7c..000000000
--- a/twml/twml/dataset.py
+++ /dev/null
@@ -1,372 +0,0 @@
-"""
-This module implements custom tf.data.datasets for twml.
-"""
-import numbers
-
-from absl import logging
-from kazoo.client import KazooClient
-from libtwml import OPLIB
-import tensorflow.compat.v1 as tf
-from twml.constants import DEFAULT_ZOOKEEPER_BASE_ZNODE, DEFAULT_ZOOKEEPER_HOST
-
-
-class BlockFormatDataset(tf.data.Dataset):
-  """A ``tf.data.Dataset`` comprising records from one or more TFRecord files."""
-
-  def __init__(self, filenames, compression_type="auto", buffer_size=1 << 20):
-    """
-    Creates a ``BlockFormatDataset``.
-
-    Args:
-      filenames:
-        A `tf.string` tensor containing one or more filenames.
-      compression_type:
-        A string specifying the compression type.
-        Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default).
-        When compression_type == 'auto', it is inferred from file extension.
-      buffer_size:
-        Buffer size to be used during decompression. default: 1<<20.
-    """
-    self._filenames = tf.convert_to_tensor(filenames, dtype=tf.string, name="filenames")
-    self._compression_type = tf.convert_to_tensor(compression_type.lower(), name="compression_type")
-    self._buffer_size = tf.convert_to_tensor(buffer_size, dtype=tf.int64, name="buffer_size")
-    # Parent class calss self._as_variant_tensor in init. So call this at the end.
-    super(BlockFormatDataset, self).__init__()
-
-  def _as_variant_tensor(self):
-    """
-    Create the resource handle for the dataset.
-    """
-    try:
-      block_format_dataset = __import__("libtwml_internal").OPLIB.block_format_dataset
-      return block_format_dataset(self._filenames)
-    except ImportError:
-      block_format_dataset = OPLIB.block_format_dataset_v2
-      return block_format_dataset(self._filenames, self._compression_type, self._buffer_size)
-
-  def _inputs(self):
-    return []
-
-  @property
-  def output_shapes(self):
-    """Return output shapes"""
-    return tf.TensorShape([])
-
-  @property
-  def output_types(self):
-    """Return output types"""
-    return tf.string
-
-  @property
-  def output_classes(self):
-    """Return output classes"""
-    return tf.Tensor
-
-
-def downsample_dataset(dataset, sample_rate, rate_name):
-  """
-  Downsample a tf.data.Dataset at sample_rate
-  """
-  if sample_rate is None or sample_rate == 1.0:
-    return dataset
-  elif not isinstance(sample_rate, numbers.Real):
-    raise TypeError("dataset %s must be a real number" % rate_name)
-  elif sample_rate <= 0 or sample_rate > 1:
-    raise ValueError("dataset %s must be in range (0, 1])" % rate_name)
-  return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate)
-
-
-def _filenames_dataset(files, shards=None, shard_index=None):
-  """
-  Get a tf.data.Dataset with file names from a list of files
-  Optionally shard the file list (see stream_block_format_dataset)
-  """
-  files = tf.data.Dataset.from_tensor_slices(files)
-
-  if [shards, shard_index] != [None, None]:
-    logging.info("Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards))
-    files = files.shard(num_shards=shards, index=shard_index)
-
-  return files
-
-
-def stream_block_format_dataset(
-        files, parse_fn, batch_size, num_threads,
-        shuffle=True, repeat=False,
-        block_length=None, part_file_parallelism=None, file_shuffle_size=None,
-        record_shuffle_size=None, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None, prefetch_size=2,
-        shards=None, shard_index=None, shuffle_files=True, interleave=True):
-  """
-  Helper function to stream a list of part files.
-
-  Args:
-    files:
-      List of input files which will create a dataset.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    num_threads:
-      Number of threads working on the data in parallel.
-    shuffle:
-      Shuffle records within each file using ``record_shuffle_size``. Defaults to True.
-    repeat:
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset
-      (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached).
-    block_length (optional):
-      Number of consecutive records to pull from a single part file.
-      Defaults to batch_size.
-    part_file_parallelism (optional):
-      Number of part files to read from in parallel. Once a part file is completely read, it will
-      be replaced by the next part file in the part file list.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    file_shuffle_size (optional):
-      the buffer_size used for shuffling of the list of files.
-      Defaults to 1000. For example, if you have 2000 files, the first
-      1000 files are shuffled together, iterated through, then the next 1000 files are shuffled
-      and iterated through.
-    record_shuffle_size (optional):
-      the ``buffer_size`` used for shuffling records in each thread.
-      Defaults to ``batch_size * 8`` records.
-    dataset_fn (optional):
-      A function of that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-      Note that this argument is only useful in conjunction with a [train,eval]_steps of -1
-      (that is, when the entire dataset is used). Furthermore, note that even in this case, each
-      epoch will see a different set of part files. This is because new part files are re-sampled
-      every epoch. In other words, this argument is only provided for backwards compatibility with
-      DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate)
-      instead.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation and other
-      techniques that require each worker to train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-  Returns:
-    tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online.
-  """
-  # Creating a dataset from an input directory
-
-  files = _filenames_dataset(files, shards=shards, shard_index=shard_index)
-
-  file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000
-  record_shuffle_size = record_shuffle_size if record_shuffle_size is not None else (batch_size * 8)
-  block_length = block_length if block_length is not None else batch_size
-
-  logging.info("NUM_THREADS: %d", num_threads)
-
-  if repeat:
-    files = files.repeat()
-
-  if shuffle_files:
-    # Randomly shuffle the files list.
-    files = files.shuffle(buffer_size=file_shuffle_size)
-
-  # Downsample parts files
-  files = downsample_dataset(files, parts_downsampling_rate, "parts_downsampling_rate")
-
-  # Interleave the result from BlockFormatDataset
-  # block_length == batch_size results in batch_size records being read from a single file.
-  def map_fn(filenames):
-    '''function that maps each filename to a BlockFormatDataset'''
-    # reach each file using BlockFormatDataset
-    dataset = BlockFormatDataset(filenames)
-
-    # early prefetching can sometimes improve performance (like on GCS)
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-
-    # Shuffling before repeating ensures strong ordering.
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=record_shuffle_size)
-
-    return dataset
-
-  if interleave:
-    part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-    dataset = files.interleave(
-      map_fn, cycle_length=part_file_parallelism, block_length=block_length, num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, keep_rate, "keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(prefetch_size)
-
-  return dataset_fn(dataset, parse_fn, batch_size)
-
-
-def cx_zk_path(path):
-  if path is None:
-    raise ValueError("Path for zookeeper dataset pointer is None. You must specify a path.")
-  return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path])
-  logging.info("Zookeeper path is: {}".format(return_path))
-  return return_path
-
-
-def zookeeper_ordered_dataset(
-        files, parse_fn, batch_size, zk_counter_path, repeat=False,
-        num_threads=2, block_length=None, part_file_parallelism=None,
-        batch_shuffle_size=None, file_keep_rate=None, record_keep_rate=None,
-        prefetch_size=2, interleave=False, dataset_fn=None, verbose=False):
-  """
-  Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of
-  which file to read, and to coordinate multiple workers.
-
-  Args:
-    files:
-      ordered list of (typically HDFS) filenames. This must remain consistent
-      between different workers, and between worker restarts (e.g. in the case
-      of instance failure or preemption).
-      To ensure this remains consistent, consider using the --train.files_list
-      option from DataRecordTrainer.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    zk_counter_path:
-      Path under the root node for the underlying zookeeper shared counter that
-      is used to coordinate distributed iteration over the list of files.
-      Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`.
-    repeat:
-      Default False. Set True to repeat over the files forever.
-    num_threads:
-      Default 2. Number of threads working on the data in parallel.
-      Only used if interleave=True.
-    block_length:
-      Default None. Number of consecutive records to pull from a single part file.
-      If None, then block_length=batch_size will be used.
-      Only used if interleave=True.
-    part_file_parallelism:
-      Default None. Number of part files to read from in parallel. Once a part file is completely
-      read, it will be replaced by the next part file indicated by the zookeeper counter.
-      Only used if interleave=True.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    batch_shuffle_size:
-      Default None. Size of shuffle buffer, for shuffling that will be applied after batching.
-      if None, then batches will not be shuffled. Ignored if dataset_fn is provided.
-    file_keep_rate:
-      Default None. Fraction of files to keep, or None to keep all files.
-    record_keep_rate:
-      Default None. Fraction of records to keep, or None to keep all records.
-    prefetch_size:
-      Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided.
-    interleave:
-      Default False. Set True to use tf.data.Dataset.interleave rather than flat_map.
-    dataset_fn:
-      A function that is applied to the dataset of individual records, after
-      these have been read from the parts files.
-      If ``None`` (the default), the behavior will be as though dataset_fn were set to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          dataset = dataset.batch(batch_size)
-          dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE)
-          if batch_shuffle_size:
-            dataset = dataset.shuffle(batch_shuffle_size)
-          return dataset.prefetch(prefetch_size)
-
-    verbose:
-      Default False. Set True to log the names of files loaded by TF.
-  """
-  block_length = batch_size if block_length is None else block_length
-  part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-
-  def zk_index_generator(my_files=files):
-    zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST)
-    zk.start()
-    my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0)
-    while True:
-      my_counter += 1
-      counter_pre_value = my_counter.pre_value
-      if repeat:
-        counter_pre_value = counter_pre_value % len(my_files)
-      if counter_pre_value >= len(my_files):
-        break
-      else:
-        chosen_file = my_files[counter_pre_value]
-        if verbose:
-          logging.info("{}. yielding {}".format(counter_pre_value, chosen_file))
-        yield chosen_file
-    zk.stop()
-
-  files = tf.data.Dataset.from_generator(zk_index_generator, tf.string)
-
-  # Downsample parts files
-  files = downsample_dataset(files, file_keep_rate, "file_keep_rate")
-
-  def map_fn(filenames):
-    return BlockFormatDataset(filenames).prefetch(20)
-
-  # Dont interleave for sequential training
-  if interleave:
-    dataset = files.interleave(
-      map_fn,
-      cycle_length=part_file_parallelism,
-      block_length=block_length,
-      num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    dataset = dataset.batch(batch_size)
-    dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    # shuffle after batching and parsing for performance reasons
-    # faster b/c 1 random selection is made per batch rather than per record
-    if batch_shuffle_size:
-      dataset = dataset.shuffle(buffer_size=batch_shuffle_size)
-    dataset = dataset.prefetch(prefetch_size)
-
-  else:
-    dataset = dataset_fn(dataset, parse_fn, batch_size)
-
-  return dataset
diff --git a/twml/twml/errors.py b/twml/twml/errors.py
deleted file mode 100644
index 9b50fcd79..000000000
--- a/twml/twml/errors.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""
-Error classes for twml
-"""
-
-
-class EarlyStopError(Exception):
-  """Exception used to indicate evaluator needs to early stop."""
-  pass
-
-
-class CheckpointNotFoundError(Exception):
-  """Exception used to indicate a checkpoint hasnt been found."""
-  pass
diff --git a/twml/twml/export_output_fns.py b/twml/twml/export_output_fns.py
deleted file mode 100644
index f72e1d0fe..000000000
--- a/twml/twml/export_output_fns.py
+++ /dev/null
@@ -1,17 +0,0 @@
-'''
-Contains implemenations of DataRecordTrainer.get_export_output_fns that specify how to
-export model graph outputs from build_graph to DataRecords for prediction servers.
-
-Modelers can use the functions in this module as the export_output_fn parameter of
-the DataRecordTrainer constructor to customize how to export their model outputs.
-
-Modelers may also provide a custom implementation of export_output_fn using these as reference.
-'''
-
-# pylint: disable=invalid-name
-from twitter.deepbird.io.legacy.export_output_fns import (
-  batch_prediction_continuous_output_fn,  # noqa: F401
-  batch_prediction_tensor_output_fn,  # noqa: F401
-  default_output_fn,  # noqa: F401
-  variable_length_continuous_output_fn,  # noqa: F401
-)
diff --git a/twml/twml/feature_config.py b/twml/twml/feature_config.py
deleted file mode 100644
index 37004f442..000000000
--- a/twml/twml/feature_config.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""
-Feature configuration for DeepBird jobs:
-- Which features to keep
-- Which features to blacklist
-- Which features are labels
-- Which feature is the weight
-"""
-
-from twitter.deepbird.io.legacy import feature_config
-
-
-class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-    # Override the class in the spec.
-    doc["class"] = "twml.FeatureConfig"
-    return doc
-
-
-class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  def build(self):
-    # Overwrite self.build() to return twml.FeatureConfig instead
-    """
-    Builds and returns FeatureConfig object.
-    """
-
-    (
-      features,
-      tensor_types,
-      sparse_tensor_types,
-      feature_map,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
-
-    return FeatureConfig(
-      features=features,
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=tensor_types,
-      sparse_tensor_types=sparse_tensor_types,
-      feature_types=feature_map,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=self._feature_name_to_feature_parser,
-      feature_in_bq_name=self._feature_in_bq_name,
-    )
-
-
-_name_to_id = feature_config._name_to_id
diff --git a/twml/twml/filters.py b/twml/twml/filters.py
deleted file mode 100644
index e48633808..000000000
--- a/twml/twml/filters.py
+++ /dev/null
@@ -1,9 +0,0 @@
-'''
-Includes functions to filter features dict build from
-data records.
-'''
-
-from twitter.deepbird.io.legacy.filters import (
-  balance_binary_class_samples,  # noqa: F401
-  sparse_keep_feature_if,  # noqa: F401
-  sparse_keep_sample_if)  # noqa: F401
diff --git a/twml/twml/hooks.py b/twml/twml/hooks.py
deleted file mode 100644
index cdf733535..000000000
--- a/twml/twml/hooks.py
+++ /dev/null
@@ -1,562 +0,0 @@
-""" This file contains tf.train.SessionRunHooks defined by TWML """
-from datetime import datetime
-import json
-import operator
-import os
-
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.python.training.basic_session_run_hooks import NeverTriggerTimer, SecondOrStepTimer
-import twml
-
-
-class StepProgressHook(tf.train.SessionRunHook):
-  """Hook that displays a progress bar to monitor global step progress """
-
-  def __init__(self, max_step):
-    """
-    Initializes a `StepProgressHook`.
-    This hook displays a progress bar for max_steps.
-
-    Note that this hook only works for training and calibration.
-
-    Args:
-      max_steps:
-        maximum steps to monitor in progress bar.
-        When this many steps is reached, the progress bar will be full.
-    """
-    self._max_step = max_step
-    self._start_step = 0
-    self._global_step_tensor = None
-    self._progress_bar = None
-
-  def begin(self):
-    """ sets the global_step_tensor """
-    self._global_step_tensor = tf.train.get_or_create_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError("Global step should be created to use StepProgressHook.")
-
-  def after_create_session(self, session, coord):
-    """ creates the progress bar and keeps track of the first global step upon session creation """
-    global_step = session.run(self._global_step_tensor)
-    self._start_step = global_step
-    self._progress_bar = tf.keras.utils.Progbar(self._max_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    """ invoked before calling session.run """
-    return tf.train.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    """ invoked after run is called. Updates the progress bar. """
-    step = run_context.session.run(self._global_step_tensor)
-    self._progress_bar.update(step - self._start_step)
-
-
-class GetMetricsHook(tf.train.SessionRunHook):
-  """
-  Hook used to obtain evaluation metrics.
-  Typically used for early-stopping by obtaining the value of a
-  metric at the end of an epoch.
-  Note that the metric tensor and its commensurate update Op
-  are responsible for aggregating the metric during the session
-  (one session per epoch). Used for evaluation.
-  """
-
-  def __init__(self, get_metrics_fn):
-    """GetMetricsHook constructor.
-
-    Args:
-      get_metrics_fn:
-        Function that returns a dict mapping metric keys to
-        tensors as a tf.Tensor.
-        See Trainer.learn for an example use-case.
-    """
-
-    self._get_metrics_fn = get_metrics_fn
-    self._metric_tensors = None
-    self.metric_values = None
-
-  def begin(self):
-    """ sets the global_step_tensor and metric tensor"""
-    self._metric_tensors = self._get_metrics_fn()
-    assert isinstance(self._metric_tensors, dict)
-
-  def end(self, session):
-    self.metric_values = session.run(self._metric_tensors)
-
-
-class EarlyStopHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with early-stopping logic for use
-  within the Trainer.learn method.
-  """
-
-  def __init__(self,
-               metric,
-               patience,
-               minimize,
-               get_estimator_spec_fn,
-               checkpoint_dir,
-               file_path=None,
-               exit_on_end=True,
-               start_epoch=0,
-               tolerance=0):
-    """
-    Prepare early-stopping hook and variables.
-
-    Args:
-      metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-      minimize:
-        Set this to True for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-        Defaults to 0.
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      checkpoint_dir:
-        path to directory containing the Estimator checkpoints.
-      file_path:
-        path to file that is used by this hook to communicate early-stopping
-        to StopIfExistsHook. This hook would be used for evaluation, while
-        the StopIfExistsHooks (the listeners) would be used for training.
-        When the file is created, the StopIfExistsHooks detect and terminate training.
-        This argument is used by ``Trainer.train_and_evaluate``.
-      exit_on_end:
-        when the end() method is called to indicate that the session is terminating,
-        and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job.
-        This is set to False by the trainer for non distributed jobs.
-      start_epoch:
-        Specifies the starting epoch number. This is used for logging purposes only.
-    """
-    if not isinstance(metric, str):
-      raise ValueError("Expecting string for metric arg")
-    if not isinstance(patience, int):
-      raise ValueError("Expecting positive number for metric arg")
-
-    self.should_stop = False
-    self._metric = metric
-    self._patience = patience
-    self._current_patience = patience
-    self._checkpoint_dir = checkpoint_dir
-    self._exit_on_end = exit_on_end
-    self._latest_checkpoint_path = None
-    # used for distributed training (tf.estimator.train_and_evaluate)
-    self._file_path = file_path
-    self._epoch = start_epoch
-    if self._file_path is not None:
-      # TODO try to read epoch from a file that we create
-      if tf.io.gfile.exists(self._file_path):
-        # delete the file if it exists (not sure this makes sense)
-        logging.info("EarlyStopHook: Removing existing file: %s.", self._file_path)
-        tf.io.gfile.remove(self._file_path)
-
-    # best_checkpoint dir will contain the best checkpoint
-    self._best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint')
-    self._eval_checkpoint_path = os.path.join(checkpoint_dir, 'eval_checkpoint')
-    self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric)
-
-    if tf.io.gfile.exists(self._best_metric_path):
-      with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f:
-        best_metric_from_file = float(f.read())
-    else:
-      best_metric_from_file = None
-
-    if minimize:
-      # current < best : is better
-      self._is_better_than = operator.lt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = np.inf
-      else:
-        self._best_metric = best_metric_from_file - tolerance
-      # used for printing
-      self._early_stop_name = "minimum"
-    else:
-      # current > best : is better
-      self._is_better_than = operator.gt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = -np.inf
-      else:
-        self._best_metric = best_metric_from_file + tolerance
-      # used for printing
-      self._early_stop_name = "maximum"
-
-    def get_metrics_fn():
-      """ function to get metric tensors to early-stopping """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      if metric not in eval_metric_ops:
-        raise ValueError(
-          "Expecting early_stop_metric '%s' key in eval_metric_ops dict"
-          % (metric))
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-
-    # initialize GetMetricsHook to get current value of metric from session
-    super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def early_stop(self, epoch):
-    """
-    Looks at the current value of the early stopping metric.
-    Decrements current patience. If metric improves, patience is reset
-    and latest checkpoint is moved to checkpoint_dir/best_checkpoint.
-    If current patience reaches zero, returns True.
-
-    Args:
-      epoch:
-        The current epoch number.
-
-    Returns:
-      True when early-stopped. False otherwise.
-    """
-    # decrement patience
-    self._current_patience -= 1
-
-    # get the current metric value
-    current_metric = self.metric_values[self._metric]
-
-    if self._is_better_than(current_metric, self._best_metric):
-      # save best version of model
-      self._best_metric = current_metric
-      logging.info(
-        "Found new %s %s=%f @ epoch %d",
-        self._early_stop_name, self._metric, self._best_metric, epoch)
-      # backup the file to checkpoint_dir/best_checkpoint
-      assert self._latest_checkpoint_path, "expecting latest checkpoint"
-      logging.info("Backing up " + self._latest_checkpoint_path)
-
-      try:
-        eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path)
-        twml.util.backup_checkpoint(
-          checkpoint_path_prefix=eval_checkpoint,
-          backup_path=self._best_checkpoint_path)
-      except twml.errors.CheckpointNotFoundError as ex:
-        msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-        raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
-
-      tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path))
-      with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f:
-        # Write with enough precision
-        f.write("%.8f" % self._best_metric)
-
-      # reset patience
-      self._current_patience = self._patience
-
-    elif self._current_patience > 0:
-      logging.info("No new %s found after %d epochs",
-                   self._early_stop_name, self._patience - self._current_patience)
-    elif self._current_patience == 0:
-      logging.info(
-        "No new %s found after %d epochs. Early-stopping experiment.",
-        self._early_stop_name, self._patience)
-      return True
-
-    return False
-
-  def cleanup_checkpoints(self):
-    """
-    makes it so that the best checkpoint is the only checkpoint
-    in checkpoint_dir.
-    """
-    raise NotImplementedError("cleanup_checkpoints is no longer supported")
-
-  def end(self, session):
-    """
-    This method is called at the end of an evaluation/epoch.
-    When file_path constructor argument is provided, this
-    will call ``early_stop()``.
-    When ``early_stop()`` returns True, it creates the file_path,
-    which will be detected by StopIfExistsHooks
-    and stop training for all workers and the chief. It will
-    also call ``cleanup_checkpoints()``.
-    """
-    super(EarlyStopHook, self).end(session)
-
-    # Checks for early stopping criteria and makes a backup
-    self.should_stop = self.early_stop(self._epoch)
-
-    if self._file_path is not None:
-      if self.should_stop:
-        # create a file to inform workers
-        with tf.io.gfile.GFile(self._file_path, "wb") as gfile:
-          gfile.write("early-stop\n")
-        # makes the best checkpoint the only checkpoint in save_dir.
-        msg = "early-stopping evaluation at epoch %d" % self._epoch
-        logging.info(msg)
-        if self._exit_on_end:
-          raise twml.errors.EarlyStopError(msg)
-      else:
-        self._latest_checkpoint_path = None
-
-    self._epoch += 1
-
-  def begin(self):
-    """
-    Saves the latest_checkpoint in case it gets superseded by another checkpoint.
-    Remember that when used with train_and_evaluate, the chief saves checkpoints
-    continuouly. The chief could save a checkpoint after evaluation started.
-    So saving the checkpoint at the beginning of evaluation ensures that we
-    later save the correct best checkpoint.
-    """
-    super(EarlyStopHook, self).begin()
-    self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir)
-
-    assert self._latest_checkpoint_path, "expecting latest checkpoint"
-    # Backup to temporary directory
-    try:
-      twml.util.backup_checkpoint(
-        checkpoint_path_prefix=self._latest_checkpoint_path,
-        backup_path=self._eval_checkpoint_path)
-    except twml.errors.CheckpointNotFoundError as ex:
-      msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-      raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
-
-
-class MetricsUpdateHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with logic to map SessionRun events to metrics updates.
-  It is mainly used by `TrackRun` to persist model metrics via Model Repo.
-  """
-
-  def __init__(self,
-               get_estimator_spec_fn,
-               add_metrics_fn,
-               every_n_iter=None,
-               every_n_secs=None
-               ):
-    """
-    Args:
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      add_metrics_fn: `function` callback used to report metrics, called automatically
-        at the end of every epoch.
-      every_n_iter: `int`, log the metrics once every N local
-        steps taken in the current epoch.
-      every_n_secs: `int` or `float`, log the metrics once every N
-        seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs`
-        should be provided.
-    Raises:
-      ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and
-        `every_n_secs` is set when `add_progress_metrics_fn` is provided.
-    """
-    only_log_at_end = (every_n_iter is None) and (every_n_secs is None)
-
-    if (not only_log_at_end and every_n_iter and every_n_secs):
-      raise ValueError(
-        'exactly one of every_n_iter and every_n_secs must be provided'
-      )
-
-    # TODO: should have a minimum to avoid too many calls to ModelRepo?
-    if every_n_iter is not None and every_n_iter <= 0:
-      raise ValueError("invalid every_n_iter=%s." % every_n_iter)
-
-    self._timer = (
-      NeverTriggerTimer() if only_log_at_end else
-      SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
-    )
-
-    self._should_trigger = False
-    self._iter_count = 0
-
-    self._add_metrics_fn = add_metrics_fn
-
-    def get_metrics_fn():
-      """
-      Function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-    super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def report_metrics(self):
-    """
-    Triggers a metrics report.
-    """
-    self._timer.update_last_triggered_step(self._iter_count)
-    if self.metric_values is not None:
-      self._add_metrics_fn(self.metric_values)
-
-  def begin(self):
-    """
-    Triggered before each epoch.
-    """
-    self._timer.reset()
-    self._iter_count = 0
-    return super(MetricsUpdateHook, self).begin()
-
-  def before_run(self, run_context):
-    """
-    Triggered before each step.
-    """
-    self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
-    return super(MetricsUpdateHook, self).before_run(run_context)
-
-  def after_run(self, run_context, run_values):
-    """
-    Triggered after each step.
-    """
-    if self._should_trigger:
-      self.report_metrics()
-    self._iter_count += 1
-    return super(MetricsUpdateHook, self).after_run(run_context, run_values)
-
-  def end(self, session):
-    """
-    Triggered after each epoch.
-    """
-    self.report_metrics()
-    return super(MetricsUpdateHook, self).end(session)
-
-
-class EarlyStopDuration(tf.train.SessionRunHook):
-  """
-  Hook that can be used to terminate a job (training or validation) after a certain duration.
-  The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes,
-  then it will only run for 15 minutes once restarted.
-
-  Args:
-    max_duration: 
-      A float. When this argument is defined, the job will automatically terminate after
-      `max_duration` seconds if it has not already compeleted. 
-    
-    overwrite:
-      A boolean. If set to True, this hook will overwrite the file containing the elapsed time
-      since the beginning of the job. In a distributed setting, this will be used so only one 
-      job writes to the file while all others will have read access. In a distributed setting,
-      if all executors have this parameter set to False, then it just means that the hook will 
-      not be fault tolerant. When restarted, the job will restart the clock from 0.
-      
-    save_dir:
-      String. A directory (located on a file system that is Tensorflow compatible) where 
-      we can store the file which contains the record of the elapsed time. This file is what makes 
-      the hook faul tolerant.
-
-    exit_on_end:
-      when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job.
-      This is usually set to True to kill a validation job in a distributed setting.
-   """
-
-  def __init__(self, max_duration: float, exit_on_end: bool, save_dir: str, overwrite: bool):
-    self._overwrite = overwrite
-    self._save_dir = save_dir
-    self._exit_on_end = exit_on_end
-    self._max_duration = max_duration
-    self._last_time_check = datetime.now()
-
-    # Initialize elapse time file
-    if overwrite:
-      self.elapsed_time()
-
-  @property
-  def elapsed_file_path(self):
-    return os.path.join(self._save_dir, "early_stop_duration.txt")
-
-  def early_stop(self) -> bool:
-    return self.elapsed_time() > self._max_duration
-
-  def elapsed_time(self) -> float:
-    # Recorded elapsed time is 0 unless it's been recorded in a file already
-    recorded_elapsed_time = 0
-    if tf.io.gfile.exists(self.elapsed_file_path):
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file:
-        recorded_elapsed_time = json.loads(file.read())["elapsed_time"]
-
-    elapsed_time = recorded_elapsed_time + (datetime.now() - self._last_time_check).total_seconds()
-    self._last_time_check = datetime.now()
-
-    if self._overwrite:
-      # Record the actualized new elapsed time to the file
-      tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path))
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file:
-        record = {
-          "elapsed_time": elapsed_time,
-          "max_duration": self._max_duration
-        }
-        file.write(json.dumps(record, indent=2))
-
-    return elapsed_time
-
-  def before_run(self, run_context: tf.estimator.SessionRunContext) -> None:
-    if self.early_stop():
-      message = f"""
-        Stopping job which now exceeded the maximum duration of {self._max_duration} seconds. 
-      """
-      logging.info(message)
-      run_context.request_stop()
-
-      if self._exit_on_end:
-        raise twml.errors.EarlyStopError(message)
-
-
-class StopAtStepHook(tf.train.StopAtStepHook):
-  """
-  Overrides ``tf.train.StopAtStepHook`` so that
-  a ``stop_requested`` property can be accessed to determine
-  if this hook requested a stop.
-  """
-
-  def __init__(self, *args, **kwargs):
-    super(StopAtStepHook, self).__init__(*args, **kwargs)
-    self._stop_requested = False
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
-
-  def after_run(self, run_context, run_values):
-    """ sets self.stop_requested to true when requesting a stop """
-    super(StopAtStepHook, self).after_run(run_context, run_values)
-    self._stop_requested = run_context.stop_requested
-
-
-class StopIfExistsHook(tf.train.SessionRunHook):
-  """
-  Hook that requests stop if a file exists.
-  This hook is used with the EarlyStopHook to implement
-  early-stopping for distributed training (tf.estimator.train_and_evaluate).
-  """
-
-  def __init__(self, file_path):
-    """
-    Arguments:
-      file_path:
-        path to file. When this hook detects that the file exists,
-        it requests a stop, which effectively kills this worker.
-    """
-    self._file_path = file_path
-    self._stop_requested = False
-
-  def after_run(self, run_context, run_values):
-    if tf.io.gfile.exists(self._file_path):
-      logging.info("Early-stopping file detected; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
diff --git a/twml/twml/input_fns.py b/twml/twml/input_fns.py
deleted file mode 100644
index 394fc8674..000000000
--- a/twml/twml/input_fns.py
+++ /dev/null
@@ -1,129 +0,0 @@
-'''
-Contains implementations of functions to read input data.
-'''
-from .dataset import stream_block_format_dataset
-
-import tensorflow.compat.v1 as tf
-
-
-def data_record_input_fn(
-        files, batch_size, parse_fn,
-        num_threads=2, repeat=False, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None,
-        shards=None, shard_index=None, shuffle=True, shuffle_files=True, interleave=True,
-        initializable=False, log_tf_data_summaries=False,
-        **kwargs):
-  """
-  Returns a nested structure of tf.Tensors containing the next element.
-  Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer.
-  By default, works with DataRecord dataset for compressed partition files.
-
-  Args:
-    files:
-      List of files that will be parsed.
-    batch_size:
-      number of samples per batch.
-    parse_fn:
-      function passed to data loading for parsing individual data records.
-      Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``.
-    num_threads (optional):
-      number of threads used for loading data. Defaults to 2.
-    repeat (optional):
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use ``train_steps`` or ``eval_steps``
-      greater than the size of the dataset
-      (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached).
-    dataset_fn (optional):
-      A function that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in (0.0, 1.0] that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation
-      (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to
-      train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle (optional):
-      Whether to shuffle the records. Defaults to True.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-    initializable (optional):
-      A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or
-      a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false)
-      is used for most plain iterators.
-
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-  Returns:
-    Iterator of elements of the dataset.
-  """
-  if not parse_fn:
-    raise ValueError("default_input_fn requires a parse_fn")
-
-  if log_tf_data_summaries and not initializable:
-    raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-  dataset = stream_block_format_dataset(
-    files=files,
-    parse_fn=parse_fn,
-    batch_size=batch_size,
-    repeat=repeat,
-    num_threads=num_threads,
-    dataset_fn=dataset_fn,
-    keep_rate=keep_rate,
-    parts_downsampling_rate=parts_downsampling_rate,
-    shards=shards,
-    shard_index=shard_index,
-    shuffle=shuffle,
-    shuffle_files=shuffle_files,
-    interleave=interleave,
-    **kwargs
-  )
-
-  # Add a tf.data.experimental.StatsAggregator
-  # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator
-  if log_tf_data_summaries:
-    aggregator = tf.data.experimental.StatsAggregator()
-    options = tf.data.Options()
-    options.experimental_stats.aggregator = aggregator
-    dataset = dataset.with_options(options)
-    stats_summary = aggregator.get_summary()
-    tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-
-  if initializable:
-    # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and
-    # therefore we need to be run explicitly
-    iterator = dataset.make_initializable_iterator()
-    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)
-  else:
-    iterator = dataset.make_one_shot_iterator()
-  return iterator.get_next()
-
-
-default_input_fn = data_record_input_fn  # pylint: disable=invalid-name
diff --git a/twml/twml/layers/__init__.py b/twml/twml/layers/__init__.py
deleted file mode 100644
index 917c61867..000000000
--- a/twml/twml/layers/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# pylint: disable=wildcard-import
-"""
-This module contains the ``tf.layers.Layer`` subclasses implemented in twml.
-Layers are used to instantiate common subgraphs.
-Typically, these layers are used when defining a ``build_graph_fn``
-for the ``twml.trainers.Trainer``.
-"""
-
-from .batch_prediction_tensor_writer import BatchPredictionTensorWriter  # noqa: F401
-from .batch_prediction_writer import BatchPredictionWriter  # noqa: F401
-from .data_record_tensor_writer import DataRecordTensorWriter  # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
-from .full_sparse import full_sparse, FullSparse  # noqa: F401
-from .isotonic import Isotonic  # noqa: F401
-from .layer import Layer  # noqa: F401
-from .mdl import MDL  # noqa: F401
-from .partition import Partition  # noqa: F401
-from .percentile_discretizer import PercentileDiscretizer  # noqa: F401
-from .sequential import Sequential  # noqa: F401
-from .sparse_max_norm import MaxNorm, sparse_max_norm, SparseMaxNorm  # noqa: F401
-from .stitch import Stitch  # noqa: F401
diff --git a/twml/twml/layers/batch_prediction_tensor_writer.py b/twml/twml/layers/batch_prediction_tensor_writer.py
deleted file mode 100644
index 3f6633a8e..000000000
--- a/twml/twml/layers/batch_prediction_tensor_writer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# pylint: disable=no-member, invalid-name
-"""
-Implementing Writer Layer
-"""
-from .layer import Layer
-
-import libtwml
-
-
-class BatchPredictionTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production) when model predictions are dense tensors.
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
-
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_tensor_response_writer(self.keys, values)
-    return write_op
diff --git a/twml/twml/layers/batch_prediction_writer.py b/twml/twml/layers/batch_prediction_writer.py
deleted file mode 100644
index 118d21921..000000000
--- a/twml/twml/layers/batch_prediction_writer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# pylint: disable=no-member, invalid-name
-"""
-Implementing Writer Layer
-"""
-from .layer import Layer
-
-import libtwml
-
-
-class BatchPredictionWriter(Layer):
-  """
-  A layer that packages keys and values into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production).
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
-
-    Arguments:
-      values:
-        values corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values)
-    return write_op
diff --git a/twml/twml/layers/data_record_tensor_writer.py b/twml/twml/layers/data_record_tensor_writer.py
deleted file mode 100644
index 0f70186b4..000000000
--- a/twml/twml/layers/data_record_tensor_writer.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# pylint: disable=no-member, invalid-name
-"""
-Implementing Writer Layer
-"""
-from .layer import Layer
-
-import libtwml
-
-
-class DataRecordTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a DataRecord.
-  This layer was initially added to support exporting user embeddings as tensors.
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a DataRecord serialized using Thrift into a uint8 tensor
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(DataRecordTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
-
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.data_record_tensor_writer(self.keys, values)
-    return write_op
diff --git a/twml/twml/layers/full_dense.py b/twml/twml/layers/full_dense.py
deleted file mode 100644
index 9c354ad3e..000000000
--- a/twml/twml/layers/full_dense.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# pylint: disable=no-member,arguments-differ, attribute-defined-outside-init
-"""
-Implementing Full Dense Layer
-"""
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.ops import init_ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine.base_layer import InputSpec
-import tensorflow.compat.v1 as tf
-
-
-class FullDense(core_layers.Dense):
-  """
-  Densely-connected layer class.
-  This is wrapping tensorflow.python.layers.core.Dense
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
-
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weight:
-      Weight matrix (TensorFlow variable or tensor). (weight)
-    bias:
-      Bias vector, if applicable (TensorFlow variable or tensor).
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
-
-  """
-
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=None,
-               **kwargs):
-    super(FullDense, self).__init__(units=output_size,
-                                    kernel_initializer=weight_initializer,
-                                    kernel_regularizer=weight_regularizer,
-                                    kernel_constraint=weight_constraint,
-                                    **kwargs)
-    self._num_partitions = num_partitions
-
-  def build(self, input_shape):
-    '''
-    code adapted from TF 1.12 Keras Dense layer:
-    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956
-    '''
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1] is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = InputSpec(min_ndim=2,
-                                axes={-1: input_shape[-1]})
-
-    partitioner = None
-    if self._num_partitions:
-      partitioner = tf.fixed_size_partitioner(self._num_partitions)
-
-    self.kernel = self.add_weight(
-        'kernel',
-        shape=[input_shape[-1], self.units],
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        partitioner=partitioner,
-        trainable=True)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=[self.units, ],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  @property
-  def output_size(self):
-    """
-    Returns output_size
-    """
-    return self.units
-
-  @property
-  def weight(self):
-    """
-    Returns weight
-    """
-    return self.kernel
-
-  @property
-  def weight_regularizer(self):
-    """
-    Returns weight_regularizer
-    """
-    return self.kernel_regularizer
-
-  @property
-  def weight_initializer(self):
-    """
-    Returns weight_initializer
-    """
-    return self.kernel_initializer
-
-  @property
-  def weight_constraint(self):
-    """
-    Returns weight_constraint
-    """
-    return self.kernel_constraint
-
-
-def full_dense(inputs, output_size,
-               activation=None,
-               use_bias=True,
-               weight_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               weight_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               num_partitions=None,
-               reuse=None):
-  """Functional interface for the densely-connected layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  Arguments:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor the same shape as `inputs` except the last dimension is of
-    size `units`.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = FullDense(output_size,
-                    activation=activation,
-                    use_bias=use_bias,
-                    weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer,
-                    weight_regularizer=weight_regularizer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    trainable=trainable,
-                    name=name,
-                    dtype=inputs.dtype.base_dtype,
-                    num_partitions=num_partitions,
-                    _scope=name,
-                    _reuse=reuse)
-  return layer.apply(inputs)
diff --git a/twml/twml/layers/full_sparse.py b/twml/twml/layers/full_sparse.py
deleted file mode 100644
index 4f0f21930..000000000
--- a/twml/twml/layers/full_sparse.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# pylint: disable=no-member, arguments-differ, attribute-defined-outside-init, unused-argument
-"""
-Implementing Full Sparse Layer
-"""
-
-import math
-
-from twitter.deepbird.sparse import sparse_dense_matmul
-
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class FullSparse(Layer):
-  """Fully-sparse layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Arguments:
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    input_size:
-      The number of input units. (Deprecated)
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-      This argument defaults to tf.constant_initializer(1/output_size)
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-    use_binary_sparse_dense_matmul:
-      If binary sparse dense matmul op is to be used. It will only be enabled if
-      `use_binary_values` is set true. It only should be used for inference, best practice is
-      to set `use_binary_sparse_dense_matmul = not is_training`.
-  """
-
-  def __init__(self,
-               output_size,
-               input_size=None,
-               weight_initializer=None,
-               activation=None,
-               bias_initializer=None,
-               trainable=True,
-               name=None,
-               use_sparse_grads=True,
-               num_partitions=None,
-               partition_axis=0,
-               use_binary_values=False,
-               bias_regularizer=None,
-               weight_regularizer=None,
-               use_compression=False,
-               use_binary_sparse_dense_matmul=False,
-               **kwargs):
-    super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs)
-    # TODO - remove input_size warning.
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-
-    # The bias initialization and weights initialization is set to match v1's implementation.
-    if bias_initializer is None:
-      bias_initializer = tf.constant_initializer(1 / output_size)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.bias_initializer = bias_initializer
-    self.output_size = output_size
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.num_partitions = num_partitions
-    if partition_axis != 0 and partition_axis != 1:
-      raise ValueError('partition_axis must be 0 or 1')
-    self.partition_axis = partition_axis
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.bias_regularizer = bias_regularizer
-    self._use_compression = use_compression
-    self._cast_indices_dtype = tf.int32 if self._use_compression else None
-    self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul
-
-  def _make_weight_var(self, shape, partitioner):
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-      partitioner=partitioner,
-    )
-
-  def build(self, input_shapes):
-    """
-    creates the ``bias`` and ``weight`` Variables
-    of shape ``[output_size]`` and ``[input_size, output_size]`` respectively.
-    """
-
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    self.bias = self.add_variable(
-      'bias',
-      initializer=self.bias_initializer,
-      regularizer=self.bias_regularizer,
-      shape=[self.output_size, ],
-      dtype=self.dtype,
-      trainable=True
-    )
-
-    partitioner = None
-    shape = [input_shape[1], self.output_size]
-
-    # There is a 2gb limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    num_partitions = 1 if self.num_partitions is None else self.num_partitions
-    in_shape = input_shape[1]
-    out_shape = self.output_size
-
-    # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int.
-    if isinstance(in_shape, tf.Dimension):
-      in_shape = in_shape.value
-
-    if in_shape is None:
-      raise ValueError("Input tensor should have shape."
-                       " You can set it using twml.util.limit_sparse_tensor_size")
-
-    (split_dim, other_dim) = (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape)
-    requested_size = math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor partitions cannot be larger than 2GB.\n"
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n"
-                       "Possible solutions:\n"
-                       "- reduce the params.output_size_bits\n"
-                       "- reduce the output_size of the sparse_layer\n"
-                       "- specify a larger num_partitions argument\n"
-                       "- reduce input_size_bits" %
-                       (in_shape, self.output_size, dtype.name, requested_size, num_partitions))
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self._make_weight_var(shape, partitioner)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A SparseTensor or a list of SparseTensors.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
-
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-        `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
-
-    """
-    if isinstance(inputs, (list, tuple)):
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        use_binary_values = self.use_binary_values
-      else:
-        use_binary_values = [self.use_binary_values] * len(inputs)
-
-      num_inputs = len(inputs)
-      if num_inputs != len(use_binary_values):
-        raise ValueError("#inputs is %d while #use_binary_values is %d"
-                         % (num_inputs, len(use_binary_values)))
-
-      outputs = []
-      for n in range(num_inputs):
-        outputs.append(sparse_dense_matmul(inputs[n], self.weight,
-                                           self.use_sparse_grads,
-                                           use_binary_values[n],
-                                           name='sparse_mm_' + str(n),
-                                           partition_axis=self.partition_axis,
-                                           num_partitions=self.num_partitions,
-                                           compress_ids=self._use_compression,
-                                           cast_indices_dtype=self._cast_indices_dtype,
-                                           use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul))
-      outputs = tf.accumulate_n(outputs)
-    else:
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        raise ValueError("use_binary_values can not be %s when inputs is %s" %
-                         (type(self.use_binary_values), type(inputs)))
-
-      outputs = sparse_dense_matmul(inputs, self.weight,
-                                    self.use_sparse_grads,
-                                    self.use_binary_values,
-                                    name='sparse_mm',
-                                    partition_axis=self.partition_axis,
-                                    num_partitions=self.num_partitions,
-                                    compress_ids=self._use_compression,
-                                    cast_indices_dtype=self._cast_indices_dtype,
-                                    use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul)
-
-    if self.bias is not None:
-      outputs = tf.nn.bias_add(outputs, self.bias)
-
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
-
-
-def full_sparse(
-        inputs, output_size,
-        input_size=None,
-        activation=None,
-        bias_regularizer=None,
-        weight_regularizer=None,
-        bias_initializer=None,
-        weight_initializer=None,
-        trainable=True,
-        name=None,
-        reuse=None,
-        use_sparse_grads=True,
-        num_partitions=None,
-        partition_axis=0,
-        use_binary_values=False,
-        use_compression=False):
-  """Functional interface for the sparsely-connected layer.
-
-  Arguments:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-  Returns:
-    Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``.
-  """
-  # TODO - remove input_size warning.
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now \
-                      automatically inferred from your input.')
-
-  dtype = None
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-    dtype = inputs.dtype.base_dtype
-
-  if isinstance(inputs, (list, tuple)):
-    inputs = [inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs]
-    dtype = inputs[0].dtype.base_dtype
-
-  layer = FullSparse(output_size=output_size,
-                     activation=activation,
-                     trainable=trainable,
-                     name=name,
-                     weight_initializer=weight_initializer,
-                     bias_initializer=bias_initializer,
-                     weight_regularizer=weight_regularizer,
-                     bias_regularizer=bias_regularizer,
-                     dtype=dtype,
-                     _scope=name,
-                     _reuse=reuse,
-                     use_sparse_grads=use_sparse_grads,
-                     num_partitions=num_partitions,
-                     partition_axis=partition_axis,
-                     use_compression=use_compression,
-                     use_binary_values=use_binary_values)
-  return layer(inputs)
diff --git a/twml/twml/layers/isotonic.py b/twml/twml/layers/isotonic.py
deleted file mode 100644
index 7113f7af4..000000000
--- a/twml/twml/layers/isotonic.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# pylint: disable=no-member, invalid-name, attribute-defined-outside-init
-"""
-Contains the Isotonic Layer
-"""
-
-from .layer import Layer
-
-import libtwml
-import numpy as np
-
-
-class Isotonic(Layer):
-  """
-  This layer is created by the IsotonicCalibrator.
-  Typically it is used intead of sigmoid activation on the output unit.
-
-  Arguments:
-    n_unit:
-      number of input units to the layer (same as number of output units).
-    n_bin:
-      number of bins used for isotonic calibration.
-      More bins means a more precise isotonic function.
-      Less bins means a more regularized isotonic function.
-    xs_input:
-      A tensor containing the boundaries of the bins.
-    ys_input:
-      A tensor containing calibrated values for the corresponding bins.
-
-  Output:
-      output:
-        A layer containing calibrated probabilities with same shape and size as input.
-  Expected Sizes:
-      xs_input, ys_input:
-        [n_unit, n_bin].
-  Expected Types:
-      xs_input, ys_input:
-        same as input.
-  """
-
-  def __init__(self, n_unit, n_bin, xs_input=None, ys_input=None, **kwargs):
-    super(Isotonic, self).__init__(**kwargs)
-
-    self._n_unit = n_unit
-    self._n_bin = n_bin
-
-    self.xs_input = np.empty([n_unit, n_bin], dtype=np.float32) if xs_input is None else xs_input
-    self.ys_input = np.empty([n_unit, n_bin], dtype=np.float32) if ys_input is None else ys_input
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the variables of the layer."""
-
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs: input tensor(s).
-
-    Returns:
-      The output from the layer
-    """
-    calibrate_op = libtwml.ops.isotonic_calibration(inputs, self.xs_input, self.ys_input)
-    return calibrate_op
diff --git a/twml/twml/layers/layer.py b/twml/twml/layers/layer.py
deleted file mode 100644
index c1b00eb13..000000000
--- a/twml/twml/layers/layer.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# pylint: disable=no-member
-"""
-Implementing a base layer for twml
-"""
-import tensorflow.compat.v1 as tf
-from tensorflow.python.layers import base
-
-
-class Layer(base.Layer):
-  """
-  Base Layer implementation for twml.
-  Overloads `twml.layers.Layer
-  <https://www.tensorflow.org/versions/master/api_docs/python/tf/layers/Layer>`_
-  from tensorflow and adds a couple of custom methods.
-  """
-
-  @property
-  def init(self):
-    """
-    Return initializer ops. By default returns tf.no_op().
-    This method is overwritten by classes like twml.layers.MDL, which
-    uses a HashTable internally, that must be initialized with its own op.
-    """
-    return tf.no_op()
-
-  def call(self, inputs, **kwargs):
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-      **kwargs:
-        additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
-    """
-    raise NotImplementedError
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/mdl.py b/twml/twml/layers/mdl.py
deleted file mode 100644
index cf4018afa..000000000
--- a/twml/twml/layers/mdl.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing MDL Layer
-"""
-
-
-from .layer import Layer
-from .partition import Partition
-from .stitch import Stitch
-
-import libtwml
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class MDL(Layer):  # noqa: T000
-  """
-  MDL layer is constructed by MDLCalibrator after accumulating data
-  and performing minimum description length (MDL) calibration.
-
-  MDL takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an MDL bin.
-  Each MDL input feature is converted to n_bin bins.
-  Each MDL calibration tries to find bin delimiters such that the number of features values
-  per bin is roughly equal (for each given MDL feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-          self,
-          n_feature, n_bin, out_bits,
-          bin_values=None, hash_keys=None, hash_values=None,
-          bin_ids=None, feature_offsets=None, **kwargs):
-    """
-    Creates a non-initialized `MDL` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during MDL calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of MDL bins used for MDL calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that MDL discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces: MDL vs non-MDL
-          2. transate the MDL features into a hash_feature ID that MDL understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for MDL.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the MDL features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-    """
-    super(MDL, self).__init__(**kwargs)
-    tf.logging.warning("MDL will be deprecated. Please use PercentileDiscretizer instead")
-
-    max_mdl_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    self._hash_keys_initializer = tf.constant_initializer(
-      hash_keys if hash_keys is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._hash_values_initializer = tf.constant_initializer(
-      hash_values if hash_values is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_ids_initializer = tf.constant_initializer(
-      bin_ids if bin_ids is not None
-      else np.empty(max_mdl_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_values_initializer = tf.constant_initializer(
-      bin_values if bin_values is not None
-      else np.empty(max_mdl_feature, dtype=np.float32),
-      dtype=np.float32
-    )
-    self._feature_offsets_initializer = tf.constant_initializer(
-      feature_offsets if feature_offsets is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-
-    # note that calling build here is an exception as typically __call__ would call build().
-    # We call it here because we need to initialize hash_map.
-    # Also note that the variable_scope is set by add_variable in build()
-    if not self.built:
-      self.build(input_shape=None)
-
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-
-    # build variables
-
-    hash_keys = self.add_variable(
-      'hash_keys',
-      initializer=self._hash_keys_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    hash_values = self.add_variable(
-      'hash_values',
-      initializer=self._hash_values_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # hashmap converts known features into range [0, n_feature)
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-
-    self.bin_ids = self.add_variable(
-      'bin_ids',
-      initializer=self._bin_ids_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.int64,
-      trainable=False)
-
-    self.bin_values = self.add_variable(
-      'bin_values',
-      initializer=self._bin_values_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.float32,
-      trainable=False)
-
-    self.feature_offsets = self.add_variable(
-      'feature_offsets',
-      initializer=self._feature_offsets_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements MDL inference where inputs are intersected with a hash_map.
-    Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to MDL for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    # get intersect(keys, hash_map)
-    hashed_keys = self.hash_map.lookup(keys)
-
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_mdl_keys, mdl_in_keys = key
-    non_mdl_vals, mdl_in_vals = vals
-
-    self.non_mdl_keys = non_mdl_keys
-
-    # run MDL on the keys/values it knows about
-    mdl_keys, mdl_vals = libtwml.ops.mdl(mdl_in_keys, mdl_in_vals, self.bin_ids, self.bin_values,
-                                         self.feature_offsets)
-
-    # handle output ID conflicts
-    mdl_size = tf.size(self.bin_ids, out_type=tf.int64)
-    non_mdl_size = tf.subtract(self.output_size, mdl_size)
-    non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size)
-
-    # Stitch the keys and values from mdl and non mdl indices back, with help
-    # of the Stitch Layer
-
-    # out for inference checking
-    self.mdl_out_keys = mdl_keys
-
-    concat_data = self.stitch([non_mdl_vals, mdl_vals],
-                              [non_mdl_keys, mdl_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/partition.py b/twml/twml/layers/partition.py
deleted file mode 100644
index 0e7c85f18..000000000
--- a/twml/twml/layers/partition.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-Implementing partition Layer
-"""
-
-
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
-
-
-class Partition(Layer):
-  """
-  This layer implements:
-
-  .. code-block:: python
-
-    tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-
-  Input:
-    partitions:
-      the number of partitions which we will divide the hashmap keys/bvalues
-
-  Output:
-    A layer that performs partitioning
-   """
-
-  def __init__(self, partitions=2, **kwargs):
-    self.partitions = partitions
-    super(Partition, self).__init__(**kwargs)
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, partition_ids, input_vals, input_keys, **kwargs):
-    """This layer is responsible for partitioning the values/keys of a hashmap
-
-    Arguments:
-      partition_ids:
-        Tensor that is equivalent to boolean (int32).
-      input_vals:
-        Tensor that represents the values of the hashmap(float).
-      input_keys:
-        Tensor that represents the keys of the hashmap(float)
-
-    Returns:
-      The output of the partition layer, which is a list of lists which looks
-      something like:
-
-      .. code-block:: python
-
-        [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]]
-
-      where:
-        vals_x:
-          values of the hashmap for partition x
-        keys_x:
-          keys of the hashmap for partition x
-        indices_x:
-          indices of the hashmap for partition x
-    """
-    partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-    partioned_keys = tf.dynamic_partition(input_keys, partition_ids, self.partitions)
-    partioned_indices = tf.dynamic_partition(tf.range(tf.shape(partition_ids)[0]),
-                                             tf.cast(partition_ids, tf.int32), self.partitions)
-    return [partioned_val, partioned_keys, partioned_indices]
diff --git a/twml/twml/layers/percentile_discretizer.py b/twml/twml/layers/percentile_discretizer.py
deleted file mode 100644
index 55bb4de8c..000000000
--- a/twml/twml/layers/percentile_discretizer.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, too-many-instance-attributes
-"""
-Implementing PercentileDiscretizer Layer
-"""
-
-
-import libtwml
-import numpy as np
-import tensorflow.compat.v1 as tf
-import twml
-from twml.layers import Layer
-
-
-class PercentileDiscretizer(Layer):
-  """
-  PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
-  accumulating data and performing percentile bucket calibration.
-
-  PercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
-  Each PercentileDiscretizer input feature is converted to n_bin bins.
-  Each PercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values per bin is roughly equal (for
-  each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
-  equiprobable, according to the given calibration data.
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-      self,
-      n_feature, n_bin, out_bits,
-      bin_values=None, hash_keys=None, hash_values=None,
-      bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs):
-    """
-    Creates a non-initialized `PercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during PercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that PercentileDiscretizer discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          PercentileDiscretizer vs non-PercentileDiscretizer
-          2. transate the PercentileDiscretizer features into a hash_feature ID that
-          PercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the PercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-    """
-
-    super(PercentileDiscretizer, self).__init__(**kwargs)
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    # build variables
-    self._out_bits = out_bits
-    self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._hash_keys = (hash_keys if hash_keys is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._hash_values = (hash_values if hash_values is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._bin_ids = (bin_ids if bin_ids is not None else
-     np.empty(max_discretizer_feature, dtype=np.int64))
-    self._bin_values = (bin_values if bin_values is not None else
-     np.empty(max_discretizer_feature, dtype=np.float32))
-    self._feature_offsets = (feature_offsets if feature_offsets is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self.num_parts = num_parts
-    self.cost_per_unit = cost_per_unit
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
-    """
-    self.built = True
-
-  def call(self, inputs, keep_inputs=False, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
-    Input features that were not calibrated have their feature IDs truncated, so as
-    to be less than 1<<output_bits, but their values remain untouched (not discretized)
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Args:
-      inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      keep_inputs:
-        Include the original inputs in the output.
-        Note - if True, undiscretized features will be passed through, but will have
-        their values doubled (unless there are no calibrated features to discretize).
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if self._n_feature > 0:
-      discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
-        input_ids=keys,  # inc key assigned to feature_id, or -1
-        input_vals=vals,  # the observed feature values
-        bin_ids=self._bin_ids,  # n_feat X (n_bin+1) 2D arange
-        bin_vals=self._bin_values,  # bin boundaries
-        feature_offsets=self._feature_offsets,  # 0 : nbin_1 : max_feat
-        output_bits=self._out_bits,
-        feature_ids=tf.make_tensor_proto(self._hash_keys),  # feature ids to build internal hash map
-        feature_indices=tf.make_tensor_proto(self._hash_values),  # keys associated w/ feat. indices
-        start_compute=tf.constant(0, shape=[], dtype=tf.int64),
-        end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
-        cost_per_unit=self.cost_per_unit
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-      # don't 2x the input.
-      keep_inputs = False
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self._output_size]
-
-    output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
-
-    if keep_inputs:
-      # Note the non-discretized features will end up doubled,
-      #   since these are already in `output`
-      # handle output ID conflicts
-      mdl_size = self._n_feature * (self._n_bin + 1)
-      non_mdl_size = tf.subtract(self._output_size, mdl_size)
-      input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
-
-      new_input = twml.SparseTensor(
-        ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf()
-
-      # concatenate discretizer output with original input
-      sparse_add = tf.sparse_add(new_input, output)
-      output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape)
-
-    return output
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/sequential.py b/twml/twml/layers/sequential.py
deleted file mode 100644
index c0d4b92cc..000000000
--- a/twml/twml/layers/sequential.py
+++ /dev/null
@@ -1,160 +0,0 @@
-"""
-Implementing Sequential Layer container
-"""
-
-
-from .layer import Layer
-
-from tensorflow import keras
-from tensorflow.python.layers import base
-
-
-class Sequential(Layer):
-  """
-  A sequential stack of layers.
-
-  Arguments:
-      layers: list of layers to add to the model.
-
-  Output:
-      the output of the sequential layers
-   """
-
-  def __init__(self, layers=None, **kwargs):
-    self._layers = []  # Stack of layers.
-    self._layer_names = []  # Stack of layers names
-    self._layer_outputs = []
-    # Add to the model any layers passed to the constructor.
-    if layers:
-      for layer in layers:
-        self.add(layer)
-    super(Sequential, self).__init__(**kwargs)
-
-  def add(self, layer):
-    """Adds a layer instance on top of the layer stack.
-
-    Arguments:
-      layer:
-        layer instance.
-
-    Raises:
-      TypeError:
-        if the layer argument is not instance of base.Layer
-    """
-    if not isinstance(layer, base.Layer) and not isinstance(layer, keras.layers.Layer):
-      raise TypeError('The added layer must be an instance of class Layer')
-
-    if layer.name in self._layer_names:
-      raise ValueError('Layer with name %s already exists in sequential layer' % layer.name)
-
-    self._layers.append(layer)
-    self._layer_names.append(layer.name)
-
-  def pop(self):
-    """Removes the last layer in the model.
-
-    Raises:
-      TypeError:
-        if there are no layers in the model.
-    """
-    if not self._layers or not self._layer_names:
-      raise TypeError('There are no layers in the model.')
-    self._layers.pop()
-    self._layer_names.pop()
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-
-    Returns:
-      The output of the sequential layers
-    """
-    self._layer_outputs = []
-    for layer in self._layers:
-      # don't use layer.call because you want to build individual layers
-      inputs = layer(inputs)  # overwrites the current input after it has been processed
-      self._layer_outputs.append(inputs)
-    return inputs
-
-  @property
-  def layers(self):
-    """ Return the layers in the sequential layer """
-    return self._layers
-
-  @property
-  def layer_names(self):
-    """ Return the layer names in the sequential layer """
-    return self._layer_names
-
-  @property
-  def layer_outputs(self):
-    """ Return the layer outputs in the sequential layer """
-    return self._layer_outputs
-
-  def get(self, key):
-    """Retrieves the n-th layer.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The n-th layer where n is equal to the key.
-    """
-    return self._layers[key]
-
-  def get_output(self, key):
-    """Retrieves the n-th layer output.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The intermediary output equivalent to the nth layer, where n is equal to the key.
-    """
-    return self._layer_outputs[key]
-
-  def get_layer_by_name(self, name):
-    """Retrieves the layer corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
-
-    Output:
-      list of layers that have the name desired
-    """
-    return self._layers[self._layer_names.index(name)]
-
-  def get_layer_output_by_name(self, name):
-    """Retrieves the layer output corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
-
-    Output:
-      list of the output of the layers that have the desired name
-    """
-    return self._layer_outputs[self._layer_names.index(name)]
-
-  @property
-  def init(self):
-    """ returns a list of initialization ops (one per layer) """
-    return [layer.init for layer in self._layers]
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
diff --git a/twml/twml/layers/sparse_max_norm.py b/twml/twml/layers/sparse_max_norm.py
deleted file mode 100644
index e1f423fe0..000000000
--- a/twml/twml/layers/sparse_max_norm.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# pylint: disable=no-member, attribute-defined-outside-init, duplicate-code
-"""
-Contains the twml.layers.SparseMaxNorm layer.
-"""
-from .layer import Layer
-
-from libtwml import OPLIB
-import tensorflow.compat.v1 as tf
-import twml
-
-
-class SparseMaxNorm(Layer):
-  """
-  Computes a max-normalization and adds bias to the sparse_input,
-  forwards that through a sparse affine transform followed
-  by an non-linear activation on the resulting dense representation.
-
-  This layer has two parameters, one of which learns through gradient descent:
-    bias_x (optional):
-      vector of shape [input_size]. Learned through gradient descent.
-    max_x:
-      vector of shape [input_size]. Holds the maximas of input ``x`` for normalization.
-      Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both.
-
-  The pseudo-code for this layer looks like:
-
-  .. code-block:: python
-
-    abs_x = abs(x)
-    normed_x = clip_by_value(x / max_x, -1, 1)
-    biased_x = normed_x + bias_x
-    return biased
-
-
-  Args:
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    A layer representing the output of the sparse_max_norm transformation.
-   """
-
-  def __init__(
-          self,
-          input_size=None,
-          max_x_initializer=None,
-          bias_x_initializer=None,
-          is_training=True,
-          epsilon=1E-5,
-          use_bias=True,
-          **kwargs):
-
-    super(SparseMaxNorm, self).__init__(**kwargs)
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-    if max_x_initializer is None:
-      max_x_initializer = tf.zeros_initializer()
-    self.max_x_initializer = max_x_initializer
-
-    self._use_bias = use_bias
-    if use_bias:
-      if bias_x_initializer is None:
-        bias_x_initializer = tf.zeros_initializer()
-      self.bias_x_initializer = bias_x_initializer
-
-    self.epsilon = epsilon
-    self.is_training = is_training
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the max_x and bias_x tf.Variables of the layer."""
-
-    self.max_x = self.add_variable(
-      'max_x',
-      initializer=self.max_x_initializer,
-      shape=[input_shape[1]],
-      dtype=tf.float32,
-      trainable=False)
-
-    if self._use_bias:
-      self.bias_x = self.add_variable(
-        'bias_x',
-        initializer=self.bias_x_initializer,
-        shape=[input_shape[1]],
-        dtype=tf.float32,
-        trainable=True)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def _call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
-
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
-    """
-
-    if isinstance(inputs, twml.SparseTensor):
-      inputs = inputs.to_tf()
-    elif not isinstance(inputs, tf.SparseTensor):
-      raise TypeError("The inputs must be of type tf.SparseTensor or twml.SparseTensor")
-
-    indices_x = inputs.indices[:, 1]
-    values_x = inputs.values
-
-    if self.is_training is False:
-      normalized_x = OPLIB.sparse_max_norm_inference(self.max_x,
-                                                     indices_x,
-                                                     values_x,
-                                                     self.epsilon)
-
-      update_op = tf.no_op()
-    else:
-      max_x, normalized_x = OPLIB.sparse_max_norm_training(self.max_x,
-                                                           indices_x,
-                                                           values_x,
-                                                           self.epsilon)
-
-      update_op = tf.assign(self.max_x, max_x)
-
-    with tf.control_dependencies([update_op]):
-      normalized_x = tf.stop_gradient(normalized_x)
-
-    # add input bias
-    if self._use_bias:
-      normalized_x = normalized_x + tf.gather(self.bias_x, indices_x)
-
-    # convert back to sparse tensor
-    return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape)
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
-
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
-    """
-    with tf.device(self.max_x.device):
-      return self._call(inputs, **kwargs)
-
-# For backwards compatiblity and also because I don't want to change all the tests.
-MaxNorm = SparseMaxNorm
-
-
-def sparse_max_norm(inputs,
-                    input_size=None,
-                    max_x_initializer=None,
-                    bias_x_initializer=None,
-                    is_training=True,
-                    epsilon=1E-5,
-                    use_bias=True,
-                    name=None,
-                    reuse=None):
-  """
-  Functional inteface to SparseMaxNorm.
-
-  Args:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    input_size:
-      number of input units
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    Output after normalizing with the max value.
-   """
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now automatically \
-                     inferred from your input.')
-
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-
-  layer = SparseMaxNorm(max_x_initializer=max_x_initializer,
-                        bias_x_initializer=bias_x_initializer,
-                        is_training=is_training,
-                        epsilon=epsilon,
-                        use_bias=use_bias,
-                        name=name,
-                        _scope=name,
-                        _reuse=reuse)
-  return layer(inputs)
diff --git a/twml/twml/layers/stitch.py b/twml/twml/layers/stitch.py
deleted file mode 100644
index 51dffdb8e..000000000
--- a/twml/twml/layers/stitch.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# pylint: disable=useless-super-delegation
-"""
-Implementing Stitch Layer
-"""
-
-
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
-
-
-class Stitch(Layer):
-  """
-  This layer is responsible for stitching a partioned layer together.
-
-  Output:
-    A layer that performs stitching
-  """
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
-
-  def call(self, partioned_val, partioned_keys,
-           partioned_indices, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """
-    This layer is responsible for stitching a partioned layer together.
-
-    Input:
-      partioned_val:
-        a list of partioned Tensors which represent the vals of the hashmap
-      partioned_keys:
-        a list of partioned Tensors which represent the keys of the hashmap
-      partioned_indices:
-        a list of partioned Tensors which represent the indices of the hashmap
-    Output:
-      List which contains: [output_vals, output_keys]
-        output_vals:
-          Values of the HashMap (float)
-        output_keys:
-          Keys of HashMap (float)
-    """
-    indices = [tf.to_int32(index) for index in partioned_indices]
-    concat_keys = tf.dynamic_stitch(indices, partioned_keys)
-    concat_vals = tf.dynamic_stitch(indices, partioned_val)
-    return [concat_vals, concat_keys]
diff --git a/twml/twml/learning_rate_decay.py b/twml/twml/learning_rate_decay.py
deleted file mode 100644
index be522d75b..000000000
--- a/twml/twml/learning_rate_decay.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# pylint: disable=too-many-branches
-""" This module includes functions for managing learning rate decay """
-import tensorflow.compat.v1 as tf
-
-
-def get_learning_rate_decay_fn(params):
-  """
-  Returns a learning rate decay function that takes the initial
-  learning_rate and global_step
-  as arguments and returns the current learning rate.
-
-  Currently supports params.learning_rate_decay values of:
-  exponential | polynomial | piecewise_constant | cosine | cosine restarts.
-  See `Decaying the Leanring Rate
-  <https://www.tensorflow.org/api_guides/python/train#Decaying_the_learning_rate>`_ for details.
-
-  Arguments:
-    params:
-      a tensorflow.contrib.train.HParams object containing the relevant hyperparameters.
-  """
-  paramsv = params.values()
-  if 'learning_rate_decay' not in paramsv or params.learning_rate_decay == 'no_learning_rate_decay':
-    return None
-  elif params.learning_rate_decay == 'exponential_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'exponential'")
-    if 'exponential_decay_rate' not in paramsv:
-      raise ValueError("Expecting params.exponential_decay_rate for "
-                       "params.learning_rate_decay == 'exponential'")
-
-    def exponential_decay_fn(learning_rate, global_step):
-      """ exponential decay function to be passed to optimize_loss """
-      return tf.train.exponential_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.exponential_decay_rate
-      )
-    return exponential_decay_fn
-  elif params.learning_rate_decay == 'piecewise_constant_learning_rate_decay':
-    if 'piecewise_constant_boundaries' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_boundaries for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    if 'piecewise_constant_values' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_values for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    # pylint: disable=unused-argument
-
-    def piecewise_constant_fn(learning_rate, global_step):
-      """ piecewise_constant decay function to be passed to optimize_loss """
-      return tf.train.piecewise_constant(
-        x=global_step,
-        boundaries=params.piecewise_constant_boundaries,
-        values=params.piecewise_constant_values
-      )
-    return piecewise_constant_fn
-  elif params.learning_rate_decay == 'polynomial_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'polynomial'")
-    if 'end_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.end_learning_rate for "
-                       "params.learning_rate_decay == 'polynomial'")
-
-    def polynomial_decay_fn(learning_rate, global_step):
-      """ polynomial decay function to be passed to optimize_loss """
-      return tf.train.polynomial_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        end_learning_rate=params.end_learning_rate,
-        power=params.polynomial_power if 'polynomial_power' in paramsv else 1.0,
-      )
-    return polynomial_decay_fn
-
-  elif params.learning_rate_decay == 'inverse_learning_rate_decay':
-    if 'min_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.min_learning_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_rate' not in paramsv:
-      raise ValueError("Expecting params.decay_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'inverse'")
-
-    def bounded_inverse_time_decay_fn(learning_rate, global_step):
-      '''
-      Returns the decayed learning_rate by applying the function:
-      decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)),
-                       min_learning_rate)
-      Arguments:
-        learning_rate:
-          A scalar `float32` or `float64` `Tensor` or a Python number.
-          The initial learning rate.
-        global_step:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Global step to use for the decay computation.  Must not be negative.
-        min_learning_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Minimum possible learning_rate. The decayed learning_rate will not be
-          smaller than the min_learning_rate
-        decay_steps:
-          How often to apply decay. In dbv1, this should be 1.
-        decay_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Rate in which we decay the learning rate.
-        Returns:
-        A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-        learning rate.
-      '''
-      decayed_rate = tf.train.inverse_time_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.decay_rate)
-      # Getting dtype of returned Tensor
-      dtype = decayed_rate.dtype
-      # Casting the min_learning rate the same dtype as decayes rate
-      min_learning_rate = tf.cast(params.min_learning_rate, dtype)
-      # Returning the maximum between the two
-      return tf.maximum(decayed_rate, min_learning_rate)
-
-    return bounded_inverse_time_decay_fn
-
-  elif params.learning_rate_decay == 'cosine_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    def cosine_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        alpha=params.alpha
-      )
-    return cosine_decay_fn
-  elif params.learning_rate_decay == 'cosine_restarts_learning_rate_decay':
-    if 'first_decay_steps' not in paramsv:
-      raise ValueError("Expecting params.first_decay_steps for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 't_mul' not in paramsv:
-      raise ValueError("Expecting params.t_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 'm_mul' not in paramsv:
-      raise ValueError("Expecting params.m_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    def cosine_restart_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay_restarts(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        first_decay_steps=params.first_decay_steps,
-        t_mul=params.t_mul,
-        m_mul=params.m_mul,
-        alpha=params.alpha
-      )
-    return cosine_restart_decay_fn
-
-  raise ValueError("Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay)
diff --git a/twml/twml/lookup/__init__.py b/twml/twml/lookup/__init__.py
deleted file mode 100644
index 87392d719..000000000
--- a/twml/twml/lookup/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from tensorflow.python.ops.lookup_ops import (
-  index_table_from_file,
-  index_table_from_tensor,
-  index_to_string_table_from_file
-)  # noqa: F401
-
-
-"""
-NOTE: Using `from tensorflow.python.ops.lookup_ops import index_table_from_tensor` in the code works.
-This stub exists because it was easier to refactor code because twml is widely used.
-"""
diff --git a/twml/twml/metrics.py b/twml/twml/metrics.py
deleted file mode 100644
index ee2f82b74..000000000
--- a/twml/twml/metrics.py
+++ /dev/null
@@ -1,1380 +0,0 @@
-"""
-This module contains custom tensorflow metrics used at Twitter.
-Its components conform to conventions used by the ``tf.metrics`` module.
-
-"""
-
-from collections import OrderedDict
-from functools import partial
-
-import numpy as np
-import tensorboard as tb
-import tensorflow.compat.v1 as tf
-
-
-CLAMP_EPSILON = 0.00001
-
-
-def total_weight_metric(
-    labels,
-    predictions,
-    weights=None,
-    metrics_collections=None,
-    updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'total_weight', (labels, predictions, weights)):
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float64)
-
-    if weights is None:
-      weights = tf.cast(tf.size(labels), total_weight.dtype, name="default_weight")
-    else:
-      weights = tf.cast(weights, total_weight.dtype)
-
-    # add up the weights to get total weight of the eval set
-    update_total_weight = tf.assign_add(total_weight, tf.reduce_sum(weights), name="update_op")
-
-    value_op = tf.identity(total_weight)
-    update_op = tf.identity(update_total_weight)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return value_op, update_op
-
-
-def num_samples_metric(
-    labels,
-    predictions,
-    weights=None,
-    metrics_collections=None,
-    updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'num_samples', (labels, predictions, weights)):
-    num_samples = _metric_variable(name='num_samples', shape=[], dtype=tf.float64)
-    update_num_samples = tf.assign_add(num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op")
-
-    value_op = tf.identity(num_samples)
-    update_op = tf.identity(update_num_samples)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return value_op, update_op
-
-
-def ctr(labels, predictions,
-        weights=None,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive sample ratio based on labels
-  (i.e. weighted average percentage of positive labels).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    ctr: A `Tensor` representing positive sample ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=labels,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def predicted_ctr(labels, predictions,
-                  weights=None,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive ratio based on predictions,
-  (i.e. weighted averaged predicted positive probability).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    predicted_ctr: A `Tensor` representing the predicted positive ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=predictions,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def prediction_std_dev(labels, predictions,
-                       weights=None,
-                       metrics_collections=None,
-                       updates_collections=None,
-                       name=None):
-  """
-  Compute the weighted standard deviation of the predictions.
-  Note - this is not a confidence interval metric.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'pred_std_dev', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    # State kept during streaming of examples
-    total_weighted_preds = _metric_variable(
-        name='total_weighted_preds', shape=[], dtype=tf.float64)
-    total_weighted_preds_sq = _metric_variable(
-        name='total_weighted_preds_sq', shape=[], dtype=tf.float64)
-    total_weights = _metric_variable(
-        name='total_weights', shape=[], dtype=tf.float64)
-
-    # Update state
-    update_total_weighted_preds = tf.assign_add(total_weighted_preds, tf.reduce_sum(weights * predictions))
-    update_total_weighted_preds_sq = tf.assign_add(total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions))
-    update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights))
-
-    # Compute output
-    def compute_output(tot_w, tot_wp, tot_wpp):
-      return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2)
-    std_dev_est = compute_output(total_weights, total_weighted_preds, total_weighted_preds_sq)
-    update_std_dev_est = compute_output(update_total_weights, update_total_weighted_preds, update_total_weighted_preds_sq)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, std_dev_est)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_std_dev_est)
-
-    return std_dev_est, update_std_dev_est
-
-
-def _get_arce_predictions(predictions, weights, label_weighted, labels,
-                         up_weight, deprecated_rce,
-                         total_positive, update_total_positive):
-  """
-  Returns the ARCE predictions, total_positive, update_total_positive and weights
-  used by the rest of the twml.metrics.rce metric computation.
-  """
-  predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
-  label_weighted_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(label_weighted))
-  pred_weight_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted))
-  normalizer_comp = label_weighted_comp / pred_weight_comp
-
-  if up_weight is False:
-    total_positive_unweighted = _metric_variable(
-      name='total_positive_unweighted', shape=[], dtype=tf.float32)
-
-    update_total_positive_unweighted = tf.assign_add(
-      total_positive_unweighted, tf.reduce_sum(labels),
-      name="total_positive_unweighted_update")
-
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted)
-    else:
-      # sum of labels / sum of weighted labels
-      normalizer = update_total_positive_unweighted / update_total_positive
-
-    label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels))
-    normalizer_comp = label_comp / label_weighted_comp
-
-    # note that up_weight=True changes these for the rest of the twml.metric.rce computation
-    weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    total_positive = total_positive_unweighted
-    update_total_positive = update_total_positive_unweighted
-  else:
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-    else:
-      # normalizer used for NRCE (and ARCE with up_weight=True)
-      total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
-
-      # update the variable holding the sum of weighted predictions
-      update_total_prediction = tf.assign_add(
-        total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-      # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      # but it measure normalizer over batch was too flawed an approximation.
-      normalizer = update_total_positive / update_total_prediction
-
-  pred_comp = tf.subtract(tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions)
-  pred_comp_norm = tf.multiply(pred_comp, normalizer_comp, name="normalized_predictions_comp")
-  pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator")
-  pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator")
-  predictions = pred_num / pred_denom
-
-  return predictions, total_positive, update_total_positive, weights
-
-
-def rce(labels, predictions,
-        weights=None,
-        normalize=False,
-        arce=False,
-        up_weight=True,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None,
-        deprecated_rce=False):
-  """
-  Compute the relative cross entropy (RCE).
-  The RCE is a relative measurement compared to the baseline model's performance.
-  The baseline model always predicts average click-through-rate (CTR).
-  The RCE measures, in percentage, how much better the predictions are, compared
-  to the baseline model, in terms of cross entropy loss.
-
-  y = label; p = prediction;
-  binary cross entropy = y * log(p) + (1-y) * log(1-p)
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    normalize:
-      if set to true, produce NRCEs used at Twitter. (normalize preds by weights first)
-      NOTE: if you don't understand what NRCE is, please don't use it.
-    arce:
-      if set to true, produces `ARCE <http://go/arce>`_.
-      This can only be activated if `normalize=True`.
-    up_weight:
-      if set to true, produces arce in the up_weighted space (considers CTR after up_weighting
-      data), while False gives arce in the original space (only considers CTR before up_weighting).
-      In the actual version, this flag can only be activated if arce is True.
-      Notice that the actual version of NRCE corresponds to up_weight=True.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-    deprecated_rce:
-      enables the previous NRCE/ARCE calculations which calculated some label metrics
-      on the batch instead of on all batches seen so far. Note that the older metric
-      calculation is less stable, especially for smaller batch sizes. You should probably
-      never have to set this to True.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  .. note:: Must have at least 1 positive and 1 negative sample accumulated,
-     or RCE will come out as NaN.
-  """
-  with tf.variable_scope(name, 'rce', (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
-
-    total_positive = _metric_variable(name='total_positive', shape=[], dtype=tf.float32)
-    total_loss = _metric_variable(name='total_loss', shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float32)
-
-    label_weighted = tf.multiply(labels, weights, name="weighted_label")
-
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(label_weighted), name="total_pos_update")
-
-    if arce:
-      if normalize is False:
-        raise ValueError('This configuration of parameters is not actually allowed')
-
-      predictions, total_positive, update_total_positive, weights = _get_arce_predictions(
-        predictions=predictions, weights=weights, deprecated_rce=deprecated_rce,
-        label_weighted=label_weighted, labels=labels, up_weight=up_weight,
-        total_positive=total_positive, update_total_positive=update_total_positive)
-
-    elif normalize:
-      predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
-
-      if deprecated_rce:
-        normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      else:
-        total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
-
-        # update the variable holding the sum of weighted predictions
-        update_total_prediction = tf.assign_add(
-          total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-        # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-        # but it measure normalizer over batch was too flawed an approximation.
-        normalizer = update_total_positive / update_total_prediction
-
-      # NRCE
-      predictions = tf.multiply(predictions, normalizer, name="normalized_predictions")
-
-    # clamp predictions to keep log(p) stable
-    clip_p = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss")
-
-    logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-
-    # metric value retrieval subgraph
-    ctr1 = tf.truediv(total_positive, total_weight, name="ctr")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce")
-    pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce")
-
-    rce_t = tf.multiply(
-      1.0 - tf.truediv(pred_ce, baseline_ce),
-      100,
-      name="rce")
-
-    # metric update subgraph
-    ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce2 = _binary_cross_entropy(pred=ctr2, target=ctr2, name="baseline_ce_update")
-    pred_ce2 = tf.truediv(update_total_loss, update_total_weight, name="pred_ce_update")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(pred_ce2, baseline_ce2),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, rce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return rce_t, update_op
-
-
-def ce(p_true, p_est=None):
-  if p_est is None:
-    p_est = p_true
-  return _binary_cross_entropy(pred=p_est, target=p_true, name=None)
-
-
-def rce_transform(outputs, labels, weights):
-  '''
-  Construct an OrderedDict of quantities to aggregate over eval batches
-  outputs, labels, weights are TensorFlow tensors, and are assumed to
-    be of shape [N] for batch_size = N
-  Each entry in the output OrderedDict should also be of shape [N]
-  '''
-  out_vals = OrderedDict()
-  out_vals['weighted_loss'] = weights * ce(p_true=labels, p_est=outputs)
-  out_vals['weighted_labels'] = labels * weights
-  out_vals['weight'] = weights
-  return out_vals
-
-
-def rce_metric(aggregates):
-  '''
-  input ``aggregates`` is an OrderedDict with the same keys as those created
-    by rce_transform(). The dict values are the aggregates (reduce_sum)
-    of the values produced by rce_transform(), and should be scalars.
-  output is the value of RCE
-  '''
-  # cummulative weighted loss of model predictions
-  total_weighted_loss = aggregates['weighted_loss']
-  total_weighted_labels = aggregates['weighted_labels']
-  total_weight = aggregates['weight']
-
-  model_average_loss = total_weighted_loss / total_weight
-  baseline_average_loss = ce(total_weighted_labels / total_weight)
-  return 100.0 * (1 - model_average_loss / baseline_average_loss)
-
-
-def metric_std_err(labels, predictions,
-                   weights=None,
-                   transform=rce_transform, metric=rce_metric,
-                   metrics_collections=None,
-                   updates_collections=None,
-                   name='rce_std_err'):
-  """
-  Compute the weighted standard error of the RCE metric on this eval set.
-  This can be used for confidence intervals and unpaired hypothesis tests.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    transform: a function of the following form:
-
-      .. code-block:: python
-
-        def transform(outputs, labels, weights):
-          out_vals = OrderedDict()
-          ...
-          return out_vals
-
-      where outputs, labels, and weights are all tensors of shape [eval_batch_size].
-      The returned OrderedDict() should have values that are tensors of shape  [eval_batch_size].
-      These will be aggregated across many batches in the eval dataset, to produce
-      one scalar value per key of out_vals.
-    metric: a function of the following form
-
-      .. code-block:: python
-
-        def metric(aggregates):
-          ...
-          return metric_value
-
-      where aggregates is an OrderedDict() having the same keys created by transform().
-      Each of the corresponding dict values is the reduce_sum of the values produced by
-      transform(), and is a TF scalar. The return value should be a scalar representing
-      the value of the desired metric.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'metric_std_err', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    labels = tf.reshape(labels, [-1])
-    predictions = tf.reshape(predictions, [-1])
-    predictions = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    weights = tf.reshape(weights, [-1])
-
-    # first apply the supplied transform function to the output, label, weight data
-    # returns an OrderedDict of 1xN tensors for N input samples
-    # for each sample, compute f = transform(pred, l, w)
-    transformed = transform(predictions, labels, weights)
-
-    # we track 3 types of aggregate information
-    # 1. total number of samples
-    # 2. aggregated transformed samples (moment1), i.e. sum(f)
-    # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T)
-
-    # count total number of samples
-    sample_count = _metric_variable(
-        name='sample_count', shape=[], dtype=tf.int64)
-    update_sample_count = tf.assign_add(sample_count, tf.size(labels, out_type=sample_count.dtype))
-
-    # compose the ordered dict into a single vector
-    # so f can be treated as a single column vector rather than a collection of scalars
-    N = len(transformed)
-    transformed_vec = tf.stack(list(transformed.values()), axis=1)
-
-    # compute and update transformed samples (1st order statistics)
-    # i.e. accumulate f into F as F += sum(f)
-    aggregates_1 = _metric_variable(
-        name='aggregates_1', shape=[N], dtype=tf.float64)
-    update_aggregates_1 = tf.assign_add(aggregates_1, tf.reduce_sum(transformed_vec, axis=0))
-
-    # compute and update crossed transformed samples (2nd order statistics)
-    # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f))
-    aggregates_2 = _metric_variable(
-        name='aggregates_2', shape=[N, N], dtype=tf.float64)
-    moment_2_temp = (
-      tf.reshape(transformed_vec, shape=[-1, N, 1])
-      * tf.reshape(transformed_vec, shape=[-1, 1, N])
-    )
-    update_aggregates_2 = tf.assign_add(aggregates_2, tf.reduce_sum(moment_2_temp, axis=0))
-
-    def compute_output(agg_1, agg_2, samp_cnt):
-      # decompose the aggregates back into a dict to pass to the user-supplied metric fn
-      aggregates_dict = OrderedDict()
-      for i, key in enumerate(transformed.keys()):
-        aggregates_dict[key] = agg_1[i]
-
-      metric_value = metric(aggregates_dict)
-
-      # derivative of metric with respect to the 1st order aggregates
-      # i.e. d M(agg1) / d agg1
-      metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1)
-
-      # estimated covariance of agg_1
-      # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N
-      #     = agg_2 - (agg_1 * agg_1^T) / N
-      N_covariance_estimate = agg_2 - (
-        tf.reshape(agg_1, shape=[-1, 1])
-        @ tf.reshape(agg_1, shape=[1, -1])
-        / tf.cast(samp_cnt, dtype=tf.float64)
-      )
-
-      # push N_covariance_estimate through a linearization of metric around agg_1
-      # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1)
-      metric_variance = (
-        tf.reshape(metric_prime, shape=[1, -1])
-        @ N_covariance_estimate
-        @ tf.reshape(metric_prime, shape=[-1, 1])
-      )
-      # result should be a single element, but the matmul is 2D
-      metric_variance = metric_variance[0][0]
-      metric_stderr = tf.sqrt(metric_variance)
-      return metric_stderr
-
-    metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count)
-    update_metric_stderr = compute_output(update_aggregates_1, update_aggregates_2, update_sample_count)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, metric_stderr)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_metric_stderr)
-
-    return metric_stderr, update_metric_stderr
-
-
-def lolly_nrce(labels, predictions,
-               weights=None,
-               metrics_collections=None,
-               updates_collections=None,
-               name=None):
-  """
-  Compute the Lolly NRCE.
-
-  Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large,
-  especially when the adjusted ctr goes above 1.0.
-
-  Calculation:
-
-  ::
-
-    NRCE: lolly NRCE
-    BCE: baseline cross entropy
-    NCE: normalized cross entropy
-    CE: cross entropy
-    y_i: label of example i
-    p_i: prediction of example i
-    y: ctr
-    p: average prediction
-    a: normalizer
-
-    Assumes any p_i and a * p_i is within [0, 1)
-    NRCE = (1 - NCE / BCE) * 100
-    BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y))
-        = - (y * log(y) + (1 - y) * log(1 - y))
-    a = y / p
-    CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-    NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i))
-        = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-          - sum_i(y_i * log(a))
-          + sum_i((1 - y_i) * log(1 - p_i))
-          - sum_i((1 - y_i) * log(1 - a * p_i))
-        ~= CE - sum_i(y_i) * log(a)
-          + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j)))
-          - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j)))
-          # Takes 5 items from the Taylor expansion, can be increased if needed
-          # Error for each example is O(p_i^6)
-        = CE - sum_i(y_i) * log(a)
-          - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j)
-        = CE - sum_i(y_i) * log(a)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j)
-
-  Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5.
-  We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that
-  we can get a at the end, which leads to this NRCE.
-
-  NRCE uses ctr and average pctr to normalize the pctrs.
-  It removes the impact of prediction error from RCE.
-  Usually NRCE is higher as the prediction error impact on RCE is negative.
-  Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE.
-
-  In Lolly NRCE we use ctr and average pctr of the whole dataset.
-  We thus remove the dataset level error in NRCE calculation.
-  In this case, when we want to improve RCE to the level of NRCE,
-  it is achievable as dataset level prediction error is easy to remove by calibration.
-  Lolly NRCE is thus a good estimate about the potential gain by adding calibration.
-
-  In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error.
-  This error is difficult to remove by modeling improvement,
-  at least not by simple calibration.
-  It thus cannot indicate the same opportunity as the Lolly NRCE does.
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  Note: Must have at least 1 positive and 1 negative sample accumulated,
-        or NRCE will come out as NaN.
-  """
-  with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
-
-    positive_weights = tf.multiply(labels, weights, name="positive_weights")
-
-    # clamp predictions to keep log(p) stable
-    clip_predictions = tf.clip_by_value(
-      predictions,
-      CLAMP_EPSILON,
-      1.0 - CLAMP_EPSILON,
-      name="clip_predictions")
-    weighted_predictions = tf.multiply(
-      predictions, weights,
-      name="weighted_predictions")
-
-    logloss = _binary_cross_entropy(pred=clip_predictions, target=labels, name="logloss")
-    weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    negatives = tf.subtract(
-      tf.ones(shape=tf.shape(labels), dtype=tf.float32),
-      labels,
-      name="negatives")
-    negative_predictions = tf.multiply(
-      predictions,
-      negatives,
-      name="negative_predictions")
-    weighted_negative_predictions = tf.multiply(
-      negative_predictions, weights,
-      name="weighted_negative_predictions")
-    negative_squared_predictions = tf.multiply(
-      negative_predictions,
-      negative_predictions,
-      name="negative_squared_predictions")
-    weighted_negative_squared_predictions = tf.multiply(
-      negative_squared_predictions, weights,
-      name="weighted_negative_squared_predictions")
-    negative_cubed_predictions = tf.multiply(
-      negative_squared_predictions,
-      negative_predictions,
-      name="negative_cubed_predictions")
-    weighted_negative_cubed_predictions = tf.multiply(
-      negative_cubed_predictions, weights,
-      name="weighted_negative_cubed_predictions")
-    negative_quartic_predictions = tf.multiply(
-      negative_cubed_predictions,
-      negative_predictions,
-      name="negative_quartic_predictions")
-    weighted_negative_quartic_predictions = tf.multiply(
-      negative_quartic_predictions, weights,
-      name="weighted_negative_quartic_predictions")
-    negative_quintic_predictions = tf.multiply(
-      negative_quartic_predictions,
-      negative_predictions,
-      name="negative_quintic_predictions")
-    weighted_negative_quintic_predictions = tf.multiply(
-      negative_quintic_predictions, weights,
-      name="weighted_negative_quintic_predictions")
-
-    # Tracked stats
-    total_positive = _metric_variable(name="total_positive", shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
-
-    total_prediction = _metric_variable(name="total_prediction", shape=[], dtype=tf.float32)
-
-    total_negative_prediction = _metric_variable(
-      name="total_negative_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_squared_prediction = _metric_variable(
-      name="total_negative_squared_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_cubed_prediction = _metric_variable(
-      name="total_negative_cubed_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quartic_prediction = _metric_variable(
-      name="total_negative_quartic_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quintic_prediction = _metric_variable(
-      name="total_negative_quintic_prediction",
-      shape=[], dtype=tf.float32)
-
-    total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
-
-    # Update tracked stats
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(positive_weights), name="total_positive_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-    update_total_prediction = tf.assign_add(
-      total_prediction, tf.reduce_sum(weighted_predictions), name="total_prediction_update")
-    update_total_negative_prediction = tf.assign_add(
-      total_negative_prediction,
-      tf.reduce_sum(weighted_negative_predictions), name="total_negative_prediction_update")
-    update_total_negative_squared_prediction = tf.assign_add(
-      total_negative_squared_prediction,
-      tf.reduce_sum(weighted_negative_squared_predictions),
-      name="total_negative_squared_prediction_update")
-    update_total_negative_cubed_prediction = tf.assign_add(
-      total_negative_cubed_prediction,
-      tf.reduce_sum(weighted_negative_cubed_predictions),
-      name="total_negative_cubed_prediction_update")
-    update_total_negative_quartic_prediction = tf.assign_add(
-      total_negative_quartic_prediction,
-      tf.reduce_sum(weighted_negative_quartic_predictions),
-      name="total_negative_quartic_prediction_update")
-    update_total_negative_quintic_prediction = tf.assign_add(
-      total_negative_quintic_prediction,
-      tf.reduce_sum(weighted_negative_quintic_predictions),
-      name="total_negative_quintic_prediction_update")
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update")
-
-    # metric value retrieval subgraph
-    # ctr of this batch
-    positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_loss = _binary_cross_entropy(
-      pred=positive_rate,
-      target=positive_rate,
-      name="baseline_loss")
-
-    # normalizing ratio for nrce
-    # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr
-    normalizer = tf.truediv(total_positive, total_prediction, name="normalizer")
-    # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a))
-    # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i)
-    # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i)
-    normalized_loss = (
-      total_loss -
-      total_positive * tf.log(normalizer) +
-      total_negative_prediction * (normalizer - 1) +
-      total_negative_squared_prediction * (normalizer * normalizer - 1) / 2 +
-      total_negative_cubed_prediction *
-      (normalizer * normalizer * normalizer - 1) / 3 +
-      total_negative_quartic_prediction *
-      (normalizer * normalizer * normalizer * normalizer - 1) / 4 +
-      total_negative_quintic_prediction *
-      (normalizer * normalizer * normalizer * normalizer * normalizer - 1) / 5)
-
-    # average normalized loss
-    avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss")
-
-    nrce_t = tf.multiply(
-      1.0 - tf.truediv(avg_loss, baseline_loss),
-      100,
-      name="lolly_nrce")
-
-    # metric update subgraph
-    update_positive_rate = tf.truediv(
-      update_total_positive,
-      update_total_weight,
-      name="update_positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    update_baseline_loss = _binary_cross_entropy(
-      pred=update_positive_rate,
-      target=update_positive_rate,
-      name="update_baseline_loss")
-
-    update_normalizer = tf.truediv(
-      update_total_positive,
-      update_total_prediction,
-      name="update_normalizer")
-    update_normalized_loss = (
-      update_total_loss -
-      update_total_positive * tf.log(update_normalizer) +
-      update_total_negative_prediction *
-      (update_normalizer - 1) +
-      update_total_negative_squared_prediction *
-      (update_normalizer * update_normalizer - 1) / 2 +
-      update_total_negative_cubed_prediction *
-      (update_normalizer * update_normalizer * update_normalizer - 1) / 3 +
-      update_total_negative_quartic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer - 1) / 4 +
-      update_total_negative_quintic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer * update_normalizer - 1) / 5)
-
-    update_avg_loss = tf.truediv(
-      update_normalized_loss,
-      update_total_weight,
-      name="update_avg_loss")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(update_avg_loss, update_baseline_loss),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, nrce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return nrce_t, update_op
-
-
-def _binary_cross_entropy(pred, target, name):
-  return - tf.add(
-    target * tf.log(pred),
-    (1.0 - target) * tf.log(1.0 - pred),
-    name=name)
-
-
-# Copied from metrics_impl.py with minor modifications.
-# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
-def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
-
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
-
-PERCENTILES = np.linspace(0, 1, 101, dtype=np.float32)
-
-# metric_name: (metric, requires thresholded output)
-SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML metrics
-  'total_weight': (total_weight_metric, False),
-  'num_samples': (num_samples_metric, False),
-  'rce': (rce, False),
-  'rce_std_err': (partial(metric_std_err, transform=rce_transform, metric=rce_metric, name='rce_std_err'), False),
-  'nrce': (partial(rce, normalize=True), False),
-  'lolly_nrce': (lolly_nrce, False),
-  'arce': (partial(rce, normalize=True, arce=True), False),
-  'arce_original': (partial(rce, normalize=True, arce=True, up_weight=False), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (predicted_ctr, False),
-  'pred_std_dev': (prediction_std_dev, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-
-  'false_positives': (tf.metrics.false_positives, True),
-  'false_negatives': (tf.metrics.false_negatives, True),
-  'true_positives': (tf.metrics.true_positives, True),
-  'true_negatives': (tf.metrics.true_negatives, True),
-
-  'precision_at_percentiles': (partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES), False),
-  'recall_at_percentiles': (partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES), False),
-  'false_positives_at_percentiles': (partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'false_negatives_at_percentiles': (partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_positives_at_percentiles': (partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_negatives_at_percentiles': (partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES), False),
-
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC',
-    summation_method='careful_interpolation'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR',
-    summation_method='careful_interpolation'), False),
-
-  # tensorboard curves
-  'pr_curve': (tb.summary.v1.pr_curve_streaming_op, False),
-
-  # deprecated metrics
-  'deprecated_nrce': (partial(rce, normalize=True, deprecated_rce=True), False),
-  'deprecated_arce': (partial(rce, normalize=True, arce=True, deprecated_rce=True), False),
-  'deprecated_arce_original': (partial(rce, normalize=True, arce=True,
-                                     up_weight=False, deprecated_rce=True), False)
-}
-
-# default metrics provided by get_binary_class_metric_fn
-DEFAULT_BINARY_CLASS_METRICS = ['total_weight', 'num_samples', 'rce', 'rce_std_err',
-                                'nrce', 'arce', 'ctr', 'predicted_ctr', 'pred_std_dev',
-                                'accuracy', 'precision', 'recall', 'roc_auc', 'pr_auc']
-
-
-def get_binary_class_metric_fn(metrics=None):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for binary classification. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - `arce <http://go/arce>`_ (a more recent proposed improvment over NRCE)
-      - arce_original
-      - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-      - deprecated_arce (ARCE as it was calculated before a stability fix)
-      - deprecated_nrce (NRCE as it was calculated before a stability fix)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add metrics to eval_metric_ops dict
-    for metric in metrics:
-      if isinstance(metric, tuple) and len(metric) == 3:
-        metric_name, metric_factory, requires_threshold = metric
-        metric_name = metric_name.lower()
-      elif isinstance(metric, str):
-        metric_name = metric.lower()  # metric name are case insensitive.
-        metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-      else:
-        raise ValueError("Metric should be either string or tuple of length 3.")
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      if metric_factory:
-        value_op, update_op = metric_factory(
-          labels=labels,
-          predictions=(hard_preds if requires_threshold else preds),
-          weights=weights, name=metric_name)
-        eval_metric_ops[metric_name] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  In multiple binary classification problems, the
-  ``predictions`` (that is, ``graph_output['output']``)
-  are expected to have shape ``batch_size x n_classes``,
-  where ``n_classes`` is the number of binary classification.
-  Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1)
-  and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output``
-  with binary values (0 or 1). The weights can be of size ``batch_size`` or
-  ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities,
-  and need to have separate metrics.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of Metrics):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-  """
-  # pylint: disable=invalid-name,dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    shape = labels.get_shape()
-    # basic sanity check: multi_metric dimension must exist
-    assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
-
-    num_labels = shape[class_dim]
-    # If we are doing multi-class / multi-label metric, the number of classes / labels must
-    # be know at graph construction time.  This dimension cannot have size None.
-    assert num_labels is not None, "The multi-metric dimension cannot be None."
-    assert classes is None or len(classes) == num_labels, (
-      "Number of classes must match the number of labels")
-
-    weights_shape = weights.get_shape() if weights is not None else None
-    if weights_shape is None:
-      num_weights = None
-    elif len(weights_shape) > 1:
-      num_weights = weights_shape[class_dim]
-    else:
-      num_weights = 1
-
-    for i in range(num_labels):
-
-      # add metrics to eval_metric_ops dict
-      for metric in metrics:
-        if isinstance(metric, tuple) and len(metric) == 3:
-          metric_name, metric_factory, requires_threshold = metric
-          metric_name = metric_name.lower()
-        elif isinstance(metric, str):
-          metric_name = metric.lower()  # metric name are case insensitive.
-          metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        else:
-          raise ValueError("Metric should be either string or tuple of length 3.")
-
-        class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
-
-        if class_metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        class_labels = tf.gather(labels, indices=[i], axis=class_dim)
-        class_preds = tf.gather(preds, indices=[i], axis=class_dim)
-        class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
-
-        if num_weights is None:
-          class_weights = None
-        elif num_weights == num_labels:
-          class_weights = tf.gather(weights, indices=[i], axis=class_dim)
-        elif num_weights == 1:
-          class_weights = weights
-        else:
-          raise ValueError("num_weights (%d) and num_labels (%d) do not match"
-                           % (num_weights, num_labels))
-
-        if metric_factory:
-          value_op, update_op = metric_factory(
-            labels=class_labels,
-            predictions=(class_hard_preds if requires_threshold else class_preds),
-            weights=class_weights, name=class_metric_name)
-          eval_metric_ops[class_metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-def _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=True):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops with uncalibrated output.
-
-  The following graph_output keys are recognized:
-    uncalibrated_output:
-      the uncalibrated raw predictions between 0 and 1. Required.
-    output:
-      the calibrated predictions between 0 and 1.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    calibrated_metric_fn: metrics function with calibration and weight.
-    keep_weight: Bool indicating whether we keep weight.
-  """
-  metric_scope = 'uncalibrated' if keep_weight else 'unweighted'
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
-    """
-    with tf.variable_scope(metric_scope):
-      if 'uncalibrated_output' not in graph_output:
-        raise Exception("Missing uncalibrated_output in graph_output!")
-      un_calibrated_weights = weights if keep_weight else tf.ones_like(weights)
-      uncalibrated_output = {
-        'output': graph_output['uncalibrated_output'],
-        'threshold': graph_output.get('threshold', 0.5),
-        'hard_output': graph_output.get('hard_output'),
-        **{k: v for k, v in graph_output.items() if k not in ['output', 'threshold', 'hard_output']}
-      }
-
-      eval_metrics_ops = calibrated_metric_fn(uncalibrated_output, labels, un_calibrated_weights)
-
-      renamed_metrics_ops = {f'{metric_scope}_{k}': v for k, v in eval_metrics_ops.items()}
-      return renamed_metrics_ops
-
-  return get_eval_metric_ops
-
-
-def get_multi_binary_class_uncalibrated_metric_fn(
-  metrics, classes=None, class_dim=1, keep_weight=True):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications without calibration.
-
-  Note: 'uncalibrated_output' is required key in graph_output.
-
-  The main use case for this function is:
-
-  1) To calculated roc-auc for rare event.
-  Calibrated prediction score for rare events will be concentrated near zero. As a result,
-  the roc-auc can be seriously underestimated with current implementation in tf.metric.auc.
-  Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc.
-  For more details, please refer to: go/roc-auc-invariance.
-
-  2) To set keep_weight=False and get unweighted and uncalibrated metrics.
-  This is useful to eval how the model is fitted to its actual training data, since
-  often time the model is trained without weight.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-
-    keep_weight (bool):
-      Whether to keep weights for the metric.
-  """
-
-  calibrated_metric_fn = get_multi_binary_class_metric_fn(
-    metrics, classes=classes, class_dim=class_dim)
-  return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight)
-
-
-def combine_metric_fns(*fn_list):
-  """
-  Combine multiple metric functions.
-  For example, we can combine metrics function generated by
-  get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn.
-
-  Args:
-    *fn_list: Multiple metric functions to be combined
-
-  Returns:
-    Combined metric function.
-  """
-  def combined_metric_ops(*args, **kwargs):
-    eval_metric_ops = OrderedDict()
-    for fn in fn_list:
-      eval_metric_ops.update(fn(*args, **kwargs))
-    return eval_metric_ops
-  return combined_metric_ops
diff --git a/twml/twml/optimizers/__init__.py b/twml/twml/optimizers/__init__.py
deleted file mode 100644
index eaa29883c..000000000
--- a/twml/twml/optimizers/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from twitter.deepbird.compat.v1.optimizers import (
-  LazyAdamOptimizer,
-  optimize_loss,
-  OPTIMIZER_SUMMARIES) # noqa: F401
diff --git a/twml/twml/parsers.py b/twml/twml/parsers.py
deleted file mode 100644
index eac60083a..000000000
--- a/twml/twml/parsers.py
+++ /dev/null
@@ -1,20 +0,0 @@
-'''
-Contains implementations of functions to parse training and evaluation data.
-
-Modelers can use the functions in this module as the the train/eval_parse_fn of
-the DataRecordTrainer constructor to customize how to parse their datasets.
-
-Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
-
-from twitter.deepbird.io.legacy.parsers import (
-  convert_to_supervised_input_receiver_fn,  # noqa: F401
-  get_continuous_parse_fn,  # noqa: F401
-  get_default_parse_fn,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_labels_in_features_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_sparse_parse_fn,  # noqa: F401
-  get_sparse_serving_input_receiver_fn,  # noqa: F401
-  get_tensor_parse_fn,  # noqa: F401
-)
diff --git a/twml/twml/readers/__init__.py b/twml/twml/readers/__init__.py
deleted file mode 100644
index 06a6d79f5..000000000
--- a/twml/twml/readers/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# pylint: disable=wildcard-import
-""" This module contains data readers """
-
-from .batch_prediction_request import BatchPredictionRequest  # noqa: F401
-from .data_record import DataRecord, SPARSE_DATA_RECORD_FEATURE_FIELDS  # noqa: F401
-from .hashed_batch_prediction_request import HashedBatchPredictionRequest  # noqa: F401
-from .hashed_data_record import HashedDataRecord  # noqa: F401
\ No newline at end of file
diff --git a/twml/twml/readers/batch_prediction_request.py b/twml/twml/readers/batch_prediction_request.py
deleted file mode 100644
index 512a8c514..000000000
--- a/twml/twml/readers/batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for BatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/readers/data_record.py b/twml/twml/readers/data_record.py
deleted file mode 100644
index d1c377afd..000000000
--- a/twml/twml/readers/data_record.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module includes facilities for manipulating data records.
-"""
-
-from twitter.deepbird.io.legacy.readers.data_record import (
-  _SPEC_TO_TF,  # noqa: F401
-  SPARSE_DATA_RECORD_FEATURE_FIELDS,  # noqa: F401
-  _FeaturesBase,  # noqa: F401
-  _Features,  # noqa: F401
-  _DiscreteFeatures,  # noqa: F401
-  _StringFeatures,  # noqa: F401
-  _BaseDataRecord,  # noqa: F401
-  DataRecord,  # noqa: F401
-)
diff --git a/twml/twml/readers/hashed_batch_prediction_request.py b/twml/twml/readers/hashed_batch_prediction_request.py
deleted file mode 100644
index 5850c4497..000000000
--- a/twml/twml/readers/hashed_batch_prediction_request.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# pylint: disable=invalid-name
-"""
-This module implements the reader for HashedBatchPredictionRequest.
-"""
-
-from twitter.deepbird.io.legacy.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
diff --git a/twml/twml/readers/hashed_data_record.py b/twml/twml/readers/hashed_data_record.py
deleted file mode 100644
index 1ff9ce816..000000000
--- a/twml/twml/readers/hashed_data_record.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# checkstyle: noqa
-# pylint: disable=invalid-name
-"""
-This module includes facilities for manipulating hashed data records.
-"""
-
-from twitter.deepbird.io.legacy.readers.hashed_data_record import (
-  _HASHED_FIELDS,
-  _FEATURE_NAMES,
-  _FEATURE_TYPES,
-  HashedDataRecord,
-)
diff --git a/twml/twml/saved_model_cli/__init__.py b/twml/twml/saved_model_cli/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml/saved_model_cli/__main__.py b/twml/twml/saved_model_cli/__main__.py
deleted file mode 100644
index ad5326431..000000000
--- a/twml/twml/saved_model_cli/__main__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""
-This module is responsible for running saved_model_cli.
-"""
-import sys
-
-from tensorflow.python.tools import saved_model_cli
-
-if __name__ == '__main__':
-  sys.exit(saved_model_cli.main())
diff --git a/twml/twml/summary/__init__.py b/twml/twml/summary/__init__.py
deleted file mode 100644
index 284d7cf3f..000000000
--- a/twml/twml/summary/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from tensorflow.python.ops.summary_ops_v2 import flush  # noqa: F401
-
-"""
-NOTE: Using `from tensorflow.python.ops.summary_ops_v2 import flush` in the code works.
-This stub exists because it was easier to refactor code because twml is widely used.
-"""
diff --git a/twml/twml/tensorboard/__init__.py b/twml/twml/tensorboard/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml/tensorboard/__main__.py b/twml/twml/tensorboard/__main__.py
deleted file mode 100644
index c426060d1..000000000
--- a/twml/twml/tensorboard/__main__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""
-This module is responsible for running tensorboard.
-"""
-import logging
-import re
-import sys
-
-from tensorboard.main import run_main
-
-
-if __name__ == '__main__':
-  # Tensorboard relies on werkzeug for its HTTP server which logs at info level
-  # by default
-  logging.getLogger('werkzeug').setLevel(logging.WARNING)
-  sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-  sys.exit(run_main())
diff --git a/twml/twml/tensorio.py b/twml/twml/tensorio.py
deleted file mode 100644
index bc551ac56..000000000
--- a/twml/twml/tensorio.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# pylint: disable=missing-docstring, bare-except, pointless-statement,
-# pointless-string-statement, redundant-unittest-assert, no-else-return,
-# no-member, old-style-class, dangerous-default-value, protected-access,
-# too-few-public-methods
-
-import os
-
-import numpy as np
-import yaml
-
-
-"""
-Utility to load tensors serialized by Deepbird V1.
-
-Note that Deepbird V1 serialize tensor names as \"weight\".\'1\'.
-For user-friendliness, the quotes are removed from the tensor names.
-"""
-
-
-# helper class used to assist hierarchical key access by remembering intermediate keys.
-class _KeyRecorder(object):
-  def __init__(self, tensorio, keys=[]):
-    self.tensorio = tensorio
-    self.keys = keys
-
-  def __getitem__(self, k):
-    new_keys = self.keys + [str(k)]
-    prefix = ".".join(new_keys)
-
-    key_list = self.tensorio.list_tensors()
-
-    # if we have a complete key, load the tensor.
-    if prefix in key_list:
-      return self.tensorio._load(prefix)
-
-    # we don't have a complete key yet, but at least one tensor should start with this prefix.
-    for k_value in key_list:
-      if k_value.startswith(prefix):
-        return _KeyRecorder(self.tensorio, new_keys)
-
-    # if no key starts with the prefix, this _key_recorder is not valid.
-    raise ValueError("Key not found: " + prefix)
-
-
-# convert tensorio tensor type to numpy data type.
-# also returns element size in bytes.
-def _get_data_type(data_type):
-  if data_type == 'Double':
-    return (np.float64, 8)
-
-  if data_type == 'Float':
-    return (np.float32, 4)
-
-  if data_type == 'Int':
-    return (np.int32, 4)
-
-  if data_type == 'Long':
-    return (np.int64, 8)
-
-  if data_type == 'Byte':
-    return (np.int8, 1)
-
-  raise ValueError('Unexpected tensorio data type: ' + data_type)
-
-
-class TensorIO(object):
-  """
-  Construct a TensorIO class.
-  tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported.
-  mmap_tensor:
-    By default, loaded tensors use mmap storage.
-    Set this to false to not use mmap. Useful when loading multiple tensors.
-  """
-
-  def __init__(self, tensorio_path, mmap_tensor=True):
-    self._tensorio_path = tensorio_path
-    self._mmap_tensor = mmap_tensor
-
-    # Make sure we can locate spec.yaml.
-    yaml_file = os.path.join(tensorio_path, 'spec.yaml')
-    if not os.path.exists(yaml_file):
-      raise ValueError('Invalid tensorio path: no spec.yaml found.')
-
-    # load spec.yaml.
-    with open(yaml_file, 'r') as file_open:
-      # Note that tensor names in the yaml are like this: \"weight\".\'1\'
-      # For user-friendliness, we remove the quotes.
-      _spec = yaml.safe_load(file_open)
-      self._spec = {k.replace("'", '').replace('"', ''): v for (k, v) in _spec.items()}
-
-  def list_tensors(self):
-    """
-    Returns a list of tensors saved in the given path.
-    """
-    return self._spec.keys()
-
-  def _load_tensor(self, name):
-    """
-    Load Tensor with the given name.
-    Raise value error if the named tensor is not found.
-    Returns a numpy array if the named tensor is found.
-    """
-    tensor_info = self._spec[name]
-    if tensor_info['type'] != 'tensor':
-      raise ValueError('Trying to load a tensor of unknown type: ' + tensor_info['type'])
-
-    filename = os.path.join(self._tensorio_path, tensor_info['filename'])
-    (data_type, element_size) = _get_data_type(tensor_info['tensorType'])
-
-    np_array = np.memmap(
-      filename,
-      dtype=data_type,
-      mode='r',
-      # -1 because lua offset is 1 based.
-      offset=(tensor_info['offset'] - 1) * element_size,
-      shape=tuple(tensor_info['size']),
-      order='C',
-    )
-
-    return np_array if self._mmap_tensor else np_array[:].copy()
-
-  def _load_nontensor_data(self, name):
-    """
-    Load non-tensor data with the given name.
-    Returns a python string.
-    """
-    tensor_info = self._spec[name]
-    return tensor_info['data']
-
-  def _load(self, name):
-    """
-    Load data serialized under the given name, it could be a tensor or regular data.
-    """
-    if name not in self._spec:
-      raise ValueError('The specified key {} is not found in {}'.format(name, self._tensorio_path))
-
-    data_type = self._spec[name]['type']
-    if data_type == 'tensor':
-      return self._load_tensor(name)
-    else:
-      return self._load_nontensor_data(name)
-
-  def load_all(self):
-    """
-    Load all tensors stored in the tensorio directory.
-    Returns a dictionary from tensor name to numpy arrays.
-    """
-    return {k: self._load(k) for k in self._spec}
-
-  ###########################################
-  # The below are utilities for convenience #
-  ###########################################
-  def __getitem__(self, k):
-    """
-    Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1']
-    """
-    if k in self._spec:
-      # We have a full tensor name, directly load it.
-      return self._load_tensor(k)
-    else:
-      return _KeyRecorder(self)[k]
diff --git a/twml/twml/tracking/__init__.py b/twml/twml/tracking/__init__.py
deleted file mode 100644
index 008a59f70..000000000
--- a/twml/twml/tracking/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""
-This module contains the ExperimentTracker class.
-"""
-
-from .experiment_tracker import ExperimentTracker  # noqa: F401
diff --git a/twml/twml/tracking/experiment_tracker.py b/twml/twml/tracking/experiment_tracker.py
deleted file mode 100644
index 4f275ba4b..000000000
--- a/twml/twml/tracking/experiment_tracker.py
+++ /dev/null
@@ -1,543 +0,0 @@
-"""
-This module contains the experiment tracker for tracking training in ML Metastore
-"""
-from contextlib import contextmanager
-from datetime import datetime
-import getpass
-import hashlib
-import os
-import re
-import sys
-import time
-
-from absl import logging
-import tensorflow.compat.v1 as tf
-from twml.hooks import MetricsUpdateHook
-
-
-try:
-  from urllib import quote as encode_url
-except ImportError:
-  from urllib.parse import quote as encode_url
-
-
-try:
-  # ML Metastore packages might not be available on GCP.
-  # If they are not found, tracking is disabled
-  import requests
-  from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
-  from com.twitter.mlmetastore.modelrepo.core.path import (
-    check_valid_id, get_components_from_id, generate_id)
-  from com.twitter.mlmetastore.modelrepo.core import (
-    DeepbirdRun, Experiment, FeatureConfig, FeatureConfigFeature, Model, ProgressReport, Project, StatusUpdate)
-except ImportError:
-  ModelRepoClient = None
-
-
-class ExperimentTracker(object):
-  """
-  A tracker that records twml runs in ML Metastore.
-  """
-
-  def __init__(self, params, run_config, save_dir):
-    """
-
-    Args:
-      params (python dict):
-        The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and
-        `params.disable_experiment_tracking`.
-        If `experiment_tracking_path` is set to None, the tracker tries to guess a path with
-        save_dir.
-        If `disable_experiment_tracking` is True, the tracker is disabled.
-      run_config (tf.estimator.RunConfig):
-        The run config used by the estimator.
-      save_dir (str):
-        save_dir of the trainer
-    """
-    if isinstance(params, dict):
-      self._params = params
-    else:
-      # preserving backward compatibility for people still using HParams
-      logging.warning("Please stop using HParams and use python dicts. HParams are removed in TF 2")
-      self._params = dict((k, v) for k, v in params.values().items() if v != 'null')
-    self._run_config = run_config
-    self._graceful_shutdown_port = self._params.get('health_port')
-
-    self.tracking_path = self._params.get('experiment_tracking_path')
-    is_tracking_path_too_long = self.tracking_path is not None and len(self.tracking_path) > 256
-
-    if is_tracking_path_too_long:
-      raise ValueError("Experiment Tracking Path longer than 256 characters")
-
-    self.disabled = (
-      self._params.get('disable_experiment_tracking', False) or
-      not self._is_env_eligible_for_tracking() or
-      ModelRepoClient is None
-    )
-
-    self._is_hogwild = bool(os.environ.get('TWML_HOGWILD_PORTS'))
-
-    self._is_distributed = bool(os.environ.get('TF_CONFIG'))
-
-    self._client = None if self.disabled else ModelRepoClient()
-
-    run_name_from_environ = self.run_name_from_environ()
-    run_name_can_be_inferred = (
-      self.tracking_path is not None or run_name_from_environ is not None)
-
-    # Turn the flags off as needed in hogwild / distributed
-    if self._is_hogwild or self._is_distributed:
-      self._env_eligible_for_recording_experiment = (
-        self._run_config.task_type == "evaluator")
-      if run_name_can_be_inferred:
-        self._env_eligible_for_recording_export_metadata = (
-          self._run_config.task_type == "chief")
-      else:
-        logging.info(
-          'experiment_tracking_path is not set and can not be inferred. '
-          'Recording export metadata is disabled because the chief node and eval node '
-          'are setting different experiment tracking paths.')
-        self._env_eligible_for_recording_export_metadata = False
-    else:
-      # Defaults to True
-      self._env_eligible_for_recording_experiment = True
-      self._env_eligible_for_recording_export_metadata = True
-
-    if not self.disabled:
-      # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name
-      # -> own:proj:exp:Run_Name
-      if self.tracking_path:
-        try:
-          check_valid_id(self.tracking_path)
-        except ValueError as err:
-          logging.error(f'Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}')
-          self.tracking_path = generate_id(
-            owner=self.path['owner'],
-            project_name=self.path['project_name'],
-            experiment_name=self.path['experiment_name'],
-            run_name=self.path['run_name']
-          )
-          logging.error(f'Generated sanitized experiment tracking path: {self.tracking_path}')
-      else:
-        logging.info(
-          'No experiment_tracking_path set. Experiment Tracker will try to guess a path')
-        self.tracking_path = self.guess_path(save_dir, run_name_from_environ)
-        logging.info('Guessed path: %s', self.tracking_path)
-
-      # additional check to see if generated path is valid
-      try:
-        check_valid_id(self.tracking_path)
-      except ValueError as err:
-        logging.error(
-          'Could not generate valid experiment tracking path. Disabling tracking. ' +
-          'Error:\n{}'.format(err)
-        )
-        self.disabled = True
-
-    self.project_id = None if self.disabled else '{}:{}'.format(
-      self.path['owner'], self.path['project_name'])
-    self.base_run_id = None if self.disabled else self.tracking_path
-    self._current_run_name_suffix = None
-
-    self._current_tracker_hook = None
-
-    if self.disabled:
-      logging.info('Experiment Tracker is disabled')
-    else:
-      logging.info('Experiment Tracker initialized with base run id: %s', self.base_run_id)
-
-  @contextmanager
-  def track_experiment(self, eval_hooks, get_estimator_spec_fn, name=None):
-    """
-    A context manager for tracking experiment. It should wrap the training loop.
-    An experiment tracker eval hook is appended to eval_hooks to collect metrics.
-
-    Args:
-      eval_hooks (list):
-        The list of eval_hooks to be used. When it's not None, and does not contain any ,
-        MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains
-        any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo
-        tracker (`TrackRun`).
-      get_estimator_spec_fn (func):
-        A function to get the current EstimatorSpec of the trainer, used by the eval hook.
-      name (str);
-        Name of this training or evaluation. Used as a suffix of the run_id.
-
-    Returns:
-      The tracker's eval hook which is appended to eval_hooks.
-    """
-
-    # disable this tracker if legacy TrackRun hook is present
-    # TODO: remove this once we completely deprecate the old TrackRun interface
-    if eval_hooks is not None:
-      self.disabled = self.disabled or any(isinstance(x, MetricsUpdateHook) for x in eval_hooks)
-
-    logging.info('Is environment eligible for recording experiment: %s',
-                 self._env_eligible_for_recording_experiment)
-
-    if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port:
-      requests.post('http://localhost:{}/track_training_start'.format(
-        self._graceful_shutdown_port
-      ))
-
-    if self.disabled or eval_hooks is None:
-      yield None
-    else:
-      assert self._current_tracker_hook is None, 'experiment tracking has been started already'
-
-      if name is not None:
-        self._current_run_name_suffix = '_' + name
-
-      logging.info('Starting experiment tracking. Path: %s', self._current_run_id)
-      logging.info('Is environment eligible for recording export metadata: %s',
-                   self._env_eligible_for_recording_export_metadata)
-      logging.info('This run will be available at: http://go/mldash/experiments/%s',
-                   encode_url(self.experiment_id))
-
-      try:
-        self._record_run()
-        self._add_run_status(StatusUpdate(self._current_run_id, status='RUNNING'))
-        self._register_for_graceful_shutdown()
-
-        self._current_tracker_hook = self.create_eval_hook(get_estimator_spec_fn)
-      except Exception as err:
-        logging.error(
-          'Failed to record run. This experiment will not be tracked. Error: %s', str(err))
-        self._current_tracker_hook = None
-
-      if self._current_tracker_hook is None:
-        yield None
-      else:
-        try:
-          eval_hooks.append(self._current_tracker_hook)
-          yield self._current_tracker_hook
-        except Exception as err:
-          self._add_run_status(
-            StatusUpdate(self._current_run_id, status='FAILED', description=str(err)))
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-          logging.error('Experiment tracking done. Experiment failed.')
-          raise
-
-        try:
-          if self._current_tracker_hook.metric_values:
-            self._record_update(self._current_tracker_hook.metric_values)
-          self._add_run_status(StatusUpdate(self._current_run_id, status='SUCCESS'))
-          logging.info('Experiment tracking done. Experiment succeeded.')
-        except Exception as err:
-          logging.error(
-            'Failed to update mark run as successful. Error: %s', str(err))
-        finally:
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-
-  def create_eval_hook(self, get_estimator_spec_fn):
-    """
-    Create an eval_hook to track eval metrics
-
-    Args:
-      get_estimator_spec_fn (func):
-        A function that returns the current EstimatorSpec of the trainer.
-    """
-    return MetricsUpdateHook(
-      get_estimator_spec_fn=get_estimator_spec_fn,
-      add_metrics_fn=self._record_update)
-
-  def register_model(self, export_path):
-    """
-    Record the exported model.
-
-    Args:
-      export_path (str):
-        The path to the exported model.
-    """
-    if self.disabled:
-      return None
-
-    try:
-      logging.info('Model is exported to %s. Computing hash of the model.', export_path)
-      model_hash = self.compute_model_hash(export_path)
-      logging.info('Model hash: %s. Registering it in ML Metastore.', model_hash)
-      self._client.register_model(Model(model_hash, self.path['owner'], self.base_run_id))
-    except Exception as err:
-      logging.error('Failed to register model. Error: %s', str(err))
-
-  def export_feature_spec(self, feature_spec_dict):
-    """
-    Export feature spec to ML Metastore (go/ml-metastore).
-
-    Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due
-    to the 1mb upper limit for values in manhattan, and more specific information (feature type,
-    feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset.
-
-    Args:
-       feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec()
-    """
-    if self.disabled or not self._env_eligible_for_recording_export_metadata:
-      return None
-
-    try:
-      logging.info('Exporting feature spec to ML Metastore.')
-      feature_list = feature_spec_dict['features']
-      label_list = feature_spec_dict['labels']
-      weight_list = feature_spec_dict['weight']
-      self._client.add_feature_config(FeatureConfig(self._current_run_id, list(feature_list.keys()),
-                                                    list(label_list.keys()), list(weight_list.keys())))
-
-      feature_config_features = [
-        FeatureConfigFeature(
-          hash_id=_feature_hash_id,
-          feature_name=_feature['featureName'],
-          feature_type=_feature['featureType']
-        )
-        for _feature_hash_id, _feature in zip(feature_list.keys(), feature_list.values())
-      ]
-      self._client.add_feature_config_features(list(feature_list.keys()), feature_config_features)
-
-      feature_config_labels = [
-        FeatureConfigFeature(
-          hash_id=_label_hash_id,
-          feature_name=_label['featureName']
-        )
-        for _label_hash_id, _label in zip(label_list.keys(), label_list.values())
-      ]
-      self._client.add_feature_config_features(list(label_list.keys()), feature_config_labels)
-
-      feature_config_weights = [
-        FeatureConfigFeature(
-          hash_id=_weight_hash_id,
-          feature_name=_weight['featureName'],
-          feature_type=_weight['featureType']
-        )
-        for _weight_hash_id, _weight in zip(weight_list.keys(), weight_list.values())
-      ]
-      self._client.add_feature_config_features(list(weight_list.keys()), feature_config_weights)
-
-    except Exception as err:
-      logging.error('Failed to export feature spec. Error: %s', str(err))
-
-  @property
-  def path(self):
-    if self.disabled:
-      return None
-    return get_components_from_id(self.tracking_path, ensure_valid_id=False)
-
-  @property
-  def experiment_id(self):
-    if self.disabled:
-      return None
-    return '%s:%s:%s' % (self.path['owner'], self.path['project_name'],
-                         self.path['experiment_name'])
-
-  @property
-  def _current_run_name(self):
-    """
-    Return the current run name.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.path['run_name'] + self._current_run_name_suffix
-    else:
-      return self.path['run_name']
-
-  @property
-  def _current_run_id(self):
-    """
-    Return the current run id.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.base_run_id + self._current_run_name_suffix
-    else:
-      return self.base_run_id
-
-  def get_run_status(self) -> str:
-    if not self.disabled:
-      return self._client.get_latest_dbv2_status(self._current_run_id)
-
-  def _add_run_status(self, status):
-    """
-    Add run status with underlying client.
-
-    Args:
-      status (StatusUpdate):
-        The status update to add.
-    """
-    if not self.disabled and self._env_eligible_for_recording_experiment:
-      self._client.add_run_status(status)
-
-  def _record_run(self):
-    """
-    Record the run in ML Metastore.
-    """
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    if not self._client.project_exists(self.project_id):
-      self._client.add_project(Project(self.path['project_name'], self.path['owner']))
-      time.sleep(1)
-
-    if not self._client.experiment_exists(self.experiment_id):
-      self._client.add_experiment(Experiment(
-        self.path['experiment_name'], self.path['owner'], self.project_id, ''))
-      time.sleep(1)
-
-    run = DeepbirdRun(self.experiment_id, self._current_run_name, '',
-                      {'raw_command': ' '.join(sys.argv)}, self._params)
-    self._client.add_deepbird_run(run, force=True)
-    time.sleep(1)
-
-  def _record_update(self, metrics):
-    """
-    Record metrics update in ML Metastore.
-
-    Args:
-      metrics (dict):
-        The dict of the metrics and their values.
-    """
-
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    reported_metrics = {}
-    for k, v in metrics.items():
-
-      if hasattr(v, 'item'):
-        reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist())
-      else:
-        logging.warning("Ignoring %s because the value (%s) is not valid" % (k, str(v)))
-
-    report = ProgressReport(self._current_run_id, reported_metrics)
-
-    try:
-      self._client.add_progress_report(report)
-    except Exception as err:
-      logging.error('Failed to record metrics in ML Metastore. Error: {}'.format(err))
-      logging.error('Run ID: {}'.format(self._current_run_id))
-      logging.error('Progress Report: {}'.format(report.to_json_string()))
-
-  def _register_for_graceful_shutdown(self):
-    """
-    Register the tracker with the health server, enabling graceful shutdown.
-
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/register_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
-
-  def _deregister_for_graceful_shutdown(self):
-    """
-    Deregister the tracker with the health server, disabling graceful shutdown.
-
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/deregister_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
-
-  def _is_env_eligible_for_tracking(self):
-    """
-    Determine if experiment tracking should run in the env.
-    """
-    is_unit_test = (
-      os.environ.get('PYTEST_CURRENT_TEST') is not None and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
-
-    is_running_on_ci = (
-      getpass.getuser() == 'scoot-service' and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
-
-    return (
-      not is_unit_test and
-      not is_running_on_ci
-    )
-
-  @classmethod
-  def run_name_from_environ(cls):
-    """
-    Create run id from environment if possible.
-    """
-    job_name = os.environ.get("TWML_JOB_NAME")
-    job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME")
-
-    if not job_name or not job_launch_time:
-      return None
-
-    try:
-      # job_launch_time should be in isoformat
-      # python2 doesnt support datetime.fromisoformat, so use hardcoded format string.
-      job_launch_time_formatted = datetime.strptime(job_launch_time,
-                                                    "%Y-%m-%dT%H:%M:%S.%f")
-    except ValueError:
-      # Fallback in case aurora config is generating datetime in a different format.
-      job_launch_time_formatted = (job_launch_time
-                                   .replace("-", "_").replace("T", "_")
-                                   .replace(":", "_").replace(".", "_"))
-
-    return '{}_{}'.format(
-      job_name, job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p'))
-
-  @classmethod
-  def guess_path(cls, save_dir, run_name=None):
-    """
-    Guess an experiment tracking path based on save_dir.
-
-    Returns:
-      (str) guessed path
-    """
-    if not run_name:
-      run_name = 'Unnamed_{}'.format(datetime.now().strftime('%m_%d_%Y_%I_%M_%p'))
-
-    if save_dir.startswith('hdfs://'):
-      path_match = re.search(r'/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)', save_dir)
-
-      if path_match:
-        groups = path_match.groups()
-        user = groups[0]
-        project_name = groups[1]
-
-        return generate_id(user, 'default', project_name, run_name)
-
-    user = getpass.getuser()
-    project_name = re.sub(r'^[a-z0-9\-_]', os.path.basename(save_dir), '')
-    if not project_name:
-      project_name = 'unnamed'
-
-    return generate_id(user, 'default', project_name, run_name)
-
-  @classmethod
-  def compute_model_hash(cls, export_path):
-    """
-    Computes the hash of an exported model. This is a gfile version of
-    twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate
-    the same hash when given the same model.
-
-    Args:
-      export_path (str):
-        The path to the exported model.
-
-    Returns:
-      (str) hash of the exported model
-    """
-    paths = []
-    for path, subdirs, files in tf.io.gfile.walk(export_path):
-      for name in sorted(files):
-        paths.append(os.path.join(path, name))
-
-    paths.sort()
-    hash_object = hashlib.new('sha1')
-
-    for path in paths:
-      with tf.io.gfile.GFile(path, "rb") as file:
-        hash_object.update(file.read())
-
-    return hash_object.hexdigest()
diff --git a/twml/twml/trainers/__init__.py b/twml/twml/trainers/__init__.py
deleted file mode 100644
index e6664d9a6..000000000
--- a/twml/twml/trainers/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# pylint: disable=wildcard-import
-"""
-This module contains the Trainer and DataRecordTrainer classes.
-Trainers wrap a
-`tf.estimator.Estimator
-<https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/Estimator>`_.
-"""
-
-from .trainer import Trainer  # noqa: F401
-from .data_record_trainer import DataRecordTrainer  # noqa: F401
diff --git a/twml/twml/trainers/data_record_trainer.py b/twml/twml/trainers/data_record_trainer.py
deleted file mode 100644
index 76dd16f80..000000000
--- a/twml/twml/trainers/data_record_trainer.py
+++ /dev/null
@@ -1,821 +0,0 @@
-# pylint: disable=arguments-differ, invalid-name
-"""
-This module contains the ``DataRecordTrainer``.
-Unlike the parent ``Trainer`` class, the ``DataRecordTrainer``
-is used specifically for processing data records.
-It abstracts away a lot of the intricacies of working with DataRecords.
-`DataRecord <http://go/datarecord>`_ is the main piping format for data samples.
-The `DataRecordTrainer` assumes training data and production responses and requests
-to be organized as the `Thrift prediction service API
-
-A ``DataRecord`` is a Thrift struct that defines how to encode the data:
-
-::
-
-  struct DataRecord {
-    1: optional set<i64> binaryFeatures;                     // stores BINARY features
-    2: optional map<i64, double> continuousFeatures;         // stores CONTINUOUS features
-    3: optional map<i64, i64> discreteFeatures;              // stores DISCRETE features
-    4: optional map<i64, string> stringFeatures;             // stores STRING features
-    5: optional map<i64, set<string>> sparseBinaryFeatures;  // stores sparse BINARY features
-    6: optional map<i64, map<string, double>> sparseContinuousFeatures; // sparse CONTINUOUS feature
-    7: optional map<i64, binary> blobFeatures; // stores features as BLOBs (binary large objects)
-    8: optional map<i64, tensor.GeneralTensor> tensors; // stores TENSOR features
-    9: optional map<i64, tensor.SparseTensor> sparseTensors; // stores SPARSE_TENSOR features
-  }
-
-
-A significant portion of Twitter data is hydrated
-and then temporarily stored on HDFS as DataRecords.
-The files are compressed (.gz or .lzo) partitions of data records.
-These form supervised datasets. Each sample captures the relationship
-between input and output (cause and effect).
-To create your own dataset, please see https://github.com/twitter/elephant-bird.
-
-The default ``DataRecordTrainer.[train,evaluate,learn]()`` reads these datarecords.
-The data is a read from multiple ``part-*.[compression]`` files.
-The default behavior of ``DataRecordTrainer`` is to read sparse features from ``DataRecords``.
-This is a legacy default piping format at Twitter.
-The ``DataRecordTrainer`` is flexible enough for research and yet simple enough
-for a new beginner ML practioner.
-
-By means of the feature string to key hashing function,
-the ``[train,eval]_feature_config`` constructor arguments
-control which features can be used as sample labels, sample weights,
-or sample features.
-Samples ids, and feature keys, feature values and feature weights
-can be skipped, included, excluded or used as labels, weights, or features.
-This allows you to easily define and control sparse distributions of
-named features.
-
-Yet sparse data is difficult to work with. We are currently working to
-optimize the sparse operations due to inefficiencies in the gradient descent
-and parameter update processes. There are efforts underway
-to minimize the footprint of sparse data as it is inefficient to process.
-CPUs and GPUs much prefer dense tensor data.
-"""
-
-import datetime
-
-import tensorflow.compat.v1 as tf
-from twitter.deepbird.io.dal import dal_to_hdfs_path, is_dal_path
-import twml
-from twml.trainers import Trainer
-from twml.contrib.feature_importances.feature_importances import (
-  compute_feature_importances,
-  TREE,
-  write_feature_importances_to_hdfs,
-  write_feature_importances_to_ml_dash)
-from absl import logging
-
-
-class DataRecordTrainer(Trainer):  # pylint: disable=abstract-method
-  """
-  The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases
-  at Twitter where only the build_graph methods needs to be overridden.
-  For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
-    """
-    The DataRecordTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, DataRecordTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
-    """
-
-    # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL.
-    super(DataRecordTrainer, self).__init__(
-      name=name, params=params, build_graph_fn=build_graph_fn, **kwargs)
-
-    self._feature_config = feature_config
-
-    # date range parameters common to both training and evaluation data:
-    hour_resolution = self.params.get("hour_resolution", 1)
-    data_threads = self.params.get("data_threads", 4)
-    datetime_format = self.params.get("datetime_format", "%Y/%m/%d")
-
-    # retrieve the desired training dataset files
-    self._train_files = self.build_files_list(
-      files_list_path=self.params.get("train_files_list", None),
-      data_dir=self.params.get("train_data_dir", None),
-      start_datetime=self.params.get("train_start_datetime", None),
-      end_datetime=self.params.get("train_end_datetime", None),
-      datetime_format=datetime_format, data_threads=data_threads,
-      hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-      overwrite=self.params.get("train_overwrite_files_list", False),
-    )
-
-    # retrieve the desired evaluation dataset files
-    eval_name = self.params.get("eval_name", None)
-
-    if eval_name == "train":
-      self._eval_files = self._train_files
-    else:
-      self._eval_files = self.build_files_list(
-        files_list_path=self.params.get("eval_files_list", None),
-        data_dir=self.params.get("eval_data_dir", None),
-        start_datetime=self.params.get("eval_start_datetime", None),
-        end_datetime=self.params.get("eval_end_datetime", None),
-        datetime_format=datetime_format, data_threads=data_threads,
-        hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-        overwrite=self.params.get("eval_overwrite_files_list", False),
-      )
-
-      if not self.params.get("allow_train_eval_overlap"):
-        # if there is overlap between train and eval, error out!
-        if self._train_files and self._eval_files:
-          overlap_files = set(self._train_files) & set(self._eval_files)
-        else:
-          overlap_files = set()
-        if overlap_files:
-          raise ValueError("There is an overlap between train and eval files:\n %s" %
-                           (overlap_files))
-
-  @staticmethod
-  def build_hdfs_files_list(
-      files_list_path, data_dir,
-      start_datetime, end_datetime, datetime_format,
-      data_threads, hour_resolution, maybe_save, overwrite):
-    if files_list_path:
-      files_list_path = twml.util.preprocess_path(files_list_path)
-
-    if isinstance(start_datetime, datetime.datetime):
-      start_datetime = start_datetime.strftime(datetime_format)
-    if isinstance(end_datetime, datetime.datetime):
-      end_datetime = end_datetime.strftime(datetime_format)
-
-    list_files_by_datetime_args = {
-      "base_path": data_dir,
-      "start_datetime": start_datetime,
-      "end_datetime": end_datetime,
-      "datetime_prefix_format": datetime_format,
-      "extension": "lzo",
-      "parallelism": data_threads,
-      "hour_resolution": hour_resolution,
-      "sort": True,
-    }
-
-    # no cache of data file paths, just get the list by scraping the directory
-    if not files_list_path or not tf.io.gfile.exists(files_list_path):
-      # twml.util.list_files_by_datetime returns None if data_dir is None.
-      # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
-      files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-    else:
-      # the cached data file paths file exists.
-      files_info = twml.util.read_file(files_list_path, decode="json")
-      # use the cached list if data params match current params,
-      #  or if current params are None
-      # Not including None checks for datetime_format and hour_resolution,
-      #  since those are shared between eval and training.
-      if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or
-          (files_info["data_dir"] == data_dir and
-           files_info["start_datetime"] == start_datetime and
-           files_info["end_datetime"] == end_datetime and
-           files_info["datetime_format"] == datetime_format and
-           files_info["hour_resolution"] == hour_resolution)):
-        files_list = files_info["files"]
-      elif overwrite:
-        # current params are not none and don't match saved params
-        # `overwrite` indicates we should thus update the list
-        files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-      else:
-        # dont update the cached list
-        raise ValueError("Information in files_list is inconsistent with provided args.\n"
-                         "Did you intend to overwrite files_list using "
-                         "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
-                         "If you instead want to use the paths in files_list, ensure that "
-                         "data_dir, start_datetime, and end_datetime are None.")
-
-    if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)):
-      save_dict = {}
-      save_dict["files"] = files_list
-      save_dict["data_dir"] = data_dir
-      save_dict["start_datetime"] = start_datetime
-      save_dict["end_datetime"] = end_datetime
-      save_dict["datetime_format"] = datetime_format
-      save_dict["hour_resolution"] = hour_resolution
-      twml.util.write_file(files_list_path, save_dict, encode="json")
-
-    return files_list
-
-  @staticmethod
-  def build_files_list(files_list_path, data_dir,
-                        start_datetime, end_datetime, datetime_format,
-                        data_threads, hour_resolution, maybe_save, overwrite):
-    '''
-    When specifying DAL datasets, only data_dir, start_dateime, and end_datetime
-    should be given with the format:
-
-    dal://{cluster}/{role}/{dataset_name}/{env}
-
-    '''
-    if not data_dir or not is_dal_path(data_dir):
-      logging.warn(f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path.")
-      return DataRecordTrainer.build_hdfs_files_list(
-        files_list_path, data_dir,
-        start_datetime, end_datetime, datetime_format,
-        data_threads, hour_resolution, maybe_save, overwrite)
-
-    del datetime_format
-    del data_threads
-    del hour_resolution
-    del maybe_save
-    del overwrite
-
-    return dal_to_hdfs_path(
-      path=data_dir,
-      start_datetime=start_datetime,
-      end_datetime=end_datetime,
-    )
-
-  @property
-  def train_files(self):
-    return self._train_files
-
-  @property
-  def eval_files(self):
-    return self._eval_files
-
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_
-    and `DataRecordTrainer code
-    <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_
-    for a list and description of all cmd-line arguments.
-
-    Args:
-      learning_rate_decay:
-        Defaults to False. When True, parses learning rate decay arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments()
-    parser.add_argument(
-      "--train.files_list", "--train_files_list", type=str, default=None,
-      dest="train_files_list",
-      help="Path for a json file storing information on training data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "training files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if train_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--train.overwrite_files_list", "--train_overwrite_files_list", action="store_true", default=False,
-      dest="train_overwrite_files_list",
-      help="When the --train.files_list param is used, indicates whether to "
-           "overwrite the existing --train.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.files_list", "--eval_files_list", type=str, default=None,
-      dest="eval_files_list",
-      help="Path for a json file storing information on evaluation data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "evaluation files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if eval_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--eval.overwrite_files_list", "--eval_overwrite_files_list", action="store_true", default=False,
-      dest="eval_overwrite_files_list",
-      help="When the --eval.files_list param is used, indicates whether to "
-           "overwrite the existing --eval.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--eval.data_dir", "--eval_data_dir", type=str, default=None,
-      dest="eval_data_dir",
-      help="Path to the cross-validation data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--eval.start_date", "--eval_start_datetime",
-      type=str, default=None,
-      dest="eval_start_datetime",
-      help="Starting date for evaluating inside the eval data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.end_date", "--eval_end_datetime", type=str, default=None,
-      dest="eval_end_datetime",
-      help="Ending date for evaluating inside the eval data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %%Y/%%m/%%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
-    parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
-    parser.add_argument(
-      "--data_spec", type=str, required=True,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-    parser.add_argument(
-      "--train.keep_rate", "--train_keep_rate", type=float, default=None,
-      dest="train_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--eval.keep_rate", "--eval_keep_rate", type=float, default=None,
-      dest="eval_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--train.parts_downsampling_rate", "--train_parts_downsampling_rate",
-      dest="train_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
-      files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--eval.parts_downsampling_rate", "--eval_parts_downsampling_rate",
-      dest="eval_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
-      files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--allow_train_eval_overlap",
-      dest="allow_train_eval_overlap",
-      action="store_true",
-      help="Allow overlap between train and eval datasets."
-    )
-    parser.add_argument(
-      "--eval_name", type=str, default=None,
-      help="String denoting what we want to name the eval. If this is `train`, then we eval on \
-      the training dataset."
-    )
-    return parser
-
-  def contrib_run_feature_importances(self, feature_importances_parse_fn=None, write_to_hdfs=True, extra_groups=None, datarecord_filter_fn=None, datarecord_filter_run_name=None):
-    """Compute feature importances on a trained model (this is a contrib feature)
-    Args:
-      feature_importances_parse_fn (fn): The same parse_fn that we use for training/evaluation.
-        Defaults to feature_config.get_parse_fn()
-      write_to_hdfs (bool): Setting this to True writes the feature importance metrics to HDFS
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    logging.info("Computing feature importance")
-    algorithm = self._params.feature_importance_algorithm
-
-    kwargs = {}
-    if algorithm == TREE:
-      kwargs["split_feature_group_on_period"] = self._params.split_feature_group_on_period
-      kwargs["stopping_metric"] = self._params.feature_importance_metric
-      kwargs["sensitivity"] = self._params.feature_importance_sensitivity
-      kwargs["dont_build_tree"] = self._params.dont_build_tree
-      kwargs["extra_groups"] = extra_groups
-      if self._params.feature_importance_is_metric_larger_the_better:
-        # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC)
-        kwargs["is_metric_larger_the_better"] = True
-      elif self._params.feature_importance_is_metric_smaller_the_better:
-        # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS)
-        kwargs["is_metric_larger_the_better"] = False
-      else:
-        # The user has not specified which direction is better for the stopping metric
-        kwargs["is_metric_larger_the_better"] = None
-      logging.info("Using the tree algorithm with kwargs {}".format(kwargs))
-
-    feature_importances = compute_feature_importances(
-      trainer=self,
-      data_dir=self._params.get('feature_importance_data_dir'),
-      feature_config=self._feature_config,
-      algorithm=algorithm,
-      record_count=self._params.feature_importance_example_count,
-      parse_fn=feature_importances_parse_fn,
-      datarecord_filter_fn=datarecord_filter_fn,
-      **kwargs)
-
-    if not feature_importances:
-      logging.info("Feature importances returned None")
-    else:
-      if write_to_hdfs:
-        logging.info("Writing feature importance to HDFS")
-        write_feature_importances_to_hdfs(
-          trainer=self,
-          feature_importances=feature_importances,
-          output_path=datarecord_filter_run_name,
-          metric=self._params.get('feature_importance_metric'))
-      else:
-        logging.info("Not writing feature importance to HDFS")
-
-      logging.info("Writing feature importance to ML Metastore")
-      write_feature_importances_to_ml_dash(
-        trainer=self, feature_importances=feature_importances)
-    return feature_importances
-
-  def export_model(self, serving_input_receiver_fn=None,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICT graph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Args:
-      serving_input_receiver_fn (Function):
-        function preparing the model for inference requests.
-        If not set; defaults to the the serving input receiver fn set by the FeatureConfig.
-      export_output_fn (Function):
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory ``save_dir`` is chosen.
-
-    Returns:
-      The export directory where the PREDICT graph is saved.
-    """
-    if serving_input_receiver_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      serving_input_receiver_fn = self._feature_config.get_serving_input_receiver_fn()
-
-    if feature_spec is None:
-      if self._feature_config is None:
-        raise ValueError("feature_spec can not be inferred."
-                         "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method")
-      else:
-        feature_spec = self._feature_config.get_feature_spec()
-
-    if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig):
-      raise ValueError("Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn")
-    elif not callable(serving_input_receiver_fn):
-      raise ValueError("Expecting Function for serving_input_receiver_fn")
-
-    if export_output_fn is None:
-      export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-
-    return super(DataRecordTrainer, self).export_model(
-      export_dir=export_dir,
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path,
-      export_output_fn=export_output_fn,
-      feature_spec=feature_spec,
-    )
-
-  def get_train_input_fn(
-      self, parse_fn=None, repeat=None, shuffle=True, interleave=True, shuffle_files=None,
-      initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.train().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to the parser returned by the FeatureConfig selected
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`.
-        This ensures the training is run for atleast `params.train_steps`.
-        Toggling this to `False` results in training finishing when one of the following happens:
-          - The entire dataset has been trained upon once.
-          - `params.train_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        When `False`, files are read in alpha-numerical order. Also when `False`
-        the dataset is sharded among workers for Hogwild and distributed training
-        if no sharding configuration is provided in `params.train_dataset_shards`.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffle the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.train()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.train_steps > 0 or self.params.get('distributed', False)
-
-    if not shuffle and self.num_workers > 1 and self.params.train_dataset_shards is None:
-      num_shards = self.num_workers
-      shard_index = self.worker_index
-    else:
-      num_shards = self.params.train_dataset_shards
-      shard_index = self.params.train_dataset_shard_index
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._train_files,
-      batch_size=self.params.train_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.train_keep_rate,
-      parts_downsampling_rate=self.params.train_parts_downsampling_rate,
-      shards=num_shards,
-      shard_index=shard_index,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs)
-
-  def get_eval_input_fn(
-      self, parse_fn=None, repeat=None,
-      shuffle=True, interleave=True,
-      shuffle_files=None, initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.eval().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to twml.parsers.get_sparse_parse_fn(feature_config).
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`.
-        This ensures the evaluation is run for atleast `params.eval_steps`.
-        Toggling this to `False` results in evaluation finishing when one of the following happens:
-          - The entire dataset has been evaled upon once.
-          - `params.eval_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `False`, files are read in alpha-numerical order.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffles the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.eval()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not self._eval_files:
-      raise ValueError("`eval_files` was not present in `params` passed to `DataRecordTrainer`")
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.eval_steps > 0
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._eval_files,
-      batch_size=self.params.eval_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.eval_keep_rate,
-      parts_downsampling_rate=self.params.eval_parts_downsampling_rate,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs
-    )
-
-  def _assert_train_files(self):
-    if not self._train_files:
-      raise ValueError("train.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def _assert_eval_files(self):
-    if not self._eval_files:
-      raise ValueError("eval.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_train_input_fn().
-    See Trainer for more detailed documentation documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).train(input_fn=input_fn, steps=steps, hooks=hooks)
-
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_eval_input_fn().
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_eval_files()
-    input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False)
-    return super(DataRecordTrainer, self).evaluate(
-      input_fn=input_fn,
-      steps=steps,
-      hooks=hooks,
-      name=name
-    )
-
-  def learn(self, train_input_fn=None, eval_input_fn=None, **kwargs):
-    """
-    Overrides ``Trainer.learn`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.learn`` for more detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).learn(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def train_and_evaluate(self,
-                         train_input_fn=None, eval_input_fn=None,
-                          **kwargs):
-    """
-    Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.train_and_evaluate`` for detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).train_and_evaluate(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    Overrides the _model_fn to correct for the features shape of the sparse features
-    extracted with the contrib.FeatureConfig
-    """
-    if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig):
-      # Fix the shape of the features. The features dictionary will be modified to
-      # contain the shape changes.
-      twml.util.fix_shape_sparse(features, self._feature_config)
-    return super(DataRecordTrainer, self)._model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      params=params,
-      config=config
-    )
-
-  def calibrate(self,
-                calibrator,
-                input_fn=None,
-                steps=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.train_input_fn.
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).calibrate(calibrator=calibrator,
-                                             input_fn=input_fn,
-                                             steps=steps,
-                                             save_calibrator=save_calibrator,
-                                             hooks=hooks)
-
-  def save_checkpoints_and_export_model(self,
-                                        serving_input_receiver_fn,
-                                        export_output_fn=None,
-                                        export_dir=None,
-                                        checkpoint_path=None,
-                                        input_fn=None):
-    """
-    Exports saved module after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See export_model for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.export_model(serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path)
-
-  def save_checkpoints_and_evaluate(self,
-                                    input_fn=None,
-                                    steps=None,
-                                    hooks=None,
-                                    name=None):
-    """
-    Evaluates model after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See evaluate for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.evaluate(input_fn, steps, hooks, name)
diff --git a/twml/twml/trainers/trainer.py b/twml/twml/trainers/trainer.py
deleted file mode 100644
index e51b4e0fd..000000000
--- a/twml/twml/trainers/trainer.py
+++ /dev/null
@@ -1,1777 +0,0 @@
-# pylint: disable=too-many-lines
-"""
-``twml.trainers.Trainer`` is a wrapper around `tf.estimator.Estimator
-<https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/Estimator>`_
-to expose an easier to use API by
-hiding rarely used config knobs and supplying default values.
-
-The `Trainer` facilitates multi-phase training commonly used at Twitter: e.g.
-MDL calibration -> MLP training -> Isotonic calibration.
-The `Trainer` also facilitates hyperparameters tuning,
-with its simple `add_parser_arguments()` method.
-
-Learning rate decay functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Please note that we have four learning rate decay functions to choose from.
-Additionally, each trainer can only take one learning rate decay function and its parameters.
-If that is not the case, it will throw an error.
-Also, please note that the learning rate decay is a positional argument and should be placed as
-the last argument to the trainer, as you can see in the example above.
-The four learning decays options are:
-
-1. inverse_learning_rate_decay:
-
-  The function returns the decayed learning rate. It is computed as:
-
-  ::
-
-    decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /decay_step)
-    final_decayed_learning_rate = max(decayed_learning_rate, min_learning_rate)
-
-
-2. polynomial_learning_rate_decay:
-
-  The function returns the decayed learning rate. It is computed as:
-
-  ::
-
-    global_step = min(global_step, decay_steps)
-    decayed_learning_rate = (learning_rate - end_learning_rate) *
-                            (1 - global_step / decay_steps) ^ (power) +
-                            end_learning_rate
-
-
-3. piecewise_constant_learning_rate_decay:
-
-  Piecewise constant from boundaries and interval values.
-
-  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 for
-  the next 10000 steps, and 0.1 for any additional steps.
-
-  ::
-
-    global_step = tf.Variable(0, trainable=False)
-    boundaries = [100000, 110000]
-    values = [1.0, 0.5, 0.1]
-    learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
-
-4. exponential_learning_rate_decay:
-
-  The function returns the decayed learning rate. It is computed as:
-
-  ::
-
-    decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-
-"""
-
-import datetime
-import functools
-import math
-from operator import itemgetter
-import os
-import pprint as pp
-import random
-from string import Template
-import subprocess
-import sys
-import time
-from threading import Thread
-
-from twitter.common.metrics import AtomicGauge
-from twitter.deepbird.stats_server import utils as stats_server_utils
-from twitter.deepbird.stats_server.stats_exporter import StatsExporter
-from twitter.ml.common import metrics
-from twitter.ml.common.kubernetes import kubectl_delete_by_name, Resource
-from twitter.ml.twml.status import get_distributed_training_job_status, TrainingJobStatus
-
-from absl import logging
-from twml.optimizers import LazyAdamOptimizer, optimize_loss, OPTIMIZER_SUMMARIES
-from twml.contrib.optimizers import DeepGradientCompressionOptimizer
-from twml.tracking import ExperimentTracker
-from twml.util import (delete_file_or_dir,
-                       get_distributed_training_job_path,
-                       sanitize_hdfs_path)
-try:
-  from urllib import quote as encode_url
-except ImportError:
-  from urllib.parse import quote as encode_url
-import tensorflow.compat.v1 as tf
-import tensorflow
-import tensorflow_hub as hub
-
-import twitter.ml.twml.kubernetes.status as k8s_status
-import twml
-import twml.export_output_fns
-import twml.learning_rate_decay
-import twml.metrics
-
-
-_CLUSTER_TEMPLATE = Template('''{
-  "cluster": {
-    "ps": [$PS],
-    "chief": [$CHIEF],
-    "worker": [$WORKER]
-  },
-  "task": {"type": "$TYPE", "index": $INDEX}
-}
-''')
-
-
-def init_from_checkpoint(init_dir, init_map):
-  """
-  Wrapper around tf.train.init_from_checkpoint
-  """
-  if init_dir:
-    init_dir = sanitize_hdfs_path(init_dir)
-    tf.train.init_from_checkpoint(init_dir, init_map)
-
-
-class Trainer(object):
-  """
-  This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier.
-  Supports multi-phase training (for example, use a Trainer for MDL calibration, then
-  another for training the rest of the model, then another for isotonic calibration).
-  The Trainer also implements a training and evaluation loop via the ``learn()`` method.
-  Each Trainer is associated to a fixed set of hyper parameters (params), and a single model
-  specified by ``build_graph``. Given these constraints, a single Trainer can be called
-  multiple times for training and evaluation over multiple epochs.
-
-  However, if you intend to try different sets of hyper-parameters, we recommend you instantiate
-  a different Trainer for each such experiment. That way, each experiment can be tracked
-  in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain
-  checkpoints of the model (its graph, and variables), and the history of metrics (for example,
-  evaluation accuracy at each epoch), and other store observations like the average time per step.
-  The latter metrics can be viewed by pointing
-  TensorBoard to the save_dir and accessing TensorBoard via your browser.
-  """
-
-  def __init__(self, name, params, build_graph_fn,
-               metric_fn=None,
-               optimize_loss_fn=None,
-               run_config=None,
-               save_dir=None,
-               init_from_dir=None,
-               init_map=None,
-               warm_start_from=None,
-               profiler_steps=None,
-               **kwargs):
-    """
-
-    Args:
-      name (String):
-        string name of this estimator; used as scope names for variables and tensors.
-      params (HParams, Namespace, or Dict):
-        hyper-parameters to be passed to Estimator constructor.
-        Must include params.train_batch_size and params.eval_batch_size.
-        Note that params is passed to twml.util.convert_to_hparams() to produce an HParams.
-      build_graph_fn:
-        A function for building tensorflow graphs.
-        This matches TensorFlow Estimator's model_fn signature.
-        For example,
-
-        .. code-block:: python
-
-          def build_graph(features, label, mode, params, config=None):
-            # Implements a simple binary logistic regression model
-            sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-
-            logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1)
-
-            if mode == 'infer':
-              loss = None
-            else:
-              loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
-              loss = twml.util.weighted_average(loss, features['weights'])
-
-            output = tf.nn.sigmoid(logits)
-
-            return {'output': output, 'loss': loss}
-
-        Args:
-          features (dict of Tensor keyed by a string name):
-            input tensors.
-          mode (tf.estimator.ModeKeys / String):
-            one of 'train', 'eval', 'infer'.
-          label (Tensor):
-            if in ``mode == 'train'`` mode, these contain the corresponding labels for input.
-          params (HParams):
-            hyper parameters that control how to build a graph.
-          config:
-            the RunConfig object passed to Estimator constructor.
-
-        This function is expected to return a dictionary containing the following keys:
-
-        * 'output': a node representing model output; required.
-        * 'loss': (required) a loss node used for optimization; required for training and
-          evaluation.
-        * 'train_op': (optional) an operation that minimizes the loss (as output by
-          `tf.train.Optimizer.minimize`). If train_op is specified, train_op is used
-          for optimization as opposed to loss. Loss is always logged to tensorboard.
-
-        Notes:
-
-        * any tf.summary written inside build graph are logged to tensorboard during training.
-        * the ``build_graph_fn`` is called once or twice per epoch (once per training,
-          once per evaluation). All data loading (and preprocessing) logic not required
-          for serving should be in the ``input_fn`` passed to ``learn``, ``train``,
-          ``evalulate``, etc.
-
-      optimize_loss_fn:
-        Defaults to Trainer.get_train_op. A function that takes params and loss as arguments
-        and returns a training op. The training op is used to update parameters (that is, to learn).
-      metric_fn:
-        A function that returns the eval_metric_ops dict given graph_output, labels and weights.
-        Defaults to None.
-        Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn``
-        which implements many binary classification metrics.
-      run_config (RunConfig):
-        optional configuration to be passed to Estimator constructor. Defaults to None.
-      save_dir (String):
-        optional directory where to save model checkpoints,
-        tensorboard event files and trained parameters.
-        Overwrites and defaults to run_config.model_dir.
-      init_from_dir (String):
-        optional directory to load weights from.
-        if set to None (the default), do not init from any directory.
-      init_map (map from String to String):
-        Must be specified if init_from_dir is specified.
-        Defines which scopes and variables to load.
-        Keys are the variables and scopes to load from the directory.
-        Values are the destinations (in the current graph) to load into.
-        See tf.init_from_checkpoint for more information.
-        Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope
-        of any variable defined inside `build_graph_fn` and this should be taken into account when
-        defining the values.
-      warm_start_from:
-        Optional string filepath to a checkpoint to warm-start from,
-        or a tf.estimator.WarmStartSettings object to fully configure warm-starting.
-        If the string filepath is provided instead of a WarmStartSettings,
-        then all variables are warm-started, and it is assumed that
-        vocabularies and Tensor names are unchanged.
-      profiler_steps (Integer):
-        Defaults to None. If set defines the number of steps in the
-        `tf.train.ProfileHook <https://www.tensorflow.org/api_docs/python/tf/train/ProfilerHook>`_.
-        Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds.
-        When executing ``learn``, ``train`` or ``predict`` methods,
-        with ``profiler_steps`` set to a number,
-        a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data
-        storedin Chrome trace format. To view stored data, use the Chrome browser to follow
-        these steps:
-
-        1) Go to the page chrome://tracing.
-        2) In the upper left corner, you will find Load button.
-        3) Press it and load our JSON file, which can be found in the ``save_dir``
-
-        *Warning*: This could create too many these json files which can be a potential problem,
-        e.g. for  HDFS there is normally quota forfile count, so use with caution.
-
-        Note: this argument is ignored when a non-None ``hooks`` argument is pasesd to
-        ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing
-        ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example.
-    """
-
-    if tensorflow.__version__ >= "2.0":
-      RuntimeError("Trainer not yet supported for Tensorflow >= 2.0")
-
-    self._name = name
-    self._build_graph_fn = build_graph_fn
-    self._metric_fn = metric_fn
-    self._tensorboard_handle = None
-    self._current_estimator_spec = None  # holds the current estimator spec
-    self._profiler_steps = profiler_steps
-    self._export_output_fn = None
-    self._is_early_stopping = False
-
-    # NOTE: Sanitize all HDFS paths first.
-    save_dir = sanitize_hdfs_path(save_dir)
-    init_from_dir = sanitize_hdfs_path(init_from_dir)
-
-    # warm_start_from can be of type tf.estimator.WarmStartSettings.
-    if isinstance(warm_start_from, str):
-      warm_start_from = sanitize_hdfs_path(warm_start_from)
-
-    # convert to twitter.deepbird.hparam.hparam.HParams object
-    params = twml.util.convert_to_hparams(params)
-
-    # keep a copy of the params because calling self._estimator.params creates a deepcopy
-    self._params = params
-    self.check_params()
-
-    self._using_hogwild = True if os.environ.get('TWML_HOGWILD_PORTS') else False
-    # configure Hogwild (needs to be called before RunConfig is created)
-    self._hogwild_setup()
-
-    if not run_config:
-      session_config = tf.ConfigProto()
-      # By default each process tries to allocate (almost) all of the memory.
-      # This option ensures the gpu memory grows dynamically instead.
-      session_config.gpu_options.allow_growth = True  # pylint: disable=no-member
-
-      if 'TWML_NUM_CPUS' in os.environ:
-        num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8"))
-        if params.num_mkl_threads > 1:
-          os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads)
-          os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads)
-          session_config.inter_op_parallelism_threads = num_available_cpus // params.num_mkl_threads
-          session_config.intra_op_parallelism_threads = params.num_mkl_threads
-
-      run_config = tf.estimator.RunConfig(
-        session_config=session_config,
-        keep_checkpoint_max=self._params.get('keep_checkpoint_max', 20),
-        log_step_count_steps=10000,
-        save_checkpoints_secs=self._params.get('save_checkpoints_secs', 600),
-        tf_random_seed=self._tf_random_seed())
-    elif not isinstance(run_config, tf.estimator.RunConfig):
-      raise ValueError("Expecting run_config argument of type None or tf.estimator.RunConfig"
-        "Got %s instead." % type(run_config).__name__)
-    elif os.environ.get('TWML_HOGWILD_PORTS'):
-      raise ValueError("Custom RunConfig not supported with Hogwild")
-
-    if run_config.model_dir is None and save_dir is None:
-      raise ValueError(
-          "Expecting either save_dir or run_config.model_dir to be specified. Got None for each.")
-    elif run_config.model_dir is None:
-      run_config = run_config.replace(model_dir=save_dir)
-    elif save_dir is None:
-      save_dir = run_config.model_dir
-
-    self._save_dir = save_dir
-    self.experiment_tracker = ExperimentTracker(self._params, run_config, self._save_dir)
-
-    # Check if should delete the tsd running this training job. In certain use case when 
-    # there are other tf operations following trainer.train_and_evaluate (or trainer.learn),
-    # additional state files need to be specified to ensure those steps are executed after job restart.
-    kwargs['gke_state_files'] = kwargs.get('gke_state_files', ['_SUCCESS'])
-    self._maybe_del_tsd_exit(kwargs['gke_state_files'])
-    logging.info("Checkpoint and event files will be saved at save_dir=%s", save_dir)
-    self._optimize_loss_fn = self.get_train_op if optimize_loss_fn is None else optimize_loss_fn
-
-    # overwrite the current save_dir
-    if self._params.get('overwrite_save_dir') and tf.io.gfile.exists(self._save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % self._save_dir)
-      # if distributed or hogwild:
-      if self._params.get('distributed', False):
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-        if run_config.is_chief:
-          logging.info("Chief deleting the save_dir now")
-          delete_file_or_dir(self._save_dir)
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-      else:
-        delete_file_or_dir(self._save_dir)
-
-    # Exposing stats to a /vars.json endpoint that will be collected
-    # by the absorber
-    if self._params.get('stats_port'):
-      try:
-        stats_server_utils.start_stats_server(self._params.get('stats_port'), self._save_dir)
-      except Exception as err:
-        logging.error('Failed to start the stats server. Error: %s', str(err))
-
-    checkpoint = os.path.join(self._save_dir, 'checkpoint')
-    if tf.io.gfile.exists(checkpoint):
-      logging.info("The provided save_dir directory %s already exists."
-                   " Training will be resumed."
-                   % checkpoint)
-
-    self._maybe_restore_checkpoint = lambda: init_from_checkpoint(init_from_dir, init_map)
-
-    if init_from_dir is not None and init_map is None:
-      raise ValueError("Need to provide init_map when init_from_dir is provided.")
-
-    if not tf.io.gfile.exists(self._save_dir):
-      # so tensorboard can point to a directory that exists
-      tf.io.gfile.mkdir(self._save_dir)
-
-    self._estimator = tf.estimator.Estimator(
-      model_fn=self._model_fn,
-      params=self._params,  # HParams
-      config=run_config,  # RunConfig
-      warm_start_from=warm_start_from,
-      model_dir=self._save_dir,  # By this point it is same as run_config.model_dir
-    )
-
-    # Log parameters that are used to construct trainer. This allows people to see default values.
-    logging.info("Trainer constructed using the following parameters: ")
-    pp_params = pp.pformat(self._params.values())
-    logging.info(pp_params)
-
-    # Start TensorBoard
-    if self._params.get('disable_tensorboard', False):
-      logging.info("Skipping launching TensorBoard [--disable_tensorboard is set]")
-    elif "tensorboard_port" in self._params.values() and self._params.tensorboard_port is not None:
-      self.start_tensorboard(self._params.tensorboard_port)
-
-    # Export gauge that will track whether a model was exported
-    self.stats_exporter = StatsExporter("twml.trainer")
-    self.export_gauge = AtomicGauge('export_model')
-    self.stats_exporter.register_metrics(self.export_gauge)
-
-  def _hogwild_setup(self):
-    """
-    Setup the parameters required for hogwild.
-    """
-    self._num_workers = self._params.get('num_workers') or 1
-    logging.info("NUM_WORKERS: %d", self._num_workers)
-    if self._num_workers <= 1:
-      self._ports = None
-      return
-
-    # a hogwild job is considered distributed
-    if 'distributed' in self._params:
-      self._params.set_hparam('distributed', True)
-    else:
-      self._params.add_hparam('distributed', True)
-
-    ports = os.environ.get('TWML_HOGWILD_PORTS')
-    if ports:
-      self._ports = [int(port) for port in ports.strip().split(",")]
-      if (self._num_workers + 1!= len(self._ports)):
-        raise ValueError("Number of (workers + PS) and ports need to match")
-    else:
-      if self._num_workers > 1:
-        raise ValueError("TWML_HOGWILD_PORTS needs to be set to use hogwild training")
-
-    # Split the number of data threads across multiple workers
-    num_threads = self._params.get('num_threads')
-    num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers))
-    self._params.set_hparam('num_threads', num_threads_per_worker)
-
-    hogwild_task_type = os.environ.get('TWML_HOGWILD_TASK_TYPE')
-    hogwild_task_id = int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    os.environ['TF_CONFIG'] = self._get_cluster_config(hogwild_task_type, hogwild_task_id)
-
-  def _tf_random_seed(self):
-    """ Returns user set seed and deal with Hogwild multiple seeds """
-    tf_random_seed = self._params.get('tf_random_seed', None)
-    if tf_random_seed is None:
-      return None
-    elif self.using_hogwild and os.environ.get('TWML_HOGWILD_TASK_TYPE') == 'worker':
-      # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)...
-      return tf_random_seed + 1 + int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    else:
-      return tf_random_seed
-
-  def check_params(self):
-    """ Verify that params has the correct key,values """
-    param_values = self._params.values()
-
-    if 'train_batch_size' in param_values:
-      if not isinstance(self._params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if self._params.train_batch_size <= 0:
-        raise ValueError("train_batch_size needs to be positive")
-    else:
-      raise ValueError("train_batch_size needs to be present in params")
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(self._params.eval_batch_size, int):
-        raise ValueError("Expecting params.eval_batch_size to be an integer.")
-      if self._params.eval_batch_size <= 0:
-        raise ValueError("eval_batch_size needs to be positive.")
-    else:
-      self._params.add_hparam('eval_batch_size', self._params.train_batch_size)
-
-    if (self._params.get('distributed_training_cleanup') and
-      not self._params.get('distributed')):
-      # we only need to support training discontinuation for distributed training
-      # bc we are still using TSDs on GKE for distributed training
-      raise ValueError(
-        "Expecting params.distributed to be set if "
-        "params.distributed_training_cleanup is set."
-      )
-
-  def _get_cluster_config(self, name, index):
-    """Create a tensorflow cluster config from ports, name and index"""
-    host = '"localhost:%d"'
-    ps = host % self._ports[0]
-    chief = host % self._ports[1]
-    workers = ", ".join([host % port for port in self._ports[2:]])
-    config = _CLUSTER_TEMPLATE.substitute(
-      PS=ps,
-      CHIEF=chief,
-      WORKER=workers,
-      TYPE=name,
-      INDEX=index,
-    )
-    return config
-
-  @property
-  def current_estimator_spec(self):
-    """
-    returns the current estimator (warning: often reset)
-    """
-    return self._current_estimator_spec
-
-  @property
-  def estimator(self):
-    """ returns estimator encapsulated by Trainer """
-    return self._estimator
-
-  @property
-  def num_workers(self):
-    """ returns number of workers """
-    return self._estimator.config.num_worker_replicas
-
-  @property
-  def worker_index(self):
-    """
-    returns index of worker in the cluster
-    chief has index 0
-    non-chief workers have indices 1 through (num_workers - 1)
-    """
-    return self._estimator.config.global_id_in_cluster
-
-  @property
-  def using_hogwild(self):
-    """ returns a bool indicating whether hogwild is being used """
-    return self._using_hogwild
-
-  def set_estimator(self, estimator):
-    """ sets the estimator used internally by Trainer """
-    if not isinstance(estimator, tf.estimator.Estimator):
-      raise ValueError("Expecting tf.estimator.Estimator")
-    self._estimator = estimator
-    self._params = self.estimator.params
-
-  @property
-  def params(self):
-    """
-    returns the hyper-parameters passed to the constructor.
-    """
-    return self._params
-
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    return twml.argument_parser.get_trainer_parser()
-
-  @staticmethod
-  def get_train_op(params, loss):
-    """
-    Return a training Op, that is, a `twml.optimizers.optimize_loss
-    <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss>`_
-    instance given params and loss.
-    This method can be overwritten by passing the optimize_loss_fn to the Trainer
-    constructor.
-
-    Args:
-      params:
-        tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries,
-        gradient_noise_scale, clip_gradients and learning_rate_decay (including
-        other learning rate decay arguments).
-      loss:
-        scalar Op returned by the build_graph that specifies the training loss to
-        be minimized.
-    """
-    optimizer = params.get('optimizer')
-
-    if not optimizer:
-      optimizer = 'SGD'
-
-    if optimizer == 'LazyAdam':
-      optimizer = LazyAdamOptimizer
-
-    if optimizer == 'DGC':
-      optimizer = DeepGradientCompressionOptimizer(
-          learning_rate=params.learning_rate,
-          use_locking=False,
-          name="Sparse",
-          density=params.get('dgc_density'),
-          density_decay=params.get('dgc_density_decay'),
-          density_decay_steps=params.get('dgc_density_decay_steps'),
-          density_decay_rate=params.get('dgc_density_decay_rate'),
-          min_density=params.get('dgc_min_density'),
-          accumulation=params.get('dgc_accumulation')
-      )
-
-    summaries = ['loss']
-    if params.get('show_optimizer_summaries'):
-      summaries = OPTIMIZER_SUMMARIES
-
-    train_op = optimize_loss(
-      loss=loss,
-      global_step=tf.train.get_global_step(),
-      optimizer=optimizer,
-      learning_rate=params.learning_rate,
-      summaries=summaries,
-      colocate_gradients_with_ops=True,
-      gradient_noise_scale=params.get('gradient_noise_scale'),
-      clip_gradients=params.get('clip_gradients'),
-      learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(params)
-    )
-    return train_op
-
-  def export_model_effects(self, export_path, feature_spec=None, log_features=True):
-
-    # DO NOT CHANGE THE ORDER.
-    # This needs to be done before registering the model.
-    if feature_spec:
-      if log_features:
-        features = feature_spec['features']
-        feature_names = ['.'.join(features[fid]['featureName'].split('.')[1:]) for fid in features.keys()]
-        features_to_log = ','.join(feature_names)
-        try:
-          model_hash = self.experiment_tracker.compute_model_hash(export_path)
-          metrics.log_usage('dbv2', 'export_model_effects', 'v1', custom_attrs=[model_hash, "feature config present", features_to_log])
-        except:  # noqa: T803
-          logging.info("Failed to log Feature Config features")
-
-      twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec)
-      export_start_time = time.time()
-      self.experiment_tracker.export_feature_spec(feature_spec)
-      logging.info("Exported feature spec to ML Metastore in %s seconds.", time.time() - export_start_time)
-
-    self.experiment_tracker.register_model(str(export_path))
-    self.export_gauge.increment()
-
-  @property
-  def best_or_latest_checkpoint(self):
-    if self._is_early_stopping:
-      best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint")
-      checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path)
-      # Return best checkpoint if necessary
-      if checkpoint_path:
-        return checkpoint_path
-      else:
-        raise ValueError("Best checkpoint not found at %s." % best_checkpoint_path)
-    else:  # Fallback to latest checkpoint from save directory
-      return self.latest_checkpoint
-
-  @property
-  def latest_checkpoint(self):
-    return self.estimator.latest_checkpoint()
-
-  def export_model(self, serving_input_receiver_fn,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None,
-                   log_features=True):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICTgraph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Note that a valid self._export_output_fn is required.
-    If export_ouput_fn is provided, it is used to set the self._export_output_fn.
-
-    Args:
-      serving_input_receiver_fn:
-        function preparing the model for inference requests.
-        This funtion returns the ``features`` dict passed to ``build_graph``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory is chosen.
-      export_output_fn:
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to `twml.export_output_fns.default_output_fn`.
-
-    Return:
-      returns a string path to exported directory.
-
-    # set the export output function
-    """
-    if not self.is_chief():
-      logging.info("Trainer.export_model ignored due to the process not being chief.")
-      return
-
-    self._export_output_fn = export_output_fn or twml.export_output_fns.default_output_fn
-
-    if not callable(self._export_output_fn):
-      raise RuntimeError(
-        "Expecting export_output_fn function. Got %s."
-        % type(self._export_output_fn).__name__)
-
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
-
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
-
-    # actually export the model using the Estimator API
-    export_path = self._estimator.export_savedmodel(
-      export_dir_base=export_dir or os.path.join(self._save_dir, 'exported_models'),
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path)
-
-    # export_path is bytes, need to convert to string for python3 to work.
-    logging.info("The exported model path is: " + str(export_path))
-
-    self.export_model_effects(export_path, feature_spec, log_features)
-
-    return export_path
-
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators.
-    You would probably never need to modify this method.
-    Instead, you should override build_graph, which this method calls.
-
-    Args:
-      features:
-        Dict of input tensors.
-      labels:
-        Tensor of target labels.
-      mode:
-        an instance of tf.estimator.ModeKeys.
-        Typically used to toggle TRAINing or EVALuation.
-      params:
-        HParams object containing hyper-parameters.
-    """
-    # pylint: disable=too-many-branches
-    if isinstance(features, dict):
-      weights = features.get('weights', None)
-    else:
-      weights = None
-
-    with tf.variable_scope(self._name + '/model'):
-      graph_output = self._build_graph_fn(features, labels, mode, params, config)
-      loss = graph_output['loss'] if 'loss' in graph_output else None
-
-    self._maybe_restore_checkpoint()
-
-    with tf.variable_scope(self._name + '/optim'):
-      train_op = None
-      if mode == tf.estimator.ModeKeys.TRAIN:
-        if 'train_op' in graph_output:
-          train_op = graph_output['train_op']
-          graph_output['train_op'] = None  # remove from preds to prevent error
-        elif loss is not None:
-          train_op = self._optimize_loss_fn(params, loss)
-
-        if params.get('train_log_metrics') and self._metric_fn:
-          metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-          for metric_name in metric_ops:
-            tf.summary.scalar(
-              name="training_metric_" + metric_name,
-              tensor=metric_ops[metric_name][1])  # index 0 contains value_op, 1 contains update_op
-
-    if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None:
-      # note that this is ignored by the predict method.
-      # Estimator only uses export_output_fn for export_model.
-      export_outputs = self._export_output_fn(graph_output)
-    else:
-      export_outputs = None
-
-    if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn:
-      eval_metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-    else:
-      eval_metric_ops = None
-
-    # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output
-    preds = {key: graph_output[key] for key in graph_output if (graph_output[key] is not None) and (key is not 'loss')}
-
-    init_feed_dict = twml.contrib.initializers.get_init_feed_dict()
-    scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict)
-
-    # Clear the init feed collection to avoid serializing the initializers.
-    twml.contrib.initializers.clear_init_feed_collection()
-
-    # save estimator for use by later methods and hooks (warning: often reset)
-    self._current_estimator_spec = tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=preds,
-      export_outputs=export_outputs,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops,
-      scaffold=scaffold,
-    )
-
-    return self._current_estimator_spec
-
-  def get_train_hooks(self):
-    """Return SessionRunHooks used during training.
-
-    By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed.
-
-    If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook`
-    for monitoring the profile.
-
-    """
-    # Instead of having every_n_steps be a constant number,
-    # change it dynamically based on batch size.
-    # Ideally we should be using every_n_secs, but that seems buggy as of 1.7.
-    # The every_n_steps = 20K / batch_size
-    every_n_steps = ((2048 * 100) // self._params.train_batch_size)
-    step_counter = tf.train.StepCounterHook(
-      every_n_steps=every_n_steps, output_dir=self._save_dir
-    )
-    train_hooks = [step_counter]
-
-    if self._profiler_steps is not None:
-      if not self._params.get('distributed') or self._estimator.config.is_chief:
-        profiler = tf.train.ProfilerHook(
-          save_steps=self._profiler_steps,
-          output_dir=self._save_dir
-        )
-        train_hooks.append(profiler)
-
-    return train_hooks
-
-  def is_task_type(self, name):
-    """
-    Helper function to specify if the current process is of the given worker type.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    if os.environ.get('TF_CONFIG'):
-      if self._estimator.config.task_type == name:
-        return True
-      else:
-        return False
-    return True
-
-  def is_evaluator(self):
-    """
-    Helper function to let you know if the worker is evaluator.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("evaluator")
-
-  def is_chief(self):
-    """
-    Helper function to let you know if the worker is chief.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("chief") or self.is_task_type("master")
-
-  def is_ps(self):
-    """
-    Helper function to let you know if the task is parameter server.
-    """
-    if os.environ.get('TF_CONFIG') and self._estimator.config.task_type == 'ps':
-      return True
-    return False
-
-  def _exit_ps_after_training_complete(self):
-    """
-    Helper function to shutdown parameter server after training job complete (either succeed or failed).
-    """
-    if not self.is_ps():
-      return
-
-    # No need to exit ps if on the same machine
-    if os.environ.get('TWML_HOGWILD_PORTS'):
-      return
-
-    if self._params.get('disable_auto_ps_shutdown', False):
-      logging.info("Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]")
-      return
-
-    # checking job status is different on gke vs aurora
-    if self._is_on_gke():
-      get_job_status = functools.partial(
-        k8s_status.get_training_job_status,
-        cluster=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        environment=os.environ['TWML_JOB_ENV'],
-        job_name=os.environ['TWML_JOB_NAME'],
-        using_tsd=True)
-    else:
-      get_job_status = functools.partial(
-        get_distributed_training_job_path,
-        base_job_path=get_distributed_training_job_path()
-      )
-
-    def wait_complete_then_exit():
-      retry_max = 60
-      retry = 0
-      while True:
-        try:
-          training_status = get_job_status()
-          if training_status == TrainingJobStatus.FINISHED:
-            logging.info("Distributed training job succeed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.FAILED:
-            logging.info("Distributed training job failed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.NOT_FOUND:
-            raise Exception("Distributed training job status not found.")
-          else:
-            poke_interval = random.randrange(60, 90)  # prevent spike QPS to aurora endpoint
-            time.sleep(poke_interval)
-            retry = 0
-        except Exception as e:
-          if retry >= retry_max:
-            raise e  # only exception in this thread, won't fail parameter server thread
-          retry += 1
-          poke_interval = random.randrange(60, 90) + retry * 10
-          logging.warn("Error getting distributed training job status, will retry after %s seconds." % poke_interval)
-          time.sleep(poke_interval)
-    Thread(target=wait_complete_then_exit).start()
-
-  def get_eval_hooks(self):  # pylint: disable=no-self-use
-    """ Return SessionRunHooks used during evaluation."""
-    return None
-
-  def get_predict_hooks(self):
-    """ Return hooks used during prediction.
-    If profiler_steps is set in the constructor to the Trainer,
-    we pass a tf.Train.ProfilerHook to the estimator's predict function.
-    """
-    hooks = []
-    if self._profiler_steps is not None:
-      profiler = tf.train.ProfilerHook(
-        save_steps=self._profiler_steps,
-        output_dir=self._save_dir
-      )
-      hooks.append(profiler)
-    return hooks
-
-  def learn(self, train_input_fn=None, eval_input_fn=None,
-            train_max_steps=None,
-            train_steps=None, eval_steps=None,
-            train_hooks=None, eval_hooks=None,
-            early_stop_metric=None, early_stop_patience=-1,
-            early_stop_minimize=True, early_stop_tolerance=0, start_epoch=0,
-            exporters=None, export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps`` steps.
-    Each epoch involves ``train_steps`` training steps followed
-    by ``eval_steps`` evaluation steps. Note that each step
-    is a ``session.run()``, that is, each batch is a step.
-
-    Args:
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        None-values cause learn() to terminate after *one* call to train() and evaluate(),
-        which is usually useful when using train_steps=-1
-        Non-positive values trains indefinitely in a loop (use with caution),
-        which is usually useful when used with early stopping.
-      train_steps:
-        number of training steps per epoch. For example, 100 means each
-        training epoch will end after processing 100 batches.
-        Defaults to params.train_steps.
-        Non-positive values and None-values go through the entire training set each epoch.
-      eval_steps:
-        number of evaluation steps per epoch.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through the entire evaluation set each epoch.
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      train_hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      eval_hooks:
-        List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks()
-      start_epoch:
-        The epoch from which to start learn. If you want to do training and evaluation
-        for N epochs, you can call ``learn()`` in a loop as follows:
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-        .. code-block:: python
-
-          for epoch in range(1,max_epoch):
-            trainer.learn(start_epoch=epoch)
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-      That is, save_dir.
-      You can point TensorBoard to this directory to get metrics,
-      or pass it to another Trainer via ``init_from_dir`` when doing
-      multi-phase training.
-    """
-    # pylint: disable=too-many-branches
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    if os.environ.get('TF_CONFIG'):
-      raise ValueError("trainer.learn() can not be used with distributed / hogwild setups")
-
-    if exporters and export_output_fn:
-      self._export_output_fn = export_output_fn
-
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if train_steps is None:
-      train_steps = self.params.train_steps
-    if train_steps <= 0:
-      train_steps = None
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if early_stop_patience > 0:
-      assert train_max_steps is not None, "Early stopping and max_steps=None are not compatible."
-      # prepare early stopping hook (which also handles logic here)
-      self._is_early_stopping = True
-      early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        start_epoch=start_epoch)
-      # add early stop hook to eval hooks
-      eval_hooks.append(early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      train_hooks.append(train_early_stop_duration_hook)
-
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    if not self._is_early_stopping:
-      if (train_max_steps is not None) and (train_max_steps <= 0):
-        if ((max_duration is not None) and (max_duration < 0)) or (max_duration is None):
-          logging.warn("train.max_steps is non-positive, and no early or duration stopping is configured. "
-                      "Training job will loop forever.")
-
-    if train_max_steps is not None and train_max_steps > 0:
-      # we can't pass max_steps AND steps to estimator.train.
-      # so we pass steps to estimator.train and max_steps to this hook instead...
-      stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps)
-      train_hooks.append(stop_at_step_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks,
-                                                  lambda: self.current_estimator_spec):
-      # alternate training and evaluation epochs
-      epoch = start_epoch
-      while True:
-        logging.info("Training epoch %d", epoch)
-        self._estimator.train(train_input_fn, steps=train_steps, hooks=train_hooks)
-
-        logging.info("Evaluating epoch %d", epoch)
-        eval_result = self._estimator.evaluate(
-          eval_input_fn, steps=eval_steps, hooks=eval_hooks)
-
-        if exporters:
-          checkpoint_path = self.estimator.latest_checkpoint()
-          for exporter in exporters:
-            export_path = os.path.join(self._save_dir, "export", exporter.name)
-            exporter.export(
-              estimator=self.estimator, export_path=export_path,
-              checkpoint_path=checkpoint_path, eval_result=eval_result,
-              is_the_final_export=False)
-
-        # If train_max_step is none. Terminate after one loop.
-        if train_max_steps is None:
-          break
-
-        # If stop_at_step_hook requested a stop, break
-        if train_max_steps > 0 and stop_at_step_hook.stop_requested:
-          break
-
-        # early-stopping logic is handled internally by the hook
-        if early_stop_patience > 0 and early_stop_hook.should_stop:
-          # but we still need to break here
-          break
-        epoch += 1
-
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
-
-  def get_train_spec(self, input_fn, max_steps=None, hooks=None):
-    """Get the TrainSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable train_input_fn")
-
-    if max_steps is None:
-      max_steps = self.params.train_max_steps
-
-    if max_steps is not None and max_steps <= 0:
-      max_steps = None
-
-    hooks = self.get_train_hooks() if hooks is None else hooks
-
-    return tf.estimator.TrainSpec(input_fn=input_fn,
-                                  max_steps=max_steps,
-                                  hooks=hooks)
-
-  def get_eval_spec(self, input_fn, steps=None, delay=None, period=None,
-                    hooks=None, exporters=None):
-    """Get the EvalSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable eval_input_fn")
-
-    if steps is None:
-      steps = self.params.eval_steps
-
-    if steps <= 0:
-      steps = None
-
-    if delay is None:
-      delay = self.params.eval_delay
-
-    if period is None:
-      period = self.params.eval_period
-
-    hooks = self.get_eval_hooks() if hooks is None else hooks
-
-    eval_name = self.params.get("eval_name", None)
-
-    return tf.estimator.EvalSpec(input_fn=input_fn,
-                                 steps=steps,
-                                 name=eval_name,
-                                 start_delay_secs=delay,
-                                 throttle_secs=period,
-                                 hooks=hooks,
-                                 exporters=exporters)
-
-  def train_and_evaluate(self, train_input_fn=None, eval_input_fn=None,
-                         train_max_steps=None, eval_steps=None,
-                         eval_delay=None, eval_period=None,
-                         train_hooks=None, eval_hooks=None,
-                         early_stop_metric=None, early_stop_patience=-1,
-                         early_stop_minimize=True, early_stop_tolerance=0, exporters=None,
-                         export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps``
-    using ``tf.estimator.train_and_evaluate``.
-    With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method
-    can be used for distributed training (multi-node or multi-process).
-    Unlike the ``learn`` method, training is continuous with ``train_max_steps``.
-    For distributed use case, evaluation happens periodically.
-    That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps
-    occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint.
-    TF defaults to saving checkpoints every 10 mins.
-    For local use case, training occurs for train_max_steps epochs followed by a
-    single evaluation. For local use case we therefore recommend using learn() instead
-    as it provides early-stopping and multiple evaluations.
-
-    ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds.
-    It will stop after ``train_steps`` is reached.
-
-    You must ensure that all workers/servers are assigned the same `save_dir`.
-
-    .. Note::
-
-      If the TF_CONFIG environment variable is set, this function assumes its running a distribute job.
-
-    Args:
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train_and_evalute
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.train_and_evalute.
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        Non-positive values and None-values train indefinitely (use with caution).
-      eval_steps:
-        number of steps per evaluation.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through
-        the entire evaluation set for each evaluation.
-        Note that the number of eval_steps should be high enough to minimize noise.
-        This is especially true for early-stopping.
-      eval_delay:
-        Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s.
-      eval_period:
-        Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s.
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-    """
-
-    logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.")
-    logging.info("Trainer.train_and_evaluate may change or be removed in future versions.")
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    self._exit_ps_after_training_complete()
-
-    # Maybe export in eval processes.
-    if self.is_evaluator():
-      if self.params.get("eval_name") is not None:
-        # Do not export if running special eval.
-        exporters = None
-        export_output_fn = None
-      elif exporters and export_output_fn:
-        self._export_output_fn = export_output_fn
-      else:
-        # Default option.
-        self._export_output_fn = None
-
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    train_hooks = [] if train_hooks is None else train_hooks
-
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if eval_delay is None:
-      eval_delay = self.params.eval_delay
-    if eval_period is None:
-      eval_period = self.params.eval_period
-
-    if early_stop_patience > 0:
-      # when training hooks detect this file, they request a stop to training
-      early_stop_path = os.path.join(self._save_dir, 'earlystop_now.txt')
-      # prepare early stopping hook (which also handles logic here)
-
-      self._is_early_stopping = True
-
-      eval_early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        file_path=early_stop_path,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None)  # only exit for distributed jobs
-      # add early stop hook to eval hooks
-      eval_hooks.append(eval_early_stop_hook)
-
-      # prepare the commensurate training hook
-      train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path)
-      train_hooks.append(train_early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=self.is_chief()
-      )
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None,
-        save_dir=self._save_dir,
-        overwrite=False
-      )  # only exit for distributed jobs
-
-      train_hooks.append(train_early_stop_duration_hook)
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks, lambda: self.current_estimator_spec):
-      train_spec = self.get_train_spec(train_input_fn, train_max_steps, train_hooks)
-      eval_spec = self.get_eval_spec(eval_input_fn, eval_steps,
-                                     eval_delay, eval_period,
-                                     eval_hooks, exporters)
-      self._train_and_evaluate(train_spec, eval_spec)
-
-    if self.is_chief():
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
-
-  def _train_and_evaluate(self, train_spec, eval_spec):
-    """
-    Private method that calls
-    ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``.
-    """
-    try:
-      tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
-    except twml.errors.EarlyStopError:
-      # Ignore the exception if on evaluator.
-      if self.is_evaluator():
-        pass
-      else:
-        raise
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Train the estimator for `steps` training steps.
-
-    Args:
-      steps:
-        number of steps for which to perform training. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. trains on the entire dataset a single time.
-        Non-positive values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-    """
-    if os.environ.get('TF_CONFIG') and "is_calibrating" not in self.params:
-      raise ValueError("trainer.train() can not be used with distributed / hogwild setups")
-
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    if self._is_early_stopping:
-      raise ValueError("Can not call train() after learn() when using early stopping.")
-
-    hooks = self.get_train_hooks() if hooks is None else hooks
-    self._estimator.train(input_fn, steps=steps, hooks=hooks)
-    return self
-
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Evaluate the estimator for `steps` evaluation steps.
-
-    Args:
-      steps:
-        number of steps for which to perform evaluation. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. evaluates on the entire dataset a single time.
-        Negative values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      hooks:
-        List of SessionRunHooks used for evaluation. Defaults to None.
-        Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks()
-        as the latter may implement early-stopping, which isn't necessarilty the desired
-        behavior when calling evaluate() on its own.
-      name:
-        Name of the evaluation if user needs to run multiple evaluations on different data sets.
-        Metrics for different evaluations are saved in separate folders,
-        and appear separately in tensorboard.
-
-    Returns:
-      If `is_evaluator()`, returns a dict containing the evaluation metrics specified
-      in `metric_fn` keyed by name, as well as an entry `global_step` that contains
-      the value of the global step for which this evaluation was performed.
-      Otherwise (i.e. `is_evaluator() == False`), returns None.
-    """
-    if not self.is_evaluator():
-      return None
-
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    hooks = self.get_eval_hooks() if hooks is None else hooks
-    hooks = [] if hooks is None else hooks
-
-    # for consistency with train/learn
-    eval_steps = None if steps is not None and steps < 0 else steps
-
-    with self.experiment_tracker.track_experiment(hooks, lambda: self.current_estimator_spec, name=name):
-      checkpoint = self.best_or_latest_checkpoint
-      computed_metrics = self._estimator.evaluate(
-        input_fn,
-        steps=eval_steps,
-        hooks=hooks,
-        checkpoint_path=checkpoint,
-        name=name
-      )
-
-    return computed_metrics
-
-  def start_tensorboard(self, port=None):
-    """
-    Start tensorboard process to visualize logs in save_dir.
-    """
-    logging.info("Starting tensorboard.")
-    if self._tensorboard_handle:
-      logging.warn("Tensorboard already running. Nothing done.")
-      return
-
-    if port is None:
-      if 'tensorboard_port' not in self.params.values():
-        raise ValueError('You must specify a port for tensorboard to run on.')
-      elif self.params.tensorboard_port is None:
-        return
-      else:
-        port = self.params.tensorboard_port
-
-    mldash_path = 'experiments'
-    if self.experiment_tracker.path:
-      mldash_path += '/%s' % encode_url(self.experiment_tracker.experiment_id)
-    tensorboard_args = ['--logdir=%s' % self._save_dir, '--port=%d' % port]
-
-    try:
-      args = ['email_and_launch_tensorboard', mldash_path, '--'] + tensorboard_args
-      self._tensorboard_handle = subprocess.Popen(args)
-    except OSError:
-      try:
-        self._tensorboard_handle = subprocess.Popen(['tensorboard'] + tensorboard_args)
-      except OSError:
-        try:
-          # this will work with Twitter internal pants build when run locally
-          args = ['./pants', 'run', 'twml:tensorboard', '--'] + tensorboard_args
-          self._tensorboard_handle = subprocess.Popen(args)
-        except OSError:
-          logging.error("No tensorboard installed, won't able to visualize training in tensorboard.")
-
-  def stop_tensorboard(self):
-    """
-    Shutdown this Trainer's associated Tensorboard.
-    """
-    if self._tensorboard_handle:
-      logging.info("Shutting down tensorboard.")
-      self._tensorboard_handle.kill()
-    else:
-      logging.warn("No known tensorboard process. Nothing done.")
-
-  def calibrate(self,
-                calibrator,
-                steps=None,
-                input_fn=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Calibrate the calibrator for `steps` calibration steps using the estimator.train method.
-    The build_graph passed to the Trainer constructor should
-    call calibrator.accumulate using something like tf.py_func.
-    That way, when this method calls estimator.train the calibrator will
-    accumulate one epoch of samples. After which, this method calls calibrator.calibrate().
-    It is up to the user to then call calibrator.save() to save the calibrated Layer
-    and other information to disk for multi-phase training.
-
-    Args:
-      calibrator:
-        a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}.
-      steps:
-        Maximum steps to accumulate examples for calibration. Optional.
-        If not specified, examples will be accumulated until all downsampled parts are processed.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      save_calibrator:
-        Boolean (default: True). If set to True it will save the calibrator layer.
-    """
-
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    # making everything a dict to avoid multiple ifs
-    if isinstance(calibrator, twml.contrib.calibrators.Calibrator):
-      calibrator = {"default": calibrator}
-
-    # This is a dummy call to train, since we cannot predict without training
-    # from the Estimator API
-    self._estimator.train(input_fn, steps=1)
-    max_steps = steps if steps is not None else -1
-    for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)):
-      count = 0
-      for out in self._estimator.predict(input_fn, hooks=hooks, yield_single_examples=False):
-        if max_steps > 0 and count > max_steps:
-          break
-        clbrt.accumulate_feature(out)
-        count += 1
-      clbrt.calibrate()
-
-    # this step is done to allow us to keep the current phases event file for
-    # visualization on Tensorboard. It removes all files that
-    # are not event files. This piece of code should be deprecated when
-    # we deprecate the MDL calibrator (CX-12329)
-    for fname in tf.io.gfile.listdir(self._save_dir):
-      if not fname.startswith("events"):
-        tf.io.gfile.remove(os.path.join(self._save_dir, fname))
-
-    if save_calibrator:
-      # If we only have one calibrator, the calibrator signature
-      # will be set to default
-      if len(calibrator) == 1:
-        calibrator = calibrator['default']
-        calibrator.save(
-          self.params.save_dir,
-          name=calibrator.name,
-          verbose=True
-        )
-      else:
-        for name, clbrt in calibrator.items():
-          clbrt.save(
-            self.params.save_dir,
-            name=clbrt.name + str(name),
-            verbose=True
-          )
-
-  def predict(self, *args, **kwargs):
-    """
-    Wrapper over the tensorflow `Estimator.predict
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict>`_.
-    method. See that documentation for description of arguments accepted.
-
-    If hooks is passed as an argument, the specified hooks are used.
-    Else when profiler_steps is specified in the constructor of the Trainer, a
-    tf.train.ProfilerHook is passed to the predict interface.
-    Otherwise, hooks is set to an empty list.
-    """
-    if 'hooks' not in kwargs and len(args) < 3:
-      # If hooks is not specified as a keyword argument, nor as a positional argument
-      # add hooks as a keyword argument.
-      kwargs['hooks'] = self.get_predict_hooks()
-
-    return self.estimator.predict(*args, **kwargs)
-
-  def hub_export(self,
-                 name,
-                 serving_input_receiver_fn,
-                 export_dir=None,
-                 checkpoint_path=None,
-                 export_task_type_overrider=None):
-    """
-    Exports registered modules into a save directory.
-
-    This method creates a directory under export_path with the save TF Hub.
-    One sub-directory (named export_name) per module registered via register_module_for_export.
-
-    Arguments:
-      name:
-        unique name of the module to export.
-      serving_input_receiver_fn:
-        A function with no arguments that returns a ServingInputReceiver.
-        This is used with the estimator passed to export() to build the graph (in PREDICT mode)
-        that registers the modules for export. The model in that graph is never run,
-        so the actual data provided by this input fn does not matter.
-      export_dir:
-        A string containing a directory where to write the export directories.
-        Defaults to the save_dir.
-      checkpoint_path:
-        The checkpoint path to export. Defaults to the latest.
-      export_task_type_overrider:
-        Specifies the task type that will override the default task type used for export
-        (hogwild training defaults to evaluator, otherwise, defaults to chief)
-    """
-    if export_task_type_overrider:
-      if not self.is_task_type(export_task_type_overrider):
-        logging.info(
-          f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}")
-        return
-    else:
-      if self._using_hogwild:
-        if not self.is_evaluator():
-          logging.info("Trainer.hub_export ignored due to the process not being evaluator.")
-          return
-      else:
-        if not self.is_chief():
-          logging.info("Trainer.hub_export ignored due to the process not being chief.")
-          return
-
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
-
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
-
-    export_dir = export_dir if export_dir is not None else self._save_dir
-    exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn)
-    # The path_exporter by default contains a timestamp directory in its path.
-    path_exporter = exporter.export(estimator=self.estimator,
-                                    export_path=export_dir,
-                                    checkpoint_path=checkpoint_path)
-
-    # LatestModuleExporter.export() returns a binary string on Cloud ML Engine
-    # but tf.io.gfile.listdir() does not; this is an issue when joining paths
-    if isinstance(path_exporter, bytes):
-      path_exporter = path_exporter.decode()
-
-    # Copying the saved hub module to export_dir so we don't need to specify
-    # the timestamp when loading the module.
-    # This is a workaround due to the current implementation of hub.LatestModuleExporter.
-    # This works for multiple hub modules.
-    hub_exported_modules = tf.io.gfile.listdir(path_exporter)
-
-    backup_dir = os.path.join(export_dir, "backups",
-                              datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
-
-    for folder in hub_exported_modules:
-      hub_module_oldpath = os.path.join(path_exporter, folder)
-      hub_module_newpath = os.path.join(export_dir, folder)
-
-      # If the destination already exists, move to backup
-      if tf.io.gfile.exists(hub_module_newpath):
-        # Ensure backup_dir exists
-        tf.io.gfile.makedirs(backup_dir)
-        hub_module_backup = os.path.join(backup_dir, folder)
-        tf.io.gfile.rename(hub_module_newpath, hub_module_backup)
-
-      tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath)
-
-    # Since the timestamped folder exists but is empty, we can delete it.
-    tf.io.gfile.rmtree(path_exporter)
-
-  def _is_on_gke(self) -> bool:
-    """Returns True if running on gke."""
-    cluster = os.environ.get('TWML_JOB_CLUSTER')
-    if not cluster or cluster in {'smf1', 'atla'}:
-      return False
-    return True
-
-  def _maybe_del_tsd_exit(self, state_files) -> None:
-    """Handle potential early exit and TwitterSetDeployment deletion.
-
-      If:
-        - distributed training
-        - running GKE
-        - training is finished (all state_files exists)
-      we will exit early and not restart work
-
-      If --distributed_training_cleanup = True then we will also handle
-      cleaning up the TwitterSetDeployments.
-
-      Args:
-        state_files: A python list indicate state files to determine the finish 
-        state of the job.
-    """
-    # job type that is responsible for experiment tracking will remain alive
-    # until it marks the experiment as finished.
-    if self.experiment_tracker._env_eligible_for_recording_experiment:
-      exp_status = self.experiment_tracker.get_run_status()
-      if exp_status and exp_status not in {'Success', 'Failed'}:
-        logging.info(
-          f"Not exiting early because experiment is still {exp_status}."
-        )
-        return
-
-    # do not bother if we are on prem
-    if not self._is_on_gke():
-      logging.info("No need to exit early because running on prem.")
-      return
-
-    states = [
-      twml.util.file_exist_in_dir(self._save_dir, state_file) for state_file in state_files]
-    do_not_restart = (self._params.get('distributed') and all(states))
-    if not do_not_restart:
-      return
-
-    logging.info(
-      f"Exiting early because a _SUCCESS file already exists in {self._save_dir}")
-    if self._params.get('distributed_training_cleanup'):
-      resource_name = '-'.join([
-        os.environ['TWML_JOB_NAME'],
-        os.environ['TWML_DISTRIBUTED_JOB_TYPE'],
-        os.environ['TWML_JOB_ENV'],
-      ])
-      logging.info(f"Deleting TwitterSetDeployment {resource_name}")
-      # each job type will manage its own deletion so that deletion happens
-      # in the trainer init call for every job type
-      # otherwise we may kill another job type during an important
-      # process like experiment tracking management (handled by the evaluator
-      kubectl_delete_by_name(
-        zone=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        resource_type=Resource.TWITTERSETDEPLOYMENTS.value,
-        resource_name=resource_name,
-        wait=False,
-      )
-    sys.exit(0)
-
-  def write_state_to_disk(self, save_dir, filename='_SUCCESS') -> None:
-    """Write state file to disk to indicate the state of training process. This is usually used 
-      to mark the state of training progress and determine the start when job restarts/resumes.
-    Args:
-      save_dir: A str of local/gcs/hdfs dir to write the state file.
-      file_name: A str indicate the state file. Default to `_SUCCESS`.
-    """
-    file_path = os.path.join(save_dir, filename)
-    if tf.io.gfile.exists(file_path):
-      tf.logging.warn(f'{file_path} already exist.')
-      return
-
-    with tf.io.gfile.GFile(file_path, 'w') as f:
-      f.write('')
\ No newline at end of file
diff --git a/twml/twml/util.py b/twml/twml/util.py
deleted file mode 100644
index cd7679a6f..000000000
--- a/twml/twml/util.py
+++ /dev/null
@@ -1,942 +0,0 @@
-"""
-This module contains utility functions for twml.
-"""
-
-import argparse
-from datetime import datetime
-import itertools
-import json
-import logging as _logging
-import os
-import re
-
-from twitter.ml.common.resources import AuroraPath
-from twitter.deepbird.hparam import HParams
-from twitter.deepbird.io.util import (
-  _get_feature_id,  # noqa: F401
-  feature_id,  # noqa: F401
-  preprocess_feature_regex,  # noqa: F401
-  preprocess_path,  # noqa: F401
-  sanitize_hdfs_path,  # noqa: F401
-  is_string,  # noqa: F401
-  list_files,  # noqa: F401
-  match_files,  # noqa: F401
-)
-from twitter.deepbird.io.legacy.util import (
-  batch_apply,  # noqa: F401
-  boolean_mask,  # noqa: F401
-  fixed_length_tensor,  # noqa: F401
-)
-from twitter.deepbird.sparse.util import (
-  convert_to_sparse,  # noqa: F401
-  limit_bits,  # noqa: F401
-)
-
-from dateutil import rrule
-from joblib import delayed, Parallel
-from six import string_types
-
-from absl import logging
-from libtwml import CLIB, OPLIB  # noqa: F401
-import tensorflow.compat.v1 as tf
-from tensorflow.python.platform import tf_logging
-import twml
-from twml.feature_config import FeatureConfigBuilder
-
-
-# big_prime is less than 2**32
-# This just needs to be co-prime with powers of 2
-# any large prime is sufficient, but it's not necessary.
-HASHING_PRIME = 2479700537
-
-
-def multiplicative_hash(input, hash_constant=HASHING_PRIME):
-  return input * hash_constant
-
-
-def _return_tensors_from_checkpoint_folder(init_dir, model_name=None):
-  """Returns tensors list from a checkpoint folder
-
-  Args:
-    init_dir: Name of the checkpoint directory.
-    model_name: the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default to the
-      latest model saved in the checkpont file.
-
-  """
-  if model_name is None:
-    # gets the most recently generated model.cpkt file
-    model_path = tf.train.latest_checkpoint(init_dir)
-    if model_path is None:
-      raise ValueError("Could not find a valid model checkpoint inside the directory")
-  else:
-    model_path = os.path.join(init_dir, model_name)
-  reader = tf.train.NewCheckpointReader(model_path)
-  try:
-    return (reader.debug_string().decode("utf-8"))
-  except OSError:
-    logging.error('Could not decode the string')
-
-
-def get_scope_dict(init_dir, incoming_scope_name, current_scope_name, model_name=None):
-  """Returns tensors map from a checkpoint file.
-
-  Args:
-    file_name:
-      Name of the checkpoint directory.
-    incoming_scope_name:
-      scope name of the previous phase
-    current_scope_name:
-      scope name of current phase
-    model_name:
-      the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default
-      to the latest model saved in the checkpoint file.
-  Returns:
-    init_map:
-      init_map which will be inputted to the checkpoint
-  """
-  init_map = {}
-  reader_dump = _return_tensors_from_checkpoint_folder(init_dir=init_dir,
-                                                       model_name=model_name).splitlines()
-  for member in reader_dump:
-    # remove global_step since it is not necessary
-    if 'global_step' not in member:
-      saved_variables = str(member.split(" ")[0])
-      saved_scope = saved_variables.rsplit('/', 1)[0] + "/"
-      new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1)
-      # create key in init_map
-      if saved_scope not in init_map.keys():  # pylint: disable=dict-keys-not-iterating
-        init_map[saved_scope] = new_scope
-  return init_map
-
-
-def get_init_map(
-        init_from_dir,
-        exclude_var_names=None,
-        exclude_name_scopes=None,
-        name_scope_to_remove=None,
-        name_scope_to_prepend=None):
-  """
-  Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint).
-
-  It assumes that the latter part of the variable names are consistent between the checkpoint and
-  the new model, but their name_scopes may be different. If the checkpoint model has variable names
-  of the form old/scope/var/foo, and the corresponding variable names for the new model should be
-  my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and
-  name_scope_to_prepend = 'my/new/'.
-
-  This function can be used to
-
-  1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or
-  2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in
-     which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside
-     ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to
-     the trainer.
-
-  Parameters
-  ----------
-  init_from_dir: Directory containing checkpoint
-  exclude_var_names: list[str]
-    List of variables in the checkpoint that should be excluded from the map.
-  exclude_name_scopes: list[str]
-    List of name_scopes in the checkpoint model that should be excluded from the map.
-  name_scope_to_remove: str
-    portion of name_scope for checkpoint variables that should not be included in variable names
-    for new model.
-  name_scope_to_prepend: str
-    name_scope to prepend to variable names in checkpoint to give variable names for new model.
-
-  Returns
-  -------
-  dict
-    keys are variable names in the checkpoint and values are variable names in the new model,
-    into which the checkpoint parameters should be loaded.
-  """
-  vars_to_restore = get_checkpoint_variable_names(
-    init_from_dir,
-    exclude_var_names=exclude_var_names,
-    exclude_scopes=exclude_name_scopes,
-  )
-
-  if name_scope_to_prepend is not None:
-    if not name_scope_to_prepend.endswith('/'):
-      name_scope_to_prepend += '/'
-
-  if name_scope_to_remove is not None:
-    if not name_scope_to_remove.endswith('/'):
-      name_scope_to_remove += '/'
-
-  init_map = {}
-
-  for var_name in vars_to_restore:
-    var_name_checkpoint = var_name
-
-    if name_scope_to_remove is not None:
-      var_name = var_name.replace(name_scope_to_remove, '')
-
-    var_name_new_model = var_name
-
-    if name_scope_to_prepend is not None:
-      var_name_new_model = name_scope_to_prepend + var_name_new_model
-
-    init_map[var_name_checkpoint] = var_name_new_model
-
-  return init_map
-
-
-def get_checkpoint_variable_names(model_dir, exclude_var_names=None, exclude_scopes=None):
-  """
-  Gets a list of variable names from the latest checkpoint in model_dir.
-  Removes variables with scope defined by exclude_scopes, and/or with names defined by
-  exclude_var_names.
-
-  Args:
-    model_dir (str): Directory containing checkpoint file for the pre-trained model
-    exclude_var_names (list): Optional variable names to exclude (can include full/partial scope)
-    exclude_scopes (list): Optional scopes to exclude
-
-  Returns:
-    list: variable names
-  """
-  checkpoint_path = tf.train.latest_checkpoint(model_dir)
-  variables_and_shapes = tf.train.list_variables(checkpoint_path)
-
-  def _keep(name):
-    if exclude_scopes and any(name.startswith(exc_scope) for exc_scope in exclude_scopes):
-      return False
-    if exclude_var_names and any(name.endswith(exc_var) for exc_var in exclude_var_names):
-      return False
-    return True
-
-  names = [x[0] for x in variables_and_shapes if _keep(x[0])]
-
-  return names
-
-
-def to_snake_case(name):
-  """
-  Changes name to snake case
-  """
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def copy_phase_inputs(init_dir, dest_dir):
-  """Automatically copies the .json.tf from the init_dir to save_dir
-  so we can load multiple parameters at the same time.
-
-  Args:
-    init_dir:
-      Name of the checkpoint directory.
-    dest_dir:
-      Name of the output directory.
-  """
-  if init_dir is not None:
-    # we are using tf.io.gfile so we can use it with both local and hdfs paths
-    for files in tf.io.gfile.listdir(init_dir):
-      if files.endswith(".json.tf"):
-        src_file = os.path.join(init_dir, files)
-        dest_file = os.path.join(dest_dir, files)
-        if not tf.io.gfile.exists(dest_dir):
-          # creates the folder
-          try:
-            tf.io.gfile.makedirs(dest_dir)
-          # to prevent racing condition
-          except OSError:
-            if not tf.io.gfile.isdir(dest_dir):
-              raise
-        # dest_file may be old if it exists and
-        # dest_file gets copied several times in distributed training
-        tf.io.gfile.copy(src_file, dest_file, overwrite=True)
-
-
-def rehash_sparse_features_nbits(sp_a, nbits, hash_fn=multiplicative_hash):
-  """
-  Rehash the feature ids of the sparse tensor,
-  and limit the output to n bits.
-
-  This is useful for making the distribution of
-  feature_ids more uniform, which may improve performance
-  in some situations.
-
-  This would typically be used on the output of
-  PercentileDiscretizer, since it assigns many
-  bins to low-valued output feature ids.
-
-  Input feature IDs should take values less than 2**32,
-  and nbits should be less than 32
-
-  Args:
-    sp_a:
-      a tf.SparseTensor object
-    nbits:
-      integer number of bits to mask output feature_ids
-    hash_fn:
-      Function that takes integer values and returns hashes of these values.
-      The output does not need to be masked to the desired number of bits,
-      as this masking will be taken care of. Default value = multiplicative_hash.
-
-  Returns:
-    a new tf.SparseTensor
-  """
-
-  feature_ids = sp_a.indices[:, 1]
-  feature_ids = hash_fn(feature_ids)
-
-  sample_ids = sp_a.indices[:, 0]
-  values = sp_a.values
-  dense_shape = sp_a.dense_shape
-
-  indices = tf.stack([sample_ids, feature_ids], axis=1)
-
-  sp_a = tf.SparseTensor(indices, values, dense_shape)
-
-  # note - we need 2**nbits >= batch size
-  # otherwise, sample_ids will be squashed by the mask.
-  return limit_sparse_tensor_size(sp_a, nbits)
-
-
-def convert_to_hparams(opt):
-  """
-  Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams.
-  Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported
-  tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams.
-
-  NOTE: If you are using estimators, please don't call this method and directly pass python dict
-  to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts.
-  """
-
-  # Convert to dict so we can iterate through it cleanly.
-  if isinstance(opt, argparse.Namespace):
-    params_dict = vars(opt)
-  elif isinstance(opt, dict):
-    params_dict = opt
-  elif isinstance(opt, HParams):
-    logging.warning('If you are using Estimator, please pass python dict directly to Estimator.')
-    params_dict = opt.values()
-  else:
-    raise ValueError("Input can not be of type %s. "
-                     "It can be one of { argparse.Namespace, dict, "
-                     "twitter.deepbird.hparam.HParams}."
-                     % type(opt))
-
-  params = HParams()
-  # Hack to convert all parameters from hdfs:/// format to hdfs://default/
-  # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical.
-  for key, val in params_dict.items():
-    val = params_dict[key]
-    # Fix the path if the value is a string
-    if isinstance(val, str):
-      params.add_hparam(key, sanitize_hdfs_path(val))
-    else:
-      params.add_hparam(key, val)
-
-  return params
-
-
-def dynamic_partition(features, partitions, num_partitions=2, name=None):
-  """
-  Partitions each of the tensor in features using the provided mask.
-
-  Args:
-    features:
-      A single tensor or an iterable of tensors (list, tuple, dict)
-    partitions:
-      A bool or integer tensor representing the partitions.
-
-  Returns partitioned outputs as a list. Each element of the list is the same type as features.
-
-  This uses tf.dynamic_partition but adds the following niceties:
-    - features can be a list or dict of different tensor types.
-    - only a partition tensor is used to partition all the feature tensors recursively.
-    - the partition tensor is automatically converted into an integer tensor.
-    - defaults to num_partitions == 2
-  """
-
-  if not isinstance(features, (dict, list, tuple, tf.Tensor)):
-    raise AssertionError("features container must be a dict, list, or tuple, tf.Tensor")
-
-  if isinstance(partitions, tf.Tensor):
-    partitions = tf.cast(partitions, tf.int32)
-
-  if isinstance(features, tf.Tensor):
-    return tf.dynamic_partition(features, partitions, num_partitions, name)
-
-  outputs = []
-  for _ in range(num_partitions):
-    if isinstance(features, (tuple, list)):
-      # Create an empty list of lists first, will be converted to right type afterwards.
-      outputs.append([None for _ in range(len(features))])
-    else:
-      outputs.append(dict())
-
-  iterable = features.items() if isinstance(features, dict) else enumerate(features)
-
-  # Handling partitions of nested classes handled here:
-  # Recursively call dynamic_partition for containers
-  for key, feature in iterable:
-    name_key = None if name is None else name + "_" + str(key)
-    if isinstance(partitions, tf.Tensor):
-      results = tf.dynamic_partition(feature, partitions, num_partitions, name_key)
-    else:
-      results = tf.dynamic_partition(feature, partitions[key], num_partitions[key], name_key)
-      # Append the result to the proper output container
-    for idx, result in enumerate(results):
-      outputs[idx][key] = result
-
-  # if input is tuple, convert list of lists back to list of tuples
-  if isinstance(features, tuple):
-    outputs = [type(features)(output) for output in outputs]
-
-  return outputs
-
-
-def write_file(filename, contents, encode=False):
-  '''
-  Optionally encodes contents and writes contents to a file.
-
-  Arguments:
-    filename:
-      path to file where the contents will be saved.
-      Accepts HDFS and local paths.
-    contents:
-      contents to save to the file.
-      Must be a string when encode is False.
-    encode:
-      False | 'json'. When encode='json', contents is encoded
-      with json.dumps.
-  '''
-  if encode == 'json':
-    contents = json.dumps(contents)
-  elif not is_string(contents):
-    raise ValueError("Expecting string for encode=False")
-
-  graph = tf.Graph()
-  with graph.as_default():
-    write = tf.write_file(filename, contents)
-
-  with tf.Session(graph=graph) as sess:
-    sess.run(write)
-
-
-def read_file(filename, decode=False):
-  '''
-  Reads contents from a file and optionally decodes it.
-
-  Arguments:
-    filename:
-      path to file where the contents will be loaded from.
-      Accepts HDFS and local paths.
-    decode:
-      False | 'json'. When decode='json', contents is decoded
-      with json.loads. When False, contents is returned as is.
-
-  Returns:
-    contents
-  '''
-  graph = tf.Graph()
-  with graph.as_default():
-    read = tf.read_file(filename)
-
-  with tf.Session(graph=graph) as sess:
-    contents = (sess.run(read))
-    # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str
-    if not isinstance(contents, str):
-      contents = contents.decode()
-
-  if decode == 'json':
-    contents = json.loads(contents)
-
-  return contents
-
-def setup_tf_logging_formatter():
-  formatter = _logging.Formatter(
-      '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
-      None)
-  # Setting up absl logging verbosity
-  logging.set_verbosity('info')
-  logging.set_stderrthreshold('info')
-  logging.get_absl_handler().setFormatter(formatter)
-  tf.logging.set_verbosity(tf.logging.INFO)
-  # Set tensorflow logging handler format
-  if len(tf_logging.get_logger().handlers) > 0:
-    tf_logging.get_logger().handlers[0].setFormatter(formatter)
-
-
-def set_tensorflow_log_level(log_level):
-  """
-  Sets tensorflow's default logging level.
-
-  0. all logs are shown.
-  1. filter out INFO logs.
-  2. filter out WARNINGs and INFOs.
-  3. filter out ERRORs, WARNINGs, and INFOs.
-
-  Note that tf.Print output are INFO logs, so setting log_level above 0 would hide
-  output from tf.Print.
-  """
-  assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3
-  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(log_level)
-
-
-def weighted_average(values, weights):
-  """
-  Compute a weighted average using the given values and weights.
-  E.g. this is usually used to compute a weighted loss given sample weights.
-  """
-  return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights)
-
-
-def backup_checkpoint(checkpoint_path_prefix,
-                      backup_path='backup',
-                      empty_backup=True):
-  """
-  Creates a backup copy of a checkpoint in backup_dir.
-  This function is used by the Trainer for early-stopping.
-
-  Arguments:
-    checkpoint_path_prefix:
-      Prefix of the path to the checkpoint files.
-    backup_path:
-      path to a directory where checkpoint files will be backed up.
-    empty_backup:
-      When True (the default), the current contents of the backup directory
-      are removed before the backup is performed.
-
-  Returns:
-    The number of backed up files.
-  """
-  checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix)
-
-  if tf.io.gfile.exists(backup_path) and empty_backup:
-    tf.io.gfile.rmtree(backup_path)
-
-  tf.io.gfile.mkdir(backup_path)
-
-  n_backup = 0
-  # copy all checkpoint files to backup directory (TODO use gfile.glob instead)
-  try:
-    checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*")
-    if len(checkpoint_files) == 0:
-      raise twml.errors.CheckpointNotFoundError("%s not found" % checkpoint_path_prefix)
-    for filename in checkpoint_files:
-      n_backup += 1
-      tf.io.gfile.copy(
-        src=filename,
-        dst=os.path.join(backup_path, os.path.basename(filename))
-      )
-  except tf.errors.OpError as ex:
-    raise twml.errors.CheckpointNotFoundError(
-      f"{str(ex)}\n {checkpoint_path_prefix} not found."
-    )
-
-  # tf.train.latest_checkpoint needs the 'checkpoint' file.
-  with tf.io.gfile.GFile(os.path.join(backup_path, 'checkpoint'), 'w') as f:
-    f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix)
-
-  return n_backup
-
-
-def set_only_checkpoint(source_path, dest_path, remove_source=True):
-  """
-  Removes the checkpoint and model.ckpt* files from dest_path.
-  Moves the latest checkpoint from source_path to dest_path.
-
-  Arguments:
-    source_path:
-      path to directory containing the latest checkpoint.
-      Should contain a valid checkpoint file and model.ckpt files.
-      For early-stopping, this should be the save_dir/best_checkpoint dir.
-    dest_path:
-      path to directory where the latest checkpoint files will be moved.
-      All its checkpoint and model.ckpt* files will be removed.
-      For early-stopping, this should be the save_dir.
-    remove_source:
-      When True (the default), deletes the source directory.
-      Note that even when False, its checkpoint files are moved to
-      dest_path anyway.
-      This deletes the source directory (and any remaining contents).
-  """
-  # make it so that source_path checkpoint is the only checkpoint
-  source_path_prefix = tf.train.latest_checkpoint(source_path)
-  if source_path_prefix is not None:
-    # remove intermediate checkpoints
-    for filename in tf.io.gfile.listdir(dest_path):
-      if filename.startswith("model.ckpt"):
-        tf.io.gfile.Remove(os.path.join(dest_path, filename))
-    # move contents of source_path to dest_path
-    for filename in tf.io.gfile.listdir(source_path):
-      tf.io.gfile.rename(
-        oldname=os.path.join(source_path, filename),
-        newname=os.path.join(dest_path, filename),
-        overwrite=True)  # overwrite "checkpoint" file
-    # delete the source_path dir
-    if remove_source:
-      tf.io.gfile.rmtree(source_path)
-
-
-def list_files_by_datetime(
-  base_path,
-  start_datetime,
-  end_datetime=None,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1,
-  hour_resolution=1,
-  sort=False
-):
-  """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range.
-
-  Args:
-    base_path:
-      The base path. If `None`, returns `None`.
-    start_datetime:
-      A `datetime.datetime` or string representing the start of the range (inclusive).
-      If `None`, it returns `list_files(base_path, extension, sort)`.
-    end_datetime:
-      A `datetime.datetime` or string representing the end of the range (inclusive).
-      If `None`, assumed to be the same as start_datetime.
-    datetime_prefix_format:
-      Format compatible with `datetime.datetime.strftime`
-      (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior).
-    extension:
-      The extension of the files composing the dataset (e.g. 'lzo').
-    parallelism:
-      The number of threads used to process list patterns (this is mostly useful
-      when dealing with filesystems such as HDFS in which listing files is a potentially expensive
-      operation).
-    hour_resolution:
-      The separation between consecutive hours. The default value is 1.
-    sort:
-      bool, whether to return a sorted list of files. Default False.
-
-  Returns:
-    A list with all the matching files.
-
-  Raises:
-    errors.OpError: If there are filesystem / directory listing errors.
-  """
-  if hour_resolution is None:
-    hour_resolution = 1
-
-  if base_path is None:
-    return None
-
-  if start_datetime is None:
-    return list_files(base_path, extension, sort)
-
-  # Do this in case people want to use a single day for training.
-  if end_datetime is None:
-    end_datetime = start_datetime
-
-  assert parallelism > 0
-  assert start_datetime <= end_datetime
-
-  if isinstance(start_datetime, str):
-    start_datetime = datetime.strptime(start_datetime, datetime_prefix_format)
-
-  if isinstance(end_datetime, str):
-    end_datetime = datetime.strptime(end_datetime, datetime_prefix_format)
-
-  assert isinstance(start_datetime, datetime)
-  assert isinstance(end_datetime, datetime)
-
-  base_path = preprocess_path(base_path)
-
-  def _handle_missing_globs(pattern):
-    try:
-      return tf.io.gfile.glob(pattern)
-    except tf.errors.NotFoundError as e:
-      tf.logging.warning(e.message)
-      return []
-
-  # a set is used because there might be some repeated globs depending on dt_prefix_format
-  globs = {
-    os.path.join(base_path, dt.strftime(datetime_prefix_format), '*.%s' % extension)
-    for dt in rrule.rrule(
-      freq=rrule.HOURLY, interval=hour_resolution, dtstart=start_datetime, until=end_datetime)
-  }
-  nested_files = Parallel(n_jobs=parallelism, backend='threading')(
-    delayed(_handle_missing_globs)(p) for p in globs
-  )
-  flattened_files = list(itertools.chain.from_iterable(nested_files))
-
-  if not flattened_files:
-    error_msg = "Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}".format(
-      base_path=base_path, start_datetime=start_datetime, end_datetime=end_datetime
-    )
-    raise OSError(error_msg)
-
-  if sort:
-    flattened_files = sorted(flattened_files)
-
-  return flattened_files
-
-
-def limit_sparse_tensor_size(sparse_tf, input_size_bits, mask_indices=True):
-  """
-  Returns a ``tf.SparseTensor`` which is the input SparseTensor
-  limited to the specified input_size_bits
-
-  Args:
-    sparse_tf:
-      twml.SparseTensor or tf.SparseTensor
-    input_size_bits:
-      The number of bits allocated to the input size.
-      Input size will be power(2,input_size_bits).
-      Note that twml.limit_bits truncates any feature keys that
-      exceed the input size.
-    mask_indices:
-      If mask indices is False; only the shape is changed. Defaults to True.
-  """
-  if isinstance(sparse_tf, twml.SparseTensor):
-    sparse_tf = sparse_tf.to_tf()
-  if not isinstance(sparse_tf, tf.SparseTensor):
-    raise TypeError('Input argument `sparse_tf` should either be of type'
-                    'twml.SparseTensor of tf.SparseTensor. Found type: {}'.
-                    format(type(sparse_tf)))
-  if mask_indices:
-    indices = twml.limit_bits(sparse_tf.indices, input_size_bits)
-  else:
-    indices = sparse_tf.indices
-  dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits])
-  return tf.SparseTensor(indices=indices, values=sparse_tf.values,
-                         dense_shape=dense_shape)
-
-
-def create_module_spec(mlp_fn, mode, params, drop_collections=None):
-  """
-  Creates a standard tags_and_args which should be passed to the create_module_spec
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args).
-
-  Args:
-    module_fn:
-      a function to build a graph for the Module.
-    mode:
-      mode in which the Estimator is run
-    params:
-      parameters passed to the Estimator
-  """
-  import tensorflow_hub as hub # noqa: F402
-  tags_and_args = [(set(), {"params": params, "mode": mode}),  # serving graph
-                   ({"train"}, {"params": params, "mode": mode})  # training graph
-                   ]
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections)
-  return spec
-
-
-def change_name_scope_from_dir(init_scope_name, final_scope_name, save_dir):
-  """
-  Changes the name of the saved scope to the desired name and saves it
-  to the same save_dir.
-
-  Args:
-    init_scope_name:
-      initial scope name
-    final_scope_name:
-      desired (final) scope name
-    save_dir:
-      directory which the scopes are saved
-
-  In the follwing section we:
-    - Read all the variables from the latest checkpoint.
-    - Make a copy of the variables with new name scope.
-    - Store both sets of variables into the latest checkpoint.
-  This essentially doubles up the size of the checkpoint.
-  But when a job is restarted after this part is done, the checkpoint size doubles again.
-  To avoid doing this, we create a copy in backup if a backup isn't found.
-  This allows us always read (from backup) and write same sized checkpoint files.
-  """
-
-  # Create a backup_checkpoints dir
-  backup_dir = os.path.join(save_dir, "change_name_scope_backups")
-  tf.io.gfile.makedirs(backup_dir)
-
-  latest_checkpoint = tf.train.latest_checkpoint(save_dir)
-
-  if latest_checkpoint is None:
-    raise OSError("No checkpoints found in save_dir: %s" % save_dir)
-
-  latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir)
-
-  if (latest_backup_checkpoint is None or
-      (os.path.basename(latest_checkpoint) !=
-       os.path.basename(latest_backup_checkpoint))):
-    backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False)
-
-  variables = tf.train.list_variables(backup_dir)
-  with tf.Graph().as_default(), tf.Session().as_default() as sess:
-    new_variables = []
-    for name, _ in variables:
-      var = tf.train.load_variable(backup_dir, name)
-      # Append both the rename and the original variable
-      new_variables.append(
-        tf.Variable(var, name=name.replace(init_scope_name, final_scope_name)))
-      new_variables.append(tf.Variable(var, name=name))
-    # Save this to the checkpoint in the save_dir
-    saver = tf.train.Saver(new_variables)
-    sess.run(tf.global_variables_initializer())
-    saver.save(sess, latest_checkpoint)  # pylint: disable=no-member
-
-
-def hub_import(input, module, module_name, trainable=False):
-  """
-  Loads exported hub module.
-
-  Args:
-    input:
-      input to hub module
-    module:
-      module path
-    module_name:
-      signature of the exported hub module
-  """
-  import tensorflow_hub as hub # noqa: F402
-  hub_module = hub.Module(module, trainable=trainable)
-  output = hub_module(input, signature=module_name)
-  return output
-
-
-def _extract_hash_space_bits(feature_config):
-  """
-  Extract Sparse Shapes for contrib.FeatureConfig.
-  Arguments:
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  Returns:
-    Dictionary of tensor names and hash space bits.
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    fc_type = type(feature_config)
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig: {fc_type}")
-  sparse_shapes_dict = {}
-  for config in feature_config.sparse_extraction_configs:
-    sparse_shapes_dict[config.output_name] = config.hash_space_bits
-  return sparse_shapes_dict
-
-
-def fix_shape_sparse(features, feature_config):
-  """
-  Modifies the shape of features which are extracted using the hashing trick.
-  Features itself is changed by this function.
-  Arguments:
-    features:
-      Feature dictionary extracted by the feature config
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}")
-  sparse_shape = _extract_hash_space_bits(feature_config)
-  if not isinstance(features, dict):
-    raise TypeError(f"features must be of dictionary type, it is of {type(features)} type")
-  for key in set(features) & set(sparse_shape):
-    features[key] = limit_sparse_tensor_size(features[key], sparse_shape[key], mask_indices=False)
-
-
-def touch_file_in_dir(directory, filename):
-  """
-  Creates a file named filename in directory.
-
-  Arguments:
-    filename: (str)
-    directory: (str)
-  """
-  file_path = os.path.join(directory, filename)
-  with tf.io.gfile.GFile(file_path, "w") as f:
-    f.write("")
-
-
-def file_exist_in_dir(directory: str, filename: str) -> bool:
-  file_path = os.path.join(directory, filename)
-  return tf.io.gfile.exists(file_path)
-
-
-def copy_to_local(remote, local, filename, overwrite=False):
-  """Function to file from remote directory to local directory."""
-  assert "hdfs://" not in local
-  tf.io.gfile.makedirs(local)
-  return tf.io.gfile.copy(
-    os.path.join(remote, filename),
-    os.path.join(local, filename),
-    overwrite=overwrite,
-  )
-
-
-def copy_recursive(src, dst, overwrite=False):
-  """
-  Function to copy a directory recursively.
-
-  Arguments:
-    src: Source directory.
-    dst: Destination directory.
-    overwrite: Specifies if files are to be overwritten if they exist.
-  """
-
-  src = src.rstrip("/")
-  dst = dst.rstrip("/")
-
-  for dirname, subdirs, files in tf.io.gfile.walk(src):
-    dst_dirname = dirname.replace(src, dst)
-    tf.io.gfile.makedirs(dst_dirname)
-
-    for f in files:
-      src_f = os.path.join(dirname, f)
-      dst_f = os.path.join(dst_dirname, f)
-
-      tf.logging.info(f"Copying {src_f} to {dst_f}")
-      tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite)
-
-
-def delete_file_or_dir(path):
-  """
-  Delete the file or directory given by `path`
-  Arguments:
-    path:
-      string indicating path of file or directory to remove
-  """
-  if tf.io.gfile.isdir(path):
-    tf.io.gfile.rmtree(path)
-  else:
-    tf.io.gfile.remove(path)
-
-
-def get_distributed_training_job_path():
-  """
-  Function to get distributed training job path.
-  Note: distributed training has three jobs, one parameter server job,
-  one worker job and one evaluator job. All of these three jobs' name
-  share a common base job name.
-  """
-  job_path = AuroraPath(dc=os.environ.get("TWML_JOB_CLUSTER"),
-    role=os.environ.get("TWML_JOB_ROLE"),
-    env=os.environ.get("TWML_JOB_ENV"),
-    job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"))
-  return job_path
-
-def do_every_n_steps(action, num_steps):
-  """
-  Execute a sequence of TensorFlow operations only once in a while.
-  Specifically, `action` is performed if `global_step` is a
-    multiple of `num_steps`
-
-  Args:
-    action: callable to be performed at regular intervals. This callable
-      must return a TF op with no output tensors.
-    num_steps: period of performing the action, as measured
-      in number of training steps
-
-  Returns:
-    A TensorFlow op with no output tensors, like a tf.print() or tf.no_op().
-    You must use tf.control_dependencies() to execute the op.
-
-  """
-  global_step = tf.train.get_or_create_global_step()
-  condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0)
-  return tf.cond(condition, action, lambda: tf.no_op())
diff --git a/twml/twml_common/__init__.py b/twml/twml_common/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/twml/twml_common/initializer.py b/twml/twml_common/initializer.py
deleted file mode 100644
index 7a9c734c7..000000000
--- a/twml/twml_common/initializer.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import tensorflow.compat.v1 as tf
-
-
-class PartitionInitializer(tf.keras.initializers.Initializer):
-  """Required to initialize partitioned weight with numpy array for tests"""
-
-  def __init__(self, np_array):
-    self.np_array = np_array
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    offset = partition_info.var_offset
-    ix0, ix1 = offset[0], offset[0] + shape[0]
-    iy0, iy1 = offset[1], offset[1] + shape[1]
-    return self.np_array[ix0:ix1, iy0:iy1]
diff --git a/twml/twml_common/serialize.py b/twml/twml_common/serialize.py
deleted file mode 100644
index 36c53881e..000000000
--- a/twml/twml_common/serialize.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from thrift.protocol import TBinaryProtocol
-from thrift.transport import TTransport
-
-
-def serialize(obj):
-  tbuf = TTransport.TMemoryBuffer()
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  obj.write(iproto)
-  return tbuf.getvalue()
-
-
-def deserialize(record, bytes):
-  tbuf = TTransport.TMemoryBuffer(bytes)
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  record.read(iproto)
-  return record
diff --git a/twml/twml_common/sparse_inputs.py b/twml/twml_common/sparse_inputs.py
deleted file mode 100644
index b8f7939e5..000000000
--- a/twml/twml_common/sparse_inputs.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-
-def create_sparse_tensor(batch_size, input_size, num_values, dtype=tf.float32):
-  random_indices = np.sort(np.random.randint(batch_size * input_size, size=num_values))
-  test_indices_i = random_indices // input_size
-  test_indices_j = random_indices % input_size
-  test_indices = np.stack([test_indices_i, test_indices_j], axis=1)
-  test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype)
-
-  return tf.SparseTensor(indices=tf.constant(test_indices),
-                         values=tf.constant(test_values),
-                         dense_shape=(batch_size, input_size))
-
-
-def create_reference_input(sparse_input, use_binary_values):
-  if use_binary_values:
-    sp_a = tf.SparseTensor(indices=sparse_input.indices,
-                           values=tf.ones_like(sparse_input.values),
-                           dense_shape=sparse_input.dense_shape)
-  else:
-    sp_a = sparse_input
-  return sp_a