mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-14 07:08:53 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
331 lines
13 KiB
C++
331 lines
13 KiB
C++
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
// TWML modified to optimize binary features:
|
|
// - Sparse tensor values are assumed to be binary, so only add operation is done
|
|
// rather than mul-add;
|
|
// - In house version of vectorization is used instead of Eigen;
|
|
// - Enable sharding and multithreading.
|
|
|
|
#define EIGEN_USE_THREADS
|
|
|
|
#include "binary_sparse_dense_matmul.h"
|
|
#include "binary_sparse_dense_matmul_impl.h"
|
|
|
|
#include "tensorflow/core/framework/bounds_check.h"
|
|
#include "tensorflow/core/framework/op.h"
|
|
#include "tensorflow/core/framework/op_kernel.h"
|
|
#include "tensorflow/core/framework/common_shape_fns.h"
|
|
#include "tensorflow/core/framework/shape_inference.h"
|
|
|
|
namespace tensorflow {
|
|
|
|
namespace shape_inference {
|
|
// TODO: The `a_value` is supposed to be all ones.
|
|
// Users should not call this op directly but to use it from `sparse_op` python library.
|
|
// To make it consistent with original op, the signature remains the same currently,
|
|
// we will think a better way to contrain correct use of this op.
|
|
// CX-18174
|
|
REGISTER_OP("BinarySparseTensorDenseMatMul")
|
|
.Input("a_indices: Tindices")
|
|
.Input("a_values: T")
|
|
.Input("a_shape: int64")
|
|
.Input("b: T")
|
|
.Output("product: T")
|
|
.Attr("T: type")
|
|
.Attr("Tindices: {int32,int64} = DT_INT64")
|
|
.Attr("adjoint_a: bool = false")
|
|
.Attr("adjoint_b: bool = false")
|
|
.SetShapeFn([](InferenceContext* c) {
|
|
DimensionHandle unused_dim;
|
|
ShapeHandle unused;
|
|
ShapeHandle b;
|
|
ShapeHandle a_shape;
|
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused)); // a_indices
|
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused)); // a_values
|
|
TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &a_shape));
|
|
TF_RETURN_IF_ERROR(c->WithRank(a_shape, 2, &a_shape));
|
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &b));
|
|
|
|
bool adjoint_a;
|
|
bool adjoint_b;
|
|
TF_RETURN_IF_ERROR(c->GetAttr("adjoint_a", &adjoint_a));
|
|
TF_RETURN_IF_ERROR(c->GetAttr("adjoint_b", &adjoint_b));
|
|
|
|
DimensionHandle output_right = c->Dim(b, adjoint_b ? 0 : 1);
|
|
DimensionHandle output_left = c->Dim(a_shape, adjoint_a ? 1 : 0);
|
|
DimensionHandle inner_left = c->Dim(a_shape, adjoint_a ? 0 : 1);
|
|
DimensionHandle inner_right = c->Dim(b, adjoint_b ? 1 : 0);
|
|
TF_RETURN_IF_ERROR(c->Merge(inner_left, inner_right, &unused_dim));
|
|
c->set_output(0, c->Matrix(output_left, output_right));
|
|
return Status::OK();
|
|
});
|
|
} // namespace shape_inference
|
|
|
|
|
|
typedef Eigen::ThreadPoolDevice CPUDevice;
|
|
|
|
template <typename Device, typename T, typename Tindices>
|
|
class BinarySparseTensorDenseMatMulOp : public OpKernel {
|
|
public:
|
|
explicit BinarySparseTensorDenseMatMulOp(OpKernelConstruction* ctx)
|
|
: OpKernel(ctx) {
|
|
OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_a", &adjoint_a_));
|
|
OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint_b", &adjoint_b_));
|
|
}
|
|
|
|
void Compute(OpKernelContext* ctx) override {
|
|
const Tensor* a_indices;
|
|
const Tensor* a_values;
|
|
const Tensor* a_shape;
|
|
const Tensor* b;
|
|
OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
|
|
OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values));
|
|
OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
|
|
OP_REQUIRES_OK(ctx, ctx->input("b", &b));
|
|
|
|
// Check that the dimensions of the two matrices are valid.
|
|
OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b->shape()),
|
|
errors::InvalidArgument("Tensor 'b' is not a matrix"));
|
|
|
|
OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()),
|
|
errors::InvalidArgument("Tensor 'a_shape' is not a vector"));
|
|
|
|
OP_REQUIRES(
|
|
ctx, a_shape->NumElements() == 2,
|
|
errors::InvalidArgument("Tensor 'a_shape' must have 2 elements"));
|
|
|
|
OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values->shape()),
|
|
errors::InvalidArgument("Tensor 'a_values' is not a vector"));
|
|
|
|
OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
|
|
errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
|
|
|
|
const int64 nnz = a_indices->shape().dim_size(0);
|
|
OP_REQUIRES(ctx, nnz == a_values->NumElements(),
|
|
errors::InvalidArgument("Number of rows of a_indices does not "
|
|
"match number of entries in a_values"));
|
|
|
|
OP_REQUIRES(
|
|
ctx, a_indices->shape().dim_size(1) == a_shape->NumElements(),
|
|
errors::InvalidArgument("Number of columns of a_indices does not match "
|
|
"number of entries in a_shape"));
|
|
|
|
auto a_shape_t = a_shape->vec<int64>();
|
|
const int64 outer_left = (adjoint_a_) ? a_shape_t(1) : a_shape_t(0);
|
|
const int64 outer_right =
|
|
(adjoint_b_) ? b->shape().dim_size(0) : b->shape().dim_size(1);
|
|
const int64 inner_left = (adjoint_a_) ? a_shape_t(0) : a_shape_t(1);
|
|
const int64 inner_right =
|
|
(adjoint_b_) ? b->shape().dim_size(1) : b->shape().dim_size(0);
|
|
|
|
OP_REQUIRES(
|
|
ctx, inner_right == inner_left,
|
|
errors::InvalidArgument(
|
|
"Cannot multiply A and B because inner dimension does not match: ",
|
|
inner_left, " vs. ", inner_right,
|
|
". Did you forget a transpose? "
|
|
"Dimensions of A: [",
|
|
a_shape_t(0), ", ", a_shape_t(1),
|
|
"). Dimensions of B: ", b->shape().DebugString()));
|
|
|
|
TensorShape out_shape({outer_left, outer_right});
|
|
Tensor* out = nullptr;
|
|
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
|
|
|
|
if (out->NumElements() == 0) {
|
|
// If a has shape [0, x] or b has shape [x, 0], the output shape
|
|
// is a 0-element matrix, so there is nothing to do.
|
|
return;
|
|
}
|
|
|
|
if (a_values->NumElements() == 0 || b->NumElements() == 0) {
|
|
// If a has shape [x, 0] and b has shape [0, y], the
|
|
// output shape is [x, y] where x and y are non-zero, so we fill
|
|
// the output with zeros.
|
|
out->flat<T>().device(ctx->eigen_device<Device>()) =
|
|
out->flat<T>().constant(T(0));
|
|
return;
|
|
}
|
|
|
|
#define MAYBE_ADJOINT(ADJ_A, ADJ_B) \
|
|
if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) { \
|
|
Status functor_status = functor::SparseTensorDenseMatMulFunctor< \
|
|
Device, T, Tindices, ADJ_A, \
|
|
ADJ_B>::Compute(ctx, a_indices, a_values, a_shape, b, out); \
|
|
OP_REQUIRES_OK(ctx, functor_status); \
|
|
}
|
|
|
|
MAYBE_ADJOINT(false, false);
|
|
MAYBE_ADJOINT(false, true);
|
|
MAYBE_ADJOINT(true, false);
|
|
MAYBE_ADJOINT(true, true);
|
|
|
|
#undef MAYBE_ADJOINT
|
|
}
|
|
|
|
private:
|
|
bool adjoint_a_;
|
|
bool adjoint_b_;
|
|
};
|
|
|
|
#define REGISTER_CPU(TypeT, TypeIndex) \
|
|
REGISTER_KERNEL_BUILDER( \
|
|
Name("BinarySparseTensorDenseMatMul") \
|
|
.Device(DEVICE_CPU) \
|
|
.TypeConstraint<TypeT>("T") \
|
|
.TypeConstraint<TypeIndex>("Tindices") \
|
|
.HostMemory("a_shape"), \
|
|
BinarySparseTensorDenseMatMulOp<CPUDevice, TypeT, TypeIndex>);
|
|
|
|
#define REGISTER_KERNELS_CPU(T) \
|
|
REGISTER_CPU(T, int64); \
|
|
REGISTER_CPU(T, int32)
|
|
|
|
REGISTER_KERNELS_CPU(float);
|
|
REGISTER_KERNELS_CPU(double);
|
|
REGISTER_KERNELS_CPU(int32);
|
|
REGISTER_KERNELS_CPU(complex64);
|
|
REGISTER_KERNELS_CPU(complex128);
|
|
|
|
namespace functor {
|
|
|
|
namespace {
|
|
Status KOutOfBoundsError(int64 k, std::size_t i, int rhs_index_a,
|
|
std::size_t lhs_right) {
|
|
return errors::InvalidArgument("k (", k, ") from index[", i, ",", rhs_index_a,
|
|
"] out of bounds (>=", lhs_right, ")");
|
|
}
|
|
|
|
Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
|
|
int64 out_dim0) {
|
|
return errors::InvalidArgument("m (", m, ") from index[", i, ",", lhs_index_a,
|
|
"] out of bounds (>=", out_dim0, ")");
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
// The general functor just borrows the code from tf except that add is computed
|
|
// instead of mul-add.
|
|
template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
|
|
struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
|
|
// Vectorize certain operations above this size.
|
|
static const std::size_t kNumVectorize = 32;
|
|
|
|
static Status Compute(OpKernelContext* ctx,
|
|
const Tensor *a_indices,
|
|
const Tensor *a_values,
|
|
const Tensor *a_shape,
|
|
const Tensor *b,
|
|
Tensor *out) {
|
|
return EigenCompute(ctx->eigen_device<CPUDevice>(), out->matrix<T>(),
|
|
a_indices->matrix<Tindices>(), a_values->vec<T>(),
|
|
b->matrix<T>());
|
|
}
|
|
|
|
static Status EigenCompute(const CPUDevice& d, typename TTypes<T>::Matrix out,
|
|
typename TTypes<Tindices>::ConstMatrix a_indices,
|
|
typename TTypes<T>::ConstVec a_values,
|
|
typename TTypes<T>::ConstMatrix b) {
|
|
const std::size_t nnz = a_values.size();
|
|
const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
|
|
const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
|
|
const int lhs_index_a = ADJ_A ? 1 : 0;
|
|
const int rhs_index_a = ADJ_A ? 0 : 1;
|
|
|
|
out.setZero();
|
|
|
|
if (rhs_right < kNumVectorize) {
|
|
// Disable vectorization if the RHS of output is too small
|
|
auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
|
|
|
|
for (std::size_t i = 0; i < nnz; ++i) {
|
|
const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
|
|
const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
|
|
if (!FastBoundsCheck(k, lhs_right)) {
|
|
return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);
|
|
}
|
|
if (!FastBoundsCheck(m, out.dimension(0))) {
|
|
return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));
|
|
}
|
|
for (std::size_t n = 0; n < rhs_right; ++n) {
|
|
const T b_value = maybe_adjoint_b(k, n);
|
|
out(m, n) += b_value;
|
|
}
|
|
}
|
|
} else {
|
|
// Vectorization via Eigen.
|
|
const int b_chip_index = ADJ_B ? 1 : 0;
|
|
|
|
#define LOOP_NNZ(b_passed) \
|
|
for (std::size_t i = 0; i < nnz; ++i) { \
|
|
const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
|
|
const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
|
|
if (!FastBoundsCheck(k, lhs_right)) { \
|
|
return KOutOfBoundsError(k, i, rhs_index_a, lhs_right); \
|
|
} \
|
|
if (!FastBoundsCheck(m, out.dimension(0))) { \
|
|
return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0)); \
|
|
} \
|
|
out.template chip<0>(m) += b_passed.template chip<b_chip_index>(k); \
|
|
}
|
|
|
|
|
|
if (ADJ_B) {
|
|
// Perform transpose and conjugation on B once, since we chip out B's
|
|
// columns in the nnz loop.
|
|
Eigen::array<int, 2> shuffle; // preserve dimension order
|
|
shuffle[0] = 1; shuffle[1] = 0;
|
|
Eigen::Tensor<T, 2, Eigen::ColMajor> col_major_conj_b =
|
|
b.swap_layout().shuffle(shuffle).conjugate();
|
|
LOOP_NNZ(col_major_conj_b);
|
|
} else {
|
|
LOOP_NNZ(b);
|
|
}
|
|
#undef LOOP_NNZ
|
|
}
|
|
return Status::OK();
|
|
}
|
|
};
|
|
|
|
|
|
// We have only specified and optimised the case with no matrix transpose,
|
|
// since it is the most typical usage in productions.
|
|
template <typename Tindices>
|
|
struct SparseTensorDenseMatMulFunctor<CPUDevice,
|
|
float, Tindices, false, false> {
|
|
static Status Compute(OpKernelContext* ctx,
|
|
const Tensor *a_indices,
|
|
const Tensor *a_values,
|
|
const Tensor *a_shape,
|
|
const Tensor *b,
|
|
Tensor *out) {
|
|
auto a_indices_ptr = a_indices->flat<Tindices>().data();
|
|
auto b_ptr = b->flat<float>().data();
|
|
auto out_ptr = out->flat<float>().data();
|
|
const int64 nnz = a_indices->shape().dim_size(0);
|
|
const int64 outer_left = a_shape->vec<int64>()(0);
|
|
const int64 outer_right = b->shape().dim_size(1);
|
|
ParallelLookupAndSegmentSum<Tindices>(ctx, a_indices_ptr, b_ptr, nnz,
|
|
outer_left, outer_right, out_ptr);
|
|
return Status::OK();
|
|
}
|
|
};
|
|
|
|
} // namespace functor
|
|
|
|
} // namespace tensorflow
|