mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 08:48:46 +02:00
Delete navi/segdense directory
This commit is contained in:
parent
db101ede46
commit
d10beb7211
|
@ -1,11 +0,0 @@
|
|||
[package]
|
||||
name = "segdense"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0.104", features = ["derive"] }
|
||||
serde_json = "1.0.48"
|
||||
log = "0.4.17"
|
|
@ -1,43 +0,0 @@
|
|||
use std::fmt::Display;
|
||||
|
||||
/**
|
||||
* Custom error
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
pub enum SegDenseError {
|
||||
IoError(std::io::Error),
|
||||
Json(serde_json::Error),
|
||||
JsonMissingRoot,
|
||||
JsonMissingObject,
|
||||
JsonMissingArray,
|
||||
JsonArraySize,
|
||||
JsonMissingInputFeature,
|
||||
}
|
||||
|
||||
impl Display for SegDenseError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
SegDenseError::IoError(io_error) => write!(f, "{}", io_error),
|
||||
SegDenseError::Json(serde_json) => write!(f, "{}", serde_json),
|
||||
SegDenseError::JsonMissingRoot => write!(f, "{}", "SegDense JSON: Root Node note found!"),
|
||||
SegDenseError::JsonMissingObject => write!(f, "{}", "SegDense JSON: Object note found!"),
|
||||
SegDenseError::JsonMissingArray => write!(f, "{}", "SegDense JSON: Array Node note found!"),
|
||||
SegDenseError::JsonArraySize => write!(f, "{}", "SegDense JSON: Array size not as expected!"),
|
||||
SegDenseError::JsonMissingInputFeature => write!(f, "{}", "SegDense JSON: Missing input feature!"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for SegDenseError {}
|
||||
|
||||
impl From<std::io::Error> for SegDenseError {
|
||||
fn from(err: std::io::Error) -> Self {
|
||||
SegDenseError::IoError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for SegDenseError {
|
||||
fn from(err: serde_json::Error) -> Self {
|
||||
SegDenseError::Json(err)
|
||||
}
|
||||
}
|
|
@ -1,4 +0,0 @@
|
|||
pub mod error;
|
||||
pub mod segdense_transform_spec_home_recap_2022;
|
||||
pub mod mapper;
|
||||
pub mod util;
|
|
@ -1,23 +0,0 @@
|
|||
use std::env;
|
||||
use std::fs;
|
||||
|
||||
use segdense::error::SegDenseError;
|
||||
use segdense::util;
|
||||
|
||||
fn main() -> Result<(), SegDenseError> {
|
||||
env_logger::init();
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
let schema_file_name: &str = if args.len() == 1 {
|
||||
"json/compact.json"
|
||||
} else {
|
||||
&args[1]
|
||||
};
|
||||
|
||||
let json_str = fs::read_to_string(schema_file_name)?;
|
||||
|
||||
util::safe_load_config(&json_str)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FeatureInfo {
|
||||
pub tensor_index: i8,
|
||||
pub index_within_tensor: i64,
|
||||
}
|
||||
|
||||
pub static NULL_INFO: FeatureInfo = FeatureInfo {
|
||||
tensor_index: -1,
|
||||
index_within_tensor: -1,
|
||||
};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct FeatureMapper {
|
||||
map: HashMap<i64, FeatureInfo>,
|
||||
}
|
||||
|
||||
impl FeatureMapper {
|
||||
pub fn new() -> FeatureMapper {
|
||||
FeatureMapper {
|
||||
map: HashMap::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait MapWriter {
|
||||
fn set(&mut self, feature_id: i64, info: FeatureInfo);
|
||||
}
|
||||
|
||||
pub trait MapReader {
|
||||
fn get(&self, feature_id: &i64) -> Option<&FeatureInfo>;
|
||||
}
|
||||
|
||||
impl MapWriter for FeatureMapper {
|
||||
fn set(&mut self, feature_id: i64, info: FeatureInfo) {
|
||||
self.map.insert(feature_id, info);
|
||||
}
|
||||
}
|
||||
|
||||
impl MapReader for FeatureMapper {
|
||||
fn get(&self, feature_id: &i64) -> Option<&FeatureInfo> {
|
||||
self.map.get(feature_id)
|
||||
}
|
||||
}
|
|
@ -1,183 +0,0 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Root {
|
||||
#[serde(rename = "common_prefix")]
|
||||
pub common_prefix: String,
|
||||
#[serde(rename = "densification_transform_spec")]
|
||||
pub densification_transform_spec: DensificationTransformSpec,
|
||||
#[serde(rename = "identity_transform_spec")]
|
||||
pub identity_transform_spec: Vec<IdentityTransformSpec>,
|
||||
#[serde(rename = "complex_feature_type_transform_spec")]
|
||||
pub complex_feature_type_transform_spec: Vec<ComplexFeatureTypeTransformSpec>,
|
||||
#[serde(rename = "input_features_map")]
|
||||
pub input_features_map: Value,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DensificationTransformSpec {
|
||||
pub discrete: Discrete,
|
||||
pub cont: Cont,
|
||||
pub binary: Binary,
|
||||
pub string: Value, // Use StringType
|
||||
pub blob: Blob,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Discrete {
|
||||
pub tag: String,
|
||||
#[serde(rename = "generic_feature_type")]
|
||||
pub generic_feature_type: i64,
|
||||
#[serde(rename = "feature_identifier")]
|
||||
pub feature_identifier: String,
|
||||
#[serde(rename = "fixed_length")]
|
||||
pub fixed_length: i64,
|
||||
#[serde(rename = "default_value")]
|
||||
pub default_value: DefaultValue,
|
||||
#[serde(rename = "input_features")]
|
||||
pub input_features: Vec<InputFeature>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DefaultValue {
|
||||
#[serde(rename = "type")]
|
||||
pub type_field: String,
|
||||
pub value: String,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct InputFeature {
|
||||
#[serde(rename = "feature_id")]
|
||||
pub feature_id: i64,
|
||||
#[serde(rename = "full_feature_name")]
|
||||
pub full_feature_name: String,
|
||||
#[serde(rename = "feature_type")]
|
||||
pub feature_type: i64,
|
||||
pub index: i64,
|
||||
#[serde(rename = "maybe_exclude")]
|
||||
pub maybe_exclude: bool,
|
||||
pub tag: String,
|
||||
#[serde(rename = "added_at")]
|
||||
pub added_at: i64,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Cont {
|
||||
pub tag: String,
|
||||
#[serde(rename = "generic_feature_type")]
|
||||
pub generic_feature_type: i64,
|
||||
#[serde(rename = "feature_identifier")]
|
||||
pub feature_identifier: String,
|
||||
#[serde(rename = "fixed_length")]
|
||||
pub fixed_length: i64,
|
||||
#[serde(rename = "default_value")]
|
||||
pub default_value: DefaultValue,
|
||||
#[serde(rename = "input_features")]
|
||||
pub input_features: Vec<InputFeature>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Binary {
|
||||
pub tag: String,
|
||||
#[serde(rename = "generic_feature_type")]
|
||||
pub generic_feature_type: i64,
|
||||
#[serde(rename = "feature_identifier")]
|
||||
pub feature_identifier: String,
|
||||
#[serde(rename = "fixed_length")]
|
||||
pub fixed_length: i64,
|
||||
#[serde(rename = "default_value")]
|
||||
pub default_value: DefaultValue,
|
||||
#[serde(rename = "input_features")]
|
||||
pub input_features: Vec<InputFeature>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct StringType {
|
||||
pub tag: String,
|
||||
#[serde(rename = "generic_feature_type")]
|
||||
pub generic_feature_type: i64,
|
||||
#[serde(rename = "feature_identifier")]
|
||||
pub feature_identifier: String,
|
||||
#[serde(rename = "fixed_length")]
|
||||
pub fixed_length: i64,
|
||||
#[serde(rename = "default_value")]
|
||||
pub default_value: DefaultValue,
|
||||
#[serde(rename = "input_features")]
|
||||
pub input_features: Vec<InputFeature>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Blob {
|
||||
pub tag: String,
|
||||
#[serde(rename = "generic_feature_type")]
|
||||
pub generic_feature_type: i64,
|
||||
#[serde(rename = "feature_identifier")]
|
||||
pub feature_identifier: String,
|
||||
#[serde(rename = "fixed_length")]
|
||||
pub fixed_length: i64,
|
||||
#[serde(rename = "default_value")]
|
||||
pub default_value: DefaultValue,
|
||||
#[serde(rename = "input_features")]
|
||||
pub input_features: Vec<Value>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct IdentityTransformSpec {
|
||||
#[serde(rename = "feature_id")]
|
||||
pub feature_id: i64,
|
||||
#[serde(rename = "full_feature_name")]
|
||||
pub full_feature_name: String,
|
||||
#[serde(rename = "feature_type")]
|
||||
pub feature_type: i64,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ComplexFeatureTypeTransformSpec {
|
||||
#[serde(rename = "feature_id")]
|
||||
pub feature_id: i64,
|
||||
#[serde(rename = "full_feature_name")]
|
||||
pub full_feature_name: String,
|
||||
#[serde(rename = "feature_type")]
|
||||
pub feature_type: i64,
|
||||
pub index: i64,
|
||||
#[serde(rename = "maybe_exclude")]
|
||||
pub maybe_exclude: bool,
|
||||
pub tag: String,
|
||||
#[serde(rename = "tensor_data_type")]
|
||||
pub tensor_data_type: Option<i64>,
|
||||
#[serde(rename = "added_at")]
|
||||
pub added_at: i64,
|
||||
#[serde(rename = "tensor_shape")]
|
||||
#[serde(default)]
|
||||
pub tensor_shape: Vec<i64>,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct InputFeatureMapRecord {
|
||||
#[serde(rename = "feature_id")]
|
||||
pub feature_id: i64,
|
||||
#[serde(rename = "full_feature_name")]
|
||||
pub full_feature_name: String,
|
||||
#[serde(rename = "feature_type")]
|
||||
pub feature_type: i64,
|
||||
pub index: i64,
|
||||
#[serde(rename = "maybe_exclude")]
|
||||
pub maybe_exclude: bool,
|
||||
pub tag: String,
|
||||
#[serde(rename = "added_at")]
|
||||
pub added_at: i64,
|
||||
}
|
|
@ -1,159 +0,0 @@
|
|||
use std::fs;
|
||||
use log::{debug};
|
||||
|
||||
use serde_json::{Value, Map};
|
||||
|
||||
use crate::error::SegDenseError;
|
||||
use crate::mapper::{FeatureMapper, FeatureInfo, MapWriter};
|
||||
use crate::segdense_transform_spec_home_recap_2022::{self as seg_dense, InputFeature};
|
||||
|
||||
pub fn load_config(file_name: &str) -> seg_dense::Root {
|
||||
let json_str = fs::read_to_string(file_name).expect(
|
||||
&format!("Unable to load segdense file {}", file_name));
|
||||
let seg_dense_config = parse(&json_str).expect(
|
||||
&format!("Unable to parse segdense file {}", file_name));
|
||||
return seg_dense_config;
|
||||
}
|
||||
|
||||
pub fn parse(json_str: &str) -> Result<seg_dense::Root, SegDenseError> {
|
||||
let root: seg_dense::Root = serde_json::from_str(json_str)?;
|
||||
return Ok(root);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a json string containing a seg dense schema create a feature mapper
|
||||
* which is essentially:
|
||||
*
|
||||
* {feature-id -> (Tensor Index, Index of feature within the tensor)}
|
||||
*
|
||||
* Feature id : 64 bit hash of the feature name used in DataRecords.
|
||||
*
|
||||
* Tensor Index : A vector of tensors is passed to the model. Tensor
|
||||
* index refers to the tensor this feature is part of.
|
||||
*
|
||||
* Index of feature in tensor : The tensors are vectors, the index of
|
||||
* feature is the position to put the feature value.
|
||||
*
|
||||
* There are many assumptions made in this function that is very model specific.
|
||||
* These assumptions are called out below and need to be schematized eventually.
|
||||
*
|
||||
* Call this once for each segdense schema and cache the FeatureMapper.
|
||||
*/
|
||||
pub fn safe_load_config(json_str: &str) -> Result<FeatureMapper, SegDenseError> {
|
||||
let root = parse(json_str)?;
|
||||
load_from_parsed_config(root)
|
||||
}
|
||||
|
||||
pub fn load_from_parsed_config_ref(root: &seg_dense::Root) -> FeatureMapper {
|
||||
load_from_parsed_config(root.clone()).unwrap_or_else(
|
||||
|error| panic!("Error loading all_config.json - {}", error))
|
||||
}
|
||||
|
||||
// Perf note : make 'root' un-owned
|
||||
pub fn load_from_parsed_config(root: seg_dense::Root) ->
|
||||
Result<FeatureMapper, SegDenseError> {
|
||||
|
||||
let v = root.input_features_map;
|
||||
|
||||
// Do error check
|
||||
let map: Map<String, Value> = match v {
|
||||
Value::Object(map) => map,
|
||||
_ => return Err(SegDenseError::JsonMissingObject),
|
||||
};
|
||||
|
||||
let mut fm: FeatureMapper = FeatureMapper::new();
|
||||
|
||||
let items = map.values();
|
||||
|
||||
// Perf : Consider a way to avoid clone here
|
||||
for item in items.cloned() {
|
||||
let mut vec = match item {
|
||||
Value::Array(v) => v,
|
||||
_ => return Err(SegDenseError::JsonMissingArray),
|
||||
};
|
||||
|
||||
if vec.len() != 1 {
|
||||
return Err(SegDenseError::JsonArraySize);
|
||||
}
|
||||
|
||||
let val = vec.pop().unwrap();
|
||||
|
||||
let input_feature: seg_dense::InputFeature = serde_json::from_value(val)?;
|
||||
let feature_id = input_feature.feature_id;
|
||||
let feature_info = to_feature_info(&input_feature);
|
||||
|
||||
match feature_info {
|
||||
Some(info) => {
|
||||
debug!("{:?}", info);
|
||||
fm.set(feature_id, info)
|
||||
},
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(fm)
|
||||
}
|
||||
#[allow(dead_code)]
|
||||
fn add_feature_info_to_mapper(feature_mapper: &mut FeatureMapper, input_features: &Vec<InputFeature>) {
|
||||
for input_feature in input_features.iter() {
|
||||
let feature_id = input_feature.feature_id;
|
||||
let feature_info = to_feature_info(input_feature);
|
||||
|
||||
match feature_info {
|
||||
Some(info) => {
|
||||
debug!("{:?}", info);
|
||||
feature_mapper.set(feature_id, info)
|
||||
},
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_feature_info(input_feature: &seg_dense::InputFeature) -> Option<FeatureInfo> {
|
||||
if input_feature.maybe_exclude {
|
||||
return None;
|
||||
}
|
||||
|
||||
// This part needs to be schema driven
|
||||
//
|
||||
// tensor index : Which of these tensors this feature is part of
|
||||
// [Continious, Binary, Discrete, User_embedding, user_eng_embedding, author_embedding]
|
||||
// Note that this order is fixed/hardcoded here, and need to be schematized
|
||||
//
|
||||
let tensor_idx: i8 = match input_feature.feature_id {
|
||||
// user.timelines.twhin_user_follow_embeddings.twhin_user_follow_embeddings
|
||||
// Feature name is mapped to a feature-id value. The hardcoded values below correspond to a specific feature name.
|
||||
-2550691008059411095 => 3,
|
||||
|
||||
// user.timelines.twhin_user_engagement_embeddings.twhin_user_engagement_embeddings
|
||||
5390650078733277231 => 4,
|
||||
|
||||
// original_author.timelines.twhin_author_follow_embeddings.twhin_author_follow_embeddings
|
||||
3223956748566688423 => 5,
|
||||
|
||||
_ => match input_feature.feature_type {
|
||||
// feature_type : src/thrift/com/twitter/ml/api/data.thrift
|
||||
// BINARY = 1, CONTINUOUS = 2, DISCRETE = 3,
|
||||
// Map to slots in [Continious, Binary, Discrete, ..]
|
||||
1 => 1,
|
||||
2 => 0,
|
||||
3 => 2,
|
||||
_ => -1,
|
||||
}
|
||||
};
|
||||
|
||||
if input_feature.index < 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Handle this case later
|
||||
if tensor_idx == -1 {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(FeatureInfo {
|
||||
tensor_index: tensor_idx,
|
||||
index_within_tensor: input_feature.index,
|
||||
})
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user