Delete navi/segdense directory

This commit is contained in:
kenan238 2023-04-05 20:56:20 +03:00 committed by GitHub
parent db101ede46
commit d10beb7211
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 0 additions and 468 deletions

View File

@ -1,11 +0,0 @@
[package]
name = "segdense"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde = { version = "1.0.104", features = ["derive"] }
serde_json = "1.0.48"
log = "0.4.17"

View File

@ -1,43 +0,0 @@
use std::fmt::Display;
/**
* Custom error
*/
#[derive(Debug)]
pub enum SegDenseError {
IoError(std::io::Error),
Json(serde_json::Error),
JsonMissingRoot,
JsonMissingObject,
JsonMissingArray,
JsonArraySize,
JsonMissingInputFeature,
}
impl Display for SegDenseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SegDenseError::IoError(io_error) => write!(f, "{}", io_error),
SegDenseError::Json(serde_json) => write!(f, "{}", serde_json),
SegDenseError::JsonMissingRoot => write!(f, "{}", "SegDense JSON: Root Node note found!"),
SegDenseError::JsonMissingObject => write!(f, "{}", "SegDense JSON: Object note found!"),
SegDenseError::JsonMissingArray => write!(f, "{}", "SegDense JSON: Array Node note found!"),
SegDenseError::JsonArraySize => write!(f, "{}", "SegDense JSON: Array size not as expected!"),
SegDenseError::JsonMissingInputFeature => write!(f, "{}", "SegDense JSON: Missing input feature!"),
}
}
}
impl std::error::Error for SegDenseError {}
impl From<std::io::Error> for SegDenseError {
fn from(err: std::io::Error) -> Self {
SegDenseError::IoError(err)
}
}
impl From<serde_json::Error> for SegDenseError {
fn from(err: serde_json::Error) -> Self {
SegDenseError::Json(err)
}
}

View File

@ -1,4 +0,0 @@
pub mod error;
pub mod segdense_transform_spec_home_recap_2022;
pub mod mapper;
pub mod util;

View File

@ -1,23 +0,0 @@
use std::env;
use std::fs;
use segdense::error::SegDenseError;
use segdense::util;
fn main() -> Result<(), SegDenseError> {
env_logger::init();
let args: Vec<String> = env::args().collect();
let schema_file_name: &str = if args.len() == 1 {
"json/compact.json"
} else {
&args[1]
};
let json_str = fs::read_to_string(schema_file_name)?;
util::safe_load_config(&json_str)?;
Ok(())
}

View File

@ -1,45 +0,0 @@
use std::collections::HashMap;
#[derive(Debug)]
pub struct FeatureInfo {
pub tensor_index: i8,
pub index_within_tensor: i64,
}
pub static NULL_INFO: FeatureInfo = FeatureInfo {
tensor_index: -1,
index_within_tensor: -1,
};
#[derive(Debug, Default)]
pub struct FeatureMapper {
map: HashMap<i64, FeatureInfo>,
}
impl FeatureMapper {
pub fn new() -> FeatureMapper {
FeatureMapper {
map: HashMap::new()
}
}
}
pub trait MapWriter {
fn set(&mut self, feature_id: i64, info: FeatureInfo);
}
pub trait MapReader {
fn get(&self, feature_id: &i64) -> Option<&FeatureInfo>;
}
impl MapWriter for FeatureMapper {
fn set(&mut self, feature_id: i64, info: FeatureInfo) {
self.map.insert(feature_id, info);
}
}
impl MapReader for FeatureMapper {
fn get(&self, feature_id: &i64) -> Option<&FeatureInfo> {
self.map.get(feature_id)
}
}

View File

@ -1,183 +0,0 @@
use serde::{Deserialize, Serialize};
use serde_json::Value;
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Root {
#[serde(rename = "common_prefix")]
pub common_prefix: String,
#[serde(rename = "densification_transform_spec")]
pub densification_transform_spec: DensificationTransformSpec,
#[serde(rename = "identity_transform_spec")]
pub identity_transform_spec: Vec<IdentityTransformSpec>,
#[serde(rename = "complex_feature_type_transform_spec")]
pub complex_feature_type_transform_spec: Vec<ComplexFeatureTypeTransformSpec>,
#[serde(rename = "input_features_map")]
pub input_features_map: Value,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DensificationTransformSpec {
pub discrete: Discrete,
pub cont: Cont,
pub binary: Binary,
pub string: Value, // Use StringType
pub blob: Blob,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Discrete {
pub tag: String,
#[serde(rename = "generic_feature_type")]
pub generic_feature_type: i64,
#[serde(rename = "feature_identifier")]
pub feature_identifier: String,
#[serde(rename = "fixed_length")]
pub fixed_length: i64,
#[serde(rename = "default_value")]
pub default_value: DefaultValue,
#[serde(rename = "input_features")]
pub input_features: Vec<InputFeature>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DefaultValue {
#[serde(rename = "type")]
pub type_field: String,
pub value: String,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct InputFeature {
#[serde(rename = "feature_id")]
pub feature_id: i64,
#[serde(rename = "full_feature_name")]
pub full_feature_name: String,
#[serde(rename = "feature_type")]
pub feature_type: i64,
pub index: i64,
#[serde(rename = "maybe_exclude")]
pub maybe_exclude: bool,
pub tag: String,
#[serde(rename = "added_at")]
pub added_at: i64,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Cont {
pub tag: String,
#[serde(rename = "generic_feature_type")]
pub generic_feature_type: i64,
#[serde(rename = "feature_identifier")]
pub feature_identifier: String,
#[serde(rename = "fixed_length")]
pub fixed_length: i64,
#[serde(rename = "default_value")]
pub default_value: DefaultValue,
#[serde(rename = "input_features")]
pub input_features: Vec<InputFeature>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Binary {
pub tag: String,
#[serde(rename = "generic_feature_type")]
pub generic_feature_type: i64,
#[serde(rename = "feature_identifier")]
pub feature_identifier: String,
#[serde(rename = "fixed_length")]
pub fixed_length: i64,
#[serde(rename = "default_value")]
pub default_value: DefaultValue,
#[serde(rename = "input_features")]
pub input_features: Vec<InputFeature>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct StringType {
pub tag: String,
#[serde(rename = "generic_feature_type")]
pub generic_feature_type: i64,
#[serde(rename = "feature_identifier")]
pub feature_identifier: String,
#[serde(rename = "fixed_length")]
pub fixed_length: i64,
#[serde(rename = "default_value")]
pub default_value: DefaultValue,
#[serde(rename = "input_features")]
pub input_features: Vec<InputFeature>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Blob {
pub tag: String,
#[serde(rename = "generic_feature_type")]
pub generic_feature_type: i64,
#[serde(rename = "feature_identifier")]
pub feature_identifier: String,
#[serde(rename = "fixed_length")]
pub fixed_length: i64,
#[serde(rename = "default_value")]
pub default_value: DefaultValue,
#[serde(rename = "input_features")]
pub input_features: Vec<Value>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct IdentityTransformSpec {
#[serde(rename = "feature_id")]
pub feature_id: i64,
#[serde(rename = "full_feature_name")]
pub full_feature_name: String,
#[serde(rename = "feature_type")]
pub feature_type: i64,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ComplexFeatureTypeTransformSpec {
#[serde(rename = "feature_id")]
pub feature_id: i64,
#[serde(rename = "full_feature_name")]
pub full_feature_name: String,
#[serde(rename = "feature_type")]
pub feature_type: i64,
pub index: i64,
#[serde(rename = "maybe_exclude")]
pub maybe_exclude: bool,
pub tag: String,
#[serde(rename = "tensor_data_type")]
pub tensor_data_type: Option<i64>,
#[serde(rename = "added_at")]
pub added_at: i64,
#[serde(rename = "tensor_shape")]
#[serde(default)]
pub tensor_shape: Vec<i64>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct InputFeatureMapRecord {
#[serde(rename = "feature_id")]
pub feature_id: i64,
#[serde(rename = "full_feature_name")]
pub full_feature_name: String,
#[serde(rename = "feature_type")]
pub feature_type: i64,
pub index: i64,
#[serde(rename = "maybe_exclude")]
pub maybe_exclude: bool,
pub tag: String,
#[serde(rename = "added_at")]
pub added_at: i64,
}

View File

@ -1,159 +0,0 @@
use std::fs;
use log::{debug};
use serde_json::{Value, Map};
use crate::error::SegDenseError;
use crate::mapper::{FeatureMapper, FeatureInfo, MapWriter};
use crate::segdense_transform_spec_home_recap_2022::{self as seg_dense, InputFeature};
pub fn load_config(file_name: &str) -> seg_dense::Root {
let json_str = fs::read_to_string(file_name).expect(
&format!("Unable to load segdense file {}", file_name));
let seg_dense_config = parse(&json_str).expect(
&format!("Unable to parse segdense file {}", file_name));
return seg_dense_config;
}
pub fn parse(json_str: &str) -> Result<seg_dense::Root, SegDenseError> {
let root: seg_dense::Root = serde_json::from_str(json_str)?;
return Ok(root);
}
/**
* Given a json string containing a seg dense schema create a feature mapper
* which is essentially:
*
* {feature-id -> (Tensor Index, Index of feature within the tensor)}
*
* Feature id : 64 bit hash of the feature name used in DataRecords.
*
* Tensor Index : A vector of tensors is passed to the model. Tensor
* index refers to the tensor this feature is part of.
*
* Index of feature in tensor : The tensors are vectors, the index of
* feature is the position to put the feature value.
*
* There are many assumptions made in this function that is very model specific.
* These assumptions are called out below and need to be schematized eventually.
*
* Call this once for each segdense schema and cache the FeatureMapper.
*/
pub fn safe_load_config(json_str: &str) -> Result<FeatureMapper, SegDenseError> {
let root = parse(json_str)?;
load_from_parsed_config(root)
}
pub fn load_from_parsed_config_ref(root: &seg_dense::Root) -> FeatureMapper {
load_from_parsed_config(root.clone()).unwrap_or_else(
|error| panic!("Error loading all_config.json - {}", error))
}
// Perf note : make 'root' un-owned
pub fn load_from_parsed_config(root: seg_dense::Root) ->
Result<FeatureMapper, SegDenseError> {
let v = root.input_features_map;
// Do error check
let map: Map<String, Value> = match v {
Value::Object(map) => map,
_ => return Err(SegDenseError::JsonMissingObject),
};
let mut fm: FeatureMapper = FeatureMapper::new();
let items = map.values();
// Perf : Consider a way to avoid clone here
for item in items.cloned() {
let mut vec = match item {
Value::Array(v) => v,
_ => return Err(SegDenseError::JsonMissingArray),
};
if vec.len() != 1 {
return Err(SegDenseError::JsonArraySize);
}
let val = vec.pop().unwrap();
let input_feature: seg_dense::InputFeature = serde_json::from_value(val)?;
let feature_id = input_feature.feature_id;
let feature_info = to_feature_info(&input_feature);
match feature_info {
Some(info) => {
debug!("{:?}", info);
fm.set(feature_id, info)
},
None => (),
}
}
Ok(fm)
}
#[allow(dead_code)]
fn add_feature_info_to_mapper(feature_mapper: &mut FeatureMapper, input_features: &Vec<InputFeature>) {
for input_feature in input_features.iter() {
let feature_id = input_feature.feature_id;
let feature_info = to_feature_info(input_feature);
match feature_info {
Some(info) => {
debug!("{:?}", info);
feature_mapper.set(feature_id, info)
},
None => (),
}
}
}
pub fn to_feature_info(input_feature: &seg_dense::InputFeature) -> Option<FeatureInfo> {
if input_feature.maybe_exclude {
return None;
}
// This part needs to be schema driven
//
// tensor index : Which of these tensors this feature is part of
// [Continious, Binary, Discrete, User_embedding, user_eng_embedding, author_embedding]
// Note that this order is fixed/hardcoded here, and need to be schematized
//
let tensor_idx: i8 = match input_feature.feature_id {
// user.timelines.twhin_user_follow_embeddings.twhin_user_follow_embeddings
// Feature name is mapped to a feature-id value. The hardcoded values below correspond to a specific feature name.
-2550691008059411095 => 3,
// user.timelines.twhin_user_engagement_embeddings.twhin_user_engagement_embeddings
5390650078733277231 => 4,
// original_author.timelines.twhin_author_follow_embeddings.twhin_author_follow_embeddings
3223956748566688423 => 5,
_ => match input_feature.feature_type {
// feature_type : src/thrift/com/twitter/ml/api/data.thrift
// BINARY = 1, CONTINUOUS = 2, DISCRETE = 3,
// Map to slots in [Continious, Binary, Discrete, ..]
1 => 1,
2 => 0,
3 => 2,
_ => -1,
}
};
if input_feature.index < 0 {
return None;
}
// Handle this case later
if tensor_idx == -1 {
return None;
}
Some(FeatureInfo {
tensor_index: tensor_idx,
index_within_tensor: input_feature.index,
})
}