the-algorithm/navi/navi/src/onnx_model.rs

276 lines
11 KiB
Rust

#[cfg(feature = "onnx")]
pub mod onnx {
use crate::TensorReturnEnum;
use crate::bootstrap::{TensorInput, TensorInputEnum};
use crate::cli_args::{
Args, ARGS, INPUTS, MODEL_SPECS, OUTPUTS,
};
use crate::metrics::{self, CONVERTER_TIME_COLLECTOR};
use crate::predict_service::Model;
use crate::{MAX_NUM_INPUTS, MAX_NUM_OUTPUTS, META_INFO, utils};
use anyhow::Result;
use arrayvec::ArrayVec;
use dr_transform::converter::{BatchPredictionRequestToTorchTensorConverter, Converter};
use itertools::Itertools;
use log::{debug, info};
use dr_transform::ort::environment::Environment;
use dr_transform::ort::session::Session;
use dr_transform::ort::tensor::InputTensor;
use dr_transform::ort::{ExecutionProvider, GraphOptimizationLevel, SessionBuilder};
use dr_transform::ort::LoggingLevel;
use serde_json::Value;
use std::fmt::{Debug, Display};
use std::sync::Arc;
use std::{fmt, fs};
use tokio::time::Instant;
lazy_static! {
pub static ref ENVIRONMENT: Arc<Environment> = Arc::new(
Environment::builder()
.with_name("onnx home")
.with_log_level(LoggingLevel::Error)
.with_global_thread_pool(ARGS.onnx_global_thread_pool_options.clone())
.build()
.unwrap()
);
}
#[derive(Debug)]
pub struct OnnxModel {
pub session: Session,
pub model_idx: usize,
pub version: i64,
pub export_dir: String,
pub output_filters: ArrayVec<usize, MAX_NUM_OUTPUTS>,
pub input_converter: Box<dyn Converter>,
}
impl Display for OnnxModel {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"idx: {}, onnx model_name:{}, version:{}, output_filters:{:?}, converter:{:}",
self.model_idx,
MODEL_SPECS[self.model_idx],
self.version,
self.output_filters,
self.input_converter
)
}
}
impl Drop for OnnxModel {
fn drop(&mut self) {
if ARGS.profiling != None {
self.session.end_profiling().map_or_else(
|e| {
info!("end profiling with some error:{:?}", e);
},
|f| {
info!("profiling ended with file:{}", f);
},
);
}
}
}
impl OnnxModel {
fn get_output_filters(session: &Session, idx: usize) -> ArrayVec<usize, MAX_NUM_OUTPUTS> {
OUTPUTS[idx]
.iter()
.map(|output| session.outputs.iter().position(|o| o.name == *output))
.flatten()
.collect::<ArrayVec<usize, MAX_NUM_OUTPUTS>>()
}
#[cfg(target_os = "linux")]
fn ep_choices() -> Vec<ExecutionProvider> {
match ARGS.onnx_gpu_ep.as_ref().map(|e| e.as_str()) {
Some("onednn") => vec![Self::ep_with_options(ExecutionProvider::onednn())],
Some("tensorrt") => vec![Self::ep_with_options(ExecutionProvider::tensorrt())],
Some("cuda") => vec![Self::ep_with_options(ExecutionProvider::cuda())],
_ => vec![Self::ep_with_options(ExecutionProvider::cpu())],
}
}
fn ep_with_options(mut ep: ExecutionProvider) -> ExecutionProvider {
for (ref k, ref v) in ARGS.onnx_ep_options.clone() {
ep = ep.with(k, v);
info!("setting option:{} -> {} and now ep is:{:?}", k, v, ep);
}
ep
}
#[cfg(target_os = "macos")]
fn ep_choices() -> Vec<ExecutionProvider> {
vec![Self::ep_with_options(ExecutionProvider::cpu())]
}
pub fn new(idx: usize, version: String, model_config: &Value) -> Result<OnnxModel> {
let export_dir = format!("{}/{}/model.onnx", ARGS.model_dir[idx], version);
let meta_info = format!("{}/{}/{}", ARGS.model_dir[idx], version, META_INFO);
let mut builder = SessionBuilder::new(&ENVIRONMENT)?
.with_optimization_level(GraphOptimizationLevel::Level3)?
.with_parallel_execution(ARGS.onnx_use_parallel_mode == "true")?;
if ARGS.onnx_global_thread_pool_options.is_empty() {
builder = builder
.with_inter_threads(
utils::get_config_or(
model_config,
"inter_op_parallelism",
&ARGS.inter_op_parallelism[idx],
)
.parse()?,
)?
.with_intra_threads(
utils::get_config_or(
model_config,
"intra_op_parallelism",
&ARGS.intra_op_parallelism[idx],
)
.parse()?,
)?;
}
else {
builder = builder.with_disable_per_session_threads()?;
}
builder = builder
.with_memory_pattern(ARGS.onnx_use_memory_pattern == "true")?
.with_execution_providers(&OnnxModel::ep_choices())?;
match &ARGS.profiling {
Some(p) => {
debug!("Enable profiling, writing to {}", *p);
builder = builder.with_profiling(p)?
}
_ => {}
}
let session = builder.with_model_from_file(&export_dir)?;
info!(
"inputs: {:?}, outputs: {:?}",
session.inputs.iter().format(","),
session.outputs.iter().format(",")
);
fs::read_to_string(&meta_info)
.ok()
.map(|info| info!("meta info:{}", info));
let output_filters = OnnxModel::get_output_filters(&session, idx);
let mut reporting_feature_ids: Vec<(i64, &str)> = vec![];
let input_spec_cell = &INPUTS[idx];
if input_spec_cell.get().is_none() {
let input_spec = session
.inputs
.iter()
.map(|input| input.name.clone())
.collect::<ArrayVec<String, MAX_NUM_INPUTS>>();
input_spec_cell.set(input_spec.clone()).map_or_else(
|_| info!("unable to set the input_spec for model {}", idx),
|_| info!("auto detect and set the inputs: {:?}", input_spec),
);
}
ARGS.onnx_report_discrete_feature_ids
.iter()
.for_each(|ids| {
ids.split(",")
.filter(|s| !s.is_empty())
.map(|s| s.parse::<i64>().unwrap())
.for_each(|id| reporting_feature_ids.push((id, "discrete")))
});
ARGS.onnx_report_continuous_feature_ids
.iter()
.for_each(|ids| {
ids.split(",")
.filter(|s| !s.is_empty())
.map(|s| s.parse::<i64>().unwrap())
.for_each(|id| reporting_feature_ids.push((id, "continuous")))
});
let onnx_model = OnnxModel {
session,
model_idx: idx,
version: Args::version_str_to_epoch(&version)?,
export_dir,
output_filters,
input_converter: Box::new(BatchPredictionRequestToTorchTensorConverter::new(
&ARGS.model_dir[idx],
&version,
reporting_feature_ids,
Some(metrics::register_dynamic_metrics),
)),
};
onnx_model.warmup()?;
Ok(onnx_model)
}
}
///Currently we only assume the input as just one string tensor.
///The string tensor will be be converted to the actual raw tensors.
/// The converter we are using is very specific to home.
/// It reads a BatchDataRecord thrift and decode it to a batch of raw input tensors.
/// Navi will then do server side batching and feed it to ONNX runtime
impl Model for OnnxModel {
//TODO: implement a generic online warmup for all runtimes
fn warmup(&self) -> Result<()> {
Ok(())
}
#[inline(always)]
fn do_predict(
&self,
input_tensors: Vec<Vec<TensorInput>>,
_: u64,
) -> (Vec<TensorReturnEnum>, Vec<Vec<usize>>) {
let batched_tensors = TensorInputEnum::merge_batch(input_tensors);
let (inputs, batch_ends): (Vec<Vec<InputTensor>>, Vec<Vec<usize>>) = batched_tensors
.into_iter()
.map(|batched_tensor| {
match batched_tensor.tensor_data {
TensorInputEnum::String(t) if ARGS.onnx_use_converter.is_some() => {
let start = Instant::now();
let (inputs, batch_ends) = self.input_converter.convert(t);
// info!("batch_ends:{:?}", batch_ends);
CONVERTER_TIME_COLLECTOR
.with_label_values(&[&MODEL_SPECS[self.model_idx()]])
.observe(
start.elapsed().as_micros() as f64
/ (*batch_ends.last().unwrap() as f64),
);
(inputs, batch_ends)
}
_ => unimplemented!(),
}
})
.unzip();
//invariant we only support one input as string. will relax later
assert_eq!(inputs.len(), 1);
let output_tensors = self
.session
.run(inputs.into_iter().flatten().collect::<Vec<_>>())
.unwrap();
self.output_filters
.iter()
.map(|&idx| {
let mut size = 1usize;
let output = output_tensors[idx].try_extract::<f32>().unwrap();
for &dim in self.session.outputs[idx].dimensions.iter().flatten() {
size *= dim as usize;
}
let tensor_ends = batch_ends[0]
.iter()
.map(|&batch| batch * size)
.collect::<Vec<_>>();
(
//only works for batch major
//TODO: to_vec() obviously wasteful, especially for large batches(GPU) . Will refactor to
//break up output and return Vec<Vec<TensorScore>> here
TensorReturnEnum::FloatTensorReturn(Box::new(output.view().as_slice().unwrap().to_vec(),
)),
tensor_ends,
)
})
.unzip()
}
#[inline(always)]
fn model_idx(&self) -> usize {
self.model_idx
}
#[inline(always)]
fn version(&self) -> i64 {
self.version
}
}
}