5972853941

Committed 25 Aug 2023 06:52AM UTC coverage: 76.247% (+0.8%) from 75.446%

Build # 5972853941

Build Type

push

github

Committed by

web-flow

Commit Message

feat: make probabilistic optimizations optional and tunable in the YAML config (#1912)

Probabilistic optimization sacrifices accuracy in order to reduce memory consumption. In certain parts of the pipeline, a Bloom Filter is used ([set_processor](https://github.com/getdozer/dozer/blob/<a class=hub.com/getdozer/dozer/commit/<a class="double-link" href="https://git"><a class=hub.com/getdozer/dozer/commit/2e3ba96c3f4bdf9a691747191ab15617564d8ca2">2e3ba96c3/dozer-sql/src/pipeline/product/set/set_processor.rs#L20)), while in other parts, hash tables that store only the hash of the keys instead of the full keys are used ([aggregation_processor](https://github.com/getdozer/dozer/blob/2e3ba96c3f4bdf9a691747191ab15617564d8ca2/dozer-sql/src/pipeline/aggregation/processor.rs#L59) and [join_processor](https://github.com/getdozer/dozer/blob/2e3ba96c3f4bdf9a691747191ab15617564d8ca2/dozer-sql/src/pipeline/product/join/operator.rs#L57-L58)).

This commit makes these optimizations disabled by default and offers user-configurable flags to enable each of these optimizations separately.

This is an example of how to turn on probabilistic optimizations for each processor in the Dozer configuration.

```
flags:
  enable_probabilistic_optimizations:
    in_sets: true # enable probabilistic optimizations in set operations (UNION, EXCEPT, INTERSECT); Default: false
    in_joins: true # enable probabilistic optimizations in JOIN operations; Default: false
    in_aggregations: true # enable probabilistic optimizations in aggregations (SUM, COUNT, MIN, etc.); Default: false
```

Run Details

347 of 347 new or added lines in 25 files covered. (100.0%)

47165 of 61858 relevant lines covered (76.25%)

48442.96 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

62.07

/dozer-sql/src/pipeline/product/set/operator.rs

use super::record_map::{CountingRecordMap, CountingRecordMapEnum};
use crate::pipeline::errors::PipelineError;
use dozer_core::processor_record::ProcessorRecord;
use sqlparser::ast::{SetOperator, SetQuantifier};

#[derive(Clone, Debug, PartialEq, Eq, Copy)]
pub enum SetAction {
    Insert,
    Delete,
    // Update,
}

#[derive(Clone, Debug)]
pub struct SetOperation {
    pub op: SetOperator,
    pub quantifier: SetQuantifier,
}

impl SetOperation {
    pub fn _new(op: SetOperator) -> Self {
        Self {
            op,
            quantifier: SetQuantifier::None,
        }
    }

    pub fn execute(
        &self,
        action: SetAction,
        record: ProcessorRecord,
        record_map: &mut CountingRecordMapEnum,
    ) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {
        match (self.op, self.quantifier) {
            (SetOperator::Union, SetQuantifier::All) => Ok(vec![(action, record)]),
            (SetOperator::Union, SetQuantifier::None) => {
                self.execute_union(action, record, record_map)
            }
            _ => Err(PipelineError::InvalidOperandType(self.op.to_string())),
        }
    }

    fn execute_union(
        &self,
        action: SetAction,
        record: ProcessorRecord,
        record_map: &mut CountingRecordMapEnum,
    ) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {
        match action {
            SetAction::Insert => self.union_insert(action, record, record_map),
            SetAction::Delete => self.union_delete(action, record, record_map),
        }
    }

    fn union_insert(
        &self,
        action: SetAction,
        record: ProcessorRecord,
        record_map: &mut CountingRecordMapEnum,
    ) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {
        let _count = self.update_map(record.clone(), false, record_map);
        if _count == 1 {
            Ok(vec![(action, record)])
        } else {
            Ok(vec![])
        }
    }

    fn union_delete(
        &self,
        action: SetAction,
        record: ProcessorRecord,
        record_map: &mut CountingRecordMapEnum,
    ) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {
        let _count = self.update_map(record.clone(), true, record_map);
        if _count == 0 {
            Ok(vec![(action, record)])
        } else {
            Ok(vec![])
        }
    }

    fn update_map(
        &self,
        record: ProcessorRecord,
        decr: bool,
        record_map: &mut CountingRecordMapEnum,
    ) -> u64 {
        if decr {
            record_map.remove(&record);
        } else {
            record_map.insert(&record);
        }

        record_map.estimate_count(&record)
    }
}

1	use super::record_map::{CountingRecordMap, CountingRecordMapEnum};
2	use crate::pipeline::errors::PipelineError;
3	use dozer_core::processor_record::ProcessorRecord;
4	use sqlparser::ast::{SetOperator, SetQuantifier};
5
6	#[derive(Clone, Debug, PartialEq, Eq, Copy)]	×
7	pub enum SetAction {
8	Insert,
9	Delete,
10	// Update,
11	}
12
13	#[derive(Clone, Debug)]	×
14	pub struct SetOperation {
15	pub op: SetOperator,
16	pub quantifier: SetQuantifier,
17	}
18
19	impl SetOperation {
20	pub fn _new(op: SetOperator) -> Self {	×
21	Self {	×
22	op,	×
23	quantifier: SetQuantifier::None,	×
24	}	×
25	}	×
26
27	pub fn execute(
28	&self,
29	action: SetAction,
30	record: ProcessorRecord,
31	record_map: &mut CountingRecordMapEnum,
32	) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {
33	match (self.op, self.quantifier) {	560✔
34	(SetOperator::Union, SetQuantifier::All) => Ok(vec![(action, record)]),	280✔
35	(SetOperator::Union, SetQuantifier::None) => {
36	self.execute_union(action, record, record_map)	280✔
37	}
38	_ => Err(PipelineError::InvalidOperandType(self.op.to_string())),	×
39	}
40	}	560✔
41
42	fn execute_union(	280✔
43	&self,	280✔
44	action: SetAction,	280✔
45	record: ProcessorRecord,	280✔
46	record_map: &mut CountingRecordMapEnum,	280✔
47	) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {	280✔
48	match action {	280✔
49	SetAction::Insert => self.union_insert(action, record, record_map),	280✔
50	SetAction::Delete => self.union_delete(action, record, record_map),	×
51	}
52	}	280✔
53
54	fn union_insert(	280✔
55	&self,	280✔
56	action: SetAction,	280✔
57	record: ProcessorRecord,	280✔
58	record_map: &mut CountingRecordMapEnum,	280✔
59	) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {	280✔
60	let _count = self.update_map(record.clone(), false, record_map);	280✔
61	if _count == 1 {	280✔
62	Ok(vec![(action, record)])	140✔
63	} else {
64	Ok(vec![])	140✔
65	}
66	}	280✔
67
68	fn union_delete(	×
69	&self,	×
70	action: SetAction,	×
71	record: ProcessorRecord,	×
72	record_map: &mut CountingRecordMapEnum,	×
73	) -> Result<Vec<(SetAction, ProcessorRecord)>, PipelineError> {	×
74	let _count = self.update_map(record.clone(), true, record_map);	×
75	if _count == 0 {	×
76	Ok(vec![(action, record)])	×
77	} else {
78	Ok(vec![])	×
79	}
80	}	×
81
82	fn update_map(	280✔
83	&self,	280✔
84	record: ProcessorRecord,	280✔
85	decr: bool,	280✔
86	record_map: &mut CountingRecordMapEnum,	280✔
87	) -> u64 {	280✔
88	if decr {	280✔
89	record_map.remove(&record);	×
90	} else {	280✔
91	record_map.insert(&record);	280✔
92	}	280✔
93
94	record_map.estimate_count(&record)	280✔
95	}	280✔
96	}

getdozer / dozer / 5972853941

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous