16935267080

Committed 13 Aug 2025 11:00AM UTC coverage: 24.312% (-63.3%) from 87.658%

Build # 16935267080

Build Type

Pull #4226

github

Committed by

web-flow

Commit Message

Merge 81b48c7fb into baa6ea202

Pull Request Pull Request #4226: Support converting TimestampTZ to and from duckdb

Run Details

0 of 2 new or added lines in 1 file covered. (0.0%)

20666 existing lines in 469 files now uncovered.

8726 of 35892 relevant lines covered (24.31%)

147.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/vortex-expr/src/pruning/pruning_expr.rs

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::iter;

use itertools::Itertools;
use vortex_array::stats::Stat;
use vortex_dtype::{Field, FieldName, FieldPath, FieldPathSet};
use vortex_utils::aliases::hash_map::HashMap;

use super::relation::Relation;
use crate::{ExprRef, StatsCatalog, get_item, root};

pub type RequiredStats = Relation<FieldPath, Stat>;

// A catalog that return a stat column whenever it is required, tracking all accessed
// stats and returning them later.
#[derive(Default)]
struct TrackingStatsCatalog {
    usage: HashMap<(FieldPath, Stat), ExprRef>,
}

impl TrackingStatsCatalog {
    /// Consume the catalog, yielding a map of field statistics that were required
    /// for each expression.
    fn into_usages(self) -> HashMap<(FieldPath, Stat), ExprRef> {
        self.usage
    }
}

// A catalog that return a stat column if it exists in the given scope.
struct ScopeStatsCatalog<'a> {
    any_catalog: TrackingStatsCatalog,
    available_stats: &'a FieldPathSet,
}

impl StatsCatalog for ScopeStatsCatalog<'_> {
    fn stats_ref(&mut self, field_path: &FieldPath, stat: Stat) -> Option<ExprRef> {
        let stat_path = field_path.clone().push(stat.name());

        if self.available_stats.contains(&stat_path) {
            self.any_catalog.stats_ref(field_path, stat)
        } else {
            None
        }
    }
}

impl StatsCatalog for TrackingStatsCatalog {
    fn stats_ref(&mut self, field_path: &FieldPath, stat: Stat) -> Option<ExprRef> {
        let mut expr = root();
        let name = field_path_stat_field_name(field_path, stat);
        expr = get_item(name, expr);
        self.usage.insert((field_path.clone(), stat), expr.clone());
        Some(expr)
    }
}

#[doc(hidden)]
pub fn field_path_stat_field_name(field_path: &FieldPath, stat: Stat) -> FieldName {
    field_path
        .parts()
        .iter()
        .map(|f| match f {
            Field::Name(n) => n.as_ref(),
            Field::ElementType => todo!("element type not currently handled"),
        })
        .chain(iter::once(stat.name()))
        .join("_")
        .into()
}

/// Build a pruning expr mask, using an existing set of stats.
/// The available stats are provided as a set of [`FieldPath`].
///
/// A pruning expression is one that returns `true` for all positions where the original expression
/// cannot hold, and false if it cannot be determined from stats alone whether the positions can
/// be pruned.
///
/// If the falsification logic attempts to access an unknown stat,
/// this function will return `None`.
pub fn checked_pruning_expr(
    expr: &ExprRef,
    available_stats: &FieldPathSet,
) -> Option<(ExprRef, RequiredStats)> {
    let mut catalog = ScopeStatsCatalog {
        any_catalog: Default::default(),
        available_stats,
    };

    let expr = expr.stat_falsification(&mut catalog)?;

    // TODO(joe): filter access by used exprs
    let mut relation: Relation<FieldPath, Stat> = Relation::new();
    for ((field_path, stat), _) in catalog.any_catalog.into_usages() {
        relation.insert(field_path, stat)
    }

    Some((expr, relation))
}

#[cfg(test)]
mod tests {
    use rstest::{fixture, rstest};
    use vortex_array::stats::Stat;
    use vortex_dtype::{FieldName, FieldPath, FieldPathSet};

    use crate::pruning::pruning_expr::HashMap;
    use crate::pruning::{checked_pruning_expr, field_path_stat_field_name};
    use crate::{HashSet, and, col, eq, get_item, gt, gt_eq, lit, lt, lt_eq, not_eq, or, root};

    // Implement some checked pruning expressions.
    #[fixture]
    fn available_stats() -> FieldPathSet {
        let field_a = FieldPath::from_name("a");
        let field_b = FieldPath::from_name("b");

        FieldPathSet::from_iter([
            field_a.clone().push(Stat::Min.name()),
            field_a.push(Stat::Max.name()),
            field_b.clone().push(Stat::Min.name()),
            field_b.push(Stat::Max.name()),
        ])
    }

    #[rstest]
    pub fn pruning_equals(available_stats: FieldPathSet) {
        let name = FieldName::from("a");
        let literal_eq = lit(42);
        let eq_expr = eq(get_item("a", root()), literal_eq.clone());
        let (converted, _refs) = checked_pruning_expr(&eq_expr, &available_stats).unwrap();
        let expected_expr = or(
            gt(
                get_item(
                    field_path_stat_field_name(&FieldPath::from_name(name.clone()), Stat::Min),
                    root(),
                ),
                literal_eq.clone(),
            ),
            gt(
                literal_eq,
                col(field_path_stat_field_name(
                    &FieldPath::from_name(name),
                    Stat::Max,
                )),
            ),
        );
        assert_eq!(&converted, &expected_expr);
    }

    #[rstest]
    pub fn pruning_equals_column(available_stats: FieldPathSet) {
        let column = FieldName::from("a");
        let other_col = FieldName::from("b");
        let eq_expr = eq(col(column.clone()), col(other_col.clone()));

        let (converted, refs) = checked_pruning_expr(&eq_expr, &available_stats).unwrap();
        assert_eq!(
            refs.map(),
            &HashMap::from_iter([
                (
                    FieldPath::from_name(column.clone()),
                    HashSet::from_iter([Stat::Min, Stat::Max])
                ),
                (
                    FieldPath::from_name(other_col.clone()),
                    HashSet::from_iter([Stat::Max, Stat::Min])
                )
            ])
        );
        let expected_expr = or(
            gt(
                col(field_path_stat_field_name(
                    &FieldPath::from_name(column.clone()),
                    Stat::Min,
                )),
                col(field_path_stat_field_name(
                    &FieldPath::from_name(other_col.clone()),
                    Stat::Max,
                )),
            ),
            gt(
                col(field_path_stat_field_name(
                    &FieldPath::from_name(other_col),
                    Stat::Min,
                )),
                col(field_path_stat_field_name(
                    &FieldPath::from_name(column),
                    Stat::Max,
                )),
            ),
        );
        assert_eq!(&converted, &expected_expr);
    }

    #[rstest]
    pub fn pruning_not_equals_column(available_stats: FieldPathSet) {
        let column = FieldName::from("a");
        let other_col = FieldName::from("b");
        let not_eq_expr = not_eq(col(column.clone()), col(other_col.clone()));

        let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
        assert_eq!(
            refs.map(),
            &HashMap::from_iter([
                (
                    FieldPath::from_name(column.clone()),
                    HashSet::from_iter([Stat::Min, Stat::Max])
                ),
                (
                    FieldPath::from_name(other_col.clone()),
                    HashSet::from_iter([Stat::Max, Stat::Min])
                )
            ])
        );
        let expected_expr = and(
            eq(
                col(field_path_stat_field_name(
                    &FieldPath::from_name(column.clone()),
                    Stat::Min,
                )),
                col(field_path_stat_field_name(
                    &FieldPath::from_name(other_col.clone()),
                    Stat::Max,
                )),
            ),
            eq(
                col(field_path_stat_field_name(
                    &FieldPath::from_name(column),
                    Stat::Max,
                )),
                col(field_path_stat_field_name(
                    &FieldPath::from_name(other_col),
                    Stat::Min,
                )),
            ),
        );

        assert_eq!(&converted, &expected_expr);
    }

    #[rstest]
    pub fn pruning_gt_column(available_stats: FieldPathSet) {
        let column = FieldName::from("a");
        let other_col = FieldName::from("b");
        let other_expr = col(other_col.clone());
        let not_eq_expr = gt(col(column.clone()), other_expr.clone());

        let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
        assert_eq!(
            refs.map(),
            &HashMap::from_iter([
                (
                    FieldPath::from_name(column.clone()),
                    HashSet::from_iter([Stat::Max])
                ),
                (
                    FieldPath::from_name(other_col.clone()),
                    HashSet::from_iter([Stat::Min])
                )
            ])
        );
        let expected_expr = lt_eq(
            col(field_path_stat_field_name(
                &FieldPath::from_name(column),
                Stat::Max,
            )),
            col(field_path_stat_field_name(
                &FieldPath::from_name(other_col),
                Stat::Min,
            )),
        );
        assert_eq!(&converted, &expected_expr);
    }

    #[rstest]
    pub fn pruning_gt_value(available_stats: FieldPathSet) {
        let column = FieldName::from("a");
        let other_col = lit(42);
        let not_eq_expr = gt(col(column.clone()), other_col.clone());

        let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
        assert_eq!(
            refs.map(),
            &HashMap::from_iter([(
                FieldPath::from_name(column.clone()),
                HashSet::from_iter([Stat::Max])
            ),])
        );
        let expected_expr = lt_eq(
            col(field_path_stat_field_name(
                &FieldPath::from_name(column),
                Stat::Max,
            )),
            other_col.clone(),
        );
        assert_eq!(&converted, &(expected_expr));
    }

    #[rstest]
    pub fn pruning_lt_column(available_stats: FieldPathSet) {
        let column = FieldName::from("a");
        let other_col = FieldName::from("b");
        let other_expr = col(other_col.clone());
        let not_eq_expr = lt(col(column.clone()), other_expr.clone());

        let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
        assert_eq!(
            refs.map(),
            &HashMap::from_iter([
                (
                    FieldPath::from_name(column.clone()),
                    HashSet::from_iter([Stat::Min])
                ),
                (
                    FieldPath::from_name(other_col.clone()),
                    HashSet::from_iter([Stat::Max])
                )
            ])
        );
        let expected_expr = gt_eq(
            col(field_path_stat_field_name(
                &FieldPath::from_name(column),
                Stat::Min,
            )),
            col(field_path_stat_field_name(
                &FieldPath::from_name(other_col),
                Stat::Max,
            )),
        );
        assert_eq!(&converted, &expected_expr);
    }

    #[rstest]
    pub fn pruning_lt_value(available_stats: FieldPathSet) {
        // expression   => a < 42
        // pruning expr => a.min >= 42
        let expr = lt(col("a"), lit(42));

        let (converted, refs) = checked_pruning_expr(&expr, &available_stats).unwrap();
        assert_eq!(
            refs.map(),
            &HashMap::from_iter([(FieldPath::from_name("a"), HashSet::from_iter([Stat::Min]))])
        );
        assert_eq!(&converted, &gt_eq(col("a_min"), lit(42)));
    }

    #[rstest]
    fn pruning_identity(available_stats: FieldPathSet) {
        let expr = or(lt(col("a").clone(), lit(10)), gt(col("a").clone(), lit(50)));

        let (predicate, _) = checked_pruning_expr(&expr, &available_stats).unwrap();

        let expected_expr = and(gt_eq(col("a_min"), lit(10)), lt_eq(col("a_max"), lit(50)));
        assert_eq!(&predicate.to_string(), &expected_expr.to_string());
    }
    #[rstest]
    pub fn pruning_and_or_operators(available_stats: FieldPathSet) {
        // Test case: a > 10 AND a < 50
        let column = FieldName::from("a");
        let and_expr = and(gt(col(column.clone()), lit(10)), lt(col(column), lit(50)));
        let (predicate, _) = checked_pruning_expr(&and_expr, &available_stats).unwrap();

        // Expected: a_max <= 10 OR a_min >= 50
        assert_eq!(
            &predicate,
            &or(
                lt_eq(col(FieldName::from("a_max")), lit(10)),
                gt_eq(col(FieldName::from("a_min")), lit(50)),
            ),
        );
    }

    #[rstest]
    fn test_gt_eq_with_booleans(available_stats: FieldPathSet) {
        // Consider this unusual, but valid (in Arrow, BooleanArray implements ArrayOrd), filter expression:
        // x > (y > z)
        // The x column is a Boolean-valued column. The y and z columns are numeric. True > False.
        // Suppose we had a Vortex zone whose min/max statistics for each column were:
        // x: [True, True]
        // y: [1, 2]
        // z: [0, 2]
        // The pruning predicate will convert the aforementioned expression into:
        // x_max <= (y_min > z_min)
        // If we evaluate that pruning expression on our zone we get:
        // x_max <= (y_min > z_min)
        // x_max <= (1     > 0    )
        // x_max <= True
        // True <= True
        // True
        // If a pruning predicate evaluates to true then, as stated in PruningPredicate::evaluate:
        // > a true value means the chunk can be pruned.
        // But, the following record lies within the above intervals and *passes* the filter expression! We
        // cannot prune this zone because we need this record!
        // {x: True, y: 1, z: 2}
        // x > (y > z)
        // True > (1 > 2)
        // True > False
        // True
        let expr = gt_eq(col("x"), gt(col("y"), col("z")));
        assert!(checked_pruning_expr(&expr, &available_stats).is_none());
        // TODO(DK): a sufficiently complex pruner would produce: `x_max <= (y_max > z_min)`
    }

    #[fixture]
    fn available_stats_with_nans() -> FieldPathSet {
        let float_col = FieldPath::from_name("float_col");
        let int_col = FieldPath::from_name("int_col");

        FieldPathSet::from_iter([
            // Float columns will have a NaNCount.
            float_col.clone().push(Stat::Min.name()),
            float_col.clone().push(Stat::Max.name()),
            float_col.push(Stat::NaNCount.name()),
            // int columns will not have a NanCount serialized into the layout
            int_col.clone().push(Stat::Min.name()),
            int_col.push(Stat::Max.name()),
        ])
    }

    #[rstest]
    fn pruning_checks_nans(available_stats_with_nans: FieldPathSet) {
        let expr = gt_eq(col("float_col"), lit(f32::NAN));
        let (converted, _) = checked_pruning_expr(&expr, &available_stats_with_nans).unwrap();
        assert_eq!(
            &converted,
            &and(
                and(
                    eq(col("float_col_nan_count"), lit(0u64)),
                    // NaNCount of NaN is 1
                    eq(lit(1u64), lit(0u64)),
                ),
                // This is the standard conversion of the >= operator. Comparing NAN to a max
                // stat is nonsensical, as min/max stats ignore NaNs, but this should be short-circuited
                // by the previous check for nan_count anyway.
                lt(col("float_col_max"), lit(f32::NAN)),
            )
        );

        // One half of the expression requires NAN count check, the other half does not.
        let expr = and(
            gt(col("float_col"), lit(10f32)),
            lt(col("int_col"), lit(10)),
        );

        let (converted, _) = checked_pruning_expr(&expr, &available_stats_with_nans).unwrap();

        assert_eq!(
            &converted,
            &or(
                // NaNCount check is enforced for the float column
                and(
                    and(
                        eq(col("float_col_nan_count"), lit(0u64)),
                        // NanCount of a non-NaN float literal is 0
                        eq(lit(0u64), lit(0u64)),
                    ),
                    // We want the opposite: we can prune IF either one is false.
                    lt_eq(col("float_col_max"), lit(10f32)),
                ),
                // NanCount check is skipped for the int column
                gt_eq(col("int_col_min"), lit(10)),
            )
        )
    }
}

1	// SPDX-License-Identifier: Apache-2.0
2	// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4	use std::iter;
5
6	use itertools::Itertools;
7	use vortex_array::stats::Stat;
8	use vortex_dtype::{Field, FieldName, FieldPath, FieldPathSet};
9	use vortex_utils::aliases::hash_map::HashMap;
10
11	use super::relation::Relation;
12	use crate::{ExprRef, StatsCatalog, get_item, root};
13
14	pub type RequiredStats = Relation<FieldPath, Stat>;
15
16	// A catalog that return a stat column whenever it is required, tracking all accessed
17	// stats and returning them later.
18	#[derive(Default)]
19	struct TrackingStatsCatalog {
20	usage: HashMap<(FieldPath, Stat), ExprRef>,
21	}
22
23	impl TrackingStatsCatalog {
24	/// Consume the catalog, yielding a map of field statistics that were required
25	/// for each expression.
UNCOV 26	fn into_usages(self) -> HashMap<(FieldPath, Stat), ExprRef> {	×
UNCOV 27	self.usage	×
UNCOV 28	}	×
29	}
30
31	// A catalog that return a stat column if it exists in the given scope.
32	struct ScopeStatsCatalog<'a> {
33	any_catalog: TrackingStatsCatalog,
34	available_stats: &'a FieldPathSet,
35	}
36
37	impl StatsCatalog for ScopeStatsCatalog<'_> {
UNCOV 38	fn stats_ref(&mut self, field_path: &FieldPath, stat: Stat) -> Option<ExprRef> {	×
UNCOV 39	let stat_path = field_path.clone().push(stat.name());	×
40
UNCOV 41	if self.available_stats.contains(&stat_path) {	×
UNCOV 42	self.any_catalog.stats_ref(field_path, stat)	×
43	} else {
UNCOV 44	None	×
45	}
UNCOV 46	}	×
47	}
48
49	impl StatsCatalog for TrackingStatsCatalog {
UNCOV 50	fn stats_ref(&mut self, field_path: &FieldPath, stat: Stat) -> Option<ExprRef> {	×
UNCOV 51	let mut expr = root();	×
UNCOV 52	let name = field_path_stat_field_name(field_path, stat);	×
UNCOV 53	expr = get_item(name, expr);	×
UNCOV 54	self.usage.insert((field_path.clone(), stat), expr.clone());	×
UNCOV 55	Some(expr)	×
UNCOV 56	}	×
57	}
58
59	#[doc(hidden)]
UNCOV 60	pub fn field_path_stat_field_name(field_path: &FieldPath, stat: Stat) -> FieldName {	×
UNCOV 61	field_path	×
UNCOV 62	.parts()	×
UNCOV 63	.iter()	×
UNCOV 64	.map(\|f\| match f {	×
UNCOV 65	Field::Name(n) => n.as_ref(),	×
66	Field::ElementType => todo!("element type not currently handled"),	×
UNCOV 67	})	×
UNCOV 68	.chain(iter::once(stat.name()))	×
UNCOV 69	.join("_")	×
UNCOV 70	.into()	×
UNCOV 71	}	×
72
73	/// Build a pruning expr mask, using an existing set of stats.
74	/// The available stats are provided as a set of [`FieldPath`].
75	///
76	/// A pruning expression is one that returns `true` for all positions where the original expression
77	/// cannot hold, and false if it cannot be determined from stats alone whether the positions can
78	/// be pruned.
79	///
80	/// If the falsification logic attempts to access an unknown stat,
81	/// this function will return `None`.
UNCOV 82	pub fn checked_pruning_expr(	×
UNCOV 83	expr: &ExprRef,	×
UNCOV 84	available_stats: &FieldPathSet,	×
UNCOV 85	) -> Option<(ExprRef, RequiredStats)> {	×
UNCOV 86	let mut catalog = ScopeStatsCatalog {	×
UNCOV 87	any_catalog: Default::default(),	×
UNCOV 88	available_stats,	×
UNCOV 89	};	×
90
UNCOV 91	let expr = expr.stat_falsification(&mut catalog)?;	×
92
93	// TODO(joe): filter access by used exprs
UNCOV 94	let mut relation: Relation<FieldPath, Stat> = Relation::new();	×
UNCOV 95	for ((field_path, stat), _) in catalog.any_catalog.into_usages() {	×
UNCOV 96	relation.insert(field_path, stat)	×
97	}
98
UNCOV 99	Some((expr, relation))	×
UNCOV 100	}	×
101
102	#[cfg(test)]
103	mod tests {
104	use rstest::{fixture, rstest};
105	use vortex_array::stats::Stat;
106	use vortex_dtype::{FieldName, FieldPath, FieldPathSet};
107
108	use crate::pruning::pruning_expr::HashMap;
109	use crate::pruning::{checked_pruning_expr, field_path_stat_field_name};
110	use crate::{HashSet, and, col, eq, get_item, gt, gt_eq, lit, lt, lt_eq, not_eq, or, root};
111
112	// Implement some checked pruning expressions.
113	#[fixture]
114	fn available_stats() -> FieldPathSet {
115	let field_a = FieldPath::from_name("a");
116	let field_b = FieldPath::from_name("b");
117
118	FieldPathSet::from_iter([
119	field_a.clone().push(Stat::Min.name()),
120	field_a.push(Stat::Max.name()),
121	field_b.clone().push(Stat::Min.name()),
122	field_b.push(Stat::Max.name()),
123	])
124	}
125
126	#[rstest]
127	pub fn pruning_equals(available_stats: FieldPathSet) {
128	let name = FieldName::from("a");
129	let literal_eq = lit(42);
130	let eq_expr = eq(get_item("a", root()), literal_eq.clone());
131	let (converted, _refs) = checked_pruning_expr(&eq_expr, &available_stats).unwrap();
132	let expected_expr = or(
133	gt(
134	get_item(
135	field_path_stat_field_name(&FieldPath::from_name(name.clone()), Stat::Min),
136	root(),
137	),
138	literal_eq.clone(),
139	),
140	gt(
141	literal_eq,
142	col(field_path_stat_field_name(
143	&FieldPath::from_name(name),
144	Stat::Max,
145	)),
146	),
147	);
148	assert_eq!(&converted, &expected_expr);
149	}
150
151	#[rstest]
152	pub fn pruning_equals_column(available_stats: FieldPathSet) {
153	let column = FieldName::from("a");
154	let other_col = FieldName::from("b");
155	let eq_expr = eq(col(column.clone()), col(other_col.clone()));
156
157	let (converted, refs) = checked_pruning_expr(&eq_expr, &available_stats).unwrap();
158	assert_eq!(
159	refs.map(),
160	&HashMap::from_iter([
161	(
162	FieldPath::from_name(column.clone()),
163	HashSet::from_iter([Stat::Min, Stat::Max])
164	),
165	(
166	FieldPath::from_name(other_col.clone()),
167	HashSet::from_iter([Stat::Max, Stat::Min])
168	)
169	])
170	);
171	let expected_expr = or(
172	gt(
173	col(field_path_stat_field_name(
174	&FieldPath::from_name(column.clone()),
175	Stat::Min,
176	)),
177	col(field_path_stat_field_name(
178	&FieldPath::from_name(other_col.clone()),
179	Stat::Max,
180	)),
181	),
182	gt(
183	col(field_path_stat_field_name(
184	&FieldPath::from_name(other_col),
185	Stat::Min,
186	)),
187	col(field_path_stat_field_name(
188	&FieldPath::from_name(column),
189	Stat::Max,
190	)),
191	),
192	);
193	assert_eq!(&converted, &expected_expr);
194	}
195
196	#[rstest]
197	pub fn pruning_not_equals_column(available_stats: FieldPathSet) {
198	let column = FieldName::from("a");
199	let other_col = FieldName::from("b");
200	let not_eq_expr = not_eq(col(column.clone()), col(other_col.clone()));
201
202	let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
203	assert_eq!(
204	refs.map(),
205	&HashMap::from_iter([
206	(
207	FieldPath::from_name(column.clone()),
208	HashSet::from_iter([Stat::Min, Stat::Max])
209	),
210	(
211	FieldPath::from_name(other_col.clone()),
212	HashSet::from_iter([Stat::Max, Stat::Min])
213	)
214	])
215	);
216	let expected_expr = and(
217	eq(
218	col(field_path_stat_field_name(
219	&FieldPath::from_name(column.clone()),
220	Stat::Min,
221	)),
222	col(field_path_stat_field_name(
223	&FieldPath::from_name(other_col.clone()),
224	Stat::Max,
225	)),
226	),
227	eq(
228	col(field_path_stat_field_name(
229	&FieldPath::from_name(column),
230	Stat::Max,
231	)),
232	col(field_path_stat_field_name(
233	&FieldPath::from_name(other_col),
234	Stat::Min,
235	)),
236	),
237	);
238
239	assert_eq!(&converted, &expected_expr);
240	}
241
242	#[rstest]
243	pub fn pruning_gt_column(available_stats: FieldPathSet) {
244	let column = FieldName::from("a");
245	let other_col = FieldName::from("b");
246	let other_expr = col(other_col.clone());
247	let not_eq_expr = gt(col(column.clone()), other_expr.clone());
248
249	let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
250	assert_eq!(
251	refs.map(),
252	&HashMap::from_iter([
253	(
254	FieldPath::from_name(column.clone()),
255	HashSet::from_iter([Stat::Max])
256	),
257	(
258	FieldPath::from_name(other_col.clone()),
259	HashSet::from_iter([Stat::Min])
260	)
261	])
262	);
263	let expected_expr = lt_eq(
264	col(field_path_stat_field_name(
265	&FieldPath::from_name(column),
266	Stat::Max,
267	)),
268	col(field_path_stat_field_name(
269	&FieldPath::from_name(other_col),
270	Stat::Min,
271	)),
272	);
273	assert_eq!(&converted, &expected_expr);
274	}
275
276	#[rstest]
277	pub fn pruning_gt_value(available_stats: FieldPathSet) {
278	let column = FieldName::from("a");
279	let other_col = lit(42);
280	let not_eq_expr = gt(col(column.clone()), other_col.clone());
281
282	let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
283	assert_eq!(
284	refs.map(),
285	&HashMap::from_iter([(
286	FieldPath::from_name(column.clone()),
287	HashSet::from_iter([Stat::Max])
288	),])
289	);
290	let expected_expr = lt_eq(
291	col(field_path_stat_field_name(
292	&FieldPath::from_name(column),
293	Stat::Max,
294	)),
295	other_col.clone(),
296	);
297	assert_eq!(&converted, &(expected_expr));
298	}
299
300	#[rstest]
301	pub fn pruning_lt_column(available_stats: FieldPathSet) {
302	let column = FieldName::from("a");
303	let other_col = FieldName::from("b");
304	let other_expr = col(other_col.clone());
305	let not_eq_expr = lt(col(column.clone()), other_expr.clone());
306
307	let (converted, refs) = checked_pruning_expr(&not_eq_expr, &available_stats).unwrap();
308	assert_eq!(
309	refs.map(),
310	&HashMap::from_iter([
311	(
312	FieldPath::from_name(column.clone()),
313	HashSet::from_iter([Stat::Min])
314	),
315	(
316	FieldPath::from_name(other_col.clone()),
317	HashSet::from_iter([Stat::Max])
318	)
319	])
320	);
321	let expected_expr = gt_eq(
322	col(field_path_stat_field_name(
323	&FieldPath::from_name(column),
324	Stat::Min,
325	)),
326	col(field_path_stat_field_name(
327	&FieldPath::from_name(other_col),
328	Stat::Max,
329	)),
330	);
331	assert_eq!(&converted, &expected_expr);
332	}
333
334	#[rstest]
335	pub fn pruning_lt_value(available_stats: FieldPathSet) {
336	// expression => a < 42
337	// pruning expr => a.min >= 42
338	let expr = lt(col("a"), lit(42));
339
340	let (converted, refs) = checked_pruning_expr(&expr, &available_stats).unwrap();
341	assert_eq!(
342	refs.map(),
343	&HashMap::from_iter([(FieldPath::from_name("a"), HashSet::from_iter([Stat::Min]))])
344	);
345	assert_eq!(&converted, &gt_eq(col("a_min"), lit(42)));
346	}
347
348	#[rstest]
349	fn pruning_identity(available_stats: FieldPathSet) {
350	let expr = or(lt(col("a").clone(), lit(10)), gt(col("a").clone(), lit(50)));
351
352	let (predicate, _) = checked_pruning_expr(&expr, &available_stats).unwrap();
353
354	let expected_expr = and(gt_eq(col("a_min"), lit(10)), lt_eq(col("a_max"), lit(50)));
355	assert_eq!(&predicate.to_string(), &expected_expr.to_string());
356	}
357	#[rstest]
358	pub fn pruning_and_or_operators(available_stats: FieldPathSet) {
359	// Test case: a > 10 AND a < 50
360	let column = FieldName::from("a");
361	let and_expr = and(gt(col(column.clone()), lit(10)), lt(col(column), lit(50)));
362	let (predicate, _) = checked_pruning_expr(&and_expr, &available_stats).unwrap();
363
364	// Expected: a_max <= 10 OR a_min >= 50
365	assert_eq!(
366	&predicate,
367	&or(
368	lt_eq(col(FieldName::from("a_max")), lit(10)),
369	gt_eq(col(FieldName::from("a_min")), lit(50)),
370	),
371	);
372	}
373
374	#[rstest]
375	fn test_gt_eq_with_booleans(available_stats: FieldPathSet) {
376	// Consider this unusual, but valid (in Arrow, BooleanArray implements ArrayOrd), filter expression:
377	// x > (y > z)
378	// The x column is a Boolean-valued column. The y and z columns are numeric. True > False.
379	// Suppose we had a Vortex zone whose min/max statistics for each column were:
380	// x: [True, True]
381	// y: [1, 2]
382	// z: [0, 2]
383	// The pruning predicate will convert the aforementioned expression into:
384	// x_max <= (y_min > z_min)
385	// If we evaluate that pruning expression on our zone we get:
386	// x_max <= (y_min > z_min)
387	// x_max <= (1 > 0 )
388	// x_max <= True
389	// True <= True
390	// True
391	// If a pruning predicate evaluates to true then, as stated in PruningPredicate::evaluate:
392	// > a true value means the chunk can be pruned.
393	// But, the following record lies within the above intervals and passes the filter expression! We
394	// cannot prune this zone because we need this record!
395	// {x: True, y: 1, z: 2}
396	// x > (y > z)
397	// True > (1 > 2)
398	// True > False
399	// True
400	let expr = gt_eq(col("x"), gt(col("y"), col("z")));
401	assert!(checked_pruning_expr(&expr, &available_stats).is_none());
402	// TODO(DK): a sufficiently complex pruner would produce: `x_max <= (y_max > z_min)`
403	}
404
405	#[fixture]
406	fn available_stats_with_nans() -> FieldPathSet {
407	let float_col = FieldPath::from_name("float_col");
408	let int_col = FieldPath::from_name("int_col");
409
410	FieldPathSet::from_iter([
411	// Float columns will have a NaNCount.
412	float_col.clone().push(Stat::Min.name()),
413	float_col.clone().push(Stat::Max.name()),
414	float_col.push(Stat::NaNCount.name()),
415	// int columns will not have a NanCount serialized into the layout
416	int_col.clone().push(Stat::Min.name()),
417	int_col.push(Stat::Max.name()),
418	])
419	}
420
421	#[rstest]
422	fn pruning_checks_nans(available_stats_with_nans: FieldPathSet) {
423	let expr = gt_eq(col("float_col"), lit(f32::NAN));
424	let (converted, _) = checked_pruning_expr(&expr, &available_stats_with_nans).unwrap();
425	assert_eq!(
426	&converted,
427	&and(
428	and(
429	eq(col("float_col_nan_count"), lit(0u64)),
430	// NaNCount of NaN is 1
431	eq(lit(1u64), lit(0u64)),
432	),
433	// This is the standard conversion of the >= operator. Comparing NAN to a max
434	// stat is nonsensical, as min/max stats ignore NaNs, but this should be short-circuited
435	// by the previous check for nan_count anyway.
436	lt(col("float_col_max"), lit(f32::NAN)),
437	)
438	);
439
440	// One half of the expression requires NAN count check, the other half does not.
441	let expr = and(
442	gt(col("float_col"), lit(10f32)),
443	lt(col("int_col"), lit(10)),
444	);
445
446	let (converted, _) = checked_pruning_expr(&expr, &available_stats_with_nans).unwrap();
447
448	assert_eq!(
449	&converted,
450	&or(
451	// NaNCount check is enforced for the float column
452	and(
453	and(
454	eq(col("float_col_nan_count"), lit(0u64)),
455	// NanCount of a non-NaN float literal is 0
456	eq(lit(0u64), lit(0u64)),
457	),
458	// We want the opposite: we can prune IF either one is false.
459	lt_eq(col("float_col_max"), lit(10f32)),
460	),
461	// NanCount check is skipped for the int column
462	gt_eq(col("int_col_min"), lit(10)),
463	)
464	)
465	}
466	}

vortex-data / vortex / 16935267080

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous