• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 13634803133

03 Mar 2025 03:47PM UTC coverage: 90.124% (+0.1%) from 89.986%
13634803133

Pull #8906

github

web-flow
Merge e48e49114 into 1b2053b35
Pull Request #8906: refactor!: remove `dataframe` field from `Document` and `ExtractedTableAnswer`; make `pandas` optional

9536 of 10581 relevant lines covered (90.12%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.5
haystack/utils/filters.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from dataclasses import fields
1✔
6
from datetime import datetime
1✔
7
from typing import Any, Dict, List, Optional
1✔
8

9
import dateutil.parser
1✔
10

11
from haystack.dataclasses import Document
1✔
12
from haystack.errors import FilterError
1✔
13

14

15
def raise_on_invalid_filter_syntax(filters: Optional[Dict[str, Any]] = None):
1✔
16
    """
17
    Raise an error if the filter syntax is invalid.
18
    """
19
    if filters and ("operator" not in filters or "conditions" not in filters):
×
20
        msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
×
21
        raise FilterError(msg)
×
22

23

24
def document_matches_filter(filters: Dict[str, Any], document: Document) -> bool:
1✔
25
    """
26
    Return whether `filters` match the Document.
27

28
    For a detailed specification of the filters, refer to the
29
    `DocumentStore.filter_documents()` protocol documentation.
30
    """
31
    if "field" in filters:
1✔
32
        return _comparison_condition(filters, document)
1✔
33
    return _logic_condition(filters, document)
1✔
34

35

36
def _and(document: Document, conditions: List[Dict[str, Any]]) -> bool:
1✔
37
    return all(_comparison_condition(condition, document) for condition in conditions)
1✔
38

39

40
def _or(document: Document, conditions: List[Dict[str, Any]]) -> bool:
1✔
41
    return any(_comparison_condition(condition, document) for condition in conditions)
1✔
42

43

44
def _not(document: Document, conditions: List[Dict[str, Any]]) -> bool:
1✔
45
    return not _and(document, conditions)
1✔
46

47

48
LOGICAL_OPERATORS = {"NOT": _not, "OR": _or, "AND": _and}
1✔
49

50

51
def _equal(document_value: Any, filter_value: Any) -> bool:
1✔
52
    return document_value == filter_value
1✔
53

54

55
def _not_equal(document_value: Any, filter_value: Any) -> bool:
1✔
56
    return not _equal(document_value=document_value, filter_value=filter_value)
1✔
57

58

59
def _greater_than(document_value: Any, filter_value: Any) -> bool:
1✔
60
    if document_value is None or filter_value is None:
1✔
61
        # We can't compare None values reliably using operators '>', '>=', '<', '<='
62
        return False
1✔
63

64
    if isinstance(document_value, str) or isinstance(filter_value, str):
1✔
65
        try:
1✔
66
            document_value = _parse_date(document_value)
1✔
67
            filter_value = _parse_date(filter_value)
1✔
68
            document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value)
1✔
69
        except FilterError as exc:
1✔
70
            raise exc
1✔
71
    if isinstance(filter_value, list):
1✔
72
        msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
1✔
73
        raise FilterError(msg)
1✔
74
    return document_value > filter_value
1✔
75

76

77
def _parse_date(value):
1✔
78
    """Try parsing the value as an ISO format date, then fall back to dateutil.parser."""
79
    try:
1✔
80
        return datetime.fromisoformat(value)
1✔
81
    except (ValueError, TypeError):
1✔
82
        try:
1✔
83
            return dateutil.parser.parse(value)
1✔
84
        except (ValueError, TypeError) as exc:
1✔
85
            msg = (
1✔
86
                "Can't compare strings using operators '>', '>=', '<', '<='. "
87
                "Strings are only comparable if they are ISO formatted dates."
88
            )
89
            raise FilterError(msg) from exc
1✔
90

91

92
def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime):
1✔
93
    """Ensure that both dates are either naive or aware."""
94
    # Both naive
95
    if date1.tzinfo is None and date2.tzinfo is None:
1✔
96
        return date1, date2
1✔
97

98
    # Both aware
99
    if date1.tzinfo is not None and date2.tzinfo is not None:
1✔
100
        return date1, date2
×
101

102
    # One naive, one aware
103
    if date1.tzinfo is None:
1✔
104
        date1 = date1.replace(tzinfo=date2.tzinfo)
×
105
    else:
106
        date2 = date2.replace(tzinfo=date1.tzinfo)
1✔
107
    return date1, date2
1✔
108

109

110
def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:
1✔
111
    if document_value is None or filter_value is None:
1✔
112
        # We can't compare None values reliably using operators '>', '>=', '<', '<='
113
        return False
1✔
114

115
    return _equal(document_value=document_value, filter_value=filter_value) or _greater_than(
1✔
116
        document_value=document_value, filter_value=filter_value
117
    )
118

119

120
def _less_than(document_value: Any, filter_value: Any) -> bool:
1✔
121
    if document_value is None or filter_value is None:
1✔
122
        # We can't compare None values reliably using operators '>', '>=', '<', '<='
123
        return False
1✔
124

125
    return not _greater_than_equal(document_value=document_value, filter_value=filter_value)
1✔
126

127

128
def _less_than_equal(document_value: Any, filter_value: Any) -> bool:
1✔
129
    if document_value is None or filter_value is None:
1✔
130
        # We can't compare None values reliably using operators '>', '>=', '<', '<='
131
        return False
1✔
132

133
    return not _greater_than(document_value=document_value, filter_value=filter_value)
1✔
134

135

136
def _in(document_value: Any, filter_value: Any) -> bool:
1✔
137
    if not isinstance(filter_value, list):
1✔
138
        msg = (
1✔
139
            f"Filter value must be a `list` when using operator 'in' or 'not in', received type '{type(filter_value)}'"
140
        )
141
        raise FilterError(msg)
1✔
142
    return any(_equal(e, document_value) for e in filter_value)
1✔
143

144

145
def _not_in(document_value: Any, filter_value: Any) -> bool:
1✔
146
    return not _in(document_value=document_value, filter_value=filter_value)
1✔
147

148

149
COMPARISON_OPERATORS = {
1✔
150
    "==": _equal,
151
    "!=": _not_equal,
152
    ">": _greater_than,
153
    ">=": _greater_than_equal,
154
    "<": _less_than,
155
    "<=": _less_than_equal,
156
    "in": _in,
157
    "not in": _not_in,
158
}
159

160

161
def _logic_condition(condition: Dict[str, Any], document: Document) -> bool:
1✔
162
    if "operator" not in condition:
1✔
163
        msg = f"'operator' key missing in {condition}"
1✔
164
        raise FilterError(msg)
1✔
165
    if "conditions" not in condition:
1✔
166
        msg = f"'conditions' key missing in {condition}"
1✔
167
        raise FilterError(msg)
1✔
168
    operator: str = condition["operator"]
1✔
169
    conditions: List[Dict[str, Any]] = condition["conditions"]
1✔
170
    return LOGICAL_OPERATORS[operator](document, conditions)
1✔
171

172

173
def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool:
1✔
174
    if "field" not in condition:
1✔
175
        # 'field' key is only found in comparison dictionaries.
176
        # We assume this is a logic dictionary since it's not present.
177
        return _logic_condition(condition, document)
1✔
178
    field: str = condition["field"]
1✔
179

180
    if "operator" not in condition:
1✔
181
        msg = f"'operator' key missing in {condition}"
1✔
182
        raise FilterError(msg)
1✔
183
    if "value" not in condition:
1✔
184
        msg = f"'value' key missing in {condition}"
1✔
185
        raise FilterError(msg)
1✔
186

187
    if "." in field:
1✔
188
        # Handles fields formatted like so:
189
        # 'meta.person.name'
190
        parts = field.split(".")
1✔
191
        document_value = getattr(document, parts[0])
1✔
192
        for part in parts[1:]:
1✔
193
            if part not in document_value:
1✔
194
                # If a field is not found we treat it as None
195
                document_value = None
1✔
196
                break
1✔
197
            document_value = document_value[part]
1✔
198
    elif field not in [f.name for f in fields(document)]:
1✔
199
        # Converted legacy filters don't add the `meta.` prefix, so we assume
200
        # that all filter fields that are not actual fields in Document are converted
201
        # filters.
202
        #
203
        # We handle this to avoid breaking compatibility with converted legacy filters.
204
        # This will be removed as soon as we stop supporting legacy filters.
205
        document_value = document.meta.get(field)
1✔
206
    else:
207
        document_value = getattr(document, field)
1✔
208
    operator: str = condition["operator"]
1✔
209
    filter_value: Any = condition["value"]
1✔
210
    return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc