• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 12298066419

12 Dec 2024 02:07PM UTC coverage: 90.408% (+0.06%) from 90.346%
12298066419

Pull #8522

github

web-flow
Merge 669550d36 into 04fc187bc
Pull Request #8522: feat: Add XLSXToDocument converter

8096 of 8955 relevant lines covered (90.41%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.88
haystack/components/converters/json.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
import os
1✔
7
from pathlib import Path
1✔
8
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
1✔
9

10
from haystack import component, default_from_dict, default_to_dict, logging
1✔
11
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
12
from haystack.dataclasses import ByteStream, Document
1✔
13
from haystack.lazy_imports import LazyImport
1✔
14

15
logger = logging.getLogger(__name__)
1✔
16

17
with LazyImport("Run 'pip install jq'") as jq_import:
1✔
18
    import jq
1✔
19

20

21
@component
1✔
22
class JSONConverter:
1✔
23
    """
24
    Converts one or more JSON files into a text document.
25

26
    ### Usage examples
27

28
    ```python
29
    import json
30

31
    from haystack.components.converters import JSONConverter
32
    from haystack.dataclasses import ByteStream
33

34
    source = ByteStream.from_string(json.dumps({"text": "This is the content of my document"}))
35

36
    converter = JSONConverter(content_key="text")
37
    results = converter.run(sources=[source])
38
    documents = results["documents"]
39
    print(documents[0].content)
40
    # 'This is the content of my document'
41
    ```
42

43
    Optionally, you can also provide a `jq_schema` string to filter the JSON source files and `extra_meta_fields`
44
    to extract from the filtered data:
45

46
    ```python
47
    import json
48

49
    from haystack.components.converters import JSONConverter
50
    from haystack.dataclasses import ByteStream
51

52
    data = {
53
        "laureates": [
54
            {
55
                "firstname": "Enrico",
56
                "surname": "Fermi",
57
                "motivation": "for his demonstrations of the existence of new radioactive elements produced "
58
                "by neutron irradiation, and for his related discovery of nuclear reactions brought about by"
59
                " slow neutrons",
60
            },
61
            {
62
                "firstname": "Rita",
63
                "surname": "Levi-Montalcini",
64
                "motivation": "for their discoveries of growth factors",
65
            },
66
        ],
67
    }
68
    source = ByteStream.from_string(json.dumps(data))
69
    converter = JSONConverter(
70
        jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
71
    )
72

73
    results = converter.run(sources=[source])
74
    documents = results["documents"]
75
    print(documents[0].content)
76
    # 'for his demonstrations of the existence of new radioactive elements produced by
77
    # neutron irradiation, and for his related discovery of nuclear reactions brought
78
    # about by slow neutrons'
79

80
    print(documents[0].meta)
81
    # {'firstname': 'Enrico', 'surname': 'Fermi'}
82

83
    print(documents[1].content)
84
    # 'for their discoveries of growth factors'
85

86
    print(documents[1].meta)
87
    # {'firstname': 'Rita', 'surname': 'Levi-Montalcini'}
88
    ```
89

90
    """
91

92
    def __init__(
1✔
93
        self,
94
        jq_schema: Optional[str] = None,
95
        content_key: Optional[str] = None,
96
        extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
97
        store_full_path: bool = False,
98
    ):
99
        """
100
        Creates a JSONConverter component.
101

102
        An optional `jq_schema` can be provided to extract nested data in the JSON source files.
103
        See the [official jq documentation](https://jqlang.github.io/jq/) for more info on the filters syntax.
104
        If `jq_schema` is not set, whole JSON source files will be used to extract content.
105

106
        Optionally, you can provide a `content_key` to specify which key in the extracted object must
107
        be set as the document's content.
108

109
        If both `jq_schema` and `content_key` are set, the component will search for the `content_key` in
110
        the JSON object extracted by `jq_schema`. If the extracted data is not a JSON object, it will be skipped.
111

112
        If only `jq_schema` is set, the extracted data must be a scalar value. If it's a JSON object or array,
113
        it will be skipped.
114

115
        If only `content_key` is set, the source JSON file must be a JSON object, else it will be skipped.
116

117
        `extra_meta_fields` can either be set to a set of strings or a literal `"*"` string.
118
        If it's a set of strings, it must specify fields in the extracted objects that must be set in
119
        the extracted documents. If a field is not found, the meta value will be `None`.
120
        If set to `"*"`, all fields that are not `content_key` found in the filtered JSON object will
121
        be saved as metadata.
122

123
        Initialization will fail if neither `jq_schema` nor `content_key` are set.
124

125
        :param jq_schema:
126
            Optional jq filter string to extract content.
127
            If not specified, whole JSON object will be used to extract information.
128
        :param content_key:
129
            Optional key to extract document content.
130
            If `jq_schema` is specified, the `content_key` will be extracted from that object.
131
        :param extra_meta_fields:
132
            An optional set of meta keys to extract from the content.
133
            If `jq_schema` is specified, all keys will be extracted from that object.
134
        :param store_full_path:
135
            If True, the full path of the file is stored in the metadata of the document.
136
            If False, only the file name is stored.
137
        """
138
        self._compiled_filter = None
1✔
139
        if jq_schema:
1✔
140
            jq_import.check()
1✔
141
            self._compiled_filter = jq.compile(jq_schema)
1✔
142

143
        self._jq_schema = jq_schema
1✔
144
        self._content_key = content_key
1✔
145
        self._meta_fields = extra_meta_fields
1✔
146
        self._store_full_path = store_full_path
1✔
147

148
        if self._compiled_filter is None and self._content_key is None:
1✔
149
            msg = "No `jq_schema` nor `content_key` specified. Set either or both to extract data."
1✔
150
            raise ValueError(msg)
1✔
151

152
    def to_dict(self) -> Dict[str, Any]:
1✔
153
        """
154
        Serializes the component to a dictionary.
155

156
        :returns:
157
            Dictionary with serialized data.
158
        """
159
        return default_to_dict(
1✔
160
            self,
161
            jq_schema=self._jq_schema,
162
            content_key=self._content_key,
163
            extra_meta_fields=self._meta_fields,
164
            store_full_path=self._store_full_path,
165
        )
166

167
    @classmethod
1✔
168
    def from_dict(cls, data: Dict[str, Any]) -> "JSONConverter":
1✔
169
        """
170
        Deserializes the component from a dictionary.
171

172
        :param data:
173
            Dictionary to deserialize from.
174
        :returns:
175
            Deserialized component.
176
        """
177
        return default_from_dict(cls, data)
1✔
178

179
    def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str, Any]]]:
1✔
180
        """
181
        Utility function to extract text and metadata from a JSON file.
182

183
        :param source:
184
            UTF-8 byte stream.
185
        :returns:
186
            Collection of text and metadata dict tuples, each corresponding
187
            to a different document.
188
        """
189
        try:
1✔
190
            file_content = source.data.decode("utf-8")
1✔
191
        except UnicodeError as exc:
×
192
            logger.warning(
×
193
                "Failed to extract text from {source}. Skipping it. Error: {error}",
194
                source=source.meta["file_path"],
195
                error=exc,
196
            )
197

198
        meta_fields = self._meta_fields or set()
1✔
199

200
        if self._compiled_filter is not None:
1✔
201
            try:
1✔
202
                objects = list(self._compiled_filter.input_text(file_content))
1✔
203
            except Exception as exc:
1✔
204
                logger.warning(
1✔
205
                    "Failed to extract text from {source}. Skipping it. Error: {error}",
206
                    source=source.meta["file_path"],
207
                    error=exc,
208
                )
209
                return []
1✔
210
        else:
211
            # We just load the whole file as JSON if the user didn't provide a jq filter.
212
            # We put it in a list even if it's not to ease handling it later on.
213
            objects = [json.loads(file_content)]
1✔
214

215
        result = []
1✔
216
        if self._content_key is not None:
1✔
217
            for obj in objects:
1✔
218
                if not isinstance(obj, dict):
1✔
219
                    logger.warning("Expected a dictionary but got {obj}. Skipping it.", obj=obj)
×
220
                    continue
×
221
                if self._content_key not in obj:
1✔
222
                    logger.warning(
×
223
                        "'{content_key}' not found in {obj}. Skipping it.", content_key=self._content_key, obj=obj
224
                    )
225
                    continue
×
226

227
                text = obj[self._content_key]
1✔
228
                if isinstance(text, (dict, list)):
1✔
229
                    logger.warning("Expected a scalar value but got {obj}. Skipping it.", obj=obj)
×
230
                    continue
×
231

232
                meta = {}
1✔
233
                if meta_fields == "*":
1✔
234
                    meta = {k: v for k, v in obj.items() if k != self._content_key}
1✔
235
                else:
236
                    for field in meta_fields:
1✔
237
                        meta[field] = obj.get(field, None)
1✔
238
                result.append((text, meta))
1✔
239
        else:
240
            for obj in objects:
1✔
241
                if isinstance(obj, (dict, list)):
1✔
242
                    logger.warning("Expected a scalar value but got {obj}. Skipping it.", obj=obj)
×
243
                    continue
×
244
                result.append((str(obj), {}))
1✔
245

246
        return result
1✔
247

248
    @component.output_types(documents=List[Document])
1✔
249
    def run(
1✔
250
        self,
251
        sources: List[Union[str, Path, ByteStream]],
252
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
253
    ):
254
        """
255
        Converts a list of JSON files to documents.
256

257
        :param sources:
258
            A list of file paths or ByteStream objects.
259
        :param meta:
260
            Optional metadata to attach to the documents.
261
            This value can be either a list of dictionaries or a single dictionary.
262
            If it's a single dictionary, its content is added to the metadata of all produced documents.
263
            If it's a list, the length of the list must match the number of sources.
264
            If `sources` contain ByteStream objects, their `meta` will be added to the output documents.
265

266
        :returns:
267
            A dictionary with the following keys:
268
            - `documents`: A list of created documents.
269
        """
270
        documents = []
1✔
271
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
1✔
272

273
        for source, metadata in zip(sources, meta_list):
1✔
274
            try:
1✔
275
                bytestream = get_bytestream_from_source(source)
1✔
276
            except Exception as exc:
×
277
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=exc)
×
278
                continue
×
279

280
            data = self._get_content_and_meta(bytestream)
1✔
281

282
            for text, extra_meta in data:
1✔
283
                merged_metadata = {**bytestream.meta, **metadata, **extra_meta}
1✔
284

285
                if not self._store_full_path and (file_path := bytestream.meta.get("file_path")):
1✔
286
                    merged_metadata["file_path"] = os.path.basename(file_path)
1✔
287
                document = Document(content=text, meta=merged_metadata)
1✔
288
                documents.append(document)
1✔
289

290
        return {"documents": documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc