12298066419

Committed 12 Dec 2024 02:07PM UTC coverage: 90.408% (+0.06%) from 90.346%

Build # 12298066419

Build Type

Pull #8522

github

Committed by

web-flow

Commit Message

Merge 669550d36 into 04fc187bc

Pull Request Pull Request #8522: feat: Add XLSXToDocument converter

Run Details

8096 of 8955 relevant lines covered (90.41%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.88

haystack/components/converters/json.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import json
import os
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

from haystack import component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream, Document
from haystack.lazy_imports import LazyImport

logger = logging.getLogger(__name__)

with LazyImport("Run 'pip install jq'") as jq_import:
    import jq


@component
class JSONConverter:
    """
    Converts one or more JSON files into a text document.

    ### Usage examples

    ```python
    import json

    from haystack.components.converters import JSONConverter
    from haystack.dataclasses import ByteStream

    source = ByteStream.from_string(json.dumps({"text": "This is the content of my document"}))

    converter = JSONConverter(content_key="text")
    results = converter.run(sources=[source])
    documents = results["documents"]
    print(documents[0].content)
    # 'This is the content of my document'
    ```

    Optionally, you can also provide a `jq_schema` string to filter the JSON source files and `extra_meta_fields`
    to extract from the filtered data:

    ```python
    import json

    from haystack.components.converters import JSONConverter
    from haystack.dataclasses import ByteStream

    data = {
        "laureates": [
            {
                "firstname": "Enrico",
                "surname": "Fermi",
                "motivation": "for his demonstrations of the existence of new radioactive elements produced "
                "by neutron irradiation, and for his related discovery of nuclear reactions brought about by"
                " slow neutrons",
            },
            {
                "firstname": "Rita",
                "surname": "Levi-Montalcini",
                "motivation": "for their discoveries of growth factors",
            },
        ],
    }
    source = ByteStream.from_string(json.dumps(data))
    converter = JSONConverter(
        jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
    )

    results = converter.run(sources=[source])
    documents = results["documents"]
    print(documents[0].content)
    # 'for his demonstrations of the existence of new radioactive elements produced by
    # neutron irradiation, and for his related discovery of nuclear reactions brought
    # about by slow neutrons'

    print(documents[0].meta)
    # {'firstname': 'Enrico', 'surname': 'Fermi'}

    print(documents[1].content)
    # 'for their discoveries of growth factors'

    print(documents[1].meta)
    # {'firstname': 'Rita', 'surname': 'Levi-Montalcini'}
    ```

    """

    def __init__(
        self,
        jq_schema: Optional[str] = None,
        content_key: Optional[str] = None,
        extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
        store_full_path: bool = False,
    ):
        """
        Creates a JSONConverter component.

        An optional `jq_schema` can be provided to extract nested data in the JSON source files.
        See the [official jq documentation](https://jqlang.github.io/jq/) for more info on the filters syntax.
        If `jq_schema` is not set, whole JSON source files will be used to extract content.

        Optionally, you can provide a `content_key` to specify which key in the extracted object must
        be set as the document's content.

        If both `jq_schema` and `content_key` are set, the component will search for the `content_key` in
        the JSON object extracted by `jq_schema`. If the extracted data is not a JSON object, it will be skipped.

        If only `jq_schema` is set, the extracted data must be a scalar value. If it's a JSON object or array,
        it will be skipped.

        If only `content_key` is set, the source JSON file must be a JSON object, else it will be skipped.

        `extra_meta_fields` can either be set to a set of strings or a literal `"*"` string.
        If it's a set of strings, it must specify fields in the extracted objects that must be set in
        the extracted documents. If a field is not found, the meta value will be `None`.
        If set to `"*"`, all fields that are not `content_key` found in the filtered JSON object will
        be saved as metadata.

        Initialization will fail if neither `jq_schema` nor `content_key` are set.

        :param jq_schema:
            Optional jq filter string to extract content.
            If not specified, whole JSON object will be used to extract information.
        :param content_key:
            Optional key to extract document content.
            If `jq_schema` is specified, the `content_key` will be extracted from that object.
        :param extra_meta_fields:
            An optional set of meta keys to extract from the content.
            If `jq_schema` is specified, all keys will be extracted from that object.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        self._compiled_filter = None
        if jq_schema:
            jq_import.check()
            self._compiled_filter = jq.compile(jq_schema)

        self._jq_schema = jq_schema
        self._content_key = content_key
        self._meta_fields = extra_meta_fields
        self._store_full_path = store_full_path

        if self._compiled_filter is None and self._content_key is None:
            msg = "No `jq_schema` nor `content_key` specified. Set either or both to extract data."
            raise ValueError(msg)

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        return default_to_dict(
            self,
            jq_schema=self._jq_schema,
            content_key=self._content_key,
            extra_meta_fields=self._meta_fields,
            store_full_path=self._store_full_path,
        )

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "JSONConverter":
        """
        Deserializes the component from a dictionary.

        :param data:
            Dictionary to deserialize from.
        :returns:
            Deserialized component.
        """
        return default_from_dict(cls, data)

    def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str, Any]]]:
        """
        Utility function to extract text and metadata from a JSON file.

        :param source:
            UTF-8 byte stream.
        :returns:
            Collection of text and metadata dict tuples, each corresponding
            to a different document.
        """
        try:
            file_content = source.data.decode("utf-8")
        except UnicodeError as exc:
            logger.warning(
                "Failed to extract text from {source}. Skipping it. Error: {error}",
                source=source.meta["file_path"],
                error=exc,
            )

        meta_fields = self._meta_fields or set()

        if self._compiled_filter is not None:
            try:
                objects = list(self._compiled_filter.input_text(file_content))
            except Exception as exc:
                logger.warning(
                    "Failed to extract text from {source}. Skipping it. Error: {error}",
                    source=source.meta["file_path"],
                    error=exc,
                )
                return []
        else:
            # We just load the whole file as JSON if the user didn't provide a jq filter.
            # We put it in a list even if it's not to ease handling it later on.
            objects = [json.loads(file_content)]

        result = []
        if self._content_key is not None:
            for obj in objects:
                if not isinstance(obj, dict):
                    logger.warning("Expected a dictionary but got {obj}. Skipping it.", obj=obj)
                    continue
                if self._content_key not in obj:
                    logger.warning(
                        "'{content_key}' not found in {obj}. Skipping it.", content_key=self._content_key, obj=obj
                    )
                    continue

                text = obj[self._content_key]
                if isinstance(text, (dict, list)):
                    logger.warning("Expected a scalar value but got {obj}. Skipping it.", obj=obj)
                    continue

                meta = {}
                if meta_fields == "*":
                    meta = {k: v for k, v in obj.items() if k != self._content_key}
                else:
                    for field in meta_fields:
                        meta[field] = obj.get(field, None)
                result.append((text, meta))
        else:
            for obj in objects:
                if isinstance(obj, (dict, list)):
                    logger.warning("Expected a scalar value but got {obj}. Skipping it.", obj=obj)
                    continue
                result.append((str(obj), {}))

        return result

    @component.output_types(documents=List[Document])
    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
    ):
        """
        Converts a list of JSON files to documents.

        :param sources:
            A list of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced documents.
            If it's a list, the length of the list must match the number of sources.
            If `sources` contain ByteStream objects, their `meta` will be added to the output documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: A list of created documents.
        """
        documents = []
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

        for source, metadata in zip(sources, meta_list):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as exc:
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=exc)
                continue

            data = self._get_content_and_meta(bytestream)

            for text, extra_meta in data:
                merged_metadata = {**bytestream.meta, **metadata, **extra_meta}

                if not self._store_full_path and (file_path := bytestream.meta.get("file_path")):
                    merged_metadata["file_path"] = os.path.basename(file_path)
                document = Document(content=text, meta=merged_metadata)
                documents.append(document)

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import json	1✔
6	import os	1✔
7	from pathlib import Path	1✔
8	from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union	1✔
9
10	from haystack import component, default_from_dict, default_to_dict, logging	1✔
11	from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata	1✔
12	from haystack.dataclasses import ByteStream, Document	1✔
13	from haystack.lazy_imports import LazyImport	1✔
14
15	logger = logging.getLogger(__name__)	1✔
16
17	with LazyImport("Run 'pip install jq'") as jq_import:	1✔
18	import jq	1✔
19
20
21	@component	1✔
22	class JSONConverter:	1✔
23	"""
24	Converts one or more JSON files into a text document.
25
26	### Usage examples
27
28	```python
29	import json
30
31	from haystack.components.converters import JSONConverter
32	from haystack.dataclasses import ByteStream
33
34	source = ByteStream.from_string(json.dumps({"text": "This is the content of my document"}))
35
36	converter = JSONConverter(content_key="text")
37	results = converter.run(sources=[source])
38	documents = results["documents"]
39	print(documents[0].content)
40	# 'This is the content of my document'
41	```
42
43	Optionally, you can also provide a `jq_schema` string to filter the JSON source files and `extra_meta_fields`
44	to extract from the filtered data:
45
46	```python
47	import json
48
49	from haystack.components.converters import JSONConverter
50	from haystack.dataclasses import ByteStream
51
52	data = {
53	"laureates": [
54	{
55	"firstname": "Enrico",
56	"surname": "Fermi",
57	"motivation": "for his demonstrations of the existence of new radioactive elements produced "
58	"by neutron irradiation, and for his related discovery of nuclear reactions brought about by"
59	" slow neutrons",
60	},
61	{
62	"firstname": "Rita",
63	"surname": "Levi-Montalcini",
64	"motivation": "for their discoveries of growth factors",
65	},
66	],
67	}
68	source = ByteStream.from_string(json.dumps(data))
69	converter = JSONConverter(
70	jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
71	)
72
73	results = converter.run(sources=[source])
74	documents = results["documents"]
75	print(documents[0].content)
76	# 'for his demonstrations of the existence of new radioactive elements produced by
77	# neutron irradiation, and for his related discovery of nuclear reactions brought
78	# about by slow neutrons'
79
80	print(documents[0].meta)
81	# {'firstname': 'Enrico', 'surname': 'Fermi'}
82
83	print(documents[1].content)
84	# 'for their discoveries of growth factors'
85
86	print(documents[1].meta)
87	# {'firstname': 'Rita', 'surname': 'Levi-Montalcini'}
88	```
89
90	"""
91
92	def __init__(	1✔
93	self,
94	jq_schema: Optional[str] = None,
95	content_key: Optional[str] = None,
96	extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
97	store_full_path: bool = False,
98	):
99	"""
100	Creates a JSONConverter component.
101
102	An optional `jq_schema` can be provided to extract nested data in the JSON source files.
103	See the [official jq documentation](https://jqlang.github.io/jq/) for more info on the filters syntax.
104	If `jq_schema` is not set, whole JSON source files will be used to extract content.
105
106	Optionally, you can provide a `content_key` to specify which key in the extracted object must
107	be set as the document's content.
108
109	If both `jq_schema` and `content_key` are set, the component will search for the `content_key` in
110	the JSON object extracted by `jq_schema`. If the extracted data is not a JSON object, it will be skipped.
111
112	If only `jq_schema` is set, the extracted data must be a scalar value. If it's a JSON object or array,
113	it will be skipped.
114
115	If only `content_key` is set, the source JSON file must be a JSON object, else it will be skipped.
116
117	`extra_meta_fields` can either be set to a set of strings or a literal `"*"` string.
118	If it's a set of strings, it must specify fields in the extracted objects that must be set in
119	the extracted documents. If a field is not found, the meta value will be `None`.
120	If set to `"*"`, all fields that are not `content_key` found in the filtered JSON object will
121	be saved as metadata.
122
123	Initialization will fail if neither `jq_schema` nor `content_key` are set.
124
125	:param jq_schema:
126	Optional jq filter string to extract content.
127	If not specified, whole JSON object will be used to extract information.
128	:param content_key:
129	Optional key to extract document content.
130	If `jq_schema` is specified, the `content_key` will be extracted from that object.
131	:param extra_meta_fields:
132	An optional set of meta keys to extract from the content.
133	If `jq_schema` is specified, all keys will be extracted from that object.
134	:param store_full_path:
135	If True, the full path of the file is stored in the metadata of the document.
136	If False, only the file name is stored.
137	"""
138	self._compiled_filter = None	1✔
139	if jq_schema:	1✔
140	jq_import.check()	1✔
141	self._compiled_filter = jq.compile(jq_schema)	1✔
142
143	self._jq_schema = jq_schema	1✔
144	self._content_key = content_key	1✔
145	self._meta_fields = extra_meta_fields	1✔
146	self._store_full_path = store_full_path	1✔
147
148	if self._compiled_filter is None and self._content_key is None:	1✔
149	msg = "No `jq_schema` nor `content_key` specified. Set either or both to extract data."	1✔
150	raise ValueError(msg)	1✔
151
152	def to_dict(self) -> Dict[str, Any]:	1✔
153	"""
154	Serializes the component to a dictionary.
155
156	:returns:
157	Dictionary with serialized data.
158	"""
159	return default_to_dict(	1✔
160	self,
161	jq_schema=self._jq_schema,
162	content_key=self._content_key,
163	extra_meta_fields=self._meta_fields,
164	store_full_path=self._store_full_path,
165	)
166
167	@classmethod	1✔
168	def from_dict(cls, data: Dict[str, Any]) -> "JSONConverter":	1✔
169	"""
170	Deserializes the component from a dictionary.
171
172	:param data:
173	Dictionary to deserialize from.
174	:returns:
175	Deserialized component.
176	"""
177	return default_from_dict(cls, data)	1✔
178
179	def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str, Any]]]:	1✔
180	"""
181	Utility function to extract text and metadata from a JSON file.
182
183	:param source:
184	UTF-8 byte stream.
185	:returns:
186	Collection of text and metadata dict tuples, each corresponding
187	to a different document.
188	"""
189	try:	1✔
190	file_content = source.data.decode("utf-8")	1✔
191	except UnicodeError as exc:	×
192	logger.warning(	×
193	"Failed to extract text from {source}. Skipping it. Error: {error}",
194	source=source.meta["file_path"],
195	error=exc,
196	)
197
198	meta_fields = self._meta_fields or set()	1✔
199
200	if self._compiled_filter is not None:	1✔
201	try:	1✔
202	objects = list(self._compiled_filter.input_text(file_content))	1✔
203	except Exception as exc:	1✔
204	logger.warning(	1✔
205	"Failed to extract text from {source}. Skipping it. Error: {error}",
206	source=source.meta["file_path"],
207	error=exc,
208	)
209	return []	1✔
210	else:
211	# We just load the whole file as JSON if the user didn't provide a jq filter.
212	# We put it in a list even if it's not to ease handling it later on.
213	objects = [json.loads(file_content)]	1✔
214
215	result = []	1✔
216	if self._content_key is not None:	1✔
217	for obj in objects:	1✔
218	if not isinstance(obj, dict):	1✔
219	logger.warning("Expected a dictionary but got {obj}. Skipping it.", obj=obj)	×
220	continue	×
221	if self._content_key not in obj:	1✔
222	logger.warning(	×
223	"'{content_key}' not found in {obj}. Skipping it.", content_key=self._content_key, obj=obj
224	)
225	continue	×
226
227	text = obj[self._content_key]	1✔
228	if isinstance(text, (dict, list)):	1✔
229	logger.warning("Expected a scalar value but got {obj}. Skipping it.", obj=obj)	×
230	continue	×
231
232	meta = {}	1✔
233	if meta_fields == "*":	1✔
234	meta = {k: v for k, v in obj.items() if k != self._content_key}	1✔
235	else:
236	for field in meta_fields:	1✔
237	meta[field] = obj.get(field, None)	1✔
238	result.append((text, meta))	1✔
239	else:
240	for obj in objects:	1✔
241	if isinstance(obj, (dict, list)):	1✔
242	logger.warning("Expected a scalar value but got {obj}. Skipping it.", obj=obj)	×
243	continue	×
244	result.append((str(obj), {}))	1✔
245
246	return result	1✔
247
248	@component.output_types(documents=List[Document])	1✔
249	def run(	1✔
250	self,
251	sources: List[Union[str, Path, ByteStream]],
252	meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
253	):
254	"""
255	Converts a list of JSON files to documents.
256
257	:param sources:
258	A list of file paths or ByteStream objects.
259	:param meta:
260	Optional metadata to attach to the documents.
261	This value can be either a list of dictionaries or a single dictionary.
262	If it's a single dictionary, its content is added to the metadata of all produced documents.
263	If it's a list, the length of the list must match the number of sources.
264	If `sources` contain ByteStream objects, their `meta` will be added to the output documents.
265
266	:returns:
267	A dictionary with the following keys:
268	- `documents`: A list of created documents.
269	"""
270	documents = []	1✔
271	meta_list = normalize_metadata(meta=meta, sources_count=len(sources))	1✔
272
273	for source, metadata in zip(sources, meta_list):	1✔
274	try:	1✔
275	bytestream = get_bytestream_from_source(source)	1✔
276	except Exception as exc:	×
277	logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=exc)	×
278	continue	×
279
280	data = self._get_content_and_meta(bytestream)	1✔
281
282	for text, extra_meta in data:	1✔
283	merged_metadata = {bytestream.meta, metadata, **extra_meta}	1✔
284
285	if not self._store_full_path and (file_path := bytestream.meta.get("file_path")):	1✔
286	merged_metadata["file_path"] = os.path.basename(file_path)	1✔
287	document = Document(content=text, meta=merged_metadata)	1✔
288	documents.append(document)	1✔
289
290	return {"documents": documents}	1✔

deepset-ai / haystack / 12298066419

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous