12712969950

Committed 10 Jan 2025 04:04PM UTC coverage: 91.26% (+0.2%) from 91.1%

Build # 12712969950

Build Type

Pull #8605

github

Committed by

web-flow

Commit Message

Merge 89b7ad1ba into 741ce5df5

Pull Request Pull Request #8605: feat: add `RecursiveSplitter` component for `Document` preprocessing

Run Details

8844 of 9691 relevant lines covered (91.26%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

38.64

haystack/components/converters/markdown.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport

with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
    from markdown_it import MarkdownIt
    from mdit_plain.renderer import RendererPlain


logger = logging.getLogger(__name__)


@component
class MarkdownToDocument:
    """
    Converts a Markdown file into a text Document.

    Usage example:
    ```python
    from haystack.components.converters import MarkdownToDocument
    from datetime import datetime

    converter = MarkdownToDocument()
    results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the markdown file.'
    ```
    """

    def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False):
        """
        Create a MarkdownToDocument component.

        :param table_to_single_line:
            If True converts table contents into a single line.
        :param progress_bar:
            If True shows a progress bar when running.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        markdown_conversion_imports.check()

        self.table_to_single_line = table_to_single_line
        self.progress_bar = progress_bar
        self.store_full_path = store_full_path

    @component.output_types(documents=List[Document])
    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
    ):
        """
        Converts a list of Markdown files to Documents.

        :param sources:
            List of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: List of created Documents
        """
        parser = MarkdownIt(renderer_cls=RendererPlain)
        if self.table_to_single_line:
            parser.enable("table")

        documents = []
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

        for source, metadata in tqdm(
            zip(sources, meta_list),
            total=len(sources),
            desc="Converting markdown files to Documents",
            disable=not self.progress_bar,
        ):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as e:
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                continue
            try:
                file_content = bytestream.data.decode("utf-8")
                text = parser.render(file_content)
            except Exception as conversion_e:
                logger.warning(
                    "Failed to extract text from {source}. Skipping it. Error: {error}",
                    source=source,
                    error=conversion_e,
                )
                continue

            merged_metadata = {**bytestream.meta, **metadata}

            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                merged_metadata["file_path"] = os.path.basename(file_path)

            document = Document(content=text, meta=merged_metadata)
            documents.append(document)

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import os	1✔
6	from pathlib import Path	1✔
7	from typing import Any, Dict, List, Optional, Union	1✔
8
9	from tqdm import tqdm	1✔
10
11	from haystack import Document, component, logging	1✔
12	from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata	1✔
13	from haystack.dataclasses import ByteStream	1✔
14	from haystack.lazy_imports import LazyImport	1✔
15
16	with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:	1✔
17	from markdown_it import MarkdownIt	1✔
18	from mdit_plain.renderer import RendererPlain	1✔
19
20
21	logger = logging.getLogger(__name__)	1✔
22
23
24	@component	1✔
25	class MarkdownToDocument:	1✔
26	"""
27	Converts a Markdown file into a text Document.
28
29	Usage example:
30	```python
31	from haystack.components.converters import MarkdownToDocument
32	from datetime import datetime
33
34	converter = MarkdownToDocument()
35	results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
36	documents = results["documents"]
37	print(documents[0].content)
38	# 'This is a text from the markdown file.'
39	```
40	"""
41
42	def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False):	1✔
43	"""
44	Create a MarkdownToDocument component.
45
46	:param table_to_single_line:
47	If True converts table contents into a single line.
48	:param progress_bar:
49	If True shows a progress bar when running.
50	:param store_full_path:
51	If True, the full path of the file is stored in the metadata of the document.
52	If False, only the file name is stored.
53	"""
54	markdown_conversion_imports.check()	×
55
56	self.table_to_single_line = table_to_single_line	×
57	self.progress_bar = progress_bar	×
58	self.store_full_path = store_full_path	×
59
60	@component.output_types(documents=List[Document])	1✔
61	def run(	1✔
62	self,
63	sources: List[Union[str, Path, ByteStream]],
64	meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
65	):
66	"""
67	Converts a list of Markdown files to Documents.
68
69	:param sources:
70	List of file paths or ByteStream objects.
71	:param meta:
72	Optional metadata to attach to the Documents.
73	This value can be either a list of dictionaries or a single dictionary.
74	If it's a single dictionary, its content is added to the metadata of all produced Documents.
75	If it's a list, the length of the list must match the number of sources, because the two lists will
76	be zipped.
77	If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
78
79	:returns:
80	A dictionary with the following keys:
81	- `documents`: List of created Documents
82	"""
83	parser = MarkdownIt(renderer_cls=RendererPlain)	×
84	if self.table_to_single_line:	×
85	parser.enable("table")	×
86
87	documents = []	×
88	meta_list = normalize_metadata(meta=meta, sources_count=len(sources))	×
89
90	for source, metadata in tqdm(	×
91	zip(sources, meta_list),
92	total=len(sources),
93	desc="Converting markdown files to Documents",
94	disable=not self.progress_bar,
95	):
96	try:	×
97	bytestream = get_bytestream_from_source(source)	×
98	except Exception as e:	×
99	logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)	×
100	continue	×
101	try:	×
102	file_content = bytestream.data.decode("utf-8")	×
103	text = parser.render(file_content)	×
104	except Exception as conversion_e:	×
105	logger.warning(	×
106	"Failed to extract text from {source}. Skipping it. Error: {error}",
107	source=source,
108	error=conversion_e,
109	)
110	continue	×
111
112	merged_metadata = {bytestream.meta, metadata}	×
113
114	if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):	×
115	merged_metadata["file_path"] = os.path.basename(file_path)	×
116
117	document = Document(content=text, meta=merged_metadata)	×
118	documents.append(document)	×
119
120	return {"documents": documents}	×

deepset-ai / haystack / 12712969950

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous