12240140835

Committed 09 Dec 2024 04:39PM UTC coverage: 90.335% (+0.001%) from 90.334%

Build # 12240140835

Build Type

Pull #8610

github

Committed by

web-flow

Commit Message

Merge 3ff0aa0e9 into 6f983a22c

Pull Request Pull Request #8610: chore: fixing `pylint` issues

Run Details

8038 of 8898 relevant lines covered (90.33%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

39.13

haystack/components/converters/markdown.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport

with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
    from markdown_it import MarkdownIt
    from mdit_plain.renderer import RendererPlain


logger = logging.getLogger(__name__)


@component
class MarkdownToDocument:
    """
    Converts a Markdown file into a text Document.

    Usage example:
    ```python
    from haystack.components.converters import MarkdownToDocument
    from datetime import datetime

    converter = MarkdownToDocument()
    results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the markdown file.'
    ```
    """

    def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
        """
        Create a MarkdownToDocument component.

        :param table_to_single_line:
            If True converts table contents into a single line.
        :param progress_bar:
            If True shows a progress bar when running.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        markdown_conversion_imports.check()

        self.table_to_single_line = table_to_single_line
        self.progress_bar = progress_bar
        self.store_full_path = store_full_path

    @component.output_types(documents=List[Document])
    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
    ):
        """
        Converts a list of Markdown files to Documents.

        :param sources:
            List of file paths or ByteStream objects.
        :param meta:
            Optional metadata to attach to the Documents.
            This value can be either a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
            If it's a list, the length of the list must match the number of sources, because the two lists will
            be zipped.
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: List of created Documents
        """
        parser = MarkdownIt(renderer_cls=RendererPlain)
        if self.table_to_single_line:
            parser.enable("table")

        documents = []
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

        for source, metadata in tqdm(
            zip(sources, meta_list),
            total=len(sources),
            desc="Converting markdown files to Documents",
            disable=not self.progress_bar,
        ):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as e:
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                continue
            try:
                file_content = bytestream.data.decode("utf-8")
                text = parser.render(file_content)
            except Exception as conversion_e:
                logger.warning(
                    "Failed to extract text from {source}. Skipping it. Error: {error}",
                    source=source,
                    error=conversion_e,
                )
                continue

            merged_metadata = {**bytestream.meta, **metadata}

            warnings.warn(
                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
                "storing only file names to improve privacy.",
                DeprecationWarning,
            )

            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                merged_metadata["file_path"] = os.path.basename(file_path)

            document = Document(content=text, meta=merged_metadata)
            documents.append(document)

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import os	1✔
6	import warnings	1✔
7	from pathlib import Path	1✔
8	from typing import Any, Dict, List, Optional, Union	1✔
9
10	from tqdm import tqdm	1✔
11
12	from haystack import Document, component, logging	1✔
13	from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata	1✔
14	from haystack.dataclasses import ByteStream	1✔
15	from haystack.lazy_imports import LazyImport	1✔
16
17	with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:	1✔
18	from markdown_it import MarkdownIt	1✔
19	from mdit_plain.renderer import RendererPlain	1✔
20
21
22	logger = logging.getLogger(__name__)	1✔
23
24
25	@component	1✔
26	class MarkdownToDocument:	1✔
27	"""
28	Converts a Markdown file into a text Document.
29
30	Usage example:
31	```python
32	from haystack.components.converters import MarkdownToDocument
33	from datetime import datetime
34
35	converter = MarkdownToDocument()
36	results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
37	documents = results["documents"]
38	print(documents[0].content)
39	# 'This is a text from the markdown file.'
40	```
41	"""
42
43	def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):	1✔
44	"""
45	Create a MarkdownToDocument component.
46
47	:param table_to_single_line:
48	If True converts table contents into a single line.
49	:param progress_bar:
50	If True shows a progress bar when running.
51	:param store_full_path:
52	If True, the full path of the file is stored in the metadata of the document.
53	If False, only the file name is stored.
54	"""
55	markdown_conversion_imports.check()	×
56
57	self.table_to_single_line = table_to_single_line	×
58	self.progress_bar = progress_bar	×
59	self.store_full_path = store_full_path	×
60
61	@component.output_types(documents=List[Document])	1✔
62	def run(	1✔
63	self,
64	sources: List[Union[str, Path, ByteStream]],
65	meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
66	):
67	"""
68	Converts a list of Markdown files to Documents.
69
70	:param sources:
71	List of file paths or ByteStream objects.
72	:param meta:
73	Optional metadata to attach to the Documents.
74	This value can be either a list of dictionaries or a single dictionary.
75	If it's a single dictionary, its content is added to the metadata of all produced Documents.
76	If it's a list, the length of the list must match the number of sources, because the two lists will
77	be zipped.
78	If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
79
80	:returns:
81	A dictionary with the following keys:
82	- `documents`: List of created Documents
83	"""
84	parser = MarkdownIt(renderer_cls=RendererPlain)	×
85	if self.table_to_single_line:	×
86	parser.enable("table")	×
87
88	documents = []	×
89	meta_list = normalize_metadata(meta=meta, sources_count=len(sources))	×
90
91	for source, metadata in tqdm(	×
92	zip(sources, meta_list),
93	total=len(sources),
94	desc="Converting markdown files to Documents",
95	disable=not self.progress_bar,
96	):
97	try:	×
98	bytestream = get_bytestream_from_source(source)	×
99	except Exception as e:	×
100	logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)	×
101	continue	×
102	try:	×
103	file_content = bytestream.data.decode("utf-8")	×
104	text = parser.render(file_content)	×
105	except Exception as conversion_e:	×
106	logger.warning(	×
107	"Failed to extract text from {source}. Skipping it. Error: {error}",
108	source=source,
109	error=conversion_e,
110	)
111	continue	×
112
113	merged_metadata = {bytestream.meta, metadata}	×
114
115	warnings.warn(	×
116	"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
117	"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
118	"storing only file names to improve privacy.",
119	DeprecationWarning,
120	)
121
122	if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):	×
123	merged_metadata["file_path"] = os.path.basename(file_path)	×
124
125	document = Document(content=text, meta=merged_metadata)	×
126	documents.append(document)	×
127
128	return {"documents": documents}	×

deepset-ai / haystack / 12240140835

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous