• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 12240140835

09 Dec 2024 04:39PM UTC coverage: 90.335% (+0.001%) from 90.334%
12240140835

Pull #8610

github

web-flow
Merge 3ff0aa0e9 into 6f983a22c
Pull Request #8610: chore: fixing `pylint` issues

8038 of 8898 relevant lines covered (90.33%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

39.13
haystack/components/converters/markdown.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import os
1✔
6
import warnings
1✔
7
from pathlib import Path
1✔
8
from typing import Any, Dict, List, Optional, Union
1✔
9

10
from tqdm import tqdm
1✔
11

12
from haystack import Document, component, logging
1✔
13
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
14
from haystack.dataclasses import ByteStream
1✔
15
from haystack.lazy_imports import LazyImport
1✔
16

17
with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
1✔
18
    from markdown_it import MarkdownIt
1✔
19
    from mdit_plain.renderer import RendererPlain
1✔
20

21

22
logger = logging.getLogger(__name__)
1✔
23

24

25
@component
1✔
26
class MarkdownToDocument:
1✔
27
    """
28
    Converts a Markdown file into a text Document.
29

30
    Usage example:
31
    ```python
32
    from haystack.components.converters import MarkdownToDocument
33
    from datetime import datetime
34

35
    converter = MarkdownToDocument()
36
    results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
37
    documents = results["documents"]
38
    print(documents[0].content)
39
    # 'This is a text from the markdown file.'
40
    ```
41
    """
42

43
    def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
1✔
44
        """
45
        Create a MarkdownToDocument component.
46

47
        :param table_to_single_line:
48
            If True converts table contents into a single line.
49
        :param progress_bar:
50
            If True shows a progress bar when running.
51
        :param store_full_path:
52
            If True, the full path of the file is stored in the metadata of the document.
53
            If False, only the file name is stored.
54
        """
55
        markdown_conversion_imports.check()
×
56

57
        self.table_to_single_line = table_to_single_line
×
58
        self.progress_bar = progress_bar
×
59
        self.store_full_path = store_full_path
×
60

61
    @component.output_types(documents=List[Document])
1✔
62
    def run(
1✔
63
        self,
64
        sources: List[Union[str, Path, ByteStream]],
65
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
66
    ):
67
        """
68
        Converts a list of Markdown files to Documents.
69

70
        :param sources:
71
            List of file paths or ByteStream objects.
72
        :param meta:
73
            Optional metadata to attach to the Documents.
74
            This value can be either a list of dictionaries or a single dictionary.
75
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
76
            If it's a list, the length of the list must match the number of sources, because the two lists will
77
            be zipped.
78
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
79

80
        :returns:
81
            A dictionary with the following keys:
82
            - `documents`: List of created Documents
83
        """
84
        parser = MarkdownIt(renderer_cls=RendererPlain)
×
85
        if self.table_to_single_line:
×
86
            parser.enable("table")
×
87

88
        documents = []
×
89
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
×
90

91
        for source, metadata in tqdm(
×
92
            zip(sources, meta_list),
93
            total=len(sources),
94
            desc="Converting markdown files to Documents",
95
            disable=not self.progress_bar,
96
        ):
97
            try:
×
98
                bytestream = get_bytestream_from_source(source)
×
99
            except Exception as e:
×
100
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
×
101
                continue
×
102
            try:
×
103
                file_content = bytestream.data.decode("utf-8")
×
104
                text = parser.render(file_content)
×
105
            except Exception as conversion_e:
×
106
                logger.warning(
×
107
                    "Failed to extract text from {source}. Skipping it. Error: {error}",
108
                    source=source,
109
                    error=conversion_e,
110
                )
111
                continue
×
112

113
            merged_metadata = {**bytestream.meta, **metadata}
×
114

115
            warnings.warn(
×
116
                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
117
                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
118
                "storing only file names to improve privacy.",
119
                DeprecationWarning,
120
            )
121

122
            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
×
123
                merged_metadata["file_path"] = os.path.basename(file_path)
×
124

125
            document = Document(content=text, meta=merged_metadata)
×
126
            documents.append(document)
×
127

128
        return {"documents": documents}
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc