• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 12712969950

10 Jan 2025 04:04PM UTC coverage: 91.26% (+0.2%) from 91.1%
12712969950

Pull #8605

github

web-flow
Merge 89b7ad1ba into 741ce5df5
Pull Request #8605: feat: add `RecursiveSplitter` component for `Document` preprocessing

8844 of 9691 relevant lines covered (91.26%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

38.64
haystack/components/converters/markdown.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import os
1✔
6
from pathlib import Path
1✔
7
from typing import Any, Dict, List, Optional, Union
1✔
8

9
from tqdm import tqdm
1✔
10

11
from haystack import Document, component, logging
1✔
12
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
13
from haystack.dataclasses import ByteStream
1✔
14
from haystack.lazy_imports import LazyImport
1✔
15

16
with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
1✔
17
    from markdown_it import MarkdownIt
1✔
18
    from mdit_plain.renderer import RendererPlain
1✔
19

20

21
logger = logging.getLogger(__name__)
1✔
22

23

24
@component
1✔
25
class MarkdownToDocument:
1✔
26
    """
27
    Converts a Markdown file into a text Document.
28

29
    Usage example:
30
    ```python
31
    from haystack.components.converters import MarkdownToDocument
32
    from datetime import datetime
33

34
    converter = MarkdownToDocument()
35
    results = converter.run(sources=["path/to/sample.md"], meta={"date_added": datetime.now().isoformat()})
36
    documents = results["documents"]
37
    print(documents[0].content)
38
    # 'This is a text from the markdown file.'
39
    ```
40
    """
41

42
    def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False):
1✔
43
        """
44
        Create a MarkdownToDocument component.
45

46
        :param table_to_single_line:
47
            If True converts table contents into a single line.
48
        :param progress_bar:
49
            If True shows a progress bar when running.
50
        :param store_full_path:
51
            If True, the full path of the file is stored in the metadata of the document.
52
            If False, only the file name is stored.
53
        """
54
        markdown_conversion_imports.check()
×
55

56
        self.table_to_single_line = table_to_single_line
×
57
        self.progress_bar = progress_bar
×
58
        self.store_full_path = store_full_path
×
59

60
    @component.output_types(documents=List[Document])
1✔
61
    def run(
1✔
62
        self,
63
        sources: List[Union[str, Path, ByteStream]],
64
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
65
    ):
66
        """
67
        Converts a list of Markdown files to Documents.
68

69
        :param sources:
70
            List of file paths or ByteStream objects.
71
        :param meta:
72
            Optional metadata to attach to the Documents.
73
            This value can be either a list of dictionaries or a single dictionary.
74
            If it's a single dictionary, its content is added to the metadata of all produced Documents.
75
            If it's a list, the length of the list must match the number of sources, because the two lists will
76
            be zipped.
77
            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
78

79
        :returns:
80
            A dictionary with the following keys:
81
            - `documents`: List of created Documents
82
        """
83
        parser = MarkdownIt(renderer_cls=RendererPlain)
×
84
        if self.table_to_single_line:
×
85
            parser.enable("table")
×
86

87
        documents = []
×
88
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
×
89

90
        for source, metadata in tqdm(
×
91
            zip(sources, meta_list),
92
            total=len(sources),
93
            desc="Converting markdown files to Documents",
94
            disable=not self.progress_bar,
95
        ):
96
            try:
×
97
                bytestream = get_bytestream_from_source(source)
×
98
            except Exception as e:
×
99
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
×
100
                continue
×
101
            try:
×
102
                file_content = bytestream.data.decode("utf-8")
×
103
                text = parser.render(file_content)
×
104
            except Exception as conversion_e:
×
105
                logger.warning(
×
106
                    "Failed to extract text from {source}. Skipping it. Error: {error}",
107
                    source=source,
108
                    error=conversion_e,
109
                )
110
                continue
×
111

112
            merged_metadata = {**bytestream.meta, **metadata}
×
113

114
            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
×
115
                merged_metadata["file_path"] = os.path.basename(file_path)
×
116

117
            document = Document(content=text, meta=merged_metadata)
×
118
            documents.append(document)
×
119

120
        return {"documents": documents}
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc