• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 20378511984

19 Dec 2025 06:09PM UTC coverage: 86.588% (+4.8%) from 81.75%
20378511984

Pull #7

github

web-flow
Merge 81717401a into aeb37fd6a
Pull Request #7: Merge develop branch to main

464 of 550 new or added lines in 17 files covered. (84.36%)

1 existing line in 1 file now uncovered.

1317 of 1521 relevant lines covered (86.59%)

4.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.63
/src/chunklet/document_chunker/processors/odt_processor.py
1
from typing import Any, Generator
5✔
2

3
# odfpy is lazy imported
4

5
from chunklet.document_chunker.processors.base_processor import BaseProcessor
5✔
6

7

8
class ODTProcessor(BaseProcessor):
5✔
9
    """
10
    ODT extraction and processing utility using `odfpy`.
11

12
    Provides methods to extract text and metadata from ODT (OpenDocument Text) files,
13
    while processing the extracted text into manageable chunks.
14

15
    This processor extracts **metadata** from the ODT document's **Dublin Core** and
16
    **OpenDocument** standard properties.
17

18
    For more details on ODF metadata fields and `odfpy` usage, refer to:
19
    https://odfpy.readthedocs.io/en/latest/
20
    """
21

22
    def __init__(self, file_path: str):
5✔
23
        """Initialize the ODTProcessor.
24

25
        Args:
26
            file_path (str): Path to the ODT file.
27
        """
28
        try:
5✔
29
            from odf.opendocument import load
5✔
30

31
            self._load_odf = load
5✔
NEW
32
        except ImportError as e:
×
NEW
33
            raise ImportError(
×
34
                "The 'odfpy' library is not installed. "
35
                "Please install it with 'pip install odfpy>=1.4.1' "
36
                "or install the document processing extras with "
37
                "'pip install chunklet-py[document]'"
38
            ) from e
39

40
        self.file_path = file_path
5✔
41
        self.doc = self._load_odf(self.file_path)
5✔
42

43
    def extract_metadata(self) -> dict[str, Any]:
5✔
44
        """Extracts metadata from the ODT file, focusing on Dublin Core and OpenDocument fields.
45

46
        Parses the document's metadata elements, extracting fields such as:
47

48
        Only present fields are included in the returned dictionary.
49

50
        Returns:
51
            dict[str, Any]: A dictionary containing metadata fields:
52
                 - title
53
                 - creator
54
                 - initial_creator
55
                 - created
56
                 - chapter
57
                 - author_name
58

59
        """
60
        try:
5✔
61
            from odf import text, meta, dc
5✔
NEW
62
        except ImportError as e:
×
NEW
63
            raise ImportError(
×
64
                "The 'odfpy' library is not installed. "
65
                "Please install it with 'pip install odfpy>=1.4.1' "
66
                "or install the document processing extras with "
67
                "'pip install chunklet-py[document]'"
68
            ) from e
69

70
        metadata = {}
5✔
71
        for field in [
5✔
72
            dc.Title,
73
            dc.Creator,
74
            meta.InitialCreator,
75
            meta.CreationDate,
76
            text.Chapter,
77
            text.AuthorName,
78
        ]:
79
            elems = self.doc.getElementsByType(field)
5✔
80
            value = "".join(
5✔
81
                node.data
82
                for e in elems
83
                for node in e.childNodes
84
                if node.nodeType == node.TEXT_NODE
85
            ).strip()
86
            if value:  # Only store if not empty
5✔
87
                key = field.__name__
5✔
88

89
                # To keep metadata uniform with the other processors
90
                key = "created" if key == "CreationDate" else key
5✔
91
                key = "author" if key == "Creator" else key
5✔
92
                key = "creator" if key == "InitialCreator" else key
5✔
93

94
                metadata[key.lower()] = value
5✔
95

96
        metadata["source"] = str(self.file_path)
5✔
97
        return metadata
5✔
98

99
    def extract_text(self) -> Generator[str, None, None]:
5✔
100
        """Extracts text content from ODT paragraphs, yielding chunks for efficient processing.
101

102
        Iterates through paragraph elements in the document, extracting text content
103
        and buffering it into chunks of approximately 4000 characters. This allows for memory-efficient
104
        processing of large documents by yielding text blocks that simulate pages and enhance parallel execution.
105

106
        Yields:
107
            str: A chunk of text, approximately 4000 characters each.
108
        """
109
        try:
5✔
110
            from odf import text
5✔
NEW
111
        except ImportError as e:
×
NEW
112
            raise ImportError(
×
113
                "The 'odfpy' library is not installed. "
114
                "Please install it with 'pip install odfpy>=1.4.1' "
115
                "or install the document processing extras with "
116
                "'pip install chunklet-py[document]'"
117
            ) from e
118

119
        current_chunk = []
5✔
120
        char_count = 0
5✔
121
        max_chunk_size = 4000
5✔
122

123
        for p_elem in self.doc.getElementsByType(text.P):
5✔
124
            para_text = "".join(
5✔
125
                node.data
126
                for node in p_elem.childNodes
127
                if node.nodeType == node.TEXT_NODE
128
            ).strip()
129
            if para_text:
5✔
130
                para_length = len(para_text)
5✔
131

132
                # If adding this paragraph would exceed the limit, yield current chunk
133
                if char_count + para_length > max_chunk_size and current_chunk:
5✔
134
                    yield "\n".join(current_chunk)
5✔
135
                    current_chunk = []
5✔
136
                    char_count = 0
5✔
137

138
                # If a single paragraph is longer than max_chunk_size, yield it as its own chunk
139
                if para_length > max_chunk_size:
5✔
NEW
140
                    if current_chunk:
×
NEW
141
                        yield "\n".join(current_chunk)
×
NEW
142
                        current_chunk = []
×
NEW
143
                        char_count = 0
×
NEW
144
                    yield para_text
×
145
                else:
146
                    current_chunk.append(para_text)
5✔
147
                    char_count += para_length
5✔
148

149
        # Yield any remaining content
150
        if current_chunk:
5✔
151
            yield "\n".join(current_chunk)
5✔
152

153

154
if __name__ == "__main__":  # pragma: no cover
155
    file_path = "samples/file-sample_100kB.odt"
156
    processor = ODTProcessor(file_path)
157

158
    # Extract metadata
159
    metadata = processor.extract_metadata()
160
    print("Metadata:")
161
    for key, value in metadata.items():
162
        print(f"{key}: {value}")
163

164
    print("\nText content preview:\n")
165
    for i, chunk in enumerate(processor.extract_text(), start=1):
166
        print(f"--- {i} ---")
167
        print(chunk, "...")
168
        print("\n --- \n")
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc