20378511984

Committed 19 Dec 2025 06:09PM UTC coverage: 86.588% (+4.8%) from 81.75%

Build # 20378511984

Build Type

Pull #7

github

Committed by

web-flow

Commit Message

Merge 81717401a into aeb37fd6a

Pull Request Pull Request #7: Merge develop branch to main

Run Details

464 of 550 new or added lines in 17 files covered. (84.36%)

1 existing line in 1 file now uncovered.

1317 of 1521 relevant lines covered (86.59%)

4.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.63

/src/chunklet/document_chunker/processors/odt_processor.py

from typing import Any, Generator

# odfpy is lazy imported

from chunklet.document_chunker.processors.base_processor import BaseProcessor


class ODTProcessor(BaseProcessor):
    """
    ODT extraction and processing utility using `odfpy`.

    Provides methods to extract text and metadata from ODT (OpenDocument Text) files,
    while processing the extracted text into manageable chunks.

    This processor extracts **metadata** from the ODT document's **Dublin Core** and
    **OpenDocument** standard properties.

    For more details on ODF metadata fields and `odfpy` usage, refer to:
    https://odfpy.readthedocs.io/en/latest/
    """

    def __init__(self, file_path: str):
        """Initialize the ODTProcessor.

        Args:
            file_path (str): Path to the ODT file.
        """
        try:
            from odf.opendocument import load

            self._load_odf = load
        except ImportError as e:
            raise ImportError(
                "The 'odfpy' library is not installed. "
                "Please install it with 'pip install odfpy>=1.4.1' "
                "or install the document processing extras with "
                "'pip install chunklet-py[document]'"
            ) from e

        self.file_path = file_path
        self.doc = self._load_odf(self.file_path)

    def extract_metadata(self) -> dict[str, Any]:
        """Extracts metadata from the ODT file, focusing on Dublin Core and OpenDocument fields.

        Parses the document's metadata elements, extracting fields such as:

        Only present fields are included in the returned dictionary.

        Returns:
            dict[str, Any]: A dictionary containing metadata fields:
                 - title
                 - creator
                 - initial_creator
                 - created
                 - chapter
                 - author_name

        """
        try:
            from odf import text, meta, dc
        except ImportError as e:
            raise ImportError(
                "The 'odfpy' library is not installed. "
                "Please install it with 'pip install odfpy>=1.4.1' "
                "or install the document processing extras with "
                "'pip install chunklet-py[document]'"
            ) from e

        metadata = {}
        for field in [
            dc.Title,
            dc.Creator,
            meta.InitialCreator,
            meta.CreationDate,
            text.Chapter,
            text.AuthorName,
        ]:
            elems = self.doc.getElementsByType(field)
            value = "".join(
                node.data
                for e in elems
                for node in e.childNodes
                if node.nodeType == node.TEXT_NODE
            ).strip()
            if value:  # Only store if not empty
                key = field.__name__

                # To keep metadata uniform with the other processors
                key = "created" if key == "CreationDate" else key
                key = "author" if key == "Creator" else key
                key = "creator" if key == "InitialCreator" else key

                metadata[key.lower()] = value

        metadata["source"] = str(self.file_path)
        return metadata

    def extract_text(self) -> Generator[str, None, None]:
        """Extracts text content from ODT paragraphs, yielding chunks for efficient processing.

        Iterates through paragraph elements in the document, extracting text content
        and buffering it into chunks of approximately 4000 characters. This allows for memory-efficient
        processing of large documents by yielding text blocks that simulate pages and enhance parallel execution.

        Yields:
            str: A chunk of text, approximately 4000 characters each.
        """
        try:
            from odf import text
        except ImportError as e:
            raise ImportError(
                "The 'odfpy' library is not installed. "
                "Please install it with 'pip install odfpy>=1.4.1' "
                "or install the document processing extras with "
                "'pip install chunklet-py[document]'"
            ) from e

        current_chunk = []
        char_count = 0
        max_chunk_size = 4000

        for p_elem in self.doc.getElementsByType(text.P):
            para_text = "".join(
                node.data
                for node in p_elem.childNodes
                if node.nodeType == node.TEXT_NODE
            ).strip()
            if para_text:
                para_length = len(para_text)

                # If adding this paragraph would exceed the limit, yield current chunk
                if char_count + para_length > max_chunk_size and current_chunk:
                    yield "\n".join(current_chunk)
                    current_chunk = []
                    char_count = 0

                # If a single paragraph is longer than max_chunk_size, yield it as its own chunk
                if para_length > max_chunk_size:
                    if current_chunk:
                        yield "\n".join(current_chunk)
                        current_chunk = []
                        char_count = 0
                    yield para_text
                else:
                    current_chunk.append(para_text)
                    char_count += para_length

        # Yield any remaining content
        if current_chunk:
            yield "\n".join(current_chunk)


if __name__ == "__main__":  # pragma: no cover
    file_path = "samples/file-sample_100kB.odt"
    processor = ODTProcessor(file_path)

    # Extract metadata
    metadata = processor.extract_metadata()
    print("Metadata:")
    for key, value in metadata.items():
        print(f"{key}: {value}")

    print("\nText content preview:\n")
    for i, chunk in enumerate(processor.extract_text(), start=1):
        print(f"--- {i} ---")
        print(chunk, "...")
        print("\n --- \n")

1	from typing import Any, Generator	5✔
2
3	# odfpy is lazy imported
4
5	from chunklet.document_chunker.processors.base_processor import BaseProcessor	5✔
6
7
8	class ODTProcessor(BaseProcessor):	5✔
9	"""
10	ODT extraction and processing utility using `odfpy`.
11
12	Provides methods to extract text and metadata from ODT (OpenDocument Text) files,
13	while processing the extracted text into manageable chunks.
14
15	This processor extracts metadata from the ODT document's Dublin Core and
16	OpenDocument standard properties.
17
18	For more details on ODF metadata fields and `odfpy` usage, refer to:
19	https://odfpy.readthedocs.io/en/latest/
20	"""
21
22	def __init__(self, file_path: str):	5✔
23	"""Initialize the ODTProcessor.
24
25	Args:
26	file_path (str): Path to the ODT file.
27	"""
28	try:	5✔
29	from odf.opendocument import load	5✔
30
31	self._load_odf = load	5✔
NEW 32	except ImportError as e:	×
NEW 33	raise ImportError(	×
34	"The 'odfpy' library is not installed. "
35	"Please install it with 'pip install odfpy>=1.4.1' "
36	"or install the document processing extras with "
37	"'pip install chunklet-py[document]'"
38	) from e
39
40	self.file_path = file_path	5✔
41	self.doc = self._load_odf(self.file_path)	5✔
42
43	def extract_metadata(self) -> dict[str, Any]:	5✔
44	"""Extracts metadata from the ODT file, focusing on Dublin Core and OpenDocument fields.
45
46	Parses the document's metadata elements, extracting fields such as:
47
48	Only present fields are included in the returned dictionary.
49
50	Returns:
51	dict[str, Any]: A dictionary containing metadata fields:
52	- title
53	- creator
54	- initial_creator
55	- created
56	- chapter
57	- author_name
58
59	"""
60	try:	5✔
61	from odf import text, meta, dc	5✔
NEW 62	except ImportError as e:	×
NEW 63	raise ImportError(	×
64	"The 'odfpy' library is not installed. "
65	"Please install it with 'pip install odfpy>=1.4.1' "
66	"or install the document processing extras with "
67	"'pip install chunklet-py[document]'"
68	) from e
69
70	metadata = {}	5✔
71	for field in [	5✔
72	dc.Title,
73	dc.Creator,
74	meta.InitialCreator,
75	meta.CreationDate,
76	text.Chapter,
77	text.AuthorName,
78	]:
79	elems = self.doc.getElementsByType(field)	5✔
80	value = "".join(	5✔
81	node.data
82	for e in elems
83	for node in e.childNodes
84	if node.nodeType == node.TEXT_NODE
85	).strip()
86	if value: # Only store if not empty	5✔
87	key = field.__name__	5✔
88
89	# To keep metadata uniform with the other processors
90	key = "created" if key == "CreationDate" else key	5✔
91	key = "author" if key == "Creator" else key	5✔
92	key = "creator" if key == "InitialCreator" else key	5✔
93
94	metadata[key.lower()] = value	5✔
95
96	metadata["source"] = str(self.file_path)	5✔
97	return metadata	5✔
98
99	def extract_text(self) -> Generator[str, None, None]:	5✔
100	"""Extracts text content from ODT paragraphs, yielding chunks for efficient processing.
101
102	Iterates through paragraph elements in the document, extracting text content
103	and buffering it into chunks of approximately 4000 characters. This allows for memory-efficient
104	processing of large documents by yielding text blocks that simulate pages and enhance parallel execution.
105
106	Yields:
107	str: A chunk of text, approximately 4000 characters each.
108	"""
109	try:	5✔
110	from odf import text	5✔
NEW 111	except ImportError as e:	×
NEW 112	raise ImportError(	×
113	"The 'odfpy' library is not installed. "
114	"Please install it with 'pip install odfpy>=1.4.1' "
115	"or install the document processing extras with "
116	"'pip install chunklet-py[document]'"
117	) from e
118
119	current_chunk = []	5✔
120	char_count = 0	5✔
121	max_chunk_size = 4000	5✔
122
123	for p_elem in self.doc.getElementsByType(text.P):	5✔
124	para_text = "".join(	5✔
125	node.data
126	for node in p_elem.childNodes
127	if node.nodeType == node.TEXT_NODE
128	).strip()
129	if para_text:	5✔
130	para_length = len(para_text)	5✔
131
132	# If adding this paragraph would exceed the limit, yield current chunk
133	if char_count + para_length > max_chunk_size and current_chunk:	5✔
134	yield "\n".join(current_chunk)	5✔
135	current_chunk = []	5✔
136	char_count = 0	5✔
137
138	# If a single paragraph is longer than max_chunk_size, yield it as its own chunk
139	if para_length > max_chunk_size:	5✔
NEW 140	if current_chunk:	×
NEW 141	yield "\n".join(current_chunk)	×
NEW 142	current_chunk = []	×
NEW 143	char_count = 0	×
NEW 144	yield para_text	×
145	else:
146	current_chunk.append(para_text)	5✔
147	char_count += para_length	5✔
148
149	# Yield any remaining content
150	if current_chunk:	5✔
151	yield "\n".join(current_chunk)	5✔
152
153
154	if __name__ == "__main__": # pragma: no cover
155	file_path = "samples/file-sample_100kB.odt"
156	processor = ODTProcessor(file_path)
157
158	# Extract metadata
159	metadata = processor.extract_metadata()
160	print("Metadata:")
161	for key, value in metadata.items():
162	print(f"{key}: {value}")
163
164	print("\nText content preview:\n")
165	for i, chunk in enumerate(processor.extract_text(), start=1):
166	print(f"--- {i} ---")
167	print(chunk, "...")
168	print("\n --- \n")

speedyk-005 / chunklet-py / 20378511984

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous