20378511984

Committed 19 Dec 2025 06:09PM UTC coverage: 86.588% (+4.8%) from 81.75%

Build # 20378511984

Build Type

Pull #7

github

Committed by

web-flow

Commit Message

Merge 81717401a into aeb37fd6a

Pull Request Pull Request #7: Merge develop branch to main

Run Details

464 of 550 new or added lines in 17 files covered. (84.36%)

1 existing line in 1 file now uncovered.

1317 of 1521 relevant lines covered (86.59%)

4.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

76.47

/src/chunklet/document_chunker/processors/docx_processor.py

from typing import Any, Generator

# mammoth and docx are lazily imported

from chunklet.document_chunker.processors.base_processor import BaseProcessor
from chunklet.document_chunker.converters.html_2_md import html_to_md


class DOCXProcessor(BaseProcessor):
    """
    Processor class for extracting text and metadata from DOCX files.

    Text content is extracted, images are replaced with a placeholder,
    and the resulting text is formatted using Markdown conversion.

    This class extracts **metadata** which typically uses a mix of
    **Open Packaging Conventions (OPC)** properties and elements that align
    with **Dublin Core** standards.

    For more details on the DOCX core properties processed, refer to the
    `python-docx` documentation:
    https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html
    """

    METADATA_FIELDS = [
        "title",
        "author",
        "publisher",
        "last_modified_by",
        "created",
        "modified",
        "rights",
        "version",
    ]

    def extract_metadata(self) -> dict[str, Any]:
        """Extracts core properties (a mix of OPC and Dublin Core elements) from the DOCX file.

        Returns:
            dict[str, Any]: A dictionary containing metadata fields:
                - title
                - author
                - publisher
                - last_modified_by
                - created
                - modified
                - rights
                - version
        """
        try:
            from docx import Document
        except ImportError as e:
            raise ImportError(
                "The 'python-docx' library is not installed. "
                "Please install it with 'pip install 'python-docx>=1.2.0'' or install the document processing extras "
                "with 'pip install 'chunklet-py[document]''"
            ) from e

        doc = Document(self.file_path)
        props = doc.core_properties
        metadata = {"source": str(self.file_path)}
        for field in self.METADATA_FIELDS:
            value = getattr(props, field, "")
            if value:
                metadata[field] = str(value)
        return metadata

    def extract_text(self) -> Generator[str, None, None]:
        """Extracts text content from DOCX file in Markdown format, yielding chunks for efficient processing.

        Images are replaced with a placeholder "[Image - num]".
        Text is yielded in chunks of approximately 4000 characters each to simulate pages and enhance parallel execution.

        Yields:
            str: A chunk of text, approximately 4000 characters each.
        """
        try:  # Lazy import
            import mammoth
        except ImportError as e:
            raise ImportError(
                "The 'mammoth' library is not installed. "
                "Please install it with 'pip install 'mammoth>=1.9.0'' or install the document processing extras "
                "with 'pip install 'chunklet-py[document]''"
            ) from e

        count = 0

        def placeholder_images(image):
            """Replace all images with a placeholder text."""
            nonlocal count
            count += 1
            return [mammoth.html.text(f"[Image - {count}]")]

        with open(self.file_path, "rb") as docx_file:
            # Convert DOCX to HTML first
            result = mammoth.convert_to_html(
                docx_file, convert_image=placeholder_images
            )
            html_content = result.value

        # Now we can convert it to markdown
        markdown_content = html_to_md(raw_text=html_content)

        # Split into paragraphs and accumulate by character count (~4000 chars per chunk)
        paragraphs = markdown_content.split("\n\n")
        current_chunk = []
        char_count = 0
        max_chunk_size = 4000

        for paragraph in paragraphs:
            para_length = len(paragraph)

            # If adding this paragraph would exceed the limit, yield current chunk
            if char_count + para_length > max_chunk_size and current_chunk:
                yield "\n\n".join(current_chunk)
                current_chunk = []
                char_count = 0

            # If a single paragraph is longer than max_chunk_size, yield it as its own chunk
            if para_length > max_chunk_size:
                if current_chunk:
                    yield "\n\n".join(current_chunk)
                    current_chunk = []
                    char_count = 0
                yield paragraph
            else:
                current_chunk.append(paragraph)
                char_count += para_length

        # Yield any remaining content
        if current_chunk:
            yield "\n\n".join(current_chunk)


if __name__ == "__main__":  # pragma: no cover
    file_path = "samples/Lorem Ipsum.docx"
    processor = DOCXProcessor(file_path)

    # Extract metadata
    metadata = processor.extract_metadata()
    print("Metadata:")
    for key, value in metadata.items():
        print(f"{key}: {value}")

    print("\nText content preview:\n")
    for i, text in enumerate(processor.extract_text(), start=1):
        print(f"--- {i} ---")
        print(text[:512], "...")
        print("\n --- \n")

1	from typing import Any, Generator	5✔
2
3	# mammoth and docx are lazily imported
4
5	from chunklet.document_chunker.processors.base_processor import BaseProcessor	5✔
6	from chunklet.document_chunker.converters.html_2_md import html_to_md	5✔
7
8
9	class DOCXProcessor(BaseProcessor):	5✔
10	"""
11	Processor class for extracting text and metadata from DOCX files.
12
13	Text content is extracted, images are replaced with a placeholder,
14	and the resulting text is formatted using Markdown conversion.
15
16	This class extracts metadata which typically uses a mix of
17	Open Packaging Conventions (OPC) properties and elements that align
18	with Dublin Core standards.
19
20	For more details on the DOCX core properties processed, refer to the
21	`python-docx` documentation:
22	https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html
23	"""
24
25	METADATA_FIELDS = [	5✔
26	"title",
27	"author",
28	"publisher",
29	"last_modified_by",
30	"created",
31	"modified",
32	"rights",
33	"version",
34	]
35
36	def extract_metadata(self) -> dict[str, Any]:	5✔
37	"""Extracts core properties (a mix of OPC and Dublin Core elements) from the DOCX file.
38
39	Returns:
40	dict[str, Any]: A dictionary containing metadata fields:
41	- title
42	- author
43	- publisher
44	- last_modified_by
45	- created
46	- modified
47	- rights
48	- version
49	"""
50	try:	5✔
51	from docx import Document	5✔
52	except ImportError as e:	×
53	raise ImportError(	×
54	"The 'python-docx' library is not installed. "
55	"Please install it with 'pip install 'python-docx>=1.2.0'' or install the document processing extras "
56	"with 'pip install 'chunklet-py[document]''"
57	) from e
58
59	doc = Document(self.file_path)	5✔
60	props = doc.core_properties	5✔
61	metadata = {"source": str(self.file_path)}	5✔
62	for field in self.METADATA_FIELDS:	5✔
63	value = getattr(props, field, "")	5✔
64	if value:	5✔
65	metadata[field] = str(value)	5✔
66	return metadata	5✔
67
68	def extract_text(self) -> Generator[str, None, None]:	5✔
69	"""Extracts text content from DOCX file in Markdown format, yielding chunks for efficient processing.
70
71	Images are replaced with a placeholder "[Image - num]".
72	Text is yielded in chunks of approximately 4000 characters each to simulate pages and enhance parallel execution.
73
74	Yields:
75	str: A chunk of text, approximately 4000 characters each.
76	"""
77	try: # Lazy import	5✔
78	import mammoth	5✔
79	except ImportError as e:	×
80	raise ImportError(	×
81	"The 'mammoth' library is not installed. "
82	"Please install it with 'pip install 'mammoth>=1.9.0'' or install the document processing extras "
83	"with 'pip install 'chunklet-py[document]''"
84	) from e
85
86	count = 0	5✔
87
88	def placeholder_images(image):	5✔
89	"""Replace all images with a placeholder text."""
90	nonlocal count
91	count += 1	5✔
92	return [mammoth.html.text(f"[Image - {count}]")]	5✔
93
94	with open(self.file_path, "rb") as docx_file:	5✔
95	# Convert DOCX to HTML first
96	result = mammoth.convert_to_html(	5✔
97	docx_file, convert_image=placeholder_images
98	)
99	html_content = result.value	5✔
100
101	# Now we can convert it to markdown
102	markdown_content = html_to_md(raw_text=html_content)	5✔
103
104	# Split into paragraphs and accumulate by character count (~4000 chars per chunk)
105	paragraphs = markdown_content.split("\n\n")	5✔
106	current_chunk = []	5✔
107	char_count = 0	5✔
108	max_chunk_size = 4000	5✔
109
110	for paragraph in paragraphs:	5✔
111	para_length = len(paragraph)	5✔
112
113	# If adding this paragraph would exceed the limit, yield current chunk
114	if char_count + para_length > max_chunk_size and current_chunk:	5✔
NEW 115	yield "\n\n".join(current_chunk)	×
NEW 116	current_chunk = []	×
NEW 117	char_count = 0	×
118
119	# If a single paragraph is longer than max_chunk_size, yield it as its own chunk
120	if para_length > max_chunk_size:	5✔
NEW 121	if current_chunk:	×
NEW 122	yield "\n\n".join(current_chunk)	×
NEW 123	current_chunk = []	×
NEW 124	char_count = 0	×
NEW 125	yield paragraph	×
126	else:
127	current_chunk.append(paragraph)	5✔
128	char_count += para_length	5✔
129
130	# Yield any remaining content
131	if current_chunk:	5✔
132	yield "\n\n".join(current_chunk)	5✔
133
134
135	if __name__ == "__main__": # pragma: no cover
136	file_path = "samples/Lorem Ipsum.docx"
137	processor = DOCXProcessor(file_path)
138
139	# Extract metadata
140	metadata = processor.extract_metadata()
141	print("Metadata:")
142	for key, value in metadata.items():
143	print(f"{key}: {value}")
144
145	print("\nText content preview:\n")
146	for i, text in enumerate(processor.extract_text(), start=1):
147	print(f"--- {i} ---")
148	print(text[:512], "...")
149	print("\n --- \n")

speedyk-005 / chunklet-py / 20378511984

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous