20396856167

Committed 20 Dec 2025 04:12PM UTC coverage: 87.366% (+5.6%) from 81.75%

Build # 20396856167

Build Type

push

github

Committed by

speedyk-005

Commit Message

fix(ci): resolve Coveralls 422 error

- Switch to GITHUB_TOKEN for seamless Coveralls authentication
- Remove manual --service flag to allow auto-detection
- Set explicit job permissions for status reporting

Run Details

1307 of 1496 relevant lines covered (87.37%)

3.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.72

/src/chunklet/document_chunker/processors/docx_processor.py

from typing import Any, Generator

# mammoth and docx are lazily imported

from chunklet.document_chunker.processors.base_processor import BaseProcessor
from chunklet.document_chunker.converters.html_2_md import html_to_md


class DOCXProcessor(BaseProcessor):
    """
    Processor class for extracting text and metadata from DOCX files.

    Text content is extracted, images are replaced with a placeholder,
    and the resulting text is formatted using Markdown conversion.

    This class extracts **metadata** which typically uses a mix of
    **Open Packaging Conventions (OPC)** properties and elements that align
    with **Dublin Core** standards.

    For more details on the DOCX core properties processed, refer to the
    `python-docx` documentation:
    https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html
    """

    METADATA_FIELDS = [
        "title",
        "author",
        "publisher",
        "last_modified_by",
        "created",
        "modified",
        "rights",
        "version",
    ]

    def extract_metadata(self) -> dict[str, Any]:
        """Extracts core properties (a mix of OPC and Dublin Core elements) from the DOCX file.

        Returns:
            dict[str, Any]: A dictionary containing metadata fields:
                - title
                - author
                - publisher
                - last_modified_by
                - created
                - modified
                - rights
                - version
        """
        try:
            from docx import Document
        except ImportError as e:
            raise ImportError(
                "The 'python-docx' library is not installed. "
                "Please install it with 'pip install 'python-docx>=1.2.0'' or install the document processing extras "
                "with 'pip install 'chunklet-py[document]''"
            ) from e

        doc = Document(self.file_path)
        props = doc.core_properties
        metadata = {"source": str(self.file_path)}
        for field in self.METADATA_FIELDS:
            value = getattr(props, field, "")
            if value:
                metadata[field] = str(value)
        return metadata

    def extract_text(self) -> Generator[str, None, None]:
        """Extracts text content from DOCX file in Markdown format, yielding chunks for efficient processing.

        Images are replaced with a placeholder "[Image - num]".
        Text is yielded in chunks of approximately 4000 characters each to simulate pages and enhance parallel execution.

        Yields:
            str: A chunk of text, approximately 4000 characters each.
        """
        try:  # Lazy import
            import mammoth
        except ImportError as e:
            raise ImportError(
                "The 'mammoth' library is not installed. "
                "Please install it with 'pip install 'mammoth>=1.9.0'' or install the document processing extras "
                "with 'pip install 'chunklet-py[document]''"
            ) from e

        count = 0

        def placeholder_images(image):
            """Replace all images with a placeholder text."""
            nonlocal count
            count += 1
            return [mammoth.html.text(f"[Image - {count}]")]

        with open(self.file_path, "rb") as docx_file:
            # Convert DOCX to HTML first
            result = mammoth.convert_to_html(
                docx_file, convert_image=placeholder_images
            )
            markdown_content = html_to_md(raw_text=result.value)

        # Split into paragraphs and accumulate by character count (~4000 chars per chunk)
        curr_chunk = []
        curr_size = 0
        max_size = 4000

        for paragraph in markdown_content.split("\n\n"):
            para_len = len(paragraph)

            # If adding this paragraph would exceed the limit, yield current chunk
            if curr_size + para_len > max_size and curr_chunk:
                yield "\n\n".join(curr_chunk)
                curr_chunk = []
                curr_size = 0

            curr_chunk.append(paragraph)
            curr_size += para_len

        # Yield any remaining content
        if curr_chunk:
            yield "\n\n".join(curr_chunk)


if __name__ == "__main__":  # pragma: no cover
    file_path = "samples/Lorem Ipsum.docx"
    processor = DOCXProcessor(file_path)

    # Extract metadata
    metadata = processor.extract_metadata()
    print("Metadata:")
    for key, value in metadata.items():
        print(f"{key}: {value}")

    print("\nText content preview:\n")
    for i, text in enumerate(processor.extract_text(), start=1):
        print(f"--- {i} ---")
        print(text[:512], "...")
        print("\n --- \n")

1	from typing import Any, Generator	4✔
2
3	# mammoth and docx are lazily imported
4
5	from chunklet.document_chunker.processors.base_processor import BaseProcessor	4✔
6	from chunklet.document_chunker.converters.html_2_md import html_to_md	4✔
7
8
9	class DOCXProcessor(BaseProcessor):	4✔
10	"""
11	Processor class for extracting text and metadata from DOCX files.
12
13	Text content is extracted, images are replaced with a placeholder,
14	and the resulting text is formatted using Markdown conversion.
15
16	This class extracts metadata which typically uses a mix of
17	Open Packaging Conventions (OPC) properties and elements that align
18	with Dublin Core standards.
19
20	For more details on the DOCX core properties processed, refer to the
21	`python-docx` documentation:
22	https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html
23	"""
24
25	METADATA_FIELDS = [	4✔
26	"title",
27	"author",
28	"publisher",
29	"last_modified_by",
30	"created",
31	"modified",
32	"rights",
33	"version",
34	]
35
36	def extract_metadata(self) -> dict[str, Any]:	4✔
37	"""Extracts core properties (a mix of OPC and Dublin Core elements) from the DOCX file.
38
39	Returns:
40	dict[str, Any]: A dictionary containing metadata fields:
41	- title
42	- author
43	- publisher
44	- last_modified_by
45	- created
46	- modified
47	- rights
48	- version
49	"""
50	try:	4✔
51	from docx import Document	4✔
52	except ImportError as e:	×
53	raise ImportError(	×
54	"The 'python-docx' library is not installed. "
55	"Please install it with 'pip install 'python-docx>=1.2.0'' or install the document processing extras "
56	"with 'pip install 'chunklet-py[document]''"
57	) from e
58
59	doc = Document(self.file_path)	4✔
60	props = doc.core_properties	4✔
61	metadata = {"source": str(self.file_path)}	4✔
62	for field in self.METADATA_FIELDS:	4✔
63	value = getattr(props, field, "")	4✔
64	if value:	4✔
65	metadata[field] = str(value)	4✔
66	return metadata	4✔
67
68	def extract_text(self) -> Generator[str, None, None]:	4✔
69	"""Extracts text content from DOCX file in Markdown format, yielding chunks for efficient processing.
70
71	Images are replaced with a placeholder "[Image - num]".
72	Text is yielded in chunks of approximately 4000 characters each to simulate pages and enhance parallel execution.
73
74	Yields:
75	str: A chunk of text, approximately 4000 characters each.
76	"""
77	try: # Lazy import	4✔
78	import mammoth	4✔
79	except ImportError as e:	×
80	raise ImportError(	×
81	"The 'mammoth' library is not installed. "
82	"Please install it with 'pip install 'mammoth>=1.9.0'' or install the document processing extras "
83	"with 'pip install 'chunklet-py[document]''"
84	) from e
85
86	count = 0	4✔
87
88	def placeholder_images(image):	4✔
89	"""Replace all images with a placeholder text."""
90	nonlocal count
91	count += 1	4✔
92	return [mammoth.html.text(f"[Image - {count}]")]	4✔
93
94	with open(self.file_path, "rb") as docx_file:	4✔
95	# Convert DOCX to HTML first
96	result = mammoth.convert_to_html(	4✔
97	docx_file, convert_image=placeholder_images
98	)
99	markdown_content = html_to_md(raw_text=result.value)	4✔
100
101	# Split into paragraphs and accumulate by character count (~4000 chars per chunk)
102	curr_chunk = []	4✔
103	curr_size = 0	4✔
104	max_size = 4000	4✔
105
106	for paragraph in markdown_content.split("\n\n"):	4✔
107	para_len = len(paragraph)	4✔
108
109	# If adding this paragraph would exceed the limit, yield current chunk
110	if curr_size + para_len > max_size and curr_chunk:	4✔
111	yield "\n\n".join(curr_chunk)	×
112	curr_chunk = []	×
113	curr_size = 0	×
114
115	curr_chunk.append(paragraph)	4✔
116	curr_size += para_len	4✔
117
118	# Yield any remaining content
119	if curr_chunk:	4✔
120	yield "\n\n".join(curr_chunk)	4✔
121
122
123	if __name__ == "__main__": # pragma: no cover
124	file_path = "samples/Lorem Ipsum.docx"
125	processor = DOCXProcessor(file_path)
126
127	# Extract metadata
128	metadata = processor.extract_metadata()
129	print("Metadata:")
130	for key, value in metadata.items():
131	print(f"{key}: {value}")
132
133	print("\nText content preview:\n")
134	for i, text in enumerate(processor.extract_text(), start=1):
135	print(f"--- {i} ---")
136	print(text[:512], "...")
137	print("\n --- \n")

speedyk-005 / chunklet-py / 20396856167

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous