• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 20396856167

20 Dec 2025 04:12PM UTC coverage: 87.366% (+5.6%) from 81.75%
20396856167

push

github

speedyk-005
fix(ci): resolve Coveralls 422 error

- Switch to GITHUB_TOKEN for seamless Coveralls authentication
- Remove manual --service flag to allow auto-detection
- Set explicit job permissions for status reporting

1307 of 1496 relevant lines covered (87.37%)

3.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.72
/src/chunklet/document_chunker/processors/docx_processor.py
1
from typing import Any, Generator
4✔
2

3
# mammoth and docx are lazily imported
4

5
from chunklet.document_chunker.processors.base_processor import BaseProcessor
4✔
6
from chunklet.document_chunker.converters.html_2_md import html_to_md
4✔
7

8

9
class DOCXProcessor(BaseProcessor):
4✔
10
    """
11
    Processor class for extracting text and metadata from DOCX files.
12

13
    Text content is extracted, images are replaced with a placeholder,
14
    and the resulting text is formatted using Markdown conversion.
15

16
    This class extracts **metadata** which typically uses a mix of
17
    **Open Packaging Conventions (OPC)** properties and elements that align
18
    with **Dublin Core** standards.
19

20
    For more details on the DOCX core properties processed, refer to the
21
    `python-docx` documentation:
22
    https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html
23
    """
24

25
    METADATA_FIELDS = [
4✔
26
        "title",
27
        "author",
28
        "publisher",
29
        "last_modified_by",
30
        "created",
31
        "modified",
32
        "rights",
33
        "version",
34
    ]
35

36
    def extract_metadata(self) -> dict[str, Any]:
4✔
37
        """Extracts core properties (a mix of OPC and Dublin Core elements) from the DOCX file.
38

39
        Returns:
40
            dict[str, Any]: A dictionary containing metadata fields:
41
                - title
42
                - author
43
                - publisher
44
                - last_modified_by
45
                - created
46
                - modified
47
                - rights
48
                - version
49
        """
50
        try:
4✔
51
            from docx import Document
4✔
52
        except ImportError as e:
×
53
            raise ImportError(
×
54
                "The 'python-docx' library is not installed. "
55
                "Please install it with 'pip install 'python-docx>=1.2.0'' or install the document processing extras "
56
                "with 'pip install 'chunklet-py[document]''"
57
            ) from e
58

59
        doc = Document(self.file_path)
4✔
60
        props = doc.core_properties
4✔
61
        metadata = {"source": str(self.file_path)}
4✔
62
        for field in self.METADATA_FIELDS:
4✔
63
            value = getattr(props, field, "")
4✔
64
            if value:
4✔
65
                metadata[field] = str(value)
4✔
66
        return metadata
4✔
67

68
    def extract_text(self) -> Generator[str, None, None]:
4✔
69
        """Extracts text content from DOCX file in Markdown format, yielding chunks for efficient processing.
70

71
        Images are replaced with a placeholder "[Image - num]".
72
        Text is yielded in chunks of approximately 4000 characters each to simulate pages and enhance parallel execution.
73

74
        Yields:
75
            str: A chunk of text, approximately 4000 characters each.
76
        """
77
        try:  # Lazy import
4✔
78
            import mammoth
4✔
79
        except ImportError as e:
×
80
            raise ImportError(
×
81
                "The 'mammoth' library is not installed. "
82
                "Please install it with 'pip install 'mammoth>=1.9.0'' or install the document processing extras "
83
                "with 'pip install 'chunklet-py[document]''"
84
            ) from e
85

86
        count = 0
4✔
87

88
        def placeholder_images(image):
4✔
89
            """Replace all images with a placeholder text."""
90
            nonlocal count
91
            count += 1
4✔
92
            return [mammoth.html.text(f"[Image - {count}]")]
4✔
93

94
        with open(self.file_path, "rb") as docx_file:
4✔
95
            # Convert DOCX to HTML first
96
            result = mammoth.convert_to_html(
4✔
97
                docx_file, convert_image=placeholder_images
98
            )
99
            markdown_content = html_to_md(raw_text=result.value)
4✔
100

101
        # Split into paragraphs and accumulate by character count (~4000 chars per chunk)
102
        curr_chunk = []
4✔
103
        curr_size = 0
4✔
104
        max_size = 4000
4✔
105

106
        for paragraph in markdown_content.split("\n\n"):
4✔
107
            para_len = len(paragraph)
4✔
108

109
            # If adding this paragraph would exceed the limit, yield current chunk
110
            if curr_size + para_len > max_size and curr_chunk:
4✔
111
                yield "\n\n".join(curr_chunk)
×
112
                curr_chunk = []
×
113
                curr_size = 0
×
114

115
            curr_chunk.append(paragraph)
4✔
116
            curr_size += para_len
4✔
117

118
        # Yield any remaining content
119
        if curr_chunk:
4✔
120
            yield "\n\n".join(curr_chunk)
4✔
121

122

123
if __name__ == "__main__":  # pragma: no cover
124
    file_path = "samples/Lorem Ipsum.docx"
125
    processor = DOCXProcessor(file_path)
126

127
    # Extract metadata
128
    metadata = processor.extract_metadata()
129
    print("Metadata:")
130
    for key, value in metadata.items():
131
        print(f"{key}: {value}")
132

133
    print("\nText content preview:\n")
134
    for i, text in enumerate(processor.extract_text(), start=1):
135
        print(f"--- {i} ---")
136
        print(text[:512], "...")
137
        print("\n --- \n")
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc