• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 20378511984

19 Dec 2025 06:09PM UTC coverage: 86.588% (+4.8%) from 81.75%
20378511984

Pull #7

github

web-flow
Merge 81717401a into aeb37fd6a
Pull Request #7: Merge develop branch to main

464 of 550 new or added lines in 17 files covered. (84.36%)

1 existing line in 1 file now uncovered.

1317 of 1521 relevant lines covered (86.59%)

4.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

76.47
/src/chunklet/document_chunker/processors/docx_processor.py
1
from typing import Any, Generator
5✔
2

3
# mammoth and docx are lazily imported
4

5
from chunklet.document_chunker.processors.base_processor import BaseProcessor
5✔
6
from chunklet.document_chunker.converters.html_2_md import html_to_md
5✔
7

8

9
class DOCXProcessor(BaseProcessor):
5✔
10
    """
11
    Processor class for extracting text and metadata from DOCX files.
12

13
    Text content is extracted, images are replaced with a placeholder,
14
    and the resulting text is formatted using Markdown conversion.
15

16
    This class extracts **metadata** which typically uses a mix of
17
    **Open Packaging Conventions (OPC)** properties and elements that align
18
    with **Dublin Core** standards.
19

20
    For more details on the DOCX core properties processed, refer to the
21
    `python-docx` documentation:
22
    https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html
23
    """
24

25
    METADATA_FIELDS = [
5✔
26
        "title",
27
        "author",
28
        "publisher",
29
        "last_modified_by",
30
        "created",
31
        "modified",
32
        "rights",
33
        "version",
34
    ]
35

36
    def extract_metadata(self) -> dict[str, Any]:
5✔
37
        """Extracts core properties (a mix of OPC and Dublin Core elements) from the DOCX file.
38

39
        Returns:
40
            dict[str, Any]: A dictionary containing metadata fields:
41
                - title
42
                - author
43
                - publisher
44
                - last_modified_by
45
                - created
46
                - modified
47
                - rights
48
                - version
49
        """
50
        try:
5✔
51
            from docx import Document
5✔
52
        except ImportError as e:
×
53
            raise ImportError(
×
54
                "The 'python-docx' library is not installed. "
55
                "Please install it with 'pip install 'python-docx>=1.2.0'' or install the document processing extras "
56
                "with 'pip install 'chunklet-py[document]''"
57
            ) from e
58

59
        doc = Document(self.file_path)
5✔
60
        props = doc.core_properties
5✔
61
        metadata = {"source": str(self.file_path)}
5✔
62
        for field in self.METADATA_FIELDS:
5✔
63
            value = getattr(props, field, "")
5✔
64
            if value:
5✔
65
                metadata[field] = str(value)
5✔
66
        return metadata
5✔
67

68
    def extract_text(self) -> Generator[str, None, None]:
5✔
69
        """Extracts text content from DOCX file in Markdown format, yielding chunks for efficient processing.
70

71
        Images are replaced with a placeholder "[Image - num]".
72
        Text is yielded in chunks of approximately 4000 characters each to simulate pages and enhance parallel execution.
73

74
        Yields:
75
            str: A chunk of text, approximately 4000 characters each.
76
        """
77
        try:  # Lazy import
5✔
78
            import mammoth
5✔
79
        except ImportError as e:
×
80
            raise ImportError(
×
81
                "The 'mammoth' library is not installed. "
82
                "Please install it with 'pip install 'mammoth>=1.9.0'' or install the document processing extras "
83
                "with 'pip install 'chunklet-py[document]''"
84
            ) from e
85

86
        count = 0
5✔
87

88
        def placeholder_images(image):
5✔
89
            """Replace all images with a placeholder text."""
90
            nonlocal count
91
            count += 1
5✔
92
            return [mammoth.html.text(f"[Image - {count}]")]
5✔
93

94
        with open(self.file_path, "rb") as docx_file:
5✔
95
            # Convert DOCX to HTML first
96
            result = mammoth.convert_to_html(
5✔
97
                docx_file, convert_image=placeholder_images
98
            )
99
            html_content = result.value
5✔
100

101
        # Now we can convert it to markdown
102
        markdown_content = html_to_md(raw_text=html_content)
5✔
103

104
        # Split into paragraphs and accumulate by character count (~4000 chars per chunk)
105
        paragraphs = markdown_content.split("\n\n")
5✔
106
        current_chunk = []
5✔
107
        char_count = 0
5✔
108
        max_chunk_size = 4000
5✔
109

110
        for paragraph in paragraphs:
5✔
111
            para_length = len(paragraph)
5✔
112

113
            # If adding this paragraph would exceed the limit, yield current chunk
114
            if char_count + para_length > max_chunk_size and current_chunk:
5✔
NEW
115
                yield "\n\n".join(current_chunk)
×
NEW
116
                current_chunk = []
×
NEW
117
                char_count = 0
×
118

119
            # If a single paragraph is longer than max_chunk_size, yield it as its own chunk
120
            if para_length > max_chunk_size:
5✔
NEW
121
                if current_chunk:
×
NEW
122
                    yield "\n\n".join(current_chunk)
×
NEW
123
                    current_chunk = []
×
NEW
124
                    char_count = 0
×
NEW
125
                yield paragraph
×
126
            else:
127
                current_chunk.append(paragraph)
5✔
128
                char_count += para_length
5✔
129

130
        # Yield any remaining content
131
        if current_chunk:
5✔
132
            yield "\n\n".join(current_chunk)
5✔
133

134

135
if __name__ == "__main__":  # pragma: no cover
136
    file_path = "samples/Lorem Ipsum.docx"
137
    processor = DOCXProcessor(file_path)
138

139
    # Extract metadata
140
    metadata = processor.extract_metadata()
141
    print("Metadata:")
142
    for key, value in metadata.items():
143
        print(f"{key}: {value}")
144

145
    print("\nText content preview:\n")
146
    for i, text in enumerate(processor.extract_text(), start=1):
147
        print(f"--- {i} ---")
148
        print(text[:512], "...")
149
        print("\n --- \n")
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc