• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19751674072

28 Nov 2025 01:40AM UTC coverage: 89.023% (-0.8%) from 89.847%
19751674072

push

github

bramp
refactor(classifier): reorganize text modules and break circular dependencies

Major Changes:
- Created text/ subdirectory for text-related classifier modules
- Moved text_histogram, text_extractors, font_size_hints to text/
- Created constants.py to resolve circular dependency issue

Module Organization:
- classifier/text/__init__.py: Package exports for text modules
- classifier/text/text_histogram.py: TextHistogram class
- classifier/text/text_extractors.py: Text extraction functions
- classifier/text/font_size_hints.py: FontSizeHints class

Circular Dependency Resolution:
- Created classifier/constants.py with CATALOG_ELEMENT_ID_THRESHOLD
- Removed ClassVar from PageHintCollection
- Updated font_size_hints.py and page_hint_collection.py to import from constants
- Fixed package-level circular import by importing TextHistogram directly from module
- Added TODO to consider moving constant to ClassifierConfig

Bug Fixes:
- Fixed DrawableItem frozen model issue in drawing.py
- Create new instances with depth instead of mutating frozen objects

Import Updates:
- Updated all imports across ~15 files to use new module paths
- Updated classifier/__init__.py to re-export text module classes

Tests:
- All tests passing (42/42 test files)
- Type checking passes
- Code formatted with ruff

32 of 32 new or added lines in 23 files covered. (100.0%)

180 existing lines in 19 files now uncovered.

7429 of 8345 relevant lines covered (89.02%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

18.26
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- No labeled element should be marked as deleted.
7
- Each element has at most one winner candidate.
8

9
Real fixture(s) live under this package's fixtures/ directory.
10
"""
11

12
import logging
1✔
13

14
import pytest
1✔
15

16
from build_a_long.pdf_extract.classifier import Candidate, classify_elements
1✔
17
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
18
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
19
    Diagram,
20
    LegoPageElement,
21
    PartsList,
22
)
23
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES
1✔
24

25
log = logging.getLogger(__name__)
1✔
26

27

28
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
29
    """Load all pages from a fixture file.
30

31
    Args:
32
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
33

34
    Returns:
35
        All pages from the extraction result
36

37
    Raises:
38
        ValueError: If the fixture contains no pages
39
    """
40
    fixture_path = FIXTURES_DIR / fixture_file
1✔
41
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
42
        fixture_path.read_text()
43
    )  # type: ignore[assignment]
44

45
    if not extraction.pages:
1✔
UNCOV
46
        raise ValueError(f"No pages found in {fixture_file}")
×
47

48
    return extraction.pages
1✔
49

50

51
class TestClassifierRules:
1✔
52
    """End-to-end rules that must hold on real pages after classification."""
53

54
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
55
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
56
        """No element with a label should be marked as deleted.
57

58
        If an element has been classified with a label, it should not be deleted.
59
        This ensures that the classification and deletion logic don't conflict.
60
        """
UNCOV
61
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)
×
62

UNCOV
63
        for page_idx, page in enumerate(pages):
×
64
            # Run the full classification pipeline on the page
UNCOV
65
            result = classify_elements(page)
×
66

67
            # Find all elements that are both labeled and deleted
68
            # Build a map of source_block -> label for successfully constructed
69
            # candidates
UNCOV
70
            block_to_label: dict[int, str] = {}
×
UNCOV
71
            for label, candidates in result.get_all_candidates().items():
×
UNCOV
72
                for candidate in candidates:
×
UNCOV
73
                    if candidate.constructed is not None and candidate.source_blocks:
×
UNCOV
74
                        for block in candidate.source_blocks:
×
UNCOV
75
                            block_to_label[id(block)] = label
×
76

UNCOV
77
            labeled_and_deleted = []
×
UNCOV
78
            for elem in page.blocks:
×
UNCOV
79
                if id(elem) in block_to_label and result.is_removed(elem):
×
UNCOV
80
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))
×
81

UNCOV
82
            if labeled_and_deleted:
×
UNCOV
83
                log.error(
×
84
                    f"Found {len(labeled_and_deleted)} labeled elements "
85
                    f"that are deleted:"
86
                )
UNCOV
87
                for elem, label in labeled_and_deleted:
×
88
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")
×
89

UNCOV
90
            assert len(labeled_and_deleted) == 0, (
×
91
                f"Found {len(labeled_and_deleted)} labeled elements that are "
92
                f"deleted in {fixture_file} page {page_idx}. "
93
                f"Labeled elements should not be deleted."
94
            )
95

96
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
97
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
1✔
98
        """Each source block should map to at most one LegoPageElement.
99

100
        This validates that the classification pipeline doesn't create duplicate
101
        elements from the same source block. Each raw extraction block should
102
        produce at most one classified element in the final Page tree.
103
        """
UNCOV
104
        pages = _load_pages_from_fixture(fixture_file)
×
105

UNCOV
106
        for page_idx, page_data in enumerate(pages):
×
107
            # Run the full classification pipeline on the page
UNCOV
108
            result = classify_elements(page_data)
×
UNCOV
109
            page = result.page
×
110

UNCOV
111
            if page is None:
×
UNCOV
112
                continue
×
113

114
            # Get all candidates from the classification result
UNCOV
115
            all_candidates = result.get_all_candidates()
×
116

117
            # Build a mapping from constructed element ID to candidate
UNCOV
118
            element_id_to_candidate: dict[int, Candidate] = {}
×
UNCOV
119
            for _label, candidates in all_candidates.items():
×
UNCOV
120
                for candidate in candidates:
×
UNCOV
121
                    if candidate.constructed is not None:
×
UNCOV
122
                        elem_id = id(candidate.constructed)
×
UNCOV
123
                        assert elem_id not in element_id_to_candidate, (
×
124
                            f"Source block id:"
125
                            f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
126
                            f"produced multiple elements of type "
127
                            f"{candidate.constructed.__class__.__name__} "
128
                            f"in {fixture_file} page {page_idx}"
129
                        )
UNCOV
130
                        element_id_to_candidate[elem_id] = candidate
×
131

UNCOV
132
            blocks_to_element: dict[int, LegoPageElement] = {}
×
133

134
            # Traverse all LegoPageElements in the Page tree
UNCOV
135
            for element in page.iter_elements():
×
UNCOV
136
                elem_id = id(element)
×
137

138
                # Skip synthetic/fallback elements that weren't created by candidates
139
                # (e.g., empty PartsLists created when Step has no parts_list)
UNCOV
140
                if elem_id not in element_id_to_candidate:
×
UNCOV
141
                    continue
×
142

UNCOV
143
                candidate = element_id_to_candidate[elem_id]
×
144

UNCOV
145
                for source_block in candidate.source_blocks:
×
UNCOV
146
                    if source_block.id in blocks_to_element:
×
UNCOV
147
                        existing_element = blocks_to_element[source_block.id]
×
148
                        assert source_block.id not in blocks_to_element, (
×
149
                            f"Source block id:{source_block.id} "
150
                            f"({source_block.tag}) mapped to multiple "
151
                            f"elements in {fixture_file} page "
152
                            f"{page_data.page_number}:\n"
153
                            f"  First:  {existing_element}\n"
154
                            f"  Second: {element}\n"
155
                            f"  Source: {source_block}"
156
                        )
UNCOV
157
                    blocks_to_element[source_block.id] = element
×
158

159
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
160
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
1✔
161
        """All LegoPageElements in the final Page tree must come from candidates.
162

163
        This validates that classifiers don't create "orphan" elements directly
164
        without a corresponding candidate. Every LegoPageElement should be either:
165
        1. The constructed element of a candidate, or
166
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
167
           parts_list candidate)
168

169
        Ensures proper tracking of all elements through the classification pipeline.
170
        """
UNCOV
171
        pages = _load_pages_from_fixture(fixture_file)
×
172

UNCOV
173
        for page_idx, page_data in enumerate(pages):
×
174
            # Run the full classification pipeline on the page
UNCOV
175
            result = classify_elements(page_data)
×
UNCOV
176
            page = result.page
×
177

UNCOV
178
            if page is None:
×
UNCOV
179
                continue
×
180

181
            # Build a set of all constructed element IDs from candidates
UNCOV
182
            all_candidates = result.get_all_candidates()
×
UNCOV
183
            constructed_element_ids: set[int] = set()
×
UNCOV
184
            for _label, candidates in all_candidates.items():
×
UNCOV
185
                for candidate in candidates:
×
UNCOV
186
                    if candidate.constructed is not None:
×
UNCOV
187
                        constructed_element_ids.add(id(candidate.constructed))
×
188

189
            # Traverse all LegoPageElements in the Page tree
UNCOV
190
            orphan_elements: list[tuple[LegoPageElement, str]] = []
×
UNCOV
191
            for element in page.iter_elements():
×
UNCOV
192
                elem_id = id(element)
×
UNCOV
193
                elem_type = element.__class__.__name__
×
194

195
                # Skip the Page itself (it's the root container)
UNCOV
196
                if elem_type == "Page":
×
UNCOV
197
                    continue
×
198

199
                # Check if this element came from a candidate
UNCOV
200
                if elem_id not in constructed_element_ids:
×
201
                    # TODO Remove the following lines
202
                    # Known synthetic/fallback elements that are expected:
203
                    # - Empty PartsList when Step has no parts_list candidate
204
                    # - Diagram when Step couldn't find a matching diagram candidate
UNCOV
205
                    if isinstance(element, PartsList) and len(element.parts) == 0:
×
UNCOV
206
                        continue
×
UNCOV
207
                    if isinstance(element, Diagram):
×
208
                        # Fallback diagrams are allowed when StepClassifier
209
                        # can't find a matching diagram candidate
UNCOV
210
                        continue
×
211

UNCOV
212
                    orphan_elements.append((element, elem_type))
×
213

UNCOV
214
            if orphan_elements:
×
UNCOV
215
                log.error(
×
216
                    f"Found {len(orphan_elements)} orphan elements not from "
217
                    f"candidates in {fixture_file} page {page_idx}:"
218
                )
UNCOV
219
                for elem, elem_type in orphan_elements:
×
220
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")
×
221

UNCOV
222
            assert len(orphan_elements) == 0, (
×
223
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
224
                f"candidates in {fixture_file} page {page_idx}. "
225
                f"All elements should come from candidates or be known fallbacks."
226
            )
227

228
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
229
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
1✔
230
        """No candidate marked constructed without being in the final tree.
231

232
        This validates the transactional rollback semantics of build():
233
        - If a parent classifier's build() fails, all sub-candidates it built
234
          should be rolled back (constructed = None)
235
        - Only candidates that are actually used in the final Page tree should
236
          remain marked as constructed
237

238
        This catches bugs where:
239
        1. A classifier builds sub-candidates (e.g., step builds step_number)
240
        2. The classifier then fails (e.g., parts_list build fails)
241
        3. The step_number candidate remains orphaned with constructed set,
242
           but not actually used in the final tree
243
        """
UNCOV
244
        pages = _load_pages_from_fixture(fixture_file)
×
245

UNCOV
246
        for page_idx, page_data in enumerate(pages):
×
247
            # Run the full classification pipeline on the page
UNCOV
248
            result = classify_elements(page_data)
×
UNCOV
249
            page = result.page
×
250

UNCOV
251
            if page is None:
×
UNCOV
252
                continue
×
253

254
            # Build set of all element IDs actually used in the final Page tree
UNCOV
255
            used_element_ids: set[int] = set()
×
UNCOV
256
            for element in page.iter_elements():
×
UNCOV
257
                used_element_ids.add(id(element))
×
258

259
            # Check all candidates for orphaned constructed elements
UNCOV
260
            all_candidates = result.get_all_candidates()
×
UNCOV
261
            orphaned_candidates: list[tuple[str, Candidate]] = []
×
262

UNCOV
263
            for label, candidates in all_candidates.items():
×
UNCOV
264
                for candidate in candidates:
×
265
                    # If candidate is marked as constructed but not in the tree
UNCOV
266
                    if (
×
267
                        candidate.constructed is not None
268
                        and id(candidate.constructed) not in used_element_ids
269
                    ):
UNCOV
270
                        orphaned_candidates.append((label, candidate))
×
271

UNCOV
272
            if orphaned_candidates:
×
UNCOV
273
                log.error(
×
274
                    f"Found {len(orphaned_candidates)} orphaned constructed "
275
                    f"candidates in {fixture_file} page {page_idx}:"
276
                )
UNCOV
277
                for label, candidate in orphaned_candidates:
×
278
                    elem_type = candidate.constructed.__class__.__name__
×
279
                    log.error(
×
280
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
281
                        f"score:{candidate.score:.3f} "
282
                        f"failure:{candidate.failure_reason}"
283
                    )
284

UNCOV
285
            assert len(orphaned_candidates) == 0, (
×
286
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
287
                f"in {fixture_file} page {page_idx}. "
288
                f"Candidates marked as constructed should either be in the final "
289
                f"Page tree or rolled back to constructed=None. "
290
                f"This indicates a transactional rollback failure."
291
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc