• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19155446196

06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%
19155446196

push

github

bramp
refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.3
/src/build_a_long/pdf_extract/classifier/part_count_classifier.py
1
"""
2
Part count classifier.
3

4
Purpose
5
-------
6
Detect part-count text like "2x", "3X", or "5×".
7

8
Debugging
9
---------
10
Enable DEBUG logs with LOG_LEVEL=DEBUG. Heavier trace can be enabled when
11
CLASSIFIER_DEBUG is set to "part_count" or "all".
12
"""
13

14
import logging
1✔
15
import os
1✔
16
from dataclasses import dataclass
1✔
17

18
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
19
    Candidate,
20
    ClassificationHints,
21
    ClassificationResult,
22
    ClassifierConfig,
23
)
24
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
25
    LabelClassifier,
26
)
27
from build_a_long.pdf_extract.classifier.text_extractors import (
1✔
28
    extract_part_count_value,
29
)
30
from build_a_long.pdf_extract.extractor import PageData
1✔
31
from build_a_long.pdf_extract.extractor.lego_page_elements import PartCount
1✔
32
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
33

34
log = logging.getLogger(__name__)
1✔
35

36

37
@dataclass
1✔
38
class _PartCountScore:
1✔
39
    """Internal score representation for part count classification."""
40

41
    text_score: float
42
    """Score based on how well the text matches part count patterns (0.0-1.0)."""
1✔
43

44
    def combined_score(self, config: ClassifierConfig) -> float:
1✔
45
        """Calculate final weighted score from components.
46

47
        For part count, we only have text_score, so just return it directly.
48
        """
49
        return self.text_score
1✔
50

51

52
class PartCountClassifier(LabelClassifier):
1✔
53
    """Classifier for part counts."""
54

55
    outputs = {"part_count"}
1✔
56
    requires = set()
1✔
57

58
    def __init__(self, config: ClassifierConfig, classifier):
1✔
59
        super().__init__(config, classifier)
1✔
60
        # Can the following go into the parent, and use "outputs" as identifier?
61
        self._debug_enabled = os.getenv("CLASSIFIER_DEBUG", "").lower() in (
1✔
62
            "part_count",
63
            "all",
64
        )
65

66
    def evaluate(
1✔
67
        self,
68
        page_data: PageData,
69
        result: ClassificationResult,
70
    ) -> None:
71
        """Evaluate elements and create candidates for part counts.
72

73
        This method scores each text element, attempts to construct PartCount objects,
74
        and stores all candidates with their scores and any failure reasons.
75
        """
76
        if not page_data.blocks:
1✔
77
            return
1✔
78

79
        for block in page_data.blocks:
1✔
80
            if not isinstance(block, Text):
1✔
81
                continue
1✔
82

83
            text_score = PartCountClassifier._score_part_count_text(block.text)
1✔
84

85
            # Store detailed score object
86
            detail_score = _PartCountScore(text_score=text_score)
1✔
87

88
            if self._debug_enabled:
1✔
89
                log.debug(
×
90
                    "[part_count] match text=%r score=%.2f bbox=%s",
91
                    block.text,
92
                    text_score,
93
                    block.bbox,
94
                )
95

96
            # Try to construct (parse part count value)
97
            value = extract_part_count_value(block.text)
1✔
98
            constructed_elem = None
1✔
99
            failure_reason = None
1✔
100

101
            if text_score == 0.0:
1✔
102
                failure_reason = (
1✔
103
                    f"Text doesn't match part count pattern: '{block.text}'"
104
                )
105
            elif value is None:
1✔
NEW
106
                failure_reason = f"Could not parse part count from text: '{block.text}'"
×
107
            else:
108
                constructed_elem = PartCount(
1✔
109
                    count=value,
110
                    bbox=block.bbox,
111
                )
112

113
            # Add candidate
114
            result.add_candidate(
1✔
115
                "part_count",
116
                Candidate(
117
                    bbox=block.bbox,
118
                    label="part_count",
119
                    score=detail_score.combined_score(self.config),
120
                    score_details=detail_score,
121
                    constructed=constructed_elem,
122
                    source_block=block,
123
                    failure_reason=failure_reason,
124
                    is_winner=False,  # Will be set by classify()
125
                ),
126
            )
127

128
    def classify(
1✔
129
        self,
130
        page_data: PageData,
131
        result: ClassificationResult,
132
        hints: ClassificationHints | None,
133
    ) -> None:
134
        """Select winning part counts from pre-built candidates."""
135
        # Get pre-built candidates
136
        candidate_list = result.get_candidates("part_count")
1✔
137

138
        # Mark winners (all successfully constructed candidates)
139
        for candidate in candidate_list:
1✔
140
            if candidate.constructed is None:
1✔
141
                # Already has failure_reason from calculate_scores
142
                continue
1✔
143

144
            # This is a winner!
145
            assert isinstance(candidate.constructed, PartCount)
1✔
146
            result.mark_winner(candidate, candidate.constructed)
1✔
147
            self.classifier._remove_child_bboxes(
1✔
148
                page_data, candidate.source_block, result
149
            )
150
            self.classifier._remove_similar_bboxes(
1✔
151
                page_data, candidate.source_block, result
152
            )
153

154
    @staticmethod
1✔
155
    def _score_part_count_text(text: str) -> float:
1✔
156
        """Score text based on how well it matches part count patterns.
157

158
        Returns:
159
            1.0 if text matches part count pattern, 0.0 otherwise
160
        """
161
        # Use the extraction function to validate format
162
        if extract_part_count_value(text) is not None:
1✔
163
            return 1.0
1✔
164
        return 0.0
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc