• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20389851973

20 Dec 2025 05:31AM UTC coverage: 89.185% (+0.04%) from 89.145%
20389851973

push

github

bramp
Add support for `ty` to the pyproject.toml.

13384 of 15007 relevant lines covered (89.19%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.97
/src/build_a_long/pdf_extract/classifier/steps/substep_classifier.py
1
"""
2
SubStep classifier.
3

4
Purpose
5
-------
6
Identify SubStep elements by finding substep numbers paired with diagrams.
7

8
SubSteps are mini-steps that appear either:
9
1. Inside SubAssembly callout boxes (numbered 1, 2, 3 within the box)
10
2. As "naked" substeps on the page (small numbers 1, 2, 3, 4 alongside a main step)
11

12
This classifier pairs substep_number with diagram candidates based on position:
13
- Step number should be to the left or above the diagram
14
- Step number and diagram should be relatively close together
15

16
Architecture
17
------------
18
This classifier independently finds SubStep candidates during scoring:
19
- Gets substep_number candidates (from SubStepNumberClassifier - smaller font)
20
- Gets diagram candidates
21
- Creates SubStep candidates by pairing substep numbers with nearby diagrams
22

23
These candidates are then used by:
24
- SubAssemblyClassifier: claims SubSteps that are inside callout boxes
25
- StepClassifier: uses remaining SubSteps as "naked" substeps
26

27
The key insight is that SubStep step numbers have a SMALLER FONT SIZE than
28
main step numbers. SubStepNumberClassifier detects these smaller font step numbers
29
and this classifier pairs them with diagrams.
30
"""
31

32
import logging
1✔
33
from typing import ClassVar
1✔
34

35
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
36
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
37
    ClassificationResult,
38
)
39
from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
1✔
40
from build_a_long.pdf_extract.classifier.rule_based_classifier import StepNumberScore
1✔
41
from build_a_long.pdf_extract.classifier.score import Score
1✔
42
from build_a_long.pdf_extract.classifier.steps.pairing import (
1✔
43
    DEFAULT_MAX_PAIRING_DISTANCE,
44
    PairingConfig,
45
    find_optimal_pairings,
46
)
47
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
48
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
49
    Diagram,
50
    StepNumber,
51
    SubStep,
52
)
53

54
log = logging.getLogger(__name__)
1✔
55

56

57
class _SubStepScore(Score):
1✔
58
    """Score details for SubStep candidates.
59

60
    Scoring is based on:
61
    - Position: step number should be to the left/top of the diagram
62
    - Distance: step number should be close to the diagram
63
    """
64

65
    step_value: int
66
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
67

68
    substep_number_candidate: Candidate
69
    """The substep_number candidate for this substep."""
1✔
70

71
    diagram_candidate: Candidate
72
    """The diagram candidate paired with this step number."""
1✔
73

74
    position_score: float
75
    """Score based on step number being to left/top of diagram (0.0-1.0)."""
1✔
76

77
    distance_score: float
78
    """Score based on distance between step number and diagram (0.0-1.0)."""
1✔
79

80
    def score(self) -> float:
1✔
81
        """Return the weighted score value."""
82
        return self.position_score * 0.5 + self.distance_score * 0.5
1✔
83

84

85
class SubStepClassifier(LabelClassifier):
1✔
86
    """Classifier for SubStep elements.
87

88
    This classifier finds step numbers and pairs them with nearby diagrams based
89
    on position. The pairing creates SubStep candidates which are then:
90
    - Claimed by SubAssemblyClassifier if inside a callout box
91
    - Used by StepClassifier as naked substeps if not inside a box
92

93
    Scoring phase:
94
    - Gets all step_number candidates
95
    - Gets all diagram candidates
96
    - Creates SubStep candidates by pairing step numbers with diagrams
97
      where the step number is to the left/above the diagram
98

99
    Build phase:
100
    - Builds the step_number from its candidate (substep_number -> StepNumber)
101
    - Builds the diagram from its candidate
102
    - Creates the SubStep with both elements
103
    """
104

105
    output: ClassVar[str] = "substep"
1✔
106
    requires: ClassVar[frozenset[str]] = frozenset({"substep_number", "diagram"})
1✔
107

108
    def _score(self, result: ClassificationResult) -> None:
1✔
109
        """Score substep number + diagram pairings to create SubStep candidates."""
110
        # Get substep number candidates (small font step numbers)
111
        # During scoring, candidates are not yet constructed
112
        substep_number_candidates = result.get_scored_candidates("substep_number")
1✔
113

114
        if not substep_number_candidates:
1✔
115
            log.debug("[substep] No substep_number candidates found")
1✔
116
            return
1✔
117

118
        # Get diagram candidates (not yet built)
119
        diagram_candidates = result.get_scored_candidates("diagram")
1✔
120

121
        if not diagram_candidates:
1✔
122
            log.debug("[substep] No diagram candidates found")
1✔
123
            return
1✔
124

125
        log.debug(
1✔
126
            "[substep] Found %d substep numbers, %d diagrams",
127
            len(substep_number_candidates),
128
            len(diagram_candidates),
129
        )
130

131
        # Use Hungarian algorithm to find optimal pairing
132
        self._create_candidates_with_hungarian(
1✔
133
            substep_number_candidates,
134
            diagram_candidates,
135
            result,
136
        )
137

138
    def _get_step_value(self, candidate: Candidate) -> int:
1✔
139
        """Get the step number value from a candidate's score."""
140
        score_details = candidate.score_details
1✔
141
        if isinstance(score_details, StepNumberScore):
1✔
142
            return score_details.step_value
1✔
143
        return 0
×
144

145
    def _create_candidates_with_hungarian(
1✔
146
        self,
147
        step_candidates: list[Candidate],
148
        diagram_candidates: list[Candidate],
149
        result: ClassificationResult,
150
    ) -> None:
151
        """Use Hungarian algorithm to optimally pair step numbers with diagrams.
152

153
        Uses the shared pairing module to find optimal step number to diagram
154
        pairings based on position and distance.
155

156
        Args:
157
            step_candidates: Step number candidates
158
            diagram_candidates: Diagram candidates
159
            result: Classification result to add candidates to
160
        """
161
        if not step_candidates or not diagram_candidates:
1✔
162
            return
×
163

164
        # Get dividers for obstruction checking
165
        divider_candidates = result.get_built_candidates("divider")
1✔
166
        divider_bboxes = [
1✔
167
            c.constructed.bbox for c in divider_candidates if c.constructed is not None
168
        ]
169

170
        # Extract bboxes for pairing
171
        step_bboxes = [c.bbox for c in step_candidates]
1✔
172
        diagram_bboxes = [c.bbox for c in diagram_candidates]
1✔
173

174
        # Configure pairing: substeps use smaller max distance
175
        config = PairingConfig(
1✔
176
            max_distance=DEFAULT_MAX_PAIRING_DISTANCE,
177
            position_weight=0.5,
178
            distance_weight=0.5,
179
            check_dividers=True,
180
            top_left_tolerance=100.0,
181
        )
182

183
        # Find optimal pairings using shared logic
184
        pairings = find_optimal_pairings(
1✔
185
            step_bboxes, diagram_bboxes, config, divider_bboxes
186
        )
187

188
        # Create candidates from pairings
189
        for pairing in pairings:
1✔
190
            substep_cand = step_candidates[pairing.step_index]
1✔
191
            diag_cand = diagram_candidates[pairing.diagram_index]
1✔
192

193
            step_value = self._get_step_value(substep_cand)
1✔
194
            score_details = _SubStepScore(
1✔
195
                step_value=step_value,
196
                substep_number_candidate=substep_cand,
197
                diagram_candidate=diag_cand,
198
                position_score=pairing.position_score,
199
                distance_score=pairing.distance_score,
200
            )
201

202
            # Combined bbox
203
            combined_bbox = substep_cand.bbox.union(diag_cand.bbox)
1✔
204

205
            candidate = Candidate(
1✔
206
                bbox=combined_bbox,
207
                label="substep",
208
                score=score_details.score(),
209
                score_details=score_details,
210
                source_blocks=[],  # Blocks claimed via nested candidates
211
            )
212

213
            result.add_candidate(candidate)
1✔
214
            log.debug(
1✔
215
                "[substep] Created candidate: step=%s, score=%.2f",
216
                score_details.step_value,
217
                score_details.score(),
218
            )
219

220
    def build(
1✔
221
        self,
222
        candidate: Candidate,
223
        result: ClassificationResult,
224
        constraint_bbox: BBox | None = None,
225
    ) -> SubStep:
226
        """Construct a SubStep from a candidate.
227

228
        Args:
229
            candidate: The candidate to construct, with score_details containing
230
                substep_number_candidate and diagram_candidate.
231
            result: The classification result context.
232
            constraint_bbox: Optional bounding box to constrain diagram clustering.
233
                When building inside a SubAssembly, this should be the
234
                SubAssembly's bbox to prevent diagrams from clustering beyond
235
                the SubAssembly boundaries.
236

237
        Returns:
238
            The constructed SubStep element.
239
        """
240
        score = candidate.score_details
1✔
241
        assert isinstance(score, _SubStepScore)
1✔
242

243
        # Build the step number from the substep_number candidate
244
        # (SubStepNumberClassifier.build() returns a StepNumber element)
245
        step_num_elem = result.build(score.substep_number_candidate)
1✔
246
        assert isinstance(step_num_elem, StepNumber)
1✔
247

248
        # Build the diagram from its candidate, passing constraint_bbox if provided
249
        # to prevent the diagram from clustering beyond the SubAssembly boundaries
250
        diagram_elem = result.build(
1✔
251
            score.diagram_candidate, constraint_bbox=constraint_bbox
252
        )
253
        assert isinstance(diagram_elem, Diagram)
1✔
254

255
        # Compute bbox including both step number and diagram
256
        substep_bbox = step_num_elem.bbox.union(diagram_elem.bbox)
1✔
257

258
        log.debug(
1✔
259
            "[substep] Built SubStep %d",
260
            step_num_elem.value,
261
        )
262

263
        return SubStep(
1✔
264
            bbox=substep_bbox,
265
            step_number=step_num_elem,
266
            diagram=diagram_elem,
267
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc