19724286935

Committed 27 Nov 2025 03:27AM UTC coverage: 89.068% (-2.2%) from 91.307%

Build # 19724286935

Build Type

push

github

Committed by

bramp

Commit Message

test(pdf_extract:classifier): Refactor tests to use fixtures and remove integration test

This commit refactors the classifier test suite to improve maintainability and isolation.

Key changes:
- **Fixtures:** Introduced `conftest.py` with `classifier` and `candidate_factory` fixtures to streamline test setup and candidate creation.
- **Unit Test Conversion:** Updated `parts_classifier_test.py`, `step_classifier_test.py`, `page_number_classifier_test.py`, `part_count_classifier_test.py`, and `step_number_classifier_test.py` to use these fixtures, removing dependency on `classify_elements` and making them true unit tests.
- **Integration Test Removal:** Deleted `test_font_size_integration.py` (and its temporary rename `font_size_scoring_test.py`) as its logic has been moved into the respective classifier unit tests.
- **Cleanup:** Removed debug prints and ensured strict type checking compliance.

Run Details

382 of 389 new or added lines in 9 files covered. (98.2%)

291 existing lines in 28 files now uncovered.

7585 of 8516 relevant lines covered (89.07%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.76

/src/build_a_long/pdf_extract/classifier/step_classifier.py

"""
Step classifier.

Purpose
-------
Identify complete Step structures by combining step_number, parts_list, and diagram
elements. A Step represents a single building instruction comprising:
- A StepNumber label
- An optional PartsList (the parts needed for this step)
- A Diagram (the main instruction graphic showing what to build)

We look for step_numbers and attempt to pair them with nearby parts_lists and
identify the appropriate diagram region for each step.

Debugging
---------
Set environment variables to aid investigation without code changes:

- LOG_LEVEL=DEBUG
    Enables DEBUG-level logging (if not already configured by caller).
"""

import logging
from dataclasses import dataclass

from build_a_long.pdf_extract.classifier.classification_result import (
    Candidate,
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.text_extractors import (
    extract_step_number_value,
)
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    LegoPageElements,
    PartsList,
    Step,
    StepNumber,
)
from build_a_long.pdf_extract.extractor.page_blocks import Text

log = logging.getLogger(__name__)


@dataclass
class _StepScore:
    """Internal score representation for step classification."""

    step_number_candidate: Candidate
    """The step number candidate this step is associated with."""

    parts_list_candidate: Candidate | None
    """The parts list candidate paired with this step (if any)."""

    has_parts_list: bool
    """Whether this step has an associated parts list."""

    step_proximity_score: float
    """Score based on proximity to the PartsList above (0.0-1.0).
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""

    step_alignment_score: float
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""

    diagram_area: float
    """Area of the diagram region."""

    def pairing_score(self) -> float:
        """Calculate pairing quality score (average of proximity and alignment)."""
        if not self.has_parts_list:
            return 0.0
        return (self.step_proximity_score + self.step_alignment_score) / 2.0

    def sort_key(self) -> tuple[float, int]:
        """Return a tuple for sorting candidates.

        We prefer:
        1. Higher pairing scores (better StepNumber-PartsList match)
        2. Lower step number values (to break ties and maintain order)
        """
        # Extract step number value from candidate's source block
        step_num_candidate = self.step_number_candidate

        # Assume single source block for step number
        if step_num_candidate.source_blocks and isinstance(
            step_num_candidate.source_blocks[0], Text
        ):
            text_block = step_num_candidate.source_blocks[0]
            step_value = extract_step_number_value(text_block.text)
            if step_value is not None:
                return (-self.pairing_score(), step_value)

        return (-self.pairing_score(), 0)  # Fallback if value cannot be extracted


@dataclass(frozen=True)
class StepClassifier(LabelClassifier):
    """Classifier for complete Step structures."""

    outputs = frozenset({"step"})
    requires = frozenset({"step_number", "parts_list"})

    def score(self, result: ClassificationResult) -> None:
        """Score step pairings and create candidates WITHOUT construction."""
        page_data = result.page_data

        # Get step number and parts list candidates (not constructed elements)
        step_candidates = result.get_scored_candidates(
            "step_number", valid_only=False, exclude_failed=True
        )

        if not step_candidates:
            return

        # Get parts_list candidates
        parts_list_candidates = result.get_scored_candidates(
            "parts_list",
            valid_only=False,
            exclude_failed=True,
        )

        log.debug(
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
            page_data.page_number,
            len(step_candidates),
            len(parts_list_candidates),
        )

        # Create all possible Step candidates for pairings
        all_candidates: list[Candidate] = []
        for step_candidate in step_candidates:
            # Create candidates for this StepNumber paired with each PartsList
            for parts_list_candidate in parts_list_candidates:
                candidate = self._create_step_candidate(
                    step_candidate, parts_list_candidate, result
                )
                if candidate:
                    all_candidates.append(candidate)

            # Also create a candidate with no PartsList (fallback)
            candidate = self._create_step_candidate(step_candidate, None, result)
            if candidate:
                all_candidates.append(candidate)

        # Greedily select the best candidates (deduplication)
        deduplicated_candidates = self._deduplicate_candidates(all_candidates)

        # Add the deduplicated candidates to the result
        for candidate in deduplicated_candidates:
            result.add_candidate("step", candidate)

        log.debug(
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
            len(deduplicated_candidates),
            len(all_candidates),
        )

    def construct(self, result: ClassificationResult) -> None:
        """Construct Step elements from candidates."""
        candidates = result.get_candidates("step")
        for candidate in candidates:
            try:
                elem = self.construct_candidate(candidate, result)
                candidate.constructed = elem
            except Exception as e:
                candidate.failure_reason = str(e)

    def construct_candidate(
        self, candidate: Candidate, result: ClassificationResult
    ) -> LegoPageElements:
        """Construct a Step element from a single candidate."""
        score = candidate.score_details
        assert isinstance(score, _StepScore)

        # Validate and extract step number from parent candidate
        step_num_candidate = score.step_number_candidate

        step_num_elem = result.construct_candidate(step_num_candidate)
        assert isinstance(step_num_elem, StepNumber)
        step_num = step_num_elem

        # Validate and extract parts list from parent candidate (if present)
        parts_list = None
        if score.parts_list_candidate:
            parts_list_candidate = score.parts_list_candidate
            parts_list_elem = result.construct_candidate(parts_list_candidate)
            assert isinstance(parts_list_elem, PartsList)
            parts_list = parts_list_elem

        # Identify diagram region
        diagram_bbox = self._identify_diagram_region(
            step_num.bbox, parts_list.bbox if parts_list else None, result
        )

        # Build Step
        diagram = Diagram(bbox=diagram_bbox)
        return Step(
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
            step_number=step_num,
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
            diagram=diagram,
        )

    def _create_step_candidate(
        self,
        step_candidate: Candidate,
        parts_list_candidate: Candidate | None,
        result: ClassificationResult,
    ) -> Candidate | None:
        """Create a Step candidate WITHOUT construction.

        Args:
            step_candidate: The StepNumber candidate for this step
            parts_list_candidate: The PartsList candidate to pair with (or None)
            result: Classification result

        Returns:
            The created Candidate with score but no construction
        """
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance

        step_bbox = step_candidate.bbox
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None

        # Calculate pairing scores if there's a parts_list above the step
        proximity_score = 0.0
        alignment_score = 0.0

        if (
            parts_list_bbox is not None
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
        ):
            # Calculate distance (how far apart vertically)
            distance = step_bbox.y0 - parts_list_bbox.y1

            # Calculate proximity score
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
            if max_distance > 0:
                proximity_score = max(0.0, 1.0 - (distance / max_distance))

            # Calculate alignment score (how well left edges align)
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
            if max_alignment_diff > 0:
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))

        # Estimate diagram bbox for scoring purposes
        diagram_bbox = self._identify_diagram_region(step_bbox, parts_list_bbox, result)

        # Create score object with candidate references
        score = _StepScore(
            step_number_candidate=step_candidate,
            parts_list_candidate=parts_list_candidate,
            has_parts_list=parts_list_candidate is not None,
            step_proximity_score=proximity_score,
            step_alignment_score=alignment_score,
            diagram_area=diagram_bbox.area,
        )

        # Calculate combined bbox for the candidate
        bboxes = [step_bbox, diagram_bbox]
        if parts_list_bbox:
            bboxes.append(parts_list_bbox)
        combined_bbox = BBox.union_all(bboxes)

        # Create candidate WITHOUT construction
        return Candidate(
            bbox=combined_bbox,
            label="step",
            score=score.pairing_score(),
            score_details=score,
            constructed=None,
            source_blocks=[],
            failure_reason=None,
        )

    def _identify_diagram_region(
        self,
        step_bbox: BBox,
        parts_list_bbox: BBox | None,
        result: ClassificationResult,
    ) -> BBox:
        """Identify the diagram region for a step.

        The diagram is typically the large area below the step number and parts list.
        For now, we create a simple heuristic-based region.

        Args:
            step_bbox: The step number bbox
            parts_list_bbox: The associated parts list bbox (if any)
            result: Classification result containing page_data

        Returns:
            BBox representing the diagram region
        """
        page_data = result.page_data
        # Simple heuristic: use the step number's bbox as a starting point
        # In the future, we should look for actual drawing elements below the step

        # Start with step number position
        x0 = step_bbox.x0
        y0 = step_bbox.y1  # Below the step number

        # If there's a parts list, the diagram should be below it
        if parts_list_bbox:
            y0 = max(y0, parts_list_bbox.y1)

        # Extend to a reasonable area (placeholder logic)
        # TODO: Find actual drawing elements and use their bounds
        page_bbox = page_data.bbox
        assert page_bbox is not None

        # Use the rest of the page width and height as a simple approximation
        x1 = page_bbox.x1
        y1 = page_bbox.y1

        # Create a bbox for the diagram region
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)

    def _compute_step_bbox(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        diagram: Diagram,
    ) -> BBox:
        """Compute the overall bounding box for the Step.

        This encompasses the step number, parts list (if any), and diagram.

        Args:
            step_num: The step number element
            parts_list: The parts list (if any)
            diagram: The diagram element

        Returns:
            Combined bounding box
        """
        bboxes = [step_num.bbox, diagram.bbox]
        if parts_list:
            bboxes.append(parts_list.bbox)

        return BBox.union_all(bboxes)

    def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:
        """Greedily select the best Step candidates.

        Ensures each StepNumber value and each PartsList is used at most once.

        Args:
            candidates: All possible Step candidates

        Returns:
            Deduplicated list of Step candidates
        """
        # Sort candidates by score (highest first)
        sorted_candidates = sorted(
            candidates,
            key=lambda c: c.score_details.sort_key(),
        )

        # Track which StepNumber values and PartsLists have been used
        used_step_values: set[int] = set()
        used_parts_list_ids: set[int] = set()
        selected: list[Candidate] = []

        # Greedily select winners
        for candidate in sorted_candidates:
            # Get step info from score_details (candidates not yet constructed)
            assert isinstance(candidate.score_details, _StepScore)
            score = candidate.score_details

            # Extract step number value from parent candidate source block
            step_num_candidate = score.step_number_candidate

            # Extract step value from text block
            if not step_num_candidate.source_blocks:
                continue
            text_block = step_num_candidate.source_blocks[0]
            if not isinstance(text_block, Text):
                continue

            step_value = extract_step_number_value(text_block.text)
            if step_value is None:
                continue

            # Extract parts list from parent candidate (if present)
            parts_list_candidate = score.parts_list_candidate

            # Skip if this step number value is already used
            if step_value in used_step_values:
                log.debug(
                    "[step] Skipping candidate for step %d - value already used",
                    step_value,
                )
                continue

            # Skip if this parts_list is already used (if it has parts)
            if parts_list_candidate is not None:
                # Check if parts list has parts (look at its score details)
                has_parts = False
                if hasattr(parts_list_candidate.score_details, "part_candidates"):
                    has_parts = (
                        len(parts_list_candidate.score_details.part_candidates) > 0
                    )

                if has_parts:
                    parts_list_id = id(parts_list_candidate)
                    if parts_list_id in used_parts_list_ids:
                        log.debug(
                            "[step] Skipping candidate for step %d - "
                            "PartsList candidate already used",
                            step_value,
                        )
                        continue
                    # Claim this parts_list
                    used_parts_list_ids.add(parts_list_id)

            # Select this candidate
            selected.append(candidate)
            used_step_values.add(step_value)

            log.debug(
                "[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
                step_value,
                "yes" if parts_list_candidate is not None else "no",
                score.pairing_score(),
            )

        return selected

1	"""
2	Step classifier.
3
4	Purpose
5	-------
6	Identify complete Step structures by combining step_number, parts_list, and diagram
7	elements. A Step represents a single building instruction comprising:
8	- A StepNumber label
9	- An optional PartsList (the parts needed for this step)
10	- A Diagram (the main instruction graphic showing what to build)
11
12	We look for step_numbers and attempt to pair them with nearby parts_lists and
13	identify the appropriate diagram region for each step.
14
15	Debugging
16	---------
17	Set environment variables to aid investigation without code changes:
18
19	- LOG_LEVEL=DEBUG
20	Enables DEBUG-level logging (if not already configured by caller).
21	"""
22
23	import logging	1✔
24	from dataclasses import dataclass	1✔
25
26	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
27	Candidate,
28	ClassificationResult,
29	)
30	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
31	LabelClassifier,
32	)
33	from build_a_long.pdf_extract.classifier.text_extractors import (	1✔
34	extract_step_number_value,
35	)
36	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
37	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
38	Diagram,
39	LegoPageElements,
40	PartsList,
41	Step,
42	StepNumber,
43	)
44	from build_a_long.pdf_extract.extractor.page_blocks import Text	1✔
45
46	log = logging.getLogger(__name__)	1✔
47
48
49	@dataclass	1✔
50	class _StepScore:	1✔
51	"""Internal score representation for step classification."""
52
53	step_number_candidate: Candidate
54	"""The step number candidate this step is associated with."""	1✔
55
56	parts_list_candidate: Candidate \| None
57	"""The parts list candidate paired with this step (if any)."""	1✔
58
59	has_parts_list: bool
60	"""Whether this step has an associated parts list."""	1✔
61
62	step_proximity_score: float
63	"""Score based on proximity to the PartsList above (0.0-1.0).	1✔
64	1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
65
66	step_alignment_score: float
67	"""Score based on left-edge alignment with PartsList above (0.0-1.0).	1✔
68	1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
69
70	diagram_area: float
71	"""Area of the diagram region."""	1✔
72
73	def pairing_score(self) -> float:	1✔
74	"""Calculate pairing quality score (average of proximity and alignment)."""
75	if not self.has_parts_list:	1✔
76	return 0.0	1✔
77	return (self.step_proximity_score + self.step_alignment_score) / 2.0	1✔
78
79	def sort_key(self) -> tuple[float, int]:	1✔
80	"""Return a tuple for sorting candidates.
81
82	We prefer:
83	1. Higher pairing scores (better StepNumber-PartsList match)
84	2. Lower step number values (to break ties and maintain order)
85	"""
86	# Extract step number value from candidate's source block
87	step_num_candidate = self.step_number_candidate	1✔
88
89	# Assume single source block for step number
90	if step_num_candidate.source_blocks and isinstance(	1✔
91	step_num_candidate.source_blocks[0], Text
92	):
93	text_block = step_num_candidate.source_blocks[0]	1✔
94	step_value = extract_step_number_value(text_block.text)	1✔
95	if step_value is not None:	1✔
96	return (-self.pairing_score(), step_value)	1✔
97
UNCOV 98	return (-self.pairing_score(), 0) # Fallback if value cannot be extracted	×
99
100
101	@dataclass(frozen=True)	1✔
102	class StepClassifier(LabelClassifier):	1✔
103	"""Classifier for complete Step structures."""
104
105	outputs = frozenset({"step"})	1✔
106	requires = frozenset({"step_number", "parts_list"})	1✔
107
108	def score(self, result: ClassificationResult) -> None:	1✔
109	"""Score step pairings and create candidates WITHOUT construction."""
110	page_data = result.page_data	1✔
111
112	# Get step number and parts list candidates (not constructed elements)
113	step_candidates = result.get_scored_candidates(	1✔
114	"step_number", valid_only=False, exclude_failed=True
115	)
116
117	if not step_candidates:	1✔
118	return	1✔
119
120	# Get parts_list candidates
121	parts_list_candidates = result.get_scored_candidates(	1✔
122	"parts_list",
123	valid_only=False,
124	exclude_failed=True,
125	)
126
127	log.debug(	1✔
128	"[step] page=%s step_candidates=%d parts_list_candidates=%d",
129	page_data.page_number,
130	len(step_candidates),
131	len(parts_list_candidates),
132	)
133
134	# Create all possible Step candidates for pairings
135	all_candidates: list[Candidate] = []	1✔
136	for step_candidate in step_candidates:	1✔
137	# Create candidates for this StepNumber paired with each PartsList
138	for parts_list_candidate in parts_list_candidates:	1✔
139	candidate = self._create_step_candidate(	1✔
140	step_candidate, parts_list_candidate, result
141	)
142	if candidate:	1✔
143	all_candidates.append(candidate)	1✔
144
145	# Also create a candidate with no PartsList (fallback)
146	candidate = self._create_step_candidate(step_candidate, None, result)	1✔
147	if candidate:	1✔
148	all_candidates.append(candidate)	1✔
149
150	# Greedily select the best candidates (deduplication)
151	deduplicated_candidates = self._deduplicate_candidates(all_candidates)	1✔
152
153	# Add the deduplicated candidates to the result
154	for candidate in deduplicated_candidates:	1✔
155	result.add_candidate("step", candidate)	1✔
156
157	log.debug(	1✔
158	"[step] Created %d deduplicated step candidates (from %d possibilities)",
159	len(deduplicated_candidates),
160	len(all_candidates),
161	)
162
163	def construct(self, result: ClassificationResult) -> None:	1✔
164	"""Construct Step elements from candidates."""
UNCOV 165	candidates = result.get_candidates("step")	×
UNCOV 166	for candidate in candidates:	×
167	try:	×
UNCOV 168	elem = self.construct_candidate(candidate, result)	×
UNCOV 169	candidate.constructed = elem	×
UNCOV 170	except Exception as e:	×
UNCOV 171	candidate.failure_reason = str(e)	×
172
173	def construct_candidate(	1✔
174	self, candidate: Candidate, result: ClassificationResult
175	) -> LegoPageElements:
176	"""Construct a Step element from a single candidate."""
177	score = candidate.score_details	1✔
178	assert isinstance(score, _StepScore)	1✔
179
180	# Validate and extract step number from parent candidate
181	step_num_candidate = score.step_number_candidate	1✔
182
183	step_num_elem = result.construct_candidate(step_num_candidate)	1✔
184	assert isinstance(step_num_elem, StepNumber)	1✔
185	step_num = step_num_elem	1✔
186
187	# Validate and extract parts list from parent candidate (if present)
188	parts_list = None	1✔
189	if score.parts_list_candidate:	1✔
190	parts_list_candidate = score.parts_list_candidate	1✔
191	parts_list_elem = result.construct_candidate(parts_list_candidate)	1✔
192	assert isinstance(parts_list_elem, PartsList)	1✔
193	parts_list = parts_list_elem	1✔
194
195	# Identify diagram region
196	diagram_bbox = self._identify_diagram_region(	1✔
197	step_num.bbox, parts_list.bbox if parts_list else None, result
198	)
199
200	# Build Step
201	diagram = Diagram(bbox=diagram_bbox)	1✔
202	return Step(	1✔
203	bbox=self._compute_step_bbox(step_num, parts_list, diagram),
204	step_number=step_num,
205	parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
206	diagram=diagram,
207	)
208
209	def _create_step_candidate(	1✔
210	self,
211	step_candidate: Candidate,
212	parts_list_candidate: Candidate \| None,
213	result: ClassificationResult,
214	) -> Candidate \| None:
215	"""Create a Step candidate WITHOUT construction.
216
217	Args:
218	step_candidate: The StepNumber candidate for this step
219	parts_list_candidate: The PartsList candidate to pair with (or None)
220	result: Classification result
221
222	Returns:
223	The created Candidate with score but no construction
224	"""
225	ABOVE_EPS = 2.0 # Small epsilon for "above" check	1✔
226	ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0 # Max horizontal offset	1✔
227	DISTANCE_THRESHOLD_MULTIPLIER = 1.0 # Max vertical distance	1✔
228
229	step_bbox = step_candidate.bbox	1✔
230	parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None	1✔
231
232	# Calculate pairing scores if there's a parts_list above the step
233	proximity_score = 0.0	1✔
234	alignment_score = 0.0	1✔
235
236	if (	1✔
237	parts_list_bbox is not None
238	and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
239	):
240	# Calculate distance (how far apart vertically)
241	distance = step_bbox.y0 - parts_list_bbox.y1	1✔
242
243	# Calculate proximity score
244	max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER	1✔
245	if max_distance > 0:	1✔
246	proximity_score = max(0.0, 1.0 - (distance / max_distance))	1✔
247
248	# Calculate alignment score (how well left edges align)
249	max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER	1✔
250	left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)	1✔
251	if max_alignment_diff > 0:	1✔
252	alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))	1✔
253
254	# Estimate diagram bbox for scoring purposes
255	diagram_bbox = self._identify_diagram_region(step_bbox, parts_list_bbox, result)	1✔
256
257	# Create score object with candidate references
258	score = _StepScore(	1✔
259	step_number_candidate=step_candidate,
260	parts_list_candidate=parts_list_candidate,
261	has_parts_list=parts_list_candidate is not None,
262	step_proximity_score=proximity_score,
263	step_alignment_score=alignment_score,
264	diagram_area=diagram_bbox.area,
265	)
266
267	# Calculate combined bbox for the candidate
268	bboxes = [step_bbox, diagram_bbox]	1✔
269	if parts_list_bbox:	1✔
270	bboxes.append(parts_list_bbox)	1✔
271	combined_bbox = BBox.union_all(bboxes)	1✔
272
273	# Create candidate WITHOUT construction
274	return Candidate(	1✔
275	bbox=combined_bbox,
276	label="step",
277	score=score.pairing_score(),
278	score_details=score,
279	constructed=None,
280	source_blocks=[],
281	failure_reason=None,
282	)
283
284	def _identify_diagram_region(	1✔
285	self,
286	step_bbox: BBox,
287	parts_list_bbox: BBox \| None,
288	result: ClassificationResult,
289	) -> BBox:
290	"""Identify the diagram region for a step.
291
292	The diagram is typically the large area below the step number and parts list.
293	For now, we create a simple heuristic-based region.
294
295	Args:
296	step_bbox: The step number bbox
297	parts_list_bbox: The associated parts list bbox (if any)
298	result: Classification result containing page_data
299
300	Returns:
301	BBox representing the diagram region
302	"""
303	page_data = result.page_data	1✔
304	# Simple heuristic: use the step number's bbox as a starting point
305	# In the future, we should look for actual drawing elements below the step
306
307	# Start with step number position
308	x0 = step_bbox.x0	1✔
309	y0 = step_bbox.y1 # Below the step number	1✔
310
311	# If there's a parts list, the diagram should be below it
312	if parts_list_bbox:	1✔
313	y0 = max(y0, parts_list_bbox.y1)	1✔
314
315	# Extend to a reasonable area (placeholder logic)
316	# TODO: Find actual drawing elements and use their bounds
317	page_bbox = page_data.bbox	1✔
318	assert page_bbox is not None	1✔
319
320	# Use the rest of the page width and height as a simple approximation
321	x1 = page_bbox.x1	1✔
322	y1 = page_bbox.y1	1✔
323
324	# Create a bbox for the diagram region
325	return BBox(x0=x0, y0=y0, x1=x1, y1=y1)	1✔
326
327	def _compute_step_bbox(	1✔
328	self,
329	step_num: StepNumber,
330	parts_list: PartsList \| None,
331	diagram: Diagram,
332	) -> BBox:
333	"""Compute the overall bounding box for the Step.
334
335	This encompasses the step number, parts list (if any), and diagram.
336
337	Args:
338	step_num: The step number element
339	parts_list: The parts list (if any)
340	diagram: The diagram element
341
342	Returns:
343	Combined bounding box
344	"""
345	bboxes = [step_num.bbox, diagram.bbox]	1✔
346	if parts_list:	1✔
347	bboxes.append(parts_list.bbox)	1✔
348
349	return BBox.union_all(bboxes)	1✔
350
351	def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:	1✔
352	"""Greedily select the best Step candidates.
353
354	Ensures each StepNumber value and each PartsList is used at most once.
355
356	Args:
357	candidates: All possible Step candidates
358
359	Returns:
360	Deduplicated list of Step candidates
361	"""
362	# Sort candidates by score (highest first)
363	sorted_candidates = sorted(	1✔
364	candidates,
365	key=lambda c: c.score_details.sort_key(),
366	)
367
368	# Track which StepNumber values and PartsLists have been used
369	used_step_values: set[int] = set()	1✔
370	used_parts_list_ids: set[int] = set()	1✔
371	selected: list[Candidate] = []	1✔
372
373	# Greedily select winners
374	for candidate in sorted_candidates:	1✔
375	# Get step info from score_details (candidates not yet constructed)
376	assert isinstance(candidate.score_details, _StepScore)	1✔
377	score = candidate.score_details	1✔
378
379	# Extract step number value from parent candidate source block
380	step_num_candidate = score.step_number_candidate	1✔
381
382	# Extract step value from text block
383	if not step_num_candidate.source_blocks:	1✔
UNCOV 384	continue	×
385	text_block = step_num_candidate.source_blocks[0]	1✔
386	if not isinstance(text_block, Text):	1✔
UNCOV 387	continue	×
388
389	step_value = extract_step_number_value(text_block.text)	1✔
390	if step_value is None:	1✔
UNCOV 391	continue	×
392
393	# Extract parts list from parent candidate (if present)
394	parts_list_candidate = score.parts_list_candidate	1✔
395
396	# Skip if this step number value is already used
397	if step_value in used_step_values:	1✔
398	log.debug(	1✔
399	"[step] Skipping candidate for step %d - value already used",
400	step_value,
401	)
402	continue	1✔
403
404	# Skip if this parts_list is already used (if it has parts)
405	if parts_list_candidate is not None:	1✔
406	# Check if parts list has parts (look at its score details)
407	has_parts = False	1✔
408	if hasattr(parts_list_candidate.score_details, "part_candidates"):	1✔
409	has_parts = (	1✔
410	len(parts_list_candidate.score_details.part_candidates) > 0
411	)
412
413	if has_parts:	1✔
414	parts_list_id = id(parts_list_candidate)	1✔
415	if parts_list_id in used_parts_list_ids:	1✔
416	log.debug(	1✔
417	"[step] Skipping candidate for step %d - "
418	"PartsList candidate already used",
419	step_value,
420	)
421	continue	1✔
422	# Claim this parts_list
423	used_parts_list_ids.add(parts_list_id)	1✔
424
425	# Select this candidate
426	selected.append(candidate)	1✔
427	used_step_values.add(step_value)	1✔
428
429	log.debug(	1✔
430	"[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
431	step_value,
432	"yes" if parts_list_candidate is not None else "no",
433	score.pairing_score(),
434	)
435
436	return selected	1✔

bramp / build-along / 19724286935

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous