19995046189

Committed 06 Dec 2025 10:18PM UTC coverage: 90.506% (+0.09%) from 90.421%

Build # 19995046189

Build Type

push

github

Committed by

bramp

Commit Message

test: regenerate golden files for step classifier refactoring

Run Details

10525 of 11629 relevant lines covered (90.51%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.87

/src/build_a_long/pdf_extract/classifier/steps/diagram_classifier.py

"""
Diagram classifier.

Purpose
-------
Identify diagram regions on instruction pages. Diagrams are any images or
drawings on the page, distinct from PartImages (which are single LEGO pieces).
Sometimes a diagram is split into multiple smaller images that are positioned
next to each other, so we cluster them together.

Heuristic
---------
- Look for Image elements on the page
- Filter out full-page images (> 90% of page area, likely backgrounds or borders)
- Each Image becomes a diagram candidate (no clustering during scoring)
- During build(), expand to include adjacent unclaimed images (lazy clustering)

Lazy Clustering
---------------
Clustering is deferred to build() time. This allows other classifiers (like
SubAssemblyClassifier) to claim images first. When build() is called:
1. Start with the candidate's source image
2. Find all adjacent/overlapping unclaimed images
3. Cluster them together into a single Diagram
4. Mark all clustered images as consumed

Re-scoring
----------
When a diagram's source blocks conflict with another candidate (e.g., an arrow
that claims part of the diagram), the diagram can be re-scored without those
blocks. If the remaining blocks still form a valid diagram (meets minimum area),
a reduced candidate is created instead of failing entirely.

Debugging
---------
Enable with `LOG_LEVEL=DEBUG` for structured logs.
"""

from __future__ import annotations

import logging
from typing import ClassVar

from build_a_long.pdf_extract.classifier.candidate import Candidate
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.score import Score, Weight
from build_a_long.pdf_extract.extractor.bbox import (
    BBox,
    build_all_connected_clusters,
)
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
)
from build_a_long.pdf_extract.extractor.page_blocks import (
    Image,
)

log = logging.getLogger(__name__)


class _DiagramScore(Score):
    """Internal score representation for diagram classification."""

    cluster_bbox: BBox
    """Bounding box encompassing the entire diagram cluster."""

    num_images: int
    """Number of images/drawings in this cluster."""

    def score(self) -> Weight:
        """Calculate final weighted score from components."""
        # All diagram clusters get score of 1.0
        # Filtering happens in _score() method
        return 1.0


class DiagramClassifier(LabelClassifier):
    """Classifier for diagram regions on instruction pages."""

    output = "diagram"
    requires = frozenset(
        {
            # Arrows typically overlap diagrams - so we exclude them upfront
            "arrow",
        }
    )

    # TODO Convert to configurable parameters
    # Area filtering threshold (as ratio of page area)
    MAX_AREA_RATIO: ClassVar[float] = (
        0.95  # Filter out images > 95% of page (backgrounds/borders)
    )

    def _score(self, result: ClassificationResult) -> None:
        """Score Image blocks and create one candidate per image.

        Clustering is deferred to build() time to allow other classifiers
        to claim images first.
        """
        page_data = result.page_data
        page_bbox = page_data.bbox
        assert page_bbox is not None

        arrow_candidates = result.get_scored_candidates(
            "arrow", valid_only=False, exclude_failed=True
        )

        # Get all image blocks, filtering out full-page images
        image_blocks: list[Image] = []
        for block in page_data.blocks:
            # Only consider Image elements
            if not isinstance(block, Image):
                continue

            # Skip if part of an arrow's source blocks
            if any(block in arrow.source_blocks for arrow in arrow_candidates):
                continue

            # Filter based on area relative to page
            area_ratio = block.bbox.area / page_bbox.area

            # Skip full-page images (> 95% of page area)
            # These are likely borders/backgrounds
            # TODO This may not be necessary as we filter out all background
            # blocks a lot earlier
            if area_ratio > self.MAX_AREA_RATIO:
                continue

            image_blocks.append(block)

        if not image_blocks:
            log.debug(
                "[diagram] No image blocks found on page %s",
                page_data.page_number,
            )
            return

        log.debug(
            "[diagram] page=%s image_blocks=%d",
            page_data.page_number,
            len(image_blocks),
        )

        # Create one candidate per image (no clustering during scoring)
        for block in image_blocks:
            score_details = _DiagramScore(
                cluster_bbox=block.bbox,
                num_images=1,
            )

            candidate = Candidate(
                bbox=block.bbox,
                label="diagram",
                score=score_details.score(),
                score_details=score_details,
                source_blocks=[block],
            )
            result.add_candidate(candidate)

    # TODO Can we delete this
    def rescore_without_blocks(
        self,
        candidate: Candidate,
        excluded_block_ids: set[int],
        result: ClassificationResult,
    ) -> Candidate | None:
        """Create a new diagram candidate excluding specified blocks.

        Since each candidate now represents a single image, if that image
        is excluded, the candidate is no longer valid.

        Args:
            candidate: The original candidate to re-score
            excluded_block_ids: Set of block IDs to exclude
            result: The classification result context

        Returns:
            The same candidate if the image is not excluded, None otherwise.
        """
        # With single-image candidates, if the block is excluded, return None
        if (
            candidate.source_blocks
            and candidate.source_blocks[0].id in excluded_block_ids
        ):
            return None
        return candidate

    def build(self, candidate: Candidate, result: ClassificationResult) -> Diagram:
        """Construct a Diagram element with lazy clustering.

        Starting from the candidate's source image, expands to include all
        adjacent/overlapping unclaimed images, clustering them into a single
        Diagram.
        """
        page_bbox = result.page_data.bbox
        assert page_bbox is not None

        # Start with the candidate's source block
        assert len(candidate.source_blocks) == 1
        seed_block = candidate.source_blocks[0]
        assert isinstance(seed_block, Image)

        # Find all unclaimed images that can be clustered with this one
        clustered_blocks = self._expand_cluster(seed_block, result)

        # Calculate the combined bbox
        cluster_bbox = BBox.union_all([b.bbox for b in clustered_blocks])

        # Clip diagram bbox to page bounds
        diagram_bbox = cluster_bbox.clip_to(page_bbox)

        # Update the candidate's source_blocks to include all clustered blocks
        # This ensures they all get marked as consumed
        candidate.source_blocks = list(clustered_blocks)

        log.debug(
            "[diagram] Building diagram at %s (clustered %d images)",
            diagram_bbox,
            len(clustered_blocks),
        )

        return Diagram(bbox=diagram_bbox)

    def _expand_cluster(
        self, seed_block: Image, result: ClassificationResult
    ) -> list[Image]:
        """Expand from a seed image to include all adjacent unclaimed images.

        Uses flood-fill to find all images that are adjacent/overlapping
        and not yet consumed by another classifier.

        Args:
            seed_block: The starting image block
            result: Classification result to check consumed blocks

        Returns:
            List of all images in the cluster (including seed)
        """
        # Get all unclaimed image blocks on the page
        log.debug(
            "[diagram] _expand_cluster: seed=%d at %s, consumed_blocks=%s",
            seed_block.id,
            seed_block.bbox,
            sorted(result._consumed_blocks),
        )
        unclaimed_images: list[Image] = []
        for block in result.page_data.blocks:
            if not isinstance(block, Image):
                continue
            # Skip if already consumed
            if block.id in result._consumed_blocks:
                log.debug(
                    "[diagram] Skipping consumed image id=%d at %s",
                    block.id,
                    block.bbox,
                )
                continue
            unclaimed_images.append(block)

        if seed_block not in unclaimed_images:
            # Seed was already consumed (shouldn't happen, but be safe)
            return [seed_block]

        # Build clusters from unclaimed images
        clusters = build_all_connected_clusters(unclaimed_images)

        # Find the cluster containing our seed block
        for cluster in clusters:
            if seed_block in cluster:
                return list(cluster)

        # Fallback: just return the seed block
        return [seed_block]

1	"""
2	Diagram classifier.
3
4	Purpose
5	-------
6	Identify diagram regions on instruction pages. Diagrams are any images or
7	drawings on the page, distinct from PartImages (which are single LEGO pieces).
8	Sometimes a diagram is split into multiple smaller images that are positioned
9	next to each other, so we cluster them together.
10
11	Heuristic
12	---------
13	- Look for Image elements on the page
14	- Filter out full-page images (> 90% of page area, likely backgrounds or borders)
15	- Each Image becomes a diagram candidate (no clustering during scoring)
16	- During build(), expand to include adjacent unclaimed images (lazy clustering)
17
18	Lazy Clustering
19	---------------
20	Clustering is deferred to build() time. This allows other classifiers (like
21	SubAssemblyClassifier) to claim images first. When build() is called:
22	1. Start with the candidate's source image
23	2. Find all adjacent/overlapping unclaimed images
24	3. Cluster them together into a single Diagram
25	4. Mark all clustered images as consumed
26
27	Re-scoring
28	----------
29	When a diagram's source blocks conflict with another candidate (e.g., an arrow
30	that claims part of the diagram), the diagram can be re-scored without those
31	blocks. If the remaining blocks still form a valid diagram (meets minimum area),
32	a reduced candidate is created instead of failing entirely.
33
34	Debugging
35	---------
36	Enable with `LOG_LEVEL=DEBUG` for structured logs.
37	"""
38
39	from __future__ import annotations	1✔
40
41	import logging	1✔
42	from typing import ClassVar	1✔
43
44	from build_a_long.pdf_extract.classifier.candidate import Candidate	1✔
45	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
46	ClassificationResult,
47	)
48	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
49	LabelClassifier,
50	)
51	from build_a_long.pdf_extract.classifier.score import Score, Weight	1✔
52	from build_a_long.pdf_extract.extractor.bbox import (	1✔
53	BBox,
54	build_all_connected_clusters,
55	)
56	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
57	Diagram,
58	)
59	from build_a_long.pdf_extract.extractor.page_blocks import (	1✔
60	Image,
61	)
62
63	log = logging.getLogger(__name__)	1✔
64
65
66	class _DiagramScore(Score):	1✔
67	"""Internal score representation for diagram classification."""
68
69	cluster_bbox: BBox	1✔
70	"""Bounding box encompassing the entire diagram cluster."""	1✔
71
72	num_images: int	1✔
73	"""Number of images/drawings in this cluster."""	1✔
74
75	def score(self) -> Weight:	1✔
76	"""Calculate final weighted score from components."""
77	# All diagram clusters get score of 1.0
78	# Filtering happens in _score() method
79	return 1.0	1✔
80
81
82	class DiagramClassifier(LabelClassifier):	1✔
83	"""Classifier for diagram regions on instruction pages."""
84
85	output = "diagram"	1✔
86	requires = frozenset(	1✔
87	{
88	# Arrows typically overlap diagrams - so we exclude them upfront
89	"arrow",
90	}
91	)
92
93	# TODO Convert to configurable parameters
94	# Area filtering threshold (as ratio of page area)
95	MAX_AREA_RATIO: ClassVar[float] = (	1✔
96	0.95 # Filter out images > 95% of page (backgrounds/borders)
97	)
98
99	def _score(self, result: ClassificationResult) -> None:	1✔
100	"""Score Image blocks and create one candidate per image.
101
102	Clustering is deferred to build() time to allow other classifiers
103	to claim images first.
104	"""
105	page_data = result.page_data	1✔
106	page_bbox = page_data.bbox	1✔
107	assert page_bbox is not None	1✔
108
109	arrow_candidates = result.get_scored_candidates(	1✔
110	"arrow", valid_only=False, exclude_failed=True
111	)
112
113	# Get all image blocks, filtering out full-page images
114	image_blocks: list[Image] = []	1✔
115	for block in page_data.blocks:	1✔
116	# Only consider Image elements
117	if not isinstance(block, Image):	1✔
118	continue	1✔
119
120	# Skip if part of an arrow's source blocks
121	if any(block in arrow.source_blocks for arrow in arrow_candidates):	1✔
122	continue	×
123
124	# Filter based on area relative to page
125	area_ratio = block.bbox.area / page_bbox.area	1✔
126
127	# Skip full-page images (> 95% of page area)
128	# These are likely borders/backgrounds
129	# TODO This may not be necessary as we filter out all background
130	# blocks a lot earlier
131	if area_ratio > self.MAX_AREA_RATIO:	1✔
132	continue	1✔
133
134	image_blocks.append(block)	1✔
135
136	if not image_blocks:	1✔
137	log.debug(	1✔
138	"[diagram] No image blocks found on page %s",
139	page_data.page_number,
140	)
141	return	1✔
142
143	log.debug(	1✔
144	"[diagram] page=%s image_blocks=%d",
145	page_data.page_number,
146	len(image_blocks),
147	)
148
149	# Create one candidate per image (no clustering during scoring)
150	for block in image_blocks:	1✔
151	score_details = _DiagramScore(	1✔
152	cluster_bbox=block.bbox,
153	num_images=1,
154	)
155
156	candidate = Candidate(	1✔
157	bbox=block.bbox,
158	label="diagram",
159	score=score_details.score(),
160	score_details=score_details,
161	source_blocks=[block],
162	)
163	result.add_candidate(candidate)	1✔
164
165	# TODO Can we delete this
166	def rescore_without_blocks(	1✔
167	self,
168	candidate: Candidate,
169	excluded_block_ids: set[int],
170	result: ClassificationResult,
171	) -> Candidate \| None:
172	"""Create a new diagram candidate excluding specified blocks.
173
174	Since each candidate now represents a single image, if that image
175	is excluded, the candidate is no longer valid.
176
177	Args:
178	candidate: The original candidate to re-score
179	excluded_block_ids: Set of block IDs to exclude
180	result: The classification result context
181
182	Returns:
183	The same candidate if the image is not excluded, None otherwise.
184	"""
185	# With single-image candidates, if the block is excluded, return None
186	if (	1✔
187	candidate.source_blocks
188	and candidate.source_blocks[0].id in excluded_block_ids
189	):
190	return None	1✔
191	return candidate	×
192
193	def build(self, candidate: Candidate, result: ClassificationResult) -> Diagram:	1✔
194	"""Construct a Diagram element with lazy clustering.
195
196	Starting from the candidate's source image, expands to include all
197	adjacent/overlapping unclaimed images, clustering them into a single
198	Diagram.
199	"""
200	page_bbox = result.page_data.bbox	1✔
201	assert page_bbox is not None	1✔
202
203	# Start with the candidate's source block
204	assert len(candidate.source_blocks) == 1	1✔
205	seed_block = candidate.source_blocks[0]	1✔
206	assert isinstance(seed_block, Image)	1✔
207
208	# Find all unclaimed images that can be clustered with this one
209	clustered_blocks = self._expand_cluster(seed_block, result)	1✔
210
211	# Calculate the combined bbox
212	cluster_bbox = BBox.union_all([b.bbox for b in clustered_blocks])	1✔
213
214	# Clip diagram bbox to page bounds
215	diagram_bbox = cluster_bbox.clip_to(page_bbox)	1✔
216
217	# Update the candidate's source_blocks to include all clustered blocks
218	# This ensures they all get marked as consumed
219	candidate.source_blocks = list(clustered_blocks)	1✔
220
221	log.debug(	1✔
222	"[diagram] Building diagram at %s (clustered %d images)",
223	diagram_bbox,
224	len(clustered_blocks),
225	)
226
227	return Diagram(bbox=diagram_bbox)	1✔
228
229	def _expand_cluster(	1✔
230	self, seed_block: Image, result: ClassificationResult
231	) -> list[Image]:
232	"""Expand from a seed image to include all adjacent unclaimed images.
233
234	Uses flood-fill to find all images that are adjacent/overlapping
235	and not yet consumed by another classifier.
236
237	Args:
238	seed_block: The starting image block
239	result: Classification result to check consumed blocks
240
241	Returns:
242	List of all images in the cluster (including seed)
243	"""
244	# Get all unclaimed image blocks on the page
245	log.debug(	1✔
246	"[diagram] _expand_cluster: seed=%d at %s, consumed_blocks=%s",
247	seed_block.id,
248	seed_block.bbox,
249	sorted(result._consumed_blocks),
250	)
251	unclaimed_images: list[Image] = []	1✔
252	for block in result.page_data.blocks:	1✔
253	if not isinstance(block, Image):	1✔
254	continue	1✔
255	# Skip if already consumed
256	if block.id in result._consumed_blocks:	1✔
257	log.debug(	1✔
258	"[diagram] Skipping consumed image id=%d at %s",
259	block.id,
260	block.bbox,
261	)
262	continue	1✔
263	unclaimed_images.append(block)	1✔
264
265	if seed_block not in unclaimed_images:	1✔
266	# Seed was already consumed (shouldn't happen, but be safe)
267	return [seed_block]	×
268
269	# Build clusters from unclaimed images
270	clusters = build_all_connected_clusters(unclaimed_images)	1✔
271
272	# Find the cluster containing our seed block
273	for cluster in clusters:	1✔
274	if seed_block in cluster:	1✔
275	return list(cluster)	1✔
276
277	# Fallback: just return the seed block
278	return [seed_block]	×

bramp / build-along / 19995046189

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous