19155446196

Committed 06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%

Build # 19155446196

Build Type

push

github

Committed by

bramp

Commit Message

refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

Run Details

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.42

/src/build_a_long/pdf_extract/classifier/classification_result.py

"""
Data classes for the classifier.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any

from dataclass_wizard import JSONPyWizard

from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.extractor import PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement
from build_a_long.pdf_extract.extractor.page_blocks import Block

# Score key can be either a single Block or a tuple of Blocks (for pairings)
ScoreKey = Block | tuple[Block, ...]


@dataclass
class RemovalReason(JSONPyWizard):
    """Tracks why a block was removed during classification."""

    reason_type: str
    """Type of removal: 'child_bbox' or 'similar_bbox'"""

    target_block: Block
    """The block that caused this removal"""


@dataclass
class Candidate(JSONPyWizard):
    """A candidate block with its score and constructed LegoElement.

    Represents a single block that was considered for a particular label,
    including its score, the constructed LegoPageElement (if successful),
    and information about why it succeeded or failed.

    This enables:
    - Re-evaluation with hints (exclude specific candidates)
    - Debugging (see all candidates and why they won/lost)
    - UI support (show users alternatives)
    """

    bbox: BBox
    """The bounding box for this candidate (from source_block or constructed)"""

    label: str
    """The label this candidate would have (e.g., 'page_number')"""

    # TODO Maybe score is redudant with score_details?
    score: float
    """Combined score (0.0-1.0)"""

    score_details: Any
    """The detailed score object (e.g., _PageNumberScore)"""

    constructed: LegoPageElement | None
    """The constructed LegoElement if parsing succeeded, None if failed"""

    source_block: Block | None = None
    """The raw element that was scored (None for synthetic elements like Step)"""

    failure_reason: str | None = None
    """Why construction failed, if it did"""

    is_winner: bool = False
    """Whether this candidate was selected as the winner.
    
    This field is set by mark_winner() and is used for:
    - Querying winners (get_label, get_blocks_by_label, has_label)
    - Synthetic candidates (which have no source_block and can't be tracked
      in _block_winners)
    - JSON serialization and golden file comparisons
    
    Note: For candidates with source_block, this is redundant with
    _block_winners, but provides convenient access and handles synthetic
    candidates.
    """


@dataclass
class ClassifierConfig(JSONPyWizard):
    """Configuration for the classifier."""

    # TODO Not sure what this value is used for
    min_confidence_threshold: float = 0.5

    page_number_text_weight: float = 0.7
    page_number_position_weight: float = 0.3
    page_number_position_scale: float = 50.0
    page_number_page_value_weight: float = 1.0

    step_number_text_weight: float = 0.8
    step_number_size_weight: float = 0.2

    def __post_init__(self) -> None:
        for weight in self.__dict__.values():
            if weight < 0:
                raise ValueError("All weights must be greater than or equal to 0.")


@dataclass
class ClassificationResult(JSONPyWizard):
    """Represents the outcome of a single classification run.

    This class encapsulates the results of element classification, including
    labels, scores, and removal information. The candidates field is now the
    primary source of truth for classification results, containing all scored
    elements, their constructed LegoPageElements, and winner information.

    ClassificationResult is passed through the classifier pipeline, with each
    classifier adding its candidates and marking winners. This allows later
    classifiers to query the current state and make decisions based on earlier
    results.

    External code should use the accessor methods rather than accessing internal
    fields directly to maintain encapsulation.
    """

    page_data: PageData
    """The original page data being classified"""

    _warnings: list[str] = field(default_factory=list)

    _removal_reasons: dict[int, RemovalReason] = field(default_factory=dict)
    """Maps block IDs (block.id, not id(block)) to the reason they were removed.
    
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability
    and consistency with _constructed_elements.
    """

    _constructed_elements: dict[int, LegoPageElement] = field(default_factory=dict)
    """Maps source block IDs to their constructed LegoPageElements.
    
    Only contains elements that were successfully labeled and constructed.
    The builder should use these pre-constructed elements rather than
    re-parsing the source blocks.
    
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability.
    """

    _candidates: dict[str, list[Candidate]] = field(default_factory=dict)
    """Maps label names to lists of all candidates considered for that label.
    
    Each candidate includes:
    - The source element
    - Its score and score details
    - The constructed LegoPageElement (if successful)
    - Failure reason (if construction failed)
    - Whether it was the winner
    
    This enables:
    - Re-evaluation with hints (exclude specific candidates)
    - Debugging (see why each candidate won/lost)
    - UI support (show users alternatives)
    """

    _block_winners: dict[int, tuple[str, Candidate]] = field(default_factory=dict)
    """Maps block IDs to their winning (label, candidate) tuple.
    
    Ensures each block has at most one winning candidate across all labels.
    Keys are block IDs (int) for JSON serializability.
    """

    def __post_init__(self) -> None:
        """Validate PageData blocks have unique IDs (if present).

        Blocks may have None IDs, but blocks with IDs must have unique IDs.
        Note: Only blocks with IDs can be tracked in _constructed_elements and
        _removal_reasons (which require block.id as keys for JSON serializability).
        """
        # Validate unique IDs (ignoring None values)
        block_ids = [e.id for e in self.page_data.blocks if e.id is not None]
        if len(block_ids) != len(set(block_ids)):
            duplicates = [id_ for id_ in block_ids if block_ids.count(id_) > 1]
            raise ValueError(
                f"PageData blocks must have unique IDs. "
                f"Found duplicates: {set(duplicates)}"
            )

    def _validate_block_in_page_data(
        self, block: Block | None, param_name: str = "block"
    ) -> None:
        """Validate that a block is in PageData.

        Args:
            block: The block to validate (None is allowed and skips validation)
            param_name: Name of the parameter being validated (for error messages)

        Raises:
            ValueError: If block is not None and not in PageData.blocks
        """
        if block is not None and block not in self.page_data.blocks:
            raise ValueError(f"{param_name} must be in PageData.blocks. Block: {block}")

    @property
    def blocks(self) -> list[Block]:
        """Get the blocks from the page data.

        Returns:
            List of blocks from the page data
        """
        return self.page_data.blocks

    def add_warning(self, warning: str) -> None:
        """Add a warning message to the classification result.

        Args:
            warning: The warning message to add
        """
        self._warnings.append(warning)

    def get_warnings(self) -> list[str]:
        """Get all warnings generated during classification.

        Returns:
            List of warning messages
        """
        return self._warnings.copy()

    def get_constructed_element(self, block: Block) -> LegoPageElement | None:
        """Get the constructed LegoPageElement for a source block.

        Args:
            block: The source block

        Returns:
            The constructed LegoPageElement if it exists, None otherwise
        """
        return self._constructed_elements.get(block.id)

    # TODO maybe add a parameter to fitler out winners/non-winners
    def get_candidates(self, label: str) -> list[Candidate]:
        """Get all candidates for a specific label.

        Args:
            label: The label to get candidates for

        Returns:
            List of candidates for that label (returns copy to prevent
            external modification)
        """
        return self._candidates.get(label, []).copy()

    def get_all_candidates(self) -> dict[str, list[Candidate]]:
        """Get all candidates across all labels.

        Returns:
            Dictionary mapping labels to their candidates (returns deep copy)
        """
        return {label: cands.copy() for label, cands in self._candidates.items()}

    def add_candidate(self, label: str, candidate: Candidate) -> None:
        """Add a single candidate for a specific label.

        Args:
            label: The label this candidate is for
            candidate: The candidate to add

        Raises:
            ValueError: If candidate has a source_block that is not in PageData
        """
        self._validate_block_in_page_data(
            candidate.source_block, "candidate.source_block"
        )

        if label not in self._candidates:
            self._candidates[label] = []
        self._candidates[label].append(candidate)

    def mark_winner(
        self,
        candidate: Candidate,
        constructed: LegoPageElement,
    ) -> None:
        """Mark a candidate as the winner and update tracking dicts.

        Args:
            candidate: The candidate to mark as winner
            constructed: The constructed LegoPageElement

        Raises:
            ValueError: If candidate has a source_block that is not in PageData
            ValueError: If this block already has a winner candidate
        """
        self._validate_block_in_page_data(
            candidate.source_block, "candidate.source_block"
        )

        # Check if this block already has a winner
        if candidate.source_block is not None:
            block_id = candidate.source_block.id
            if block_id in self._block_winners:
                existing_label, existing_candidate = self._block_winners[block_id]
                raise ValueError(
                    f"Block {block_id} already has a winner candidate for "
                    f"label '{existing_label}'. Cannot mark as winner for "
                    f"label '{candidate.label}'. Each block can have at most "
                    f"one winner candidate."
                )

        candidate.is_winner = True
        # Store the constructed element for this source element
        if candidate.source_block is not None:
            self._constructed_elements[candidate.source_block.id] = constructed
            self._block_winners[candidate.source_block.id] = (
                candidate.label,
                candidate,
            )

    def mark_removed(self, block: Block, reason: RemovalReason) -> None:
        """Mark a block as removed with the given reason.

        Args:
            block: The block to mark as removed
            reason: The reason for removal

        Raises:
            ValueError: If block is not in PageData
        """
        self._validate_block_in_page_data(block, "block")
        self._removal_reasons[block.id] = reason

    # TODO Consider removing this method.
    def get_labeled_blocks(self) -> dict[Block, str]:
        """Get a dictionary of all labeled blocks.

        Returns:
            Dictionary mapping blocks to their labels (excludes synthetic candidates)
        """
        labeled: dict[Block, str] = {}
        for label, label_candidates in self._candidates.items():
            for candidate in label_candidates:
                if candidate.is_winner and candidate.source_block is not None:
                    labeled[candidate.source_block] = label
        return labeled

    def get_label(self, block: Block) -> str | None:
        """Get the label for a block from this classification result.

        Args:
            block: The block to get the label for

        Returns:
            The label string if found, None otherwise
        """
        # Search through all candidates to find the winning label for this block
        for label, label_candidates in self._candidates.items():
            for candidate in label_candidates:
                if candidate.source_block is block and candidate.is_winner:
                    return label
        return None

    def get_blocks_by_label(self, label: str) -> list[Block]:
        """Get all blocks with the given label.

        Args:
            label: The label to search for

        Returns:
            List of blocks with that label. For constructed blocks (e.g., Part),
            returns the constructed object; for regular blocks, returns source_block.
        """
        label_candidates = self._candidates.get(label, [])
        blocks = []
        for c in label_candidates:
            if c.is_winner:
                # Prefer source_block, fall back to constructed for synthetic blocks
                if c.source_block is not None:
                    blocks.append(c.source_block)
                elif c.constructed is not None:
                    blocks.append(c.constructed)
        return blocks

    def is_removed(self, block: Block) -> bool:
        """Check if a block has been marked for removal.

        Args:
            block: The block to check

        Returns:
            True if the block is marked for removal, False otherwise
        """
        return block.id in self._removal_reasons

    def get_removal_reason(self, block: Block) -> RemovalReason | None:
        """Get the reason why a block was removed.

        Args:
            block: The block to get the removal reason for

        Returns:
            The RemovalReason if the block was removed, None otherwise
        """
        return self._removal_reasons.get(block.id)

    def get_scores_for_label(self, label: str) -> dict[ScoreKey, Any]:
        """Get all scores for a specific label.

        Args:
            label: The label to get scores for

        Returns:
            Dictionary mapping elements to score objects for that label
            (excludes synthetic candidates without source_block)
        """
        label_candidates = self._candidates.get(label, [])
        return {
            c.source_block: c.score_details
            for c in label_candidates
            if c.source_block is not None
        }

    def has_label(self, label: str) -> bool:
        """Check if any elements have been assigned the given label.

        Args:
            label: The label to check for

        Returns:
            True if at least one element has this label, False otherwise
        """
        label_candidates = self._candidates.get(label, [])
        return any(c.is_winner for c in label_candidates)

    def get_best_candidate(self, label: str) -> Candidate | None:
        """Get the winning candidate for a label.

        Args:
            label: The label to get the best candidate for

        Returns:
            The candidate with the highest score that successfully constructed,
            or None if no valid candidates exist
        """
        label_candidates = self._candidates.get(label, [])
        valid = [c for c in label_candidates if c.constructed is not None]
        return max(valid, key=lambda c: c.score) if valid else None

    def get_alternative_candidates(
        self, label: str, exclude_winner: bool = True
    ) -> list[Candidate]:
        """Get alternative candidates for a label (for UI/re-evaluation).

        Args:
            label: The label to get alternatives for
            exclude_winner: If True, exclude the winning candidate

        Returns:
            List of candidates sorted by score (highest first)
        """
        label_candidates = self._candidates.get(label, [])
        if exclude_winner:
            winner_blocks = self.get_blocks_by_label(label)
            if winner_blocks:
                winner_id = id(winner_blocks[0])
                label_candidates = [
                    c for c in label_candidates if id(c.source_block) != winner_id
                ]
        return sorted(label_candidates, key=lambda c: c.score, reverse=True)

    def get_part_image_pairs(self) -> list[tuple[Block, Block]]:
        """Get part_count and part_image element pairs from winning candidates.

        This derives the pairs from the part_image candidates' score_details,
        which contain the relationship between part_count text and image elements.

        Returns:
            List of (part_count, image) tuples for all winning part_image candidates
        """
        pairs: list[tuple[Block, Block]] = []
        for candidate in self.get_candidates("part_image"):
            if candidate.is_winner and candidate.score_details:
                # score_details is a _PartImageScore with part_count and image fields
                score = candidate.score_details
                if hasattr(score, "part_count") and hasattr(score, "image"):
                    pairs.append((score.part_count, score.image))
        return pairs


@dataclass
class ClassificationHints(JSONPyWizard):
    """Hints to guide the classification process."""

    pass

1	"""
2	Data classes for the classifier.
3	"""
4
5	from __future__ import annotations	1✔
6
7	from dataclasses import dataclass, field	1✔
8	from typing import Any	1✔
9
10	from dataclass_wizard import JSONPyWizard	1✔
11
12	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
13	from build_a_long.pdf_extract.extractor.extractor import PageData	1✔
14	from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement	1✔
15	from build_a_long.pdf_extract.extractor.page_blocks import Block	1✔
16
17	# Score key can be either a single Block or a tuple of Blocks (for pairings)
18	ScoreKey = Block \| tuple[Block, ...]	1✔
19
20
21	@dataclass	1✔
22	class RemovalReason(JSONPyWizard):	1✔
23	"""Tracks why a block was removed during classification."""
24
25	reason_type: str	1✔
26	"""Type of removal: 'child_bbox' or 'similar_bbox'"""	1✔
27
28	target_block: Block	1✔
29	"""The block that caused this removal"""	1✔
30
31
32	@dataclass	1✔
33	class Candidate(JSONPyWizard):	1✔
34	"""A candidate block with its score and constructed LegoElement.
35
36	Represents a single block that was considered for a particular label,
37	including its score, the constructed LegoPageElement (if successful),
38	and information about why it succeeded or failed.
39
40	This enables:
41	- Re-evaluation with hints (exclude specific candidates)
42	- Debugging (see all candidates and why they won/lost)
43	- UI support (show users alternatives)
44	"""
45
46	bbox: BBox	1✔
47	"""The bounding box for this candidate (from source_block or constructed)"""	1✔
48
49	label: str	1✔
50	"""The label this candidate would have (e.g., 'page_number')"""	1✔
51
52	# TODO Maybe score is redudant with score_details?
53	score: float	1✔
54	"""Combined score (0.0-1.0)"""	1✔
55
56	score_details: Any	1✔
57	"""The detailed score object (e.g., _PageNumberScore)"""	1✔
58
59	constructed: LegoPageElement \| None	1✔
60	"""The constructed LegoElement if parsing succeeded, None if failed"""	1✔
61
62	source_block: Block \| None = None	1✔
63	"""The raw element that was scored (None for synthetic elements like Step)"""	1✔
64
65	failure_reason: str \| None = None	1✔
66	"""Why construction failed, if it did"""	1✔
67
68	is_winner: bool = False	1✔
69	"""Whether this candidate was selected as the winner.	1✔
70
71	This field is set by mark_winner() and is used for:
72	- Querying winners (get_label, get_blocks_by_label, has_label)
73	- Synthetic candidates (which have no source_block and can't be tracked
74	in _block_winners)
75	- JSON serialization and golden file comparisons
76
77	Note: For candidates with source_block, this is redundant with
78	_block_winners, but provides convenient access and handles synthetic
79	candidates.
80	"""
81
82
83	@dataclass	1✔
84	class ClassifierConfig(JSONPyWizard):	1✔
85	"""Configuration for the classifier."""
86
87	# TODO Not sure what this value is used for
88	min_confidence_threshold: float = 0.5	1✔
89
90	page_number_text_weight: float = 0.7	1✔
91	page_number_position_weight: float = 0.3	1✔
92	page_number_position_scale: float = 50.0	1✔
93	page_number_page_value_weight: float = 1.0	1✔
94
95	step_number_text_weight: float = 0.8	1✔
96	step_number_size_weight: float = 0.2	1✔
97
98	def __post_init__(self) -> None:	1✔
99	for weight in self.__dict__.values():	1✔
100	if weight < 0:	1✔
101	raise ValueError("All weights must be greater than or equal to 0.")	1✔
102
103
104	@dataclass	1✔
105	class ClassificationResult(JSONPyWizard):	1✔
106	"""Represents the outcome of a single classification run.
107
108	This class encapsulates the results of element classification, including
109	labels, scores, and removal information. The candidates field is now the
110	primary source of truth for classification results, containing all scored
111	elements, their constructed LegoPageElements, and winner information.
112
113	ClassificationResult is passed through the classifier pipeline, with each
114	classifier adding its candidates and marking winners. This allows later
115	classifiers to query the current state and make decisions based on earlier
116	results.
117
118	External code should use the accessor methods rather than accessing internal
119	fields directly to maintain encapsulation.
120	"""
121
122	page_data: PageData	1✔
123	"""The original page data being classified"""	1✔
124
125	_warnings: list[str] = field(default_factory=list)	1✔
126
127	_removal_reasons: dict[int, RemovalReason] = field(default_factory=dict)	1✔
128	"""Maps block IDs (block.id, not id(block)) to the reason they were removed.	1✔
129
130	Keys are block IDs (int) instead of Block objects to ensure JSON serializability
131	and consistency with _constructed_elements.
132	"""
133
134	_constructed_elements: dict[int, LegoPageElement] = field(default_factory=dict)	1✔
135	"""Maps source block IDs to their constructed LegoPageElements.	1✔
136
137	Only contains elements that were successfully labeled and constructed.
138	The builder should use these pre-constructed elements rather than
139	re-parsing the source blocks.
140
141	Keys are block IDs (int) instead of Block objects to ensure JSON serializability.
142	"""
143
144	_candidates: dict[str, list[Candidate]] = field(default_factory=dict)	1✔
145	"""Maps label names to lists of all candidates considered for that label.	1✔
146
147	Each candidate includes:
148	- The source element
149	- Its score and score details
150	- The constructed LegoPageElement (if successful)
151	- Failure reason (if construction failed)
152	- Whether it was the winner
153
154	This enables:
155	- Re-evaluation with hints (exclude specific candidates)
156	- Debugging (see why each candidate won/lost)
157	- UI support (show users alternatives)
158	"""
159
160	_block_winners: dict[int, tuple[str, Candidate]] = field(default_factory=dict)	1✔
161	"""Maps block IDs to their winning (label, candidate) tuple.	1✔
162
163	Ensures each block has at most one winning candidate across all labels.
164	Keys are block IDs (int) for JSON serializability.
165	"""
166
167	def __post_init__(self) -> None:	1✔
168	"""Validate PageData blocks have unique IDs (if present).
169
170	Blocks may have None IDs, but blocks with IDs must have unique IDs.
171	Note: Only blocks with IDs can be tracked in _constructed_elements and
172	_removal_reasons (which require block.id as keys for JSON serializability).
173	"""
174	# Validate unique IDs (ignoring None values)
175	block_ids = [e.id for e in self.page_data.blocks if e.id is not None]	1✔
176	if len(block_ids) != len(set(block_ids)):	1✔
177	duplicates = [id_ for id_ in block_ids if block_ids.count(id_) > 1]	1✔
178	raise ValueError(	1✔
179	f"PageData blocks must have unique IDs. "
180	f"Found duplicates: {set(duplicates)}"
181	)
182
183	def _validate_block_in_page_data(	1✔
184	self, block: Block \| None, param_name: str = "block"
185	) -> None:
186	"""Validate that a block is in PageData.
187
188	Args:
189	block: The block to validate (None is allowed and skips validation)
190	param_name: Name of the parameter being validated (for error messages)
191
192	Raises:
193	ValueError: If block is not None and not in PageData.blocks
194	"""
195	if block is not None and block not in self.page_data.blocks:	1✔
196	raise ValueError(f"{param_name} must be in PageData.blocks. Block: {block}")	1✔
197
198	@property	1✔
199	def blocks(self) -> list[Block]:	1✔
200	"""Get the blocks from the page data.
201
202	Returns:
203	List of blocks from the page data
204	"""
NEW 205	return self.page_data.blocks	×
206
207	def add_warning(self, warning: str) -> None:	1✔
208	"""Add a warning message to the classification result.
209
210	Args:
211	warning: The warning message to add
212	"""
213	self._warnings.append(warning)	1✔
214
215	def get_warnings(self) -> list[str]:	1✔
216	"""Get all warnings generated during classification.
217
218	Returns:
219	List of warning messages
220	"""
221	return self._warnings.copy()	1✔
222
223	def get_constructed_element(self, block: Block) -> LegoPageElement \| None:	1✔
224	"""Get the constructed LegoPageElement for a source block.
225
226	Args:
227	block: The source block
228
229	Returns:
230	The constructed LegoPageElement if it exists, None otherwise
231	"""
232	return self._constructed_elements.get(block.id)	1✔
233
234	# TODO maybe add a parameter to fitler out winners/non-winners
235	def get_candidates(self, label: str) -> list[Candidate]:	1✔
236	"""Get all candidates for a specific label.
237
238	Args:
239	label: The label to get candidates for
240
241	Returns:
242	List of candidates for that label (returns copy to prevent
243	external modification)
244	"""
245	return self._candidates.get(label, []).copy()	1✔
246
247	def get_all_candidates(self) -> dict[str, list[Candidate]]:	1✔
248	"""Get all candidates across all labels.
249
250	Returns:
251	Dictionary mapping labels to their candidates (returns deep copy)
252	"""
253	return {label: cands.copy() for label, cands in self._candidates.items()}	×
254
255	def add_candidate(self, label: str, candidate: Candidate) -> None:	1✔
256	"""Add a single candidate for a specific label.
257
258	Args:
259	label: The label this candidate is for
260	candidate: The candidate to add
261
262	Raises:
263	ValueError: If candidate has a source_block that is not in PageData
264	"""
265	self._validate_block_in_page_data(	1✔
266	candidate.source_block, "candidate.source_block"
267	)
268
269	if label not in self._candidates:	1✔
270	self._candidates[label] = []	1✔
271	self._candidates[label].append(candidate)	1✔
272
273	def mark_winner(	1✔
274	self,
275	candidate: Candidate,
276	constructed: LegoPageElement,
277	) -> None:
278	"""Mark a candidate as the winner and update tracking dicts.
279
280	Args:
281	candidate: The candidate to mark as winner
282	constructed: The constructed LegoPageElement
283
284	Raises:
285	ValueError: If candidate has a source_block that is not in PageData
286	ValueError: If this block already has a winner candidate
287	"""
288	self._validate_block_in_page_data(	1✔
289	candidate.source_block, "candidate.source_block"
290	)
291
292	# Check if this block already has a winner
293	if candidate.source_block is not None:	1✔
294	block_id = candidate.source_block.id	1✔
295	if block_id in self._block_winners:	1✔
296	existing_label, existing_candidate = self._block_winners[block_id]	1✔
297	raise ValueError(	1✔
298	f"Block {block_id} already has a winner candidate for "
299	f"label '{existing_label}'. Cannot mark as winner for "
300	f"label '{candidate.label}'. Each block can have at most "
301	f"one winner candidate."
302	)
303
304	candidate.is_winner = True	1✔
305	# Store the constructed element for this source element
306	if candidate.source_block is not None:	1✔
307	self._constructed_elements[candidate.source_block.id] = constructed	1✔
308	self._block_winners[candidate.source_block.id] = (	1✔
309	candidate.label,
310	candidate,
311	)
312
313	def mark_removed(self, block: Block, reason: RemovalReason) -> None:	1✔
314	"""Mark a block as removed with the given reason.
315
316	Args:
317	block: The block to mark as removed
318	reason: The reason for removal
319
320	Raises:
321	ValueError: If block is not in PageData
322	"""
323	self._validate_block_in_page_data(block, "block")	1✔
324	self._removal_reasons[block.id] = reason	1✔
325
326	# TODO Consider removing this method.
327	def get_labeled_blocks(self) -> dict[Block, str]:	1✔
328	"""Get a dictionary of all labeled blocks.
329
330	Returns:
331	Dictionary mapping blocks to their labels (excludes synthetic candidates)
332	"""
333	labeled: dict[Block, str] = {}	1✔
334	for label, label_candidates in self._candidates.items():	1✔
335	for candidate in label_candidates:	1✔
336	if candidate.is_winner and candidate.source_block is not None:	1✔
337	labeled[candidate.source_block] = label	1✔
338	return labeled	1✔
339
340	def get_label(self, block: Block) -> str \| None:	1✔
341	"""Get the label for a block from this classification result.
342
343	Args:
344	block: The block to get the label for
345
346	Returns:
347	The label string if found, None otherwise
348	"""
349	# Search through all candidates to find the winning label for this block
350	for label, label_candidates in self._candidates.items():	1✔
351	for candidate in label_candidates:	1✔
352	if candidate.source_block is block and candidate.is_winner:	1✔
353	return label	1✔
354	return None	1✔
355
356	def get_blocks_by_label(self, label: str) -> list[Block]:	1✔
357	"""Get all blocks with the given label.
358
359	Args:
360	label: The label to search for
361
362	Returns:
363	List of blocks with that label. For constructed blocks (e.g., Part),
364	returns the constructed object; for regular blocks, returns source_block.
365	"""
366	label_candidates = self._candidates.get(label, [])	1✔
367	blocks = []	1✔
368	for c in label_candidates:	1✔
369	if c.is_winner:	1✔
370	# Prefer source_block, fall back to constructed for synthetic blocks
371	if c.source_block is not None:	1✔
372	blocks.append(c.source_block)	1✔
NEW 373	elif c.constructed is not None:	×
NEW 374	blocks.append(c.constructed)	×
375	return blocks	1✔
376
377	def is_removed(self, block: Block) -> bool:	1✔
378	"""Check if a block has been marked for removal.
379
380	Args:
381	block: The block to check
382
383	Returns:
384	True if the block is marked for removal, False otherwise
385	"""
386	return block.id in self._removal_reasons	1✔
387
388	def get_removal_reason(self, block: Block) -> RemovalReason \| None:	1✔
389	"""Get the reason why a block was removed.
390
391	Args:
392	block: The block to get the removal reason for
393
394	Returns:
395	The RemovalReason if the block was removed, None otherwise
396	"""
397	return self._removal_reasons.get(block.id)	1✔
398
399	def get_scores_for_label(self, label: str) -> dict[ScoreKey, Any]:	1✔
400	"""Get all scores for a specific label.
401
402	Args:
403	label: The label to get scores for
404
405	Returns:
406	Dictionary mapping elements to score objects for that label
407	(excludes synthetic candidates without source_block)
408	"""
409	label_candidates = self._candidates.get(label, [])	1✔
410	return {	1✔
411	c.source_block: c.score_details
412	for c in label_candidates
413	if c.source_block is not None
414	}
415
416	def has_label(self, label: str) -> bool:	1✔
417	"""Check if any elements have been assigned the given label.
418
419	Args:
420	label: The label to check for
421
422	Returns:
423	True if at least one element has this label, False otherwise
424	"""
425	label_candidates = self._candidates.get(label, [])	1✔
426	return any(c.is_winner for c in label_candidates)	1✔
427
428	def get_best_candidate(self, label: str) -> Candidate \| None:	1✔
429	"""Get the winning candidate for a label.
430
431	Args:
432	label: The label to get the best candidate for
433
434	Returns:
435	The candidate with the highest score that successfully constructed,
436	or None if no valid candidates exist
437	"""
438	label_candidates = self._candidates.get(label, [])	1✔
439	valid = [c for c in label_candidates if c.constructed is not None]	1✔
440	return max(valid, key=lambda c: c.score) if valid else None	1✔
441
442	def get_alternative_candidates(	1✔
443	self, label: str, exclude_winner: bool = True
444	) -> list[Candidate]:
445	"""Get alternative candidates for a label (for UI/re-evaluation).
446
447	Args:
448	label: The label to get alternatives for
449	exclude_winner: If True, exclude the winning candidate
450
451	Returns:
452	List of candidates sorted by score (highest first)
453	"""
454	label_candidates = self._candidates.get(label, [])	1✔
455	if exclude_winner:	1✔
456	winner_blocks = self.get_blocks_by_label(label)	1✔
457	if winner_blocks:	1✔
458	winner_id = id(winner_blocks[0])	1✔
459	label_candidates = [	1✔
460	c for c in label_candidates if id(c.source_block) != winner_id
461	]
462	return sorted(label_candidates, key=lambda c: c.score, reverse=True)	1✔
463
464	def get_part_image_pairs(self) -> list[tuple[Block, Block]]:	1✔
465	"""Get part_count and part_image element pairs from winning candidates.
466
467	This derives the pairs from the part_image candidates' score_details,
468	which contain the relationship between part_count text and image elements.
469
470	Returns:
471	List of (part_count, image) tuples for all winning part_image candidates
472	"""
473	pairs: list[tuple[Block, Block]] = []	1✔
474	for candidate in self.get_candidates("part_image"):	1✔
475	if candidate.is_winner and candidate.score_details:	1✔
476	# score_details is a _PartImageScore with part_count and image fields
477	score = candidate.score_details	1✔
478	if hasattr(score, "part_count") and hasattr(score, "image"):	1✔
479	pairs.append((score.part_count, score.image))	1✔
480	return pairs	1✔
481
482
483	@dataclass	1✔
484	class ClassificationHints(JSONPyWizard):	1✔
485	"""Hints to guide the classification process."""
486
487	pass	1✔

bramp / build-along / 19155446196

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous