19221936920

Committed 10 Nov 2025 05:39AM UTC coverage: 86.707% (-0.4%) from 87.11%

Build # 19221936920

Build Type

push

github

Committed by

bramp

Commit Message

fix(pdf_extract): Fix argparse prefix matching and improve boolean flags

- Add allow_abbrev=False to prevent --draw from matching --draw-deleted
- Change to BooleanOptionalAction for --summary, --draw, and --draw-deleted
  This provides both positive and negative forms (e.g., --draw/--no-draw)
- Remove set_defaults() call since defaults are now in argument definitions
- Fixes bug where deleted blocks were drawn without --draw-deleted flag

Run Details

4253 of 4905 relevant lines covered (86.71%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.13

/src/build_a_long/pdf_extract/classifier/classification_result.py

"""
Data classes for the classifier.
"""

from __future__ import annotations

from typing import Annotated, Any

from annotated_types import Ge, Le
from pydantic import BaseModel, Field, model_validator

from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.extractor import PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement
from build_a_long.pdf_extract.extractor.page_blocks import Block

# Score key can be either a single Block or a tuple of Blocks (for pairings)
ScoreKey = Block | tuple[Block, ...]

# Weight value constrained to [0.0, 1.0] range
Weight = Annotated[float, Ge(0), Le(1)]


# TODO Make this JSON serializable
class BatchClassificationResult(BaseModel):
    """Results from classifying multiple pages together.

    This class holds both the per-page classification results and the
    global text histogram computed across all pages.
    """

    results: list[ClassificationResult]
    """Per-page classification results, one for each input page"""

    histogram: TextHistogram
    """Global text histogram computed across all pages"""


class RemovalReason(BaseModel):
    """Tracks why a block was removed during classification."""

    reason_type: str
    """Type of removal: 'child_bbox' or 'similar_bbox'"""

    target_block: Block
    """The block that caused this removal"""


class Candidate(BaseModel):
    """A candidate block with its score and constructed LegoElement.

    Represents a single block that was considered for a particular label,
    including its score, the constructed LegoPageElement (if successful),
    and information about why it succeeded or failed.

    This enables:
    - Re-evaluation with hints (exclude specific candidates)
    - Debugging (see all candidates and why they won/lost)
    - UI support (show users alternatives)
    """

    bbox: BBox
    """The bounding box for this candidate (from source_block or constructed)"""

    label: str
    """The label this candidate would have (e.g., 'page_number')"""

    # TODO Maybe score is redudant with score_details?
    score: float
    """Combined score (0.0-1.0)"""

    score_details: Any
    """The detailed score object (e.g., _PageNumberScore)"""

    constructed: LegoPageElement | None
    """The constructed LegoElement if parsing succeeded, None if failed"""

    source_block: Block | None = None
    """The raw element that was scored (None for synthetic elements like Step)"""

    failure_reason: str | None = None
    """Why construction failed, if it did"""

    is_winner: bool = False
    """Whether this candidate was selected as the winner.
    
    This field is set by mark_winner() and is used for:
    - Querying winners (get_label, get_blocks_by_label, has_label)
    - Synthetic candidates (which have no source_block and can't be tracked
      in _block_winners)
    - JSON serialization and golden file comparisons
    
    Note: For candidates with source_block, this is redundant with
    _block_winners, but provides convenient access and handles synthetic
    candidates.
    """


class ClassifierConfig(BaseModel):
    """Configuration for the classifier."""

    # TODO Not sure what this value is used for
    min_confidence_threshold: float = 0.5

    page_number_text_weight: Weight = 0.7
    page_number_position_weight: Weight = 0.3
    page_number_position_scale: float = 50.0
    page_number_page_value_weight: Weight = 1.0
    page_number_font_size_weight: Weight = 0.1

    step_number_text_weight: Weight = 0.7
    step_number_font_size_weight: Weight = 0.3

    part_count_text_weight: Weight = 0.7
    part_count_font_size_weight: Weight = 0.3

    font_size_hints: FontSizeHints = Field(default_factory=FontSizeHints.empty)
    """Font size hints derived from analyzing all pages"""


class ClassificationResult(BaseModel):
    """Result of classifying a single page.

    This class stores both the results and intermediate artifacts for a page
    classification. It provides structured access to:
    - Labels assigned to blocks
    - LegoPageElements constructed from blocks
    - Removal reasons for filtered blocks
    - All candidates considered (including rejected ones)

    The use of dictionaries keyed by block IDs (int) instead of Block objects
    ensures JSON serializability and consistent equality semantics.

    # TODO: Consider refactoring to separate DAO (Data Access Object) representation
    # from the business logic. The public fields below are used for serialization
    # but external code should prefer using the accessor methods to maintain
    # encapsulation and allow future refactoring.

    External code should use the accessor methods rather than accessing these
    fields directly to maintain encapsulation.
    """

    page_data: PageData
    """The original page data being classified"""

    warnings: list[str] = Field(default_factory=list)
    """Warning messages generated during classification.
    
    Public for serialization. Prefer using add_warning() and get_warnings() methods.
    """

    removal_reasons: dict[int, RemovalReason] = Field(default_factory=dict)
    """Maps block IDs (block.id, not id(block)) to the reason they were removed.
    
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability
    and consistency with constructed_elements.
    
    Public for serialization. Prefer using accessor methods.
    """

    constructed_elements: dict[int, LegoPageElement] = Field(default_factory=dict)
    """Maps source block IDs to their constructed LegoPageElements.
    
    Only contains elements that were successfully labeled and constructed.
    The builder should use these pre-constructed elements rather than
    re-parsing the source blocks.
    
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability.
    
    Public for serialization. Prefer using get_constructed_element() method.
    """

    candidates: dict[str, list[Candidate]] = Field(default_factory=dict)
    """Maps label names to lists of all candidates considered for that label.
    
    Each candidate includes:
    - The source element
    - Its score and score details
    - The constructed LegoPageElement (if successful)
    - Failure reason (if construction failed)
    - Whether it was the winner
    
    This enables:
    - Re-evaluation with hints (exclude specific candidates)
    - Debugging (see why each candidate won/lost)
    - UI support (show users alternatives)
    
    Public for serialization. Prefer using get_* accessor methods.
    """

    block_winners: dict[int, tuple[str, Candidate]] = Field(default_factory=dict)
    """Maps block IDs to their winning (label, candidate) tuple.
    
    Ensures each block has at most one winning candidate across all labels.
    Keys are block IDs (int) for JSON serializability.
    
    Public for serialization. Prefer using get_label() and related methods.
    """

    @model_validator(mode="after")
    def validate_unique_block_ids(self) -> ClassificationResult:
        """Validate PageData blocks have unique IDs (if present).

        Blocks may have None IDs, but blocks with IDs must have unique IDs.
        Note: Only blocks with IDs can be tracked in _constructed_elements and
        _removal_reasons (which require block.id as keys for JSON serializability).
        """
        # Validate unique IDs (ignoring None values)
        block_ids = [e.id for e in self.page_data.blocks if e.id is not None]
        if len(block_ids) != len(set(block_ids)):
            duplicates = [id_ for id_ in block_ids if block_ids.count(id_) > 1]
            raise ValueError(
                f"PageData blocks must have unique IDs. "
                f"Found duplicates: {set(duplicates)}"
            )
        return self

    def _validate_block_in_page_data(
        self, block: Block | None, param_name: str = "block"
    ) -> None:
        """Validate that a block is in PageData.

        Args:
            block: The block to validate (None is allowed and skips validation)
            param_name: Name of the parameter being validated (for error messages)

        Raises:
            ValueError: If block is not None and not in PageData.blocks
        """
        if block is not None and block not in self.page_data.blocks:
            raise ValueError(f"{param_name} must be in PageData.blocks. Block: {block}")

    @property
    def blocks(self) -> list[Block]:
        """Get the blocks from the page data.

        Returns:
            List of blocks from the page data
        """
        return self.page_data.blocks

    def add_warning(self, warning: str) -> None:
        """Add a warning message to the classification result.

        Args:
            warning: The warning message to add
        """
        self.warnings.append(warning)

    def get_warnings(self) -> list[str]:
        """Get all warnings generated during classification.

        Returns:
            List of warning messages
        """
        return self.warnings.copy()

    def get_constructed_element(self, block: Block) -> LegoPageElement | None:
        """Get the constructed LegoPageElement for a source block.

        Args:
            block: The source block to look up

        Returns:
            The constructed LegoPageElement if it exists, otherwise None
        """
        return self.constructed_elements.get(block.id)

    # TODO maybe add a parameter to fitler out winners/non-winners
    def get_candidates(self, label: str) -> list[Candidate]:
        """Get all candidates for a specific label.

        Args:
            label: The label to get candidates for

        Returns:
            List of candidates for that label (returns copy to prevent
            external modification)
        """
        return self.candidates.get(label, []).copy()

    def get_all_candidates(self) -> dict[str, list[Candidate]]:
        """Get all candidates across all labels.

        Returns:
            Dictionary mapping labels to their candidates (returns deep copy)
        """
        return {label: cands.copy() for label, cands in self.candidates.items()}

    def add_candidate(self, label: str, candidate: Candidate) -> None:
        """Add a single candidate for a specific label.

        Args:
            label: The label this candidate is for
            candidate: The candidate to add

        Raises:
            ValueError: If candidate has a source_block that is not in PageData
        """
        self._validate_block_in_page_data(
            candidate.source_block, "candidate.source_block"
        )

        if label not in self.candidates:
            self.candidates[label] = []
        self.candidates[label].append(candidate)

    def mark_winner(
        self,
        candidate: Candidate,
        constructed: LegoPageElement,
    ) -> None:
        """Mark a candidate as the winner and update tracking dicts.

        Args:
            candidate: The candidate to mark as winner
            constructed: The constructed LegoPageElement

        Raises:
            ValueError: If candidate has a source_block that is not in PageData
            ValueError: If this block already has a winner candidate
        """
        self._validate_block_in_page_data(
            candidate.source_block, "candidate.source_block"
        )

        # Check if this block already has a winner
        if candidate.source_block is not None:
            block_id = candidate.source_block.id
            if block_id in self.block_winners:
                existing_label, existing_candidate = self.block_winners[block_id]
                raise ValueError(
                    f"Block {block_id} already has a winner candidate for "
                    f"label '{existing_label}'. Cannot mark as winner for "
                    f"label '{candidate.label}'. Each block can have at most "
                    f"one winner candidate."
                )

        candidate.is_winner = True
        # Store the constructed element for this source element
        if candidate.source_block is not None:
            self.constructed_elements[candidate.source_block.id] = constructed
            self.block_winners[candidate.source_block.id] = (
                candidate.label,
                candidate,
            )

    def mark_removed(self, block: Block, reason: RemovalReason) -> None:
        """Mark a block as removed with the given reason.

        Args:
            block: The block to mark as removed
            reason: The reason for removal

        Raises:
            ValueError: If block is not in PageData
        """
        self._validate_block_in_page_data(block, "block")
        self.removal_reasons[block.id] = reason

    # TODO Consider removing this method.
    def get_labeled_blocks(self) -> dict[Block, str]:
        """Get a dictionary of all labeled blocks.

        Returns:
            Dictionary mapping blocks to their labels (excludes synthetic candidates)
        """
        labeled: dict[Block, str] = {}
        for label, label_candidates in self.candidates.items():
            for candidate in label_candidates:
                if candidate.is_winner and candidate.source_block is not None:
                    labeled[candidate.source_block] = label
        return labeled

    def get_label(self, block: Block) -> str | None:
        """Get the label for a block from this classification result.

        Args:
            block: The block to get the label for

        Returns:
            The label string if found, None otherwise
        """
        # Search through all candidates to find the winning label for this block
        for label, label_candidates in self.candidates.items():
            for candidate in label_candidates:
                if candidate.source_block is block and candidate.is_winner:
                    return label
        return None

    def get_blocks_by_label(self, label: str) -> list[Block]:
        """Get all blocks with the given label.

        Args:
            label: The label to search for

        Returns:
            List of blocks with that label. For constructed blocks (e.g., Part),
            returns the constructed object; for regular blocks, returns source_block.
        """
        label_candidates = self.candidates.get(label, [])
        blocks = []
        for c in label_candidates:
            if c.is_winner:
                # Prefer source_block, fall back to constructed for synthetic blocks
                if c.source_block is not None:
                    blocks.append(c.source_block)
                elif c.constructed is not None:
                    blocks.append(c.constructed)
        return blocks

    def is_removed(self, block: Block) -> bool:
        """Check if a block has been marked for removal.

        Args:
            block: The block to check

        Returns:
            True if the block is marked for removal, False otherwise
        """
        return block.id in self.removal_reasons

    def get_removal_reason(self, block: Block) -> RemovalReason | None:
        """Get the reason why a block was removed.

        Args:
            block: The block to get the removal reason for

        Returns:
            The RemovalReason if the block was removed, None otherwise
        """
        return self.removal_reasons.get(block.id)

    def get_scores_for_label(self, label: str) -> dict[ScoreKey, Any]:
        """Get all scores for a specific label.

        Args:
            label: The label to get scores for

        Returns:
            Dictionary mapping elements to score objects for that label
            (excludes synthetic candidates without source_block)
        """
        label_candidates = self.candidates.get(label, [])
        return {
            c.source_block: c.score_details
            for c in label_candidates
            if c.source_block is not None
        }

    def has_label(self, label: str) -> bool:
        """Check if any elements have been assigned the given label.

        Args:
            label: The label to check for

        Returns:
            True if at least one element has this label, False otherwise
        """
        label_candidates = self.candidates.get(label, [])
        return any(c.is_winner for c in label_candidates)

    def get_best_candidate(self, label: str) -> Candidate | None:
        """Get the winning candidate for a label.

        Args:
            label: The label to get the best candidate for

        Returns:
            The candidate with the highest score that successfully constructed,
            or None if no valid candidates exist
        """
        label_candidates = self.candidates.get(label, [])
        valid = [c for c in label_candidates if c.constructed is not None]
        return max(valid, key=lambda c: c.score) if valid else None

    def get_alternative_candidates(
        self, label: str, exclude_winner: bool = True
    ) -> list[Candidate]:
        """Get alternative candidates for a label (for UI/re-evaluation).

        Args:
            label: The label to get alternatives for
            exclude_winner: If True, exclude the winning candidate

        Returns:
            List of candidates sorted by score (highest first)
        """
        label_candidates = self.candidates.get(label, [])
        if exclude_winner:
            winner_blocks = self.get_blocks_by_label(label)
            if winner_blocks:
                winner_id = id(winner_blocks[0])
                label_candidates = [
                    c for c in label_candidates if id(c.source_block) != winner_id
                ]
        return sorted(label_candidates, key=lambda c: c.score, reverse=True)

    def get_part_image_pairs(self) -> list[tuple[Block, Block]]:
        """Get part_count and part_image element pairs from winning candidates.

        This derives the pairs from the part_image candidates' score_details,
        which contain the relationship between part_count text and image elements.

        Returns:
            List of (part_count, image) tuples for all winning part_image candidates
        """
        pairs: list[tuple[Block, Block]] = []
        for candidate in self.get_candidates("part_image"):
            if candidate.is_winner and candidate.score_details:
                # score_details is a _PartImageScore with part_count and image fields
                score = candidate.score_details
                if hasattr(score, "part_count") and hasattr(score, "image"):
                    pairs.append((score.part_count, score.image))
        return pairs

1	"""
2	Data classes for the classifier.
3	"""
4
5	from __future__ import annotations	1✔
6
7	from typing import Annotated, Any	1✔
8
9	from annotated_types import Ge, Le	1✔
10	from pydantic import BaseModel, Field, model_validator	1✔
11
12	from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints	1✔
13	from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram	1✔
14	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
15	from build_a_long.pdf_extract.extractor.extractor import PageData	1✔
16	from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement	1✔
17	from build_a_long.pdf_extract.extractor.page_blocks import Block	1✔
18
19	# Score key can be either a single Block or a tuple of Blocks (for pairings)
20	ScoreKey = Block \| tuple[Block, ...]	1✔
21
22	# Weight value constrained to [0.0, 1.0] range
23	Weight = Annotated[float, Ge(0), Le(1)]	1✔
24
25
26	# TODO Make this JSON serializable
27	class BatchClassificationResult(BaseModel):	1✔
28	"""Results from classifying multiple pages together.
29
30	This class holds both the per-page classification results and the
31	global text histogram computed across all pages.
32	"""
33
34	results: list[ClassificationResult]	1✔
35	"""Per-page classification results, one for each input page"""	1✔
36
37	histogram: TextHistogram	1✔
38	"""Global text histogram computed across all pages"""	1✔
39
40
41	class RemovalReason(BaseModel):	1✔
42	"""Tracks why a block was removed during classification."""
43
44	reason_type: str	1✔
45	"""Type of removal: 'child_bbox' or 'similar_bbox'"""	1✔
46
47	target_block: Block	1✔
48	"""The block that caused this removal"""	1✔
49
50
51	class Candidate(BaseModel):	1✔
52	"""A candidate block with its score and constructed LegoElement.
53
54	Represents a single block that was considered for a particular label,
55	including its score, the constructed LegoPageElement (if successful),
56	and information about why it succeeded or failed.
57
58	This enables:
59	- Re-evaluation with hints (exclude specific candidates)
60	- Debugging (see all candidates and why they won/lost)
61	- UI support (show users alternatives)
62	"""
63
64	bbox: BBox	1✔
65	"""The bounding box for this candidate (from source_block or constructed)"""	1✔
66
67	label: str	1✔
68	"""The label this candidate would have (e.g., 'page_number')"""	1✔
69
70	# TODO Maybe score is redudant with score_details?
71	score: float	1✔
72	"""Combined score (0.0-1.0)"""	1✔
73
74	score_details: Any	1✔
75	"""The detailed score object (e.g., _PageNumberScore)"""	1✔
76
77	constructed: LegoPageElement \| None	1✔
78	"""The constructed LegoElement if parsing succeeded, None if failed"""	1✔
79
80	source_block: Block \| None = None	1✔
81	"""The raw element that was scored (None for synthetic elements like Step)"""	1✔
82
83	failure_reason: str \| None = None	1✔
84	"""Why construction failed, if it did"""	1✔
85
86	is_winner: bool = False	1✔
87	"""Whether this candidate was selected as the winner.	1✔
88
89	This field is set by mark_winner() and is used for:
90	- Querying winners (get_label, get_blocks_by_label, has_label)
91	- Synthetic candidates (which have no source_block and can't be tracked
92	in _block_winners)
93	- JSON serialization and golden file comparisons
94
95	Note: For candidates with source_block, this is redundant with
96	_block_winners, but provides convenient access and handles synthetic
97	candidates.
98	"""
99
100
101	class ClassifierConfig(BaseModel):	1✔
102	"""Configuration for the classifier."""
103
104	# TODO Not sure what this value is used for
105	min_confidence_threshold: float = 0.5	1✔
106
107	page_number_text_weight: Weight = 0.7	1✔
108	page_number_position_weight: Weight = 0.3	1✔
109	page_number_position_scale: float = 50.0	1✔
110	page_number_page_value_weight: Weight = 1.0	1✔
111	page_number_font_size_weight: Weight = 0.1	1✔
112
113	step_number_text_weight: Weight = 0.7	1✔
114	step_number_font_size_weight: Weight = 0.3	1✔
115
116	part_count_text_weight: Weight = 0.7	1✔
117	part_count_font_size_weight: Weight = 0.3	1✔
118
119	font_size_hints: FontSizeHints = Field(default_factory=FontSizeHints.empty)	1✔
120	"""Font size hints derived from analyzing all pages"""	1✔
121
122
123	class ClassificationResult(BaseModel):	1✔
124	"""Result of classifying a single page.
125
126	This class stores both the results and intermediate artifacts for a page
127	classification. It provides structured access to:
128	- Labels assigned to blocks
129	- LegoPageElements constructed from blocks
130	- Removal reasons for filtered blocks
131	- All candidates considered (including rejected ones)
132
133	The use of dictionaries keyed by block IDs (int) instead of Block objects
134	ensures JSON serializability and consistent equality semantics.
135
136	# TODO: Consider refactoring to separate DAO (Data Access Object) representation
137	# from the business logic. The public fields below are used for serialization
138	# but external code should prefer using the accessor methods to maintain
139	# encapsulation and allow future refactoring.
140
141	External code should use the accessor methods rather than accessing these
142	fields directly to maintain encapsulation.
143	"""
144
145	page_data: PageData	1✔
146	"""The original page data being classified"""	1✔
147
148	warnings: list[str] = Field(default_factory=list)	1✔
149	"""Warning messages generated during classification.	1✔
150
151	Public for serialization. Prefer using add_warning() and get_warnings() methods.
152	"""
153
154	removal_reasons: dict[int, RemovalReason] = Field(default_factory=dict)	1✔
155	"""Maps block IDs (block.id, not id(block)) to the reason they were removed.	1✔
156
157	Keys are block IDs (int) instead of Block objects to ensure JSON serializability
158	and consistency with constructed_elements.
159
160	Public for serialization. Prefer using accessor methods.
161	"""
162
163	constructed_elements: dict[int, LegoPageElement] = Field(default_factory=dict)	1✔
164	"""Maps source block IDs to their constructed LegoPageElements.	1✔
165
166	Only contains elements that were successfully labeled and constructed.
167	The builder should use these pre-constructed elements rather than
168	re-parsing the source blocks.
169
170	Keys are block IDs (int) instead of Block objects to ensure JSON serializability.
171
172	Public for serialization. Prefer using get_constructed_element() method.
173	"""
174
175	candidates: dict[str, list[Candidate]] = Field(default_factory=dict)	1✔
176	"""Maps label names to lists of all candidates considered for that label.	1✔
177
178	Each candidate includes:
179	- The source element
180	- Its score and score details
181	- The constructed LegoPageElement (if successful)
182	- Failure reason (if construction failed)
183	- Whether it was the winner
184
185	This enables:
186	- Re-evaluation with hints (exclude specific candidates)
187	- Debugging (see why each candidate won/lost)
188	- UI support (show users alternatives)
189
190	Public for serialization. Prefer using get_* accessor methods.
191	"""
192
193	block_winners: dict[int, tuple[str, Candidate]] = Field(default_factory=dict)	1✔
194	"""Maps block IDs to their winning (label, candidate) tuple.	1✔
195
196	Ensures each block has at most one winning candidate across all labels.
197	Keys are block IDs (int) for JSON serializability.
198
199	Public for serialization. Prefer using get_label() and related methods.
200	"""
201
202	@model_validator(mode="after")	1✔
203	def validate_unique_block_ids(self) -> ClassificationResult:	1✔
204	"""Validate PageData blocks have unique IDs (if present).
205
206	Blocks may have None IDs, but blocks with IDs must have unique IDs.
207	Note: Only blocks with IDs can be tracked in _constructed_elements and
208	_removal_reasons (which require block.id as keys for JSON serializability).
209	"""
210	# Validate unique IDs (ignoring None values)
211	block_ids = [e.id for e in self.page_data.blocks if e.id is not None]	1✔
212	if len(block_ids) != len(set(block_ids)):	1✔
213	duplicates = [id_ for id_ in block_ids if block_ids.count(id_) > 1]	1✔
214	raise ValueError(	1✔
215	f"PageData blocks must have unique IDs. "
216	f"Found duplicates: {set(duplicates)}"
217	)
218	return self	1✔
219
220	def _validate_block_in_page_data(	1✔
221	self, block: Block \| None, param_name: str = "block"
222	) -> None:
223	"""Validate that a block is in PageData.
224
225	Args:
226	block: The block to validate (None is allowed and skips validation)
227	param_name: Name of the parameter being validated (for error messages)
228
229	Raises:
230	ValueError: If block is not None and not in PageData.blocks
231	"""
232	if block is not None and block not in self.page_data.blocks:	1✔
233	raise ValueError(f"{param_name} must be in PageData.blocks. Block: {block}")	1✔
234
235	@property	1✔
236	def blocks(self) -> list[Block]:	1✔
237	"""Get the blocks from the page data.
238
239	Returns:
240	List of blocks from the page data
241	"""
242	return self.page_data.blocks	×
243
244	def add_warning(self, warning: str) -> None:	1✔
245	"""Add a warning message to the classification result.
246
247	Args:
248	warning: The warning message to add
249	"""
250	self.warnings.append(warning)	1✔
251
252	def get_warnings(self) -> list[str]:	1✔
253	"""Get all warnings generated during classification.
254
255	Returns:
256	List of warning messages
257	"""
258	return self.warnings.copy()	1✔
259
260	def get_constructed_element(self, block: Block) -> LegoPageElement \| None:	1✔
261	"""Get the constructed LegoPageElement for a source block.
262
263	Args:
264	block: The source block to look up
265
266	Returns:
267	The constructed LegoPageElement if it exists, otherwise None
268	"""
269	return self.constructed_elements.get(block.id)	1✔
270
271	# TODO maybe add a parameter to fitler out winners/non-winners
272	def get_candidates(self, label: str) -> list[Candidate]:	1✔
273	"""Get all candidates for a specific label.
274
275	Args:
276	label: The label to get candidates for
277
278	Returns:
279	List of candidates for that label (returns copy to prevent
280	external modification)
281	"""
282	return self.candidates.get(label, []).copy()	1✔
283
284	def get_all_candidates(self) -> dict[str, list[Candidate]]:	1✔
285	"""Get all candidates across all labels.
286
287	Returns:
288	Dictionary mapping labels to their candidates (returns deep copy)
289	"""
290	return {label: cands.copy() for label, cands in self.candidates.items()}	×
291
292	def add_candidate(self, label: str, candidate: Candidate) -> None:	1✔
293	"""Add a single candidate for a specific label.
294
295	Args:
296	label: The label this candidate is for
297	candidate: The candidate to add
298
299	Raises:
300	ValueError: If candidate has a source_block that is not in PageData
301	"""
302	self._validate_block_in_page_data(	1✔
303	candidate.source_block, "candidate.source_block"
304	)
305
306	if label not in self.candidates:	1✔
307	self.candidates[label] = []	1✔
308	self.candidates[label].append(candidate)	1✔
309
310	def mark_winner(	1✔
311	self,
312	candidate: Candidate,
313	constructed: LegoPageElement,
314	) -> None:
315	"""Mark a candidate as the winner and update tracking dicts.
316
317	Args:
318	candidate: The candidate to mark as winner
319	constructed: The constructed LegoPageElement
320
321	Raises:
322	ValueError: If candidate has a source_block that is not in PageData
323	ValueError: If this block already has a winner candidate
324	"""
325	self._validate_block_in_page_data(	1✔
326	candidate.source_block, "candidate.source_block"
327	)
328
329	# Check if this block already has a winner
330	if candidate.source_block is not None:	1✔
331	block_id = candidate.source_block.id	1✔
332	if block_id in self.block_winners:	1✔
333	existing_label, existing_candidate = self.block_winners[block_id]	1✔
334	raise ValueError(	1✔
335	f"Block {block_id} already has a winner candidate for "
336	f"label '{existing_label}'. Cannot mark as winner for "
337	f"label '{candidate.label}'. Each block can have at most "
338	f"one winner candidate."
339	)
340
341	candidate.is_winner = True	1✔
342	# Store the constructed element for this source element
343	if candidate.source_block is not None:	1✔
344	self.constructed_elements[candidate.source_block.id] = constructed	1✔
345	self.block_winners[candidate.source_block.id] = (	1✔
346	candidate.label,
347	candidate,
348	)
349
350	def mark_removed(self, block: Block, reason: RemovalReason) -> None:	1✔
351	"""Mark a block as removed with the given reason.
352
353	Args:
354	block: The block to mark as removed
355	reason: The reason for removal
356
357	Raises:
358	ValueError: If block is not in PageData
359	"""
360	self._validate_block_in_page_data(block, "block")	1✔
361	self.removal_reasons[block.id] = reason	1✔
362
363	# TODO Consider removing this method.
364	def get_labeled_blocks(self) -> dict[Block, str]:	1✔
365	"""Get a dictionary of all labeled blocks.
366
367	Returns:
368	Dictionary mapping blocks to their labels (excludes synthetic candidates)
369	"""
370	labeled: dict[Block, str] = {}	1✔
371	for label, label_candidates in self.candidates.items():	1✔
372	for candidate in label_candidates:	1✔
373	if candidate.is_winner and candidate.source_block is not None:	1✔
374	labeled[candidate.source_block] = label	1✔
375	return labeled	1✔
376
377	def get_label(self, block: Block) -> str \| None:	1✔
378	"""Get the label for a block from this classification result.
379
380	Args:
381	block: The block to get the label for
382
383	Returns:
384	The label string if found, None otherwise
385	"""
386	# Search through all candidates to find the winning label for this block
387	for label, label_candidates in self.candidates.items():	1✔
388	for candidate in label_candidates:	1✔
389	if candidate.source_block is block and candidate.is_winner:	1✔
390	return label	1✔
391	return None	1✔
392
393	def get_blocks_by_label(self, label: str) -> list[Block]:	1✔
394	"""Get all blocks with the given label.
395
396	Args:
397	label: The label to search for
398
399	Returns:
400	List of blocks with that label. For constructed blocks (e.g., Part),
401	returns the constructed object; for regular blocks, returns source_block.
402	"""
403	label_candidates = self.candidates.get(label, [])	1✔
404	blocks = []	1✔
405	for c in label_candidates:	1✔
406	if c.is_winner:	1✔
407	# Prefer source_block, fall back to constructed for synthetic blocks
408	if c.source_block is not None:	1✔
409	blocks.append(c.source_block)	1✔
410	elif c.constructed is not None:	×
411	blocks.append(c.constructed)	×
412	return blocks	1✔
413
414	def is_removed(self, block: Block) -> bool:	1✔
415	"""Check if a block has been marked for removal.
416
417	Args:
418	block: The block to check
419
420	Returns:
421	True if the block is marked for removal, False otherwise
422	"""
423	return block.id in self.removal_reasons	1✔
424
425	def get_removal_reason(self, block: Block) -> RemovalReason \| None:	1✔
426	"""Get the reason why a block was removed.
427
428	Args:
429	block: The block to get the removal reason for
430
431	Returns:
432	The RemovalReason if the block was removed, None otherwise
433	"""
434	return self.removal_reasons.get(block.id)	1✔
435
436	def get_scores_for_label(self, label: str) -> dict[ScoreKey, Any]:	1✔
437	"""Get all scores for a specific label.
438
439	Args:
440	label: The label to get scores for
441
442	Returns:
443	Dictionary mapping elements to score objects for that label
444	(excludes synthetic candidates without source_block)
445	"""
446	label_candidates = self.candidates.get(label, [])	1✔
447	return {	1✔
448	c.source_block: c.score_details
449	for c in label_candidates
450	if c.source_block is not None
451	}
452
453	def has_label(self, label: str) -> bool:	1✔
454	"""Check if any elements have been assigned the given label.
455
456	Args:
457	label: The label to check for
458
459	Returns:
460	True if at least one element has this label, False otherwise
461	"""
462	label_candidates = self.candidates.get(label, [])	1✔
463	return any(c.is_winner for c in label_candidates)	1✔
464
465	def get_best_candidate(self, label: str) -> Candidate \| None:	1✔
466	"""Get the winning candidate for a label.
467
468	Args:
469	label: The label to get the best candidate for
470
471	Returns:
472	The candidate with the highest score that successfully constructed,
473	or None if no valid candidates exist
474	"""
475	label_candidates = self.candidates.get(label, [])	1✔
476	valid = [c for c in label_candidates if c.constructed is not None]	1✔
477	return max(valid, key=lambda c: c.score) if valid else None	1✔
478
479	def get_alternative_candidates(	1✔
480	self, label: str, exclude_winner: bool = True
481	) -> list[Candidate]:
482	"""Get alternative candidates for a label (for UI/re-evaluation).
483
484	Args:
485	label: The label to get alternatives for
486	exclude_winner: If True, exclude the winning candidate
487
488	Returns:
489	List of candidates sorted by score (highest first)
490	"""
491	label_candidates = self.candidates.get(label, [])	1✔
492	if exclude_winner:	1✔
493	winner_blocks = self.get_blocks_by_label(label)	1✔
494	if winner_blocks:	1✔
495	winner_id = id(winner_blocks[0])	1✔
496	label_candidates = [	1✔
497	c for c in label_candidates if id(c.source_block) != winner_id
498	]
499	return sorted(label_candidates, key=lambda c: c.score, reverse=True)	1✔
500
501	def get_part_image_pairs(self) -> list[tuple[Block, Block]]:	1✔
502	"""Get part_count and part_image element pairs from winning candidates.
503
504	This derives the pairs from the part_image candidates' score_details,
505	which contain the relationship between part_count text and image elements.
506
507	Returns:
508	List of (part_count, image) tuples for all winning part_image candidates
509	"""
510	pairs: list[tuple[Block, Block]] = []	×
511	for candidate in self.get_candidates("part_image"):	×
512	if candidate.is_winner and candidate.score_details:	×
513	# score_details is a _PartImageScore with part_count and image fields
514	score = candidate.score_details	×
515	if hasattr(score, "part_count") and hasattr(score, "image"):	×
516	pairs.append((score.part_count, score.image))	×
517	return pairs	×

bramp / build-along / 19221936920

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous