• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19221936920

10 Nov 2025 05:39AM UTC coverage: 86.707% (-0.4%) from 87.11%
19221936920

push

github

bramp
fix(pdf_extract): Fix argparse prefix matching and improve boolean flags

- Add allow_abbrev=False to prevent --draw from matching --draw-deleted
- Change to BooleanOptionalAction for --summary, --draw, and --draw-deleted
  This provides both positive and negative forms (e.g., --draw/--no-draw)
- Remove set_defaults() call since defaults are now in argument definitions
- Fixes bug where deleted blocks were drawn without --draw-deleted flag

4253 of 4905 relevant lines covered (86.71%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.13
/src/build_a_long/pdf_extract/classifier/classification_result.py
1
"""
2
Data classes for the classifier.
3
"""
4

5
from __future__ import annotations
1✔
6

7
from typing import Annotated, Any
1✔
8

9
from annotated_types import Ge, Le
1✔
10
from pydantic import BaseModel, Field, model_validator
1✔
11

12
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
13
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
14
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
15
from build_a_long.pdf_extract.extractor.extractor import PageData
1✔
16
from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement
1✔
17
from build_a_long.pdf_extract.extractor.page_blocks import Block
1✔
18

19
# Score key can be either a single Block or a tuple of Blocks (for pairings)
20
ScoreKey = Block | tuple[Block, ...]
1✔
21

22
# Weight value constrained to [0.0, 1.0] range
23
Weight = Annotated[float, Ge(0), Le(1)]
1✔
24

25

26
# TODO Make this JSON serializable
27
class BatchClassificationResult(BaseModel):
1✔
28
    """Results from classifying multiple pages together.
29

30
    This class holds both the per-page classification results and the
31
    global text histogram computed across all pages.
32
    """
33

34
    results: list[ClassificationResult]
1✔
35
    """Per-page classification results, one for each input page"""
1✔
36

37
    histogram: TextHistogram
1✔
38
    """Global text histogram computed across all pages"""
1✔
39

40

41
class RemovalReason(BaseModel):
1✔
42
    """Tracks why a block was removed during classification."""
43

44
    reason_type: str
1✔
45
    """Type of removal: 'child_bbox' or 'similar_bbox'"""
1✔
46

47
    target_block: Block
1✔
48
    """The block that caused this removal"""
1✔
49

50

51
class Candidate(BaseModel):
1✔
52
    """A candidate block with its score and constructed LegoElement.
53

54
    Represents a single block that was considered for a particular label,
55
    including its score, the constructed LegoPageElement (if successful),
56
    and information about why it succeeded or failed.
57

58
    This enables:
59
    - Re-evaluation with hints (exclude specific candidates)
60
    - Debugging (see all candidates and why they won/lost)
61
    - UI support (show users alternatives)
62
    """
63

64
    bbox: BBox
1✔
65
    """The bounding box for this candidate (from source_block or constructed)"""
1✔
66

67
    label: str
1✔
68
    """The label this candidate would have (e.g., 'page_number')"""
1✔
69

70
    # TODO Maybe score is redudant with score_details?
71
    score: float
1✔
72
    """Combined score (0.0-1.0)"""
1✔
73

74
    score_details: Any
1✔
75
    """The detailed score object (e.g., _PageNumberScore)"""
1✔
76

77
    constructed: LegoPageElement | None
1✔
78
    """The constructed LegoElement if parsing succeeded, None if failed"""
1✔
79

80
    source_block: Block | None = None
1✔
81
    """The raw element that was scored (None for synthetic elements like Step)"""
1✔
82

83
    failure_reason: str | None = None
1✔
84
    """Why construction failed, if it did"""
1✔
85

86
    is_winner: bool = False
1✔
87
    """Whether this candidate was selected as the winner.
1✔
88
    
89
    This field is set by mark_winner() and is used for:
90
    - Querying winners (get_label, get_blocks_by_label, has_label)
91
    - Synthetic candidates (which have no source_block and can't be tracked
92
      in _block_winners)
93
    - JSON serialization and golden file comparisons
94
    
95
    Note: For candidates with source_block, this is redundant with
96
    _block_winners, but provides convenient access and handles synthetic
97
    candidates.
98
    """
99

100

101
class ClassifierConfig(BaseModel):
1✔
102
    """Configuration for the classifier."""
103

104
    # TODO Not sure what this value is used for
105
    min_confidence_threshold: float = 0.5
1✔
106

107
    page_number_text_weight: Weight = 0.7
1✔
108
    page_number_position_weight: Weight = 0.3
1✔
109
    page_number_position_scale: float = 50.0
1✔
110
    page_number_page_value_weight: Weight = 1.0
1✔
111
    page_number_font_size_weight: Weight = 0.1
1✔
112

113
    step_number_text_weight: Weight = 0.7
1✔
114
    step_number_font_size_weight: Weight = 0.3
1✔
115

116
    part_count_text_weight: Weight = 0.7
1✔
117
    part_count_font_size_weight: Weight = 0.3
1✔
118

119
    font_size_hints: FontSizeHints = Field(default_factory=FontSizeHints.empty)
1✔
120
    """Font size hints derived from analyzing all pages"""
1✔
121

122

123
class ClassificationResult(BaseModel):
1✔
124
    """Result of classifying a single page.
125

126
    This class stores both the results and intermediate artifacts for a page
127
    classification. It provides structured access to:
128
    - Labels assigned to blocks
129
    - LegoPageElements constructed from blocks
130
    - Removal reasons for filtered blocks
131
    - All candidates considered (including rejected ones)
132

133
    The use of dictionaries keyed by block IDs (int) instead of Block objects
134
    ensures JSON serializability and consistent equality semantics.
135

136
    # TODO: Consider refactoring to separate DAO (Data Access Object) representation
137
    # from the business logic. The public fields below are used for serialization
138
    # but external code should prefer using the accessor methods to maintain
139
    # encapsulation and allow future refactoring.
140

141
    External code should use the accessor methods rather than accessing these
142
    fields directly to maintain encapsulation.
143
    """
144

145
    page_data: PageData
1✔
146
    """The original page data being classified"""
1✔
147

148
    warnings: list[str] = Field(default_factory=list)
1✔
149
    """Warning messages generated during classification.
1✔
150
    
151
    Public for serialization. Prefer using add_warning() and get_warnings() methods.
152
    """
153

154
    removal_reasons: dict[int, RemovalReason] = Field(default_factory=dict)
1✔
155
    """Maps block IDs (block.id, not id(block)) to the reason they were removed.
1✔
156
    
157
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability
158
    and consistency with constructed_elements.
159
    
160
    Public for serialization. Prefer using accessor methods.
161
    """
162

163
    constructed_elements: dict[int, LegoPageElement] = Field(default_factory=dict)
1✔
164
    """Maps source block IDs to their constructed LegoPageElements.
1✔
165
    
166
    Only contains elements that were successfully labeled and constructed.
167
    The builder should use these pre-constructed elements rather than
168
    re-parsing the source blocks.
169
    
170
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability.
171
    
172
    Public for serialization. Prefer using get_constructed_element() method.
173
    """
174

175
    candidates: dict[str, list[Candidate]] = Field(default_factory=dict)
1✔
176
    """Maps label names to lists of all candidates considered for that label.
1✔
177
    
178
    Each candidate includes:
179
    - The source element
180
    - Its score and score details
181
    - The constructed LegoPageElement (if successful)
182
    - Failure reason (if construction failed)
183
    - Whether it was the winner
184
    
185
    This enables:
186
    - Re-evaluation with hints (exclude specific candidates)
187
    - Debugging (see why each candidate won/lost)
188
    - UI support (show users alternatives)
189
    
190
    Public for serialization. Prefer using get_* accessor methods.
191
    """
192

193
    block_winners: dict[int, tuple[str, Candidate]] = Field(default_factory=dict)
1✔
194
    """Maps block IDs to their winning (label, candidate) tuple.
1✔
195
    
196
    Ensures each block has at most one winning candidate across all labels.
197
    Keys are block IDs (int) for JSON serializability.
198
    
199
    Public for serialization. Prefer using get_label() and related methods.
200
    """
201

202
    @model_validator(mode="after")
1✔
203
    def validate_unique_block_ids(self) -> ClassificationResult:
1✔
204
        """Validate PageData blocks have unique IDs (if present).
205

206
        Blocks may have None IDs, but blocks with IDs must have unique IDs.
207
        Note: Only blocks with IDs can be tracked in _constructed_elements and
208
        _removal_reasons (which require block.id as keys for JSON serializability).
209
        """
210
        # Validate unique IDs (ignoring None values)
211
        block_ids = [e.id for e in self.page_data.blocks if e.id is not None]
1✔
212
        if len(block_ids) != len(set(block_ids)):
1✔
213
            duplicates = [id_ for id_ in block_ids if block_ids.count(id_) > 1]
1✔
214
            raise ValueError(
1✔
215
                f"PageData blocks must have unique IDs. "
216
                f"Found duplicates: {set(duplicates)}"
217
            )
218
        return self
1✔
219

220
    def _validate_block_in_page_data(
1✔
221
        self, block: Block | None, param_name: str = "block"
222
    ) -> None:
223
        """Validate that a block is in PageData.
224

225
        Args:
226
            block: The block to validate (None is allowed and skips validation)
227
            param_name: Name of the parameter being validated (for error messages)
228

229
        Raises:
230
            ValueError: If block is not None and not in PageData.blocks
231
        """
232
        if block is not None and block not in self.page_data.blocks:
1✔
233
            raise ValueError(f"{param_name} must be in PageData.blocks. Block: {block}")
1✔
234

235
    @property
1✔
236
    def blocks(self) -> list[Block]:
1✔
237
        """Get the blocks from the page data.
238

239
        Returns:
240
            List of blocks from the page data
241
        """
242
        return self.page_data.blocks
×
243

244
    def add_warning(self, warning: str) -> None:
1✔
245
        """Add a warning message to the classification result.
246

247
        Args:
248
            warning: The warning message to add
249
        """
250
        self.warnings.append(warning)
1✔
251

252
    def get_warnings(self) -> list[str]:
1✔
253
        """Get all warnings generated during classification.
254

255
        Returns:
256
            List of warning messages
257
        """
258
        return self.warnings.copy()
1✔
259

260
    def get_constructed_element(self, block: Block) -> LegoPageElement | None:
1✔
261
        """Get the constructed LegoPageElement for a source block.
262

263
        Args:
264
            block: The source block to look up
265

266
        Returns:
267
            The constructed LegoPageElement if it exists, otherwise None
268
        """
269
        return self.constructed_elements.get(block.id)
1✔
270

271
    # TODO maybe add a parameter to fitler out winners/non-winners
272
    def get_candidates(self, label: str) -> list[Candidate]:
1✔
273
        """Get all candidates for a specific label.
274

275
        Args:
276
            label: The label to get candidates for
277

278
        Returns:
279
            List of candidates for that label (returns copy to prevent
280
            external modification)
281
        """
282
        return self.candidates.get(label, []).copy()
1✔
283

284
    def get_all_candidates(self) -> dict[str, list[Candidate]]:
1✔
285
        """Get all candidates across all labels.
286

287
        Returns:
288
            Dictionary mapping labels to their candidates (returns deep copy)
289
        """
290
        return {label: cands.copy() for label, cands in self.candidates.items()}
×
291

292
    def add_candidate(self, label: str, candidate: Candidate) -> None:
1✔
293
        """Add a single candidate for a specific label.
294

295
        Args:
296
            label: The label this candidate is for
297
            candidate: The candidate to add
298

299
        Raises:
300
            ValueError: If candidate has a source_block that is not in PageData
301
        """
302
        self._validate_block_in_page_data(
1✔
303
            candidate.source_block, "candidate.source_block"
304
        )
305

306
        if label not in self.candidates:
1✔
307
            self.candidates[label] = []
1✔
308
        self.candidates[label].append(candidate)
1✔
309

310
    def mark_winner(
1✔
311
        self,
312
        candidate: Candidate,
313
        constructed: LegoPageElement,
314
    ) -> None:
315
        """Mark a candidate as the winner and update tracking dicts.
316

317
        Args:
318
            candidate: The candidate to mark as winner
319
            constructed: The constructed LegoPageElement
320

321
        Raises:
322
            ValueError: If candidate has a source_block that is not in PageData
323
            ValueError: If this block already has a winner candidate
324
        """
325
        self._validate_block_in_page_data(
1✔
326
            candidate.source_block, "candidate.source_block"
327
        )
328

329
        # Check if this block already has a winner
330
        if candidate.source_block is not None:
1✔
331
            block_id = candidate.source_block.id
1✔
332
            if block_id in self.block_winners:
1✔
333
                existing_label, existing_candidate = self.block_winners[block_id]
1✔
334
                raise ValueError(
1✔
335
                    f"Block {block_id} already has a winner candidate for "
336
                    f"label '{existing_label}'. Cannot mark as winner for "
337
                    f"label '{candidate.label}'. Each block can have at most "
338
                    f"one winner candidate."
339
                )
340

341
        candidate.is_winner = True
1✔
342
        # Store the constructed element for this source element
343
        if candidate.source_block is not None:
1✔
344
            self.constructed_elements[candidate.source_block.id] = constructed
1✔
345
            self.block_winners[candidate.source_block.id] = (
1✔
346
                candidate.label,
347
                candidate,
348
            )
349

350
    def mark_removed(self, block: Block, reason: RemovalReason) -> None:
1✔
351
        """Mark a block as removed with the given reason.
352

353
        Args:
354
            block: The block to mark as removed
355
            reason: The reason for removal
356

357
        Raises:
358
            ValueError: If block is not in PageData
359
        """
360
        self._validate_block_in_page_data(block, "block")
1✔
361
        self.removal_reasons[block.id] = reason
1✔
362

363
    # TODO Consider removing this method.
364
    def get_labeled_blocks(self) -> dict[Block, str]:
1✔
365
        """Get a dictionary of all labeled blocks.
366

367
        Returns:
368
            Dictionary mapping blocks to their labels (excludes synthetic candidates)
369
        """
370
        labeled: dict[Block, str] = {}
1✔
371
        for label, label_candidates in self.candidates.items():
1✔
372
            for candidate in label_candidates:
1✔
373
                if candidate.is_winner and candidate.source_block is not None:
1✔
374
                    labeled[candidate.source_block] = label
1✔
375
        return labeled
1✔
376

377
    def get_label(self, block: Block) -> str | None:
1✔
378
        """Get the label for a block from this classification result.
379

380
        Args:
381
            block: The block to get the label for
382

383
        Returns:
384
            The label string if found, None otherwise
385
        """
386
        # Search through all candidates to find the winning label for this block
387
        for label, label_candidates in self.candidates.items():
1✔
388
            for candidate in label_candidates:
1✔
389
                if candidate.source_block is block and candidate.is_winner:
1✔
390
                    return label
1✔
391
        return None
1✔
392

393
    def get_blocks_by_label(self, label: str) -> list[Block]:
1✔
394
        """Get all blocks with the given label.
395

396
        Args:
397
            label: The label to search for
398

399
        Returns:
400
            List of blocks with that label. For constructed blocks (e.g., Part),
401
            returns the constructed object; for regular blocks, returns source_block.
402
        """
403
        label_candidates = self.candidates.get(label, [])
1✔
404
        blocks = []
1✔
405
        for c in label_candidates:
1✔
406
            if c.is_winner:
1✔
407
                # Prefer source_block, fall back to constructed for synthetic blocks
408
                if c.source_block is not None:
1✔
409
                    blocks.append(c.source_block)
1✔
410
                elif c.constructed is not None:
×
411
                    blocks.append(c.constructed)
×
412
        return blocks
1✔
413

414
    def is_removed(self, block: Block) -> bool:
1✔
415
        """Check if a block has been marked for removal.
416

417
        Args:
418
            block: The block to check
419

420
        Returns:
421
            True if the block is marked for removal, False otherwise
422
        """
423
        return block.id in self.removal_reasons
1✔
424

425
    def get_removal_reason(self, block: Block) -> RemovalReason | None:
1✔
426
        """Get the reason why a block was removed.
427

428
        Args:
429
            block: The block to get the removal reason for
430

431
        Returns:
432
            The RemovalReason if the block was removed, None otherwise
433
        """
434
        return self.removal_reasons.get(block.id)
1✔
435

436
    def get_scores_for_label(self, label: str) -> dict[ScoreKey, Any]:
1✔
437
        """Get all scores for a specific label.
438

439
        Args:
440
            label: The label to get scores for
441

442
        Returns:
443
            Dictionary mapping elements to score objects for that label
444
            (excludes synthetic candidates without source_block)
445
        """
446
        label_candidates = self.candidates.get(label, [])
1✔
447
        return {
1✔
448
            c.source_block: c.score_details
449
            for c in label_candidates
450
            if c.source_block is not None
451
        }
452

453
    def has_label(self, label: str) -> bool:
1✔
454
        """Check if any elements have been assigned the given label.
455

456
        Args:
457
            label: The label to check for
458

459
        Returns:
460
            True if at least one element has this label, False otherwise
461
        """
462
        label_candidates = self.candidates.get(label, [])
1✔
463
        return any(c.is_winner for c in label_candidates)
1✔
464

465
    def get_best_candidate(self, label: str) -> Candidate | None:
1✔
466
        """Get the winning candidate for a label.
467

468
        Args:
469
            label: The label to get the best candidate for
470

471
        Returns:
472
            The candidate with the highest score that successfully constructed,
473
            or None if no valid candidates exist
474
        """
475
        label_candidates = self.candidates.get(label, [])
1✔
476
        valid = [c for c in label_candidates if c.constructed is not None]
1✔
477
        return max(valid, key=lambda c: c.score) if valid else None
1✔
478

479
    def get_alternative_candidates(
1✔
480
        self, label: str, exclude_winner: bool = True
481
    ) -> list[Candidate]:
482
        """Get alternative candidates for a label (for UI/re-evaluation).
483

484
        Args:
485
            label: The label to get alternatives for
486
            exclude_winner: If True, exclude the winning candidate
487

488
        Returns:
489
            List of candidates sorted by score (highest first)
490
        """
491
        label_candidates = self.candidates.get(label, [])
1✔
492
        if exclude_winner:
1✔
493
            winner_blocks = self.get_blocks_by_label(label)
1✔
494
            if winner_blocks:
1✔
495
                winner_id = id(winner_blocks[0])
1✔
496
                label_candidates = [
1✔
497
                    c for c in label_candidates if id(c.source_block) != winner_id
498
                ]
499
        return sorted(label_candidates, key=lambda c: c.score, reverse=True)
1✔
500

501
    def get_part_image_pairs(self) -> list[tuple[Block, Block]]:
1✔
502
        """Get part_count and part_image element pairs from winning candidates.
503

504
        This derives the pairs from the part_image candidates' score_details,
505
        which contain the relationship between part_count text and image elements.
506

507
        Returns:
508
            List of (part_count, image) tuples for all winning part_image candidates
509
        """
510
        pairs: list[tuple[Block, Block]] = []
×
511
        for candidate in self.get_candidates("part_image"):
×
512
            if candidate.is_winner and candidate.score_details:
×
513
                # score_details is a _PartImageScore with part_count and image fields
514
                score = candidate.score_details
×
515
                if hasattr(score, "part_count") and hasattr(score, "image"):
×
516
                    pairs.append((score.part_count, score.image))
×
517
        return pairs
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc