• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20017191425

08 Dec 2025 04:39AM UTC coverage: 90.47% (+0.07%) from 90.402%
20017191425

push

github

bramp
Add TriviaTextClassifier for detecting flavor/story text on instruction pages

- Add TriviaTextClassifier using union-find spatial clustering to detect
  dense clusters of text blocks (trivia/flavor content)
- Filter numeric text (part numbers, element IDs, counts) from consideration
- Include related images and drawings in the trivia text bounding box
- Exclude large background elements (>50% of page) from expansion
- Clamp final bbox to page bounds to avoid boundary violations
- Add TriviaTextConfig with min_text_blocks, min_total_characters, proximity_margin
- Register classifier in classifier.py and PageClassifier
- Add TriviaText element to Page and iter_elements()
- Update golden test file for page 17

130 of 139 new or added lines in 7 files covered. (93.53%)

41 existing lines in 4 files now uncovered.

11230 of 12413 relevant lines covered (90.47%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.43
/src/build_a_long/pdf_extract/validation/runner.py
1
"""Main validation runner that orchestrates all validation rules."""
2

3
from build_a_long.pdf_extract.classifier import BatchClassificationResult
1✔
4
from build_a_long.pdf_extract.extractor import PageData
1✔
5
from build_a_long.pdf_extract.extractor.lego_page_elements import Page
1✔
6

7
from .rules import (
1✔
8
    validate_catalog_coverage,
9
    validate_content_no_metadata_overlap,
10
    validate_elements_within_page,
11
    validate_first_page_number,
12
    validate_invalid_pages,
13
    validate_missing_page_numbers,
14
    validate_no_divider_intersection,
15
    validate_page_number_sequence,
16
    validate_part_contains_children,
17
    validate_parts_list_has_parts,
18
    validate_parts_lists_no_overlap,
19
    validate_progress_bar_sequence,
20
    validate_skipped_pages,
21
    validate_step_sequence,
22
    validate_steps_have_parts,
23
    validate_steps_no_significant_overlap,
24
    validate_unassigned_blocks,
25
)
26
from .types import ValidationResult
1✔
27

28

29
def validate_results(
1✔
30
    batch_result: BatchClassificationResult,
31
) -> ValidationResult:
32
    """Run all validation rules on classification results.
33

34
    This function checks for common issues that indicate the extraction
35
    may not be working correctly for a particular instruction book.
36

37
    Validation rules:
38
    - Each page should have a page number detected
39
    - Step numbers should form a continuous sequence without gaps
40
    - Step numbers should not have duplicates
41
    - Pages with steps should have parts lists
42
    - The first page number found should be reasonable (typically 1-4)
43

44
    Args:
45
        batch_result: The complete batch classification result.
46

47
    Returns:
48
        ValidationResult containing all found issues
49
    """
50
    validation = ValidationResult()
1✔
51

52
    # Collect data for validation
53
    missing_page_numbers: list[int] = []
1✔
54
    step_numbers_seen: list[tuple[int, int]] = []  # (pdf_page, step_number)
1✔
55
    steps_without_parts: list[tuple[int, int]] = []  # (pdf_page, step_number)
1✔
56
    lego_page_numbers: list[int] = []  # Detected LEGO page numbers
1✔
57
    skipped_pages: list[tuple[int, str]] = []  # (pdf_page, reason)
1✔
58
    invalid_pages: list[int] = []  # Pages where classification produced no Page
1✔
59
    progress_bars: list[tuple[int, float]] = []  # (pdf_page, progress_value)
1✔
60

61
    for result in batch_result.results:
1✔
62
        page = result.page
1✔
63
        page_data = result.page_data
1✔
64
        pdf_page = page_data.page_number
1✔
65

66
        # Check for skipped pages
67
        if result.skipped_reason:
1✔
68
            skipped_pages.append((pdf_page, result.skipped_reason))
×
UNCOV
69
            continue  # Don't collect other data for skipped pages
×
70

71
        # Check for unassigned blocks
72
        validate_unassigned_blocks(validation, result)
1✔
73

74
        # Check for invalid pages (no Page object but also not skipped)
75
        if page is None:
1✔
76
            invalid_pages.append(pdf_page)
×
UNCOV
77
            continue
×
78

79
        # Check for page number
80
        if page.page_number:
1✔
81
            lego_page_numbers.append(page.page_number.value)
1✔
82
        else:
83
            missing_page_numbers.append(pdf_page)
1✔
84

85
        # Collect progress bar value
86
        if page.progress_bar and page.progress_bar.progress is not None:
1✔
UNCOV
87
            progress_bars.append((pdf_page, page.progress_bar.progress))
×
88

89
        # Collect step numbers
90
        if page:
1✔
91
            for step in page.steps:
1✔
92
                step_numbers_seen.append((pdf_page, step.step_number.value))
1✔
93

94
                # Check for steps without parts lists
95
                if step.parts_list is None or len(step.parts_list.parts) == 0:
1✔
UNCOV
96
                    steps_without_parts.append((pdf_page, step.step_number.value))
×
97

98
    # --- Validation Rules ---
99

100
    # Rule 0: Skipped pages
101
    validate_skipped_pages(validation, skipped_pages)
1✔
102

103
    # Rule 0b: Invalid pages (classification failed to produce a Page)
104
    validate_invalid_pages(validation, invalid_pages)
1✔
105

106
    # Rule 1: Missing page numbers
107
    validate_missing_page_numbers(
1✔
108
        validation, missing_page_numbers, len(batch_result.results)
109
    )
110

111
    # Rule 2 & 3: Step number sequence validation
112
    validate_step_sequence(validation, step_numbers_seen)
1✔
113

114
    # Rule 4: Steps without parts lists
115
    validate_steps_have_parts(validation, steps_without_parts)
1✔
116

117
    # Rule 5: First page number validation
118
    validate_first_page_number(validation, lego_page_numbers)
1✔
119

120
    # Rule 6: Page number sequence validation
121
    validate_page_number_sequence(validation, lego_page_numbers)
1✔
122

123
    # Rule 7: Progress bar sequence validation
124
    validate_progress_bar_sequence(validation, progress_bars)
1✔
125

126
    # Rule 8: Catalog coverage
127
    validate_catalog_coverage(validation, batch_result.manual, experimental=True)
1✔
128

129
    return validation
1✔
130

131
    return validation
132

133

134
def validate_page(
1✔
135
    page: Page,
136
    page_data: PageData,
137
    validation: ValidationResult | None = None,
138
    *,
139
    step_overlap_threshold: float = 0.05,
140
) -> ValidationResult:
141
    """Run domain invariant validation rules on a single classified page.
142

143
    This function checks structural/spatial properties of elements on a page
144
    to ensure they satisfy LEGO instruction layout invariants.
145

146
    Domain invariant rules:
147
    - Each PartsList should contain at least one Part
148
    - PartsList bounding boxes should not overlap
149
    - Step bounding boxes should not significantly overlap
150
    - Part bbox should contain its count and diagram bboxes
151
    - All elements should stay within page boundaries
152
    - Content elements should not overlap page metadata
153

154
    Args:
155
        page: The classified Page object
156
        page_data: The raw PageData for context (page number, bbox, source)
157
        validation: Optional existing ValidationResult to add to.
158
            If None, a new one is created.
159
        step_overlap_threshold: Maximum allowed IOU for step overlap (default 5%)
160

161
    Returns:
162
        ValidationResult containing all found issues
163
    """
164
    if validation is None:
×
UNCOV
165
        validation = ValidationResult()
×
166

167
    # Rule 1: Parts lists should have parts
UNCOV
168
    validate_parts_list_has_parts(validation, page, page_data)
×
169

170
    # Rule 2: Parts lists should not overlap
UNCOV
171
    validate_parts_lists_no_overlap(validation, page, page_data)
×
172

173
    # Rule 3: Steps should not significantly overlap
UNCOV
174
    validate_steps_no_significant_overlap(
×
175
        validation, page, page_data, step_overlap_threshold
176
    )
177

178
    # Rule 4: Part bbox should contain children
UNCOV
179
    validate_part_contains_children(validation, page, page_data)
×
180

181
    # Rule 5: Elements should stay within page bounds
UNCOV
182
    validate_elements_within_page(validation, page, page_data)
×
183

184
    # Rule 6: Content should not overlap metadata
UNCOV
185
    validate_content_no_metadata_overlap(validation, page, page_data)
×
186

187
    # Rule 7: Content should not overlap dividers
UNCOV
188
    validate_no_divider_intersection(validation, page, page_data)
×
189

UNCOV
190
    return validation
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc