19796282858

Committed 30 Nov 2025 08:32AM UTC coverage: 90.603% (-0.04%) from 90.646%

Build # 19796282858

Build Type

push

github

Committed by

bramp

Commit Message

test: update golden files for classifier with hints

Regenerated golden files using classifier config with font_hints and
page_hints for improved classification accuracy.

Run Details

9835 of 10855 relevant lines covered (90.6%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07

/src/build_a_long/pdf_extract/classifier/classifier.py

"""
Rule-based classifier for labeling page elements.

Pipeline order and dependencies
--------------------------------
The classification pipeline operates in two main phases:

1. **Bottom-up Scoring**: All classifiers run independently to identify potential
   candidates (e.g. page numbers, part counts, step numbers) and score them based
   on heuristics. No construction of final elements happens here.

2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
   the final `Page` object. It recursively requests the construction of its
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
   their own dependencies. This ensures a consistent and validated object tree.

"""

from __future__ import annotations

import logging

from build_a_long.pdf_extract.classifier.bags import (
    BagNumberClassifier,
    NewBagClassifier,
)
from build_a_long.pdf_extract.classifier.batch_classification_result import (
    BatchClassificationResult,
)
from build_a_long.pdf_extract.classifier.block_filter import (
    filter_background_blocks,
    filter_duplicate_blocks,
    filter_overlapping_text_blocks,
)
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
from build_a_long.pdf_extract.classifier.pages import (
    PageHintCollection,
)
from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier
from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (
    PageNumberClassifier,
)
from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (
    ProgressBarClassifier,
)
from build_a_long.pdf_extract.classifier.parts import (
    PartCountClassifier,
    PartNumberClassifier,
    PartsClassifier,
    PartsImageClassifier,
    PartsListClassifier,
    PieceLengthClassifier,
    ShineClassifier,
)
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
from build_a_long.pdf_extract.classifier.steps import (
    ArrowClassifier,
    DiagramClassifier,
    RotationSymbolClassifier,
    StepClassifier,
    StepCountClassifier,
    StepNumberClassifier,
    SubStepClassifier,
)
from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram
from build_a_long.pdf_extract.classifier.topological_sort import topological_sort
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    PageNumber,
    PartCount,
    PartsList,
    StepNumber,
)
from build_a_long.pdf_extract.extractor.page_blocks import Blocks

logger = logging.getLogger(__name__)

# Pages with more blocks than this threshold will be skipped during classification.
# This avoids O(n²) algorithms (like duplicate detection) that become prohibitively
# slow on pages with thousands of vector drawings. Such pages are typically info
# pages where each character is a separate vector graphic.
# TODO: Add spatial indexing to handle high-block pages efficiently.
MAX_BLOCKS_PER_PAGE = 1000


# TODO require config, so we don't accidentally use default empty config
def classify_elements(
    page: PageData, config: ClassifierConfig | None = None
) -> ClassificationResult:
    """Classify and label elements on a single page using rule-based heuristics.

    Args:
        page: A single PageData object to classify.
        config: Optional classifier configuration with font/page hints.
            If None, uses default empty configuration (no hints).
            For better classification accuracy, pass a config with
            FontSizeHints computed from multiple pages of the same PDF.

    Returns:
        A ClassificationResult object containing the classification results.
    """
    if config is None:
        config = ClassifierConfig()
    classifier = Classifier(config)

    return classifier.classify(page)


def classify_pages(
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
) -> BatchClassificationResult:
    """Classify and label elements across multiple pages using rule-based heuristics.

    This function performs a three-phase process:
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
    2. Analysis phase: Build font size hints from text properties (excluding
       removed blocks)
    3. Classification phase: Use hints to guide element classification

    Args:
        pages: A list of PageData objects to classify.
        pages_for_hints: Optional list of pages to use for generating font/page hints.
            If None, uses `pages`. This allows generating hints from all pages
            while only classifying a subset (e.g., when using --pages filter).

    Returns:
        BatchClassificationResult containing per-page results and global histogram
    """

    # TODO There is a bunch of duplication in here between hints and non-hints. Refactor

    # Use all pages for hint generation if provided, otherwise use selected pages
    hint_pages = pages_for_hints if pages_for_hints is not None else pages

    # Phase 1: Filter duplicate blocks on each page and track removals
    # Skip pages with too many blocks to avoid O(n²) performance issues
    removed_blocks_per_page: list[dict[Blocks, RemovalReason]] = []
    skipped_pages: set[int] = set()  # Track page numbers that are skipped

    for page_data in pages:
        # Skip pages with too many blocks - these are likely info/inventory pages
        # with vectorized text that cause O(n²) algorithms to be very slow
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
            logger.debug(
                f"Page {page_data.page_number}: skipping classification "
                f"({len(page_data.blocks)} blocks exceeds threshold of "
                f"{MAX_BLOCKS_PER_PAGE})"
            )
            skipped_pages.add(page_data.page_number)
            removed_blocks_per_page.append({})
            continue

        kept_blocks = page_data.blocks

        # Filter background blocks (full page blocks like background images)
        kept_blocks, background_removed = filter_background_blocks(
            kept_blocks, page_data.bbox.width, page_data.bbox.height
        )

        # Filter overlapping text blocks (e.g., "4" and "43" at same origin)
        kept_blocks, text_removed = filter_overlapping_text_blocks(kept_blocks)

        # Filter duplicate image/drawing blocks based on IOU
        kept_blocks, bbox_removed = filter_duplicate_blocks(kept_blocks)

        # Combine all removal mappings into a single dict for this page
        combined_removed_mapping = {
            **text_removed,
            **bbox_removed,
            **background_removed,
        }

        logger.debug(
            f"Page {page_data.page_number}: "
            f"filtered {len(text_removed)} overlapping text, "
            f"{len(bbox_removed)} duplicate bbox blocks, "
            f"{len(background_removed)} background blocks"
        )

        removed_blocks_per_page.append(combined_removed_mapping)

    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
    # Build pages with non-removed blocks for hint extraction and histogram

    # Filter duplicates from hint pages (may be different from pages to classify)
    hint_pages_without_duplicates = []
    for page_data in hint_pages:
        # Skip high-block pages for hints too (same threshold)
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
            continue

        # TODO We are re-filtering duplicates here; optimize by changing the API
        # to accept one list of PageData, and seperate by page_numbers.
        kept_blocks = page_data.blocks
        kept_blocks, _ = filter_background_blocks(
            kept_blocks, page_data.bbox.width, page_data.bbox.height
        )
        kept_blocks, _ = filter_overlapping_text_blocks(kept_blocks)
        kept_blocks, _ = filter_duplicate_blocks(kept_blocks)

        hint_pages_without_duplicates.append(
            PageData(
                page_number=page_data.page_number,
                bbox=page_data.bbox,
                blocks=kept_blocks,
            )
        )

    # Build pages without duplicates for classification
    pages_without_duplicates = []
    for page_data, removed_mapping in zip(pages, removed_blocks_per_page, strict=True):
        # We need to filter blocks that were removed by ANY filter
        non_removed_blocks = [
            block for block in page_data.blocks if block not in removed_mapping
        ]
        pages_without_duplicates.append(
            PageData(
                page_number=page_data.page_number,
                bbox=page_data.bbox,
                blocks=non_removed_blocks,
            )
        )

    # Generate hints from hint pages, histogram from pages to classify
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
    page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)
    histogram = TextHistogram.from_pages(pages_without_duplicates)

    # Phase 3: Classify using the hints (on pages without duplicates)
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
    classifier = Classifier(config)

    results = []
    for page_data, page_without_duplicates, removed_mapping in zip(
        pages, pages_without_duplicates, removed_blocks_per_page, strict=True
    ):
        # Handle skipped pages
        if page_data.page_number in skipped_pages:
            result = ClassificationResult(
                page_data=page_data,
                skipped_reason=(
                    f"Page has {len(page_data.blocks)} blocks, which exceeds "
                    f"the threshold of {MAX_BLOCKS_PER_PAGE}. This is likely an "
                    f"info/inventory page with vectorized text."
                ),
            )
            results.append(result)
            continue

        # Classify using only non-removed blocks
        result = classifier.classify(page_without_duplicates)

        # Update result to use original page_data (with all blocks)
        result.page_data = page_data

        # Mark removed blocks
        for removed_block, removal_reason in removed_mapping.items():
            result.mark_removed(removed_block, removal_reason)

        results.append(result)

    return BatchClassificationResult(results=results, histogram=histogram)


type Classifiers = (
    PageNumberClassifier
    | ProgressBarClassifier
    | BagNumberClassifier
    | PartCountClassifier
    | PartNumberClassifier
    | StepNumberClassifier
    | StepCountClassifier
    | PieceLengthClassifier
    | PartsClassifier
    | PartsListClassifier
    | PartsImageClassifier
    | ShineClassifier
    | NewBagClassifier
    | DiagramClassifier
    | ArrowClassifier
    | SubStepClassifier
    | StepClassifier
    | PageClassifier
)


class Classifier:
    """
    Performs a single run of classification based on rules, configuration, and hints.
    This class should be stateless.
    """

    def __init__(self, config: ClassifierConfig):
        self.config = config
        # Sort classifiers topologically based on their dependencies
        self.classifiers = topological_sort(
            [
                PageNumberClassifier(config),
                ProgressBarClassifier(config),
                BagNumberClassifier(config),
                PartCountClassifier(config),
                PartNumberClassifier(config),
                StepNumberClassifier(config),
                StepCountClassifier(config),
                PieceLengthClassifier(config),
                PartsClassifier(config),
                PartsListClassifier(config),
                DiagramClassifier(config),
                RotationSymbolClassifier(config),
                ArrowClassifier(config),
                PartsImageClassifier(config),
                ShineClassifier(config),
                NewBagClassifier(config),
                SubStepClassifier(config),
                StepClassifier(config),
                PageClassifier(config),
            ]
        )

    def classify(self, page_data: PageData) -> ClassificationResult:
        """
        Runs the classification logic and returns a result.
        It does NOT modify page_data directly.

        The classification process runs in three phases:
        1. Score all classifiers (bottom-up) - auto-registers classifiers
        2. Construct final elements (top-down starting from Page)
        """
        result = ClassificationResult(page_data=page_data)

        logger.debug(f"Starting classification for page {page_data.page_number}")

        # 1. Score all classifiers (Bottom-Up)
        # Note: score() automatically registers each classifier for its output labels
        for classifier in self.classifiers:
            classifier.score(result)

        # 2. Construct (Top-Down)
        # Find the PageClassifier to start the construction process
        page_classifier = next(
            c for c in self.classifiers if isinstance(c, PageClassifier)
        )
        page_classifier.build_all(result)

        # TODO Do we actualy ever add warnings?
        warnings = self._log_post_classification_warnings(page_data, result)
        for warning in warnings:
            result.add_warning(warning)

        return result

    def _log_post_classification_warnings(
        self, page_data: PageData, result: ClassificationResult
    ) -> list[str]:
        warnings = []

        # Check if there's a page number
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
        if not page_numbers:
            warnings.append(f"Page {page_data.page_number}: missing page number")

        # Get elements by label
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
        part_counts = result.get_winners_by_score("part_count", PartCount)

        for pl in parts_lists:
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
            if not inside_counts:
                warnings.append(
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
                    f"contains no part counts"
                )

        steps = result.get_winners_by_score("step_number", StepNumber)
        ABOVE_EPS = 2.0
        for step in steps:
            sb = step.bbox
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
            if not above:
                warnings.append(
                    f"Page {page_data.page_number}: step number '{step.value}' "
                    f"at {sb} has no parts list above it"
                )
        return warnings

1	"""
2	Rule-based classifier for labeling page elements.
3
4	Pipeline order and dependencies
5	--------------------------------
6	The classification pipeline operates in two main phases:
7
8	1. Bottom-up Scoring: All classifiers run independently to identify potential
9	candidates (e.g. page numbers, part counts, step numbers) and score them based
10	on heuristics. No construction of final elements happens here.
11
12	2. Top-down Construction: The root `PageClassifier` is invoked to construct
13	the final `Page` object. It recursively requests the construction of its
14	dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15	their own dependencies. This ensures a consistent and validated object tree.
16
17	"""
18
19	from __future__ import annotations	1✔
20
21	import logging	1✔
22
23	from build_a_long.pdf_extract.classifier.bags import (	1✔
24	BagNumberClassifier,
25	NewBagClassifier,
26	)
27	from build_a_long.pdf_extract.classifier.batch_classification_result import (	1✔
28	BatchClassificationResult,
29	)
30	from build_a_long.pdf_extract.classifier.block_filter import (	1✔
31	filter_background_blocks,
32	filter_duplicate_blocks,
33	filter_overlapping_text_blocks,
34	)
35	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
36	ClassificationResult,
37	)
38	from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig	1✔
39	from build_a_long.pdf_extract.classifier.pages import (	1✔
40	PageHintCollection,
41	)
42	from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier	1✔
43	from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (	1✔
44	PageNumberClassifier,
45	)
46	from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (	1✔
47	ProgressBarClassifier,
48	)
49	from build_a_long.pdf_extract.classifier.parts import (	1✔
50	PartCountClassifier,
51	PartNumberClassifier,
52	PartsClassifier,
53	PartsImageClassifier,
54	PartsListClassifier,
55	PieceLengthClassifier,
56	ShineClassifier,
57	)
58	from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason	1✔
59	from build_a_long.pdf_extract.classifier.steps import (	1✔
60	ArrowClassifier,
61	DiagramClassifier,
62	RotationSymbolClassifier,
63	StepClassifier,
64	StepCountClassifier,
65	StepNumberClassifier,
66	SubStepClassifier,
67	)
68	from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram	1✔
69	from build_a_long.pdf_extract.classifier.topological_sort import topological_sort	1✔
70	from build_a_long.pdf_extract.extractor import PageData	1✔
71	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
72	PageNumber,
73	PartCount,
74	PartsList,
75	StepNumber,
76	)
77	from build_a_long.pdf_extract.extractor.page_blocks import Blocks	1✔
78
79	logger = logging.getLogger(__name__)	1✔
80
81	# Pages with more blocks than this threshold will be skipped during classification.
82	# This avoids O(n²) algorithms (like duplicate detection) that become prohibitively
83	# slow on pages with thousands of vector drawings. Such pages are typically info
84	# pages where each character is a separate vector graphic.
85	# TODO: Add spatial indexing to handle high-block pages efficiently.
86	MAX_BLOCKS_PER_PAGE = 1000	1✔
87
88
89	# TODO require config, so we don't accidentally use default empty config
90	def classify_elements(	1✔
91	page: PageData, config: ClassifierConfig \| None = None
92	) -> ClassificationResult:
93	"""Classify and label elements on a single page using rule-based heuristics.
94
95	Args:
96	page: A single PageData object to classify.
97	config: Optional classifier configuration with font/page hints.
98	If None, uses default empty configuration (no hints).
99	For better classification accuracy, pass a config with
100	FontSizeHints computed from multiple pages of the same PDF.
101
102	Returns:
103	A ClassificationResult object containing the classification results.
104	"""
105	if config is None:	1✔
106	config = ClassifierConfig()	1✔
107	classifier = Classifier(config)	1✔
108
109	return classifier.classify(page)	1✔
110
111
112	def classify_pages(	1✔
113	pages: list[PageData], pages_for_hints: list[PageData] \| None = None
114	) -> BatchClassificationResult:
115	"""Classify and label elements across multiple pages using rule-based heuristics.
116
117	This function performs a three-phase process:
118	1. Filtering phase: Mark duplicate/similar blocks as removed on each page
119	2. Analysis phase: Build font size hints from text properties (excluding
120	removed blocks)
121	3. Classification phase: Use hints to guide element classification
122
123	Args:
124	pages: A list of PageData objects to classify.
125	pages_for_hints: Optional list of pages to use for generating font/page hints.
126	If None, uses `pages`. This allows generating hints from all pages
127	while only classifying a subset (e.g., when using --pages filter).
128
129	Returns:
130	BatchClassificationResult containing per-page results and global histogram
131	"""
132
133	# TODO There is a bunch of duplication in here between hints and non-hints. Refactor
134
135	# Use all pages for hint generation if provided, otherwise use selected pages
136	hint_pages = pages_for_hints if pages_for_hints is not None else pages	1✔
137
138	# Phase 1: Filter duplicate blocks on each page and track removals
139	# Skip pages with too many blocks to avoid O(n²) performance issues
140	removed_blocks_per_page: list[dict[Blocks, RemovalReason]] = []	1✔
141	skipped_pages: set[int] = set() # Track page numbers that are skipped	1✔
142
143	for page_data in pages:	1✔
144	# Skip pages with too many blocks - these are likely info/inventory pages
145	# with vectorized text that cause O(n²) algorithms to be very slow
146	if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:	1✔
147	logger.debug(	1✔
148	f"Page {page_data.page_number}: skipping classification "
149	f"({len(page_data.blocks)} blocks exceeds threshold of "
150	f"{MAX_BLOCKS_PER_PAGE})"
151	)
152	skipped_pages.add(page_data.page_number)	1✔
153	removed_blocks_per_page.append({})	1✔
154	continue	1✔
155
156	kept_blocks = page_data.blocks	1✔
157
158	# Filter background blocks (full page blocks like background images)
159	kept_blocks, background_removed = filter_background_blocks(	1✔
160	kept_blocks, page_data.bbox.width, page_data.bbox.height
161	)
162
163	# Filter overlapping text blocks (e.g., "4" and "43" at same origin)
164	kept_blocks, text_removed = filter_overlapping_text_blocks(kept_blocks)	1✔
165
166	# Filter duplicate image/drawing blocks based on IOU
167	kept_blocks, bbox_removed = filter_duplicate_blocks(kept_blocks)	1✔
168
169	# Combine all removal mappings into a single dict for this page
170	combined_removed_mapping = {	1✔
171	**text_removed,
172	**bbox_removed,
173	**background_removed,
174	}
175
176	logger.debug(	1✔
177	f"Page {page_data.page_number}: "
178	f"filtered {len(text_removed)} overlapping text, "
179	f"{len(bbox_removed)} duplicate bbox blocks, "
180	f"{len(background_removed)} background blocks"
181	)
182
183	removed_blocks_per_page.append(combined_removed_mapping)	1✔
184
185	# Phase 2: Extract font size hints from hint pages (excluding removed blocks)
186	# Build pages with non-removed blocks for hint extraction and histogram
187
188	# Filter duplicates from hint pages (may be different from pages to classify)
189	hint_pages_without_duplicates = []	1✔
190	for page_data in hint_pages:	1✔
191	# Skip high-block pages for hints too (same threshold)
192	if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:	1✔
193	continue	1✔
194
195	# TODO We are re-filtering duplicates here; optimize by changing the API
196	# to accept one list of PageData, and seperate by page_numbers.
197	kept_blocks = page_data.blocks	1✔
198	kept_blocks, _ = filter_background_blocks(	1✔
199	kept_blocks, page_data.bbox.width, page_data.bbox.height
200	)
201	kept_blocks, _ = filter_overlapping_text_blocks(kept_blocks)	1✔
202	kept_blocks, _ = filter_duplicate_blocks(kept_blocks)	1✔
203
204	hint_pages_without_duplicates.append(	1✔
205	PageData(
206	page_number=page_data.page_number,
207	bbox=page_data.bbox,
208	blocks=kept_blocks,
209	)
210	)
211
212	# Build pages without duplicates for classification
213	pages_without_duplicates = []	1✔
214	for page_data, removed_mapping in zip(pages, removed_blocks_per_page, strict=True):	1✔
215	# We need to filter blocks that were removed by ANY filter
216	non_removed_blocks = [	1✔
217	block for block in page_data.blocks if block not in removed_mapping
218	]
219	pages_without_duplicates.append(	1✔
220	PageData(
221	page_number=page_data.page_number,
222	bbox=page_data.bbox,
223	blocks=non_removed_blocks,
224	)
225	)
226
227	# Generate hints from hint pages, histogram from pages to classify
228	font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)	1✔
229	page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)	1✔
230	histogram = TextHistogram.from_pages(pages_without_duplicates)	1✔
231
232	# Phase 3: Classify using the hints (on pages without duplicates)
233	config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)	1✔
234	classifier = Classifier(config)	1✔
235
236	results = []	1✔
237	for page_data, page_without_duplicates, removed_mapping in zip(	1✔
238	pages, pages_without_duplicates, removed_blocks_per_page, strict=True
239	):
240	# Handle skipped pages
241	if page_data.page_number in skipped_pages:	1✔
242	result = ClassificationResult(	1✔
243	page_data=page_data,
244	skipped_reason=(
245	f"Page has {len(page_data.blocks)} blocks, which exceeds "
246	f"the threshold of {MAX_BLOCKS_PER_PAGE}. This is likely an "
247	f"info/inventory page with vectorized text."
248	),
249	)
250	results.append(result)	1✔
251	continue	1✔
252
253	# Classify using only non-removed blocks
254	result = classifier.classify(page_without_duplicates)	1✔
255
256	# Update result to use original page_data (with all blocks)
257	result.page_data = page_data	1✔
258
259	# Mark removed blocks
260	for removed_block, removal_reason in removed_mapping.items():	1✔
261	result.mark_removed(removed_block, removal_reason)	1✔
262
263	results.append(result)	1✔
264
265	return BatchClassificationResult(results=results, histogram=histogram)	1✔
266
267
268	type Classifiers = (	1✔
269	PageNumberClassifier
270	\| ProgressBarClassifier
271	\| BagNumberClassifier
272	\| PartCountClassifier
273	\| PartNumberClassifier
274	\| StepNumberClassifier
275	\| StepCountClassifier
276	\| PieceLengthClassifier
277	\| PartsClassifier
278	\| PartsListClassifier
279	\| PartsImageClassifier
280	\| ShineClassifier
281	\| NewBagClassifier
282	\| DiagramClassifier
283	\| ArrowClassifier
284	\| SubStepClassifier
285	\| StepClassifier
286	\| PageClassifier
287	)
288
289
290	class Classifier:	1✔
291	"""
292	Performs a single run of classification based on rules, configuration, and hints.
293	This class should be stateless.
294	"""
295
296	def __init__(self, config: ClassifierConfig):	1✔
297	self.config = config	1✔
298	# Sort classifiers topologically based on their dependencies
299	self.classifiers = topological_sort(	1✔
300	[
301	PageNumberClassifier(config),
302	ProgressBarClassifier(config),
303	BagNumberClassifier(config),
304	PartCountClassifier(config),
305	PartNumberClassifier(config),
306	StepNumberClassifier(config),
307	StepCountClassifier(config),
308	PieceLengthClassifier(config),
309	PartsClassifier(config),
310	PartsListClassifier(config),
311	DiagramClassifier(config),
312	RotationSymbolClassifier(config),
313	ArrowClassifier(config),
314	PartsImageClassifier(config),
315	ShineClassifier(config),
316	NewBagClassifier(config),
317	SubStepClassifier(config),
318	StepClassifier(config),
319	PageClassifier(config),
320	]
321	)
322
323	def classify(self, page_data: PageData) -> ClassificationResult:	1✔
324	"""
325	Runs the classification logic and returns a result.
326	It does NOT modify page_data directly.
327
328	The classification process runs in three phases:
329	1. Score all classifiers (bottom-up) - auto-registers classifiers
330	2. Construct final elements (top-down starting from Page)
331	"""
332	result = ClassificationResult(page_data=page_data)	1✔
333
334	logger.debug(f"Starting classification for page {page_data.page_number}")	1✔
335
336	# 1. Score all classifiers (Bottom-Up)
337	# Note: score() automatically registers each classifier for its output labels
338	for classifier in self.classifiers:	1✔
339	classifier.score(result)	1✔
340
341	# 2. Construct (Top-Down)
342	# Find the PageClassifier to start the construction process
343	page_classifier = next(	1✔
344	c for c in self.classifiers if isinstance(c, PageClassifier)
345	)
346	page_classifier.build_all(result)	1✔
347
348	# TODO Do we actualy ever add warnings?
349	warnings = self._log_post_classification_warnings(page_data, result)	1✔
350	for warning in warnings:	1✔
351	result.add_warning(warning)	1✔
352
353	return result	1✔
354
355	def _log_post_classification_warnings(	1✔
356	self, page_data: PageData, result: ClassificationResult
357	) -> list[str]:
358	warnings = []	1✔
359
360	# Check if there's a page number
361	page_numbers = result.get_winners_by_score("page_number", PageNumber)	1✔
362	if not page_numbers:	1✔
363	warnings.append(f"Page {page_data.page_number}: missing page number")	1✔
364
365	# Get elements by label
366	parts_lists = result.get_winners_by_score("parts_list", PartsList)	1✔
367	part_counts = result.get_winners_by_score("part_count", PartCount)	1✔
368
369	for pl in parts_lists:	1✔
370	inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]	1✔
371	if not inside_counts:	1✔
372	warnings.append(	×
373	f"Page {page_data.page_number}: parts list at {pl.bbox} "
374	f"contains no part counts"
375	)
376
377	steps = result.get_winners_by_score("step_number", StepNumber)	1✔
378	ABOVE_EPS = 2.0	1✔
379	for step in steps:	1✔
380	sb = step.bbox	1✔
381	above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]	1✔
382	if not above:	1✔
383	warnings.append(	1✔
384	f"Page {page_data.page_number}: step number '{step.value}' "
385	f"at {sb} has no parts list above it"
386	)
387	return warnings	1✔

bramp / build-along / 19796282858

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous