19850699447

Committed 02 Dec 2025 07:27AM UTC coverage: 90.421% (-0.1%) from 90.525%

Build # 19850699447

Build Type

push

github

Committed by

bramp

Commit Message

feat: Implement rule-based classifier system and refactor key classifiers

Introduces a flexible, rule-based architecture for PDF element classification,
significantly reducing boilerplate and improving maintainability.

Key changes include:
- **New `RuleBasedClassifier` base class**: Provides a declarative way to define
  classifier logic using a list of `Rule` objects.
- **Generic and specific `Rule` implementations**:
  - `Filter` base class for pass/fail rules (`IsInstanceFilter`, `InBottomBandFilter`).
  - Scoring rules like `RegexMatch`, `FontSizeMatch`, `CornerDistanceScore`,
    `PageNumberValueMatch`.
  - Aggregation rules like `MaxScoreRule` for combining multiple scoring criteria.
  - Domain-specific rules: `PageNumberTextRule`, `PartCountTextRule`,
    `PartNumberTextRule`, `BagNumberTextRule`, `TopLeftPositionScore`,
    `BagNumberFontSizeRule`, and `StepNumberTextRule`.
- **Refactored Classifiers**: Converted `PageNumberClassifier`,
  `PartCountClassifier`, `StepNumberClassifier`, `PartNumberClassifier`, and
  `BagNumberClassifier` to use the new rule-based system.
- **Centralized Configuration**: `min_score` and weights for `BagNumberClassifier`
  are now configurable via `BagNumberConfig`.
- **Improved Test Fixtures**: Updated `conftest.py` to support the new `RuleScore`.
- **Golden File Updates**: Golden test fixtures were regenerated to match the
  new (and in some cases, improved) classification behavior. Notably, some pages
  previously misclassified as "instruction" are now correctly identified as "catalog"
  due to enhanced `StepNumberClassifier` logic.

This refactoring streamlines the addition and modification of classification
heuristics, making the system more robust and easier to debug.

Run Details

341 of 365 new or added lines in 12 files covered. (93.42%)

11 existing lines in 2 files now uncovered.

10298 of 11389 relevant lines covered (90.42%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.74

/src/build_a_long/pdf_extract/classifier/rule_based_classifier.py

"""
Rule-based classifier implementation.
"""

from __future__ import annotations

import logging
from abc import abstractmethod

from build_a_long.pdf_extract.classifier.candidate import Candidate
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext
from build_a_long.pdf_extract.classifier.score import Score, Weight

log = logging.getLogger(__name__)


class RuleScore(Score):
    """Generic score based on rules."""

    components: dict[str, float]
    total_score: float

    def score(self) -> Weight:
        return self.total_score

    def get(self, rule_name: str, default: float = 0.0) -> float:
        """Get the score for a specific rule name."""
        return self.components.get(rule_name, default)


class RuleBasedClassifier(LabelClassifier):
    """Base class for classifiers that use a list of rules to score candidates."""

    @property
    @abstractmethod
    def rules(self) -> list[Rule]:
        """Get the list of rules for this classifier."""
        pass

    @property
    def min_score(self) -> float:
        """Minimum score threshold for acceptance. Defaults to 0.0."""
        return 0.0

    def _score(self, result: ClassificationResult) -> None:
        """Score blocks using rules."""
        context = RuleContext(result.page_data, self.config)
        rules = self.rules

        for block in result.page_data.blocks:
            components = {}
            weighted_sum = 0.0
            total_weight = 0.0
            failed = False

            for rule in rules:
                score = rule.calculate(block, context)

                # If rule returns None, it's skipped (not applicable)
                if score is None:
                    continue

                # If required rule fails (score 0), fail the block immediately
                if rule.required and score == 0.0:
                    failed = True
                    break

                rule_weight = rule.weight  # Using direct weight from Rule instance

                weighted_sum += score * rule_weight
                total_weight += rule_weight
                components[rule.name] = score

            if failed:
                continue

            # Calculate final score
            if total_weight > 0:
                final_score = weighted_sum / total_weight
            else:
                final_score = 0.0

            # Check classifier-specific acceptance logic
            if not self._should_accept(final_score):
                continue

            # Create candidate
            candidate = Candidate(
                bbox=block.bbox,
                label=self.output,
                score=final_score,
                score_details=RuleScore(components=components, total_score=final_score),
                source_blocks=[block],
            )
            result.add_candidate(candidate)

    def _should_accept(self, score: float) -> bool:
        """Determine if a score is high enough to be a candidate.

        Subclasses can override this.
        """
        return score >= self.min_score

1	"""
2	Rule-based classifier implementation.
3	"""
4
5	from __future__ import annotations	1✔
6
7	import logging	1✔
8	from abc import abstractmethod	1✔
9
10	from build_a_long.pdf_extract.classifier.candidate import Candidate	1✔
11	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
12	ClassificationResult,
13	)
14	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
15	LabelClassifier,
16	)
17	from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext	1✔
18	from build_a_long.pdf_extract.classifier.score import Score, Weight	1✔
19
20	log = logging.getLogger(__name__)	1✔
21
22
23	class RuleScore(Score):	1✔
24	"""Generic score based on rules."""
25
26	components: dict[str, float]	1✔
27	total_score: float	1✔
28
29	def score(self) -> Weight:	1✔
NEW 30	return self.total_score	×
31
32	def get(self, rule_name: str, default: float = 0.0) -> float:	1✔
33	"""Get the score for a specific rule name."""
NEW 34	return self.components.get(rule_name, default)	×
35
36
37	class RuleBasedClassifier(LabelClassifier):	1✔
38	"""Base class for classifiers that use a list of rules to score candidates."""
39
40	@property	1✔
41	@abstractmethod	1✔
42	def rules(self) -> list[Rule]:	1✔
43	"""Get the list of rules for this classifier."""
NEW 44	pass	×
45
46	@property	1✔
47	def min_score(self) -> float:	1✔
48	"""Minimum score threshold for acceptance. Defaults to 0.0."""
NEW 49	return 0.0	×
50
51	def _score(self, result: ClassificationResult) -> None:	1✔
52	"""Score blocks using rules."""
53	context = RuleContext(result.page_data, self.config)	1✔
54	rules = self.rules	1✔
55
56	for block in result.page_data.blocks:	1✔
57	components = {}	1✔
58	weighted_sum = 0.0	1✔
59	total_weight = 0.0	1✔
60	failed = False	1✔
61
62	for rule in rules:	1✔
63	score = rule.calculate(block, context)	1✔
64
65	# If rule returns None, it's skipped (not applicable)
66	if score is None:	1✔
67	continue	1✔
68
69	# If required rule fails (score 0), fail the block immediately
70	if rule.required and score == 0.0:	1✔
71	failed = True	1✔
72	break	1✔
73
74	rule_weight = rule.weight # Using direct weight from Rule instance	1✔
75
76	weighted_sum += score * rule_weight	1✔
77	total_weight += rule_weight	1✔
78	components[rule.name] = score	1✔
79
80	if failed:	1✔
81	continue	1✔
82
83	# Calculate final score
84	if total_weight > 0:	1✔
85	final_score = weighted_sum / total_weight	1✔
86	else:
NEW 87	final_score = 0.0	×
88
89	# Check classifier-specific acceptance logic
90	if not self._should_accept(final_score):	1✔
91	continue	1✔
92
93	# Create candidate
94	candidate = Candidate(	1✔
95	bbox=block.bbox,
96	label=self.output,
97	score=final_score,
98	score_details=RuleScore(components=components, total_score=final_score),
99	source_blocks=[block],
100	)
101	result.add_candidate(candidate)	1✔
102
103	def _should_accept(self, score: float) -> bool:	1✔
104	"""Determine if a score is high enough to be a candidate.
105
106	Subclasses can override this.
107	"""
108	return score >= self.min_score	1✔

bramp / build-along / 19850699447

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous