20361865516

Committed 19 Dec 2025 06:25AM UTC coverage: 89.13% (-0.002%) from 89.132%

Build # 20361865516

Build Type

push

github

Committed by

bramp

Commit Message

Fix lint errors: line length, unused imports, and YAML issues

- Add ruff isort configuration with known-first-party for build_a_long
- Add per-file E501 ignore for legocom_test.py (JSON test data)
- Create .yamllint config to relax strict YAML rules
- Fix E501 line length errors by wrapping long comments and strings
- Fix F841 unused variable errors
- Fix PLC0415 import-at-non-top-level errors
- Fix SIM108 ternary simplification errors

Run Details

12 of 14 new or added lines in 8 files covered. (85.71%)

78 existing lines in 6 files now uncovered.

12915 of 14490 relevant lines covered (89.13%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.22

/src/build_a_long/pdf_extract/classifier/rule_based_classifier.py

"""
Rule-based classifier implementation.
"""

from __future__ import annotations

import logging
from abc import abstractmethod
from typing import TYPE_CHECKING

from build_a_long.pdf_extract.classifier.block_filter import (
    find_text_outline_effects,
)
from build_a_long.pdf_extract.classifier.candidate import Candidate
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext
from build_a_long.pdf_extract.classifier.score import Score, Weight
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.page_blocks import Block, Blocks, Text

if TYPE_CHECKING:
    pass

log = logging.getLogger(__name__)


class RuleScore(Score):
    """Generic score based on rules."""

    components: dict[str, float]
    total_score: float

    def score(self) -> Weight:
        return self.total_score

    def get(self, rule_name: str, default: float = 0.0) -> float:
        """Get the score for a specific rule name."""
        return self.components.get(rule_name, default)


class StepNumberScore(RuleScore):
    """Score for step number candidates that includes the parsed step value.

    This avoids re-parsing the step number from source blocks when the value
    is needed later (e.g., for building StepNumber elements or sorting).
    """

    step_value: int
    """The parsed step number value (e.g., 1, 2, 3, 42)."""


class RuleBasedClassifier(LabelClassifier):
    """Base class for classifiers that use a list of rules to score candidates."""

    @property
    @abstractmethod
    def rules(self) -> list[Rule]:
        """Get the list of rules for this classifier."""
        pass

    @property
    def min_score(self) -> float:
        """Minimum score threshold for acceptance. Defaults to 0.0."""
        return 0.0

    def _create_score(
        self,
        block: Block,
        components: dict[str, float],
        total_score: float,
    ) -> RuleScore:
        """Create the score object for a candidate.

        Subclasses can override this to return a more specific score type
        that contains additional information (e.g., parsed values).

        Args:
            block: The block being scored
            components: Dictionary of rule name to score
            total_score: The weighted total score

        Returns:
            A RuleScore (or subclass) instance
        """
        return RuleScore(components=components, total_score=total_score)

    def _score(self, result: ClassificationResult) -> None:
        """Score blocks using rules."""
        context = RuleContext(result.page_data, self.config, result)
        rules = self.rules

        for block in result.page_data.blocks:
            components = {}
            weighted_sum = 0.0
            total_weight = 0.0
            failed = False

            for rule in rules:
                score = rule.calculate(block, context)

                # If rule returns None, it's skipped (not applicable)
                if score is None:
                    continue

                # If required rule fails (score 0), fail the block immediately
                if rule.required and score == 0.0:
                    failed = True
                    # log.debug(
                    #    "[%s] block_id=%s failed required rule '%s'",
                    #    self.output,
                    #    block.id,
                    #    rule.name,
                    # )
                    break

                rule_weight = rule.weight  # Using direct weight from Rule instance

                weighted_sum += score * rule_weight
                total_weight += rule_weight
                components[rule.name] = score

            if failed:
                continue

            # Calculate final score
            final_score = weighted_sum / total_weight if total_weight > 0 else 0.0

            # Check classifier-specific acceptance logic
            if not self._should_accept(final_score):
                log.debug(
                    "[%s] block_id=%s "
                    "rejected: score=%.3f < min_score=%.3f components=%s",
                    self.output,
                    block.id,
                    final_score,
                    self.min_score,
                    components,
                )
                continue

            log.debug(
                "[%s] block_id=%s accepted: score=%.3f components=%s",
                self.output,
                block.id,
                final_score,
                components,
            )

            # Build source blocks list, including text outline effects for Text blocks
            source_blocks: list = [block]
            if isinstance(block, Text):
                outline_effects = find_text_outline_effects(
                    block, result.page_data.blocks
                )
                source_blocks.extend(outline_effects)

            # Add any classifier-specific additional source blocks
            source_blocks.extend(self._get_additional_source_blocks(block, result))

            # Create score object (subclasses can override _create_score)
            score_details = self._create_score(block, components, final_score)

            # Compute bbox as the union of all source blocks
            # This ensures the candidate bbox matches the source_blocks union,
            # required by validation (assert_element_bbox_matches_source_and_children)
            candidate_bbox = BBox.union_all([b.bbox for b in source_blocks])

            # Create candidate
            candidate = Candidate(
                bbox=candidate_bbox,
                label=self.output,
                score=final_score,
                score_details=score_details,
                source_blocks=source_blocks,
            )
            result.add_candidate(candidate)

    def _get_additional_source_blocks(
        self, block: Block, result: ClassificationResult
    ) -> list[Blocks]:
        """Get additional source blocks to include with the candidate.

        Subclasses can override this to include related blocks (e.g.,
        overlapping drawings, drop shadows) in the candidate's source_blocks.
        These blocks will be marked as removed if the candidate wins.
        """
        return []

    def _should_accept(self, score: float) -> bool:
        """Determine if a score is high enough to be a candidate.

        Subclasses can override this.
        """
        return score >= self.min_score

1	"""
2	Rule-based classifier implementation.
3	"""
4
5	from __future__ import annotations	1✔
6
7	import logging	1✔
8	from abc import abstractmethod	1✔
9	from typing import TYPE_CHECKING	1✔
10
11	from build_a_long.pdf_extract.classifier.block_filter import (	1✔
12	find_text_outline_effects,
13	)
14	from build_a_long.pdf_extract.classifier.candidate import Candidate	1✔
15	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
16	ClassificationResult,
17	)
18	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
19	LabelClassifier,
20	)
21	from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext	1✔
22	from build_a_long.pdf_extract.classifier.score import Score, Weight	1✔
23	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
24	from build_a_long.pdf_extract.extractor.page_blocks import Block, Blocks, Text	1✔
25
26	if TYPE_CHECKING:
27	pass
28
29	log = logging.getLogger(__name__)	1✔
30
31
32	class RuleScore(Score):	1✔
33	"""Generic score based on rules."""
34
35	components: dict[str, float]	1✔
36	total_score: float	1✔
37
38	def score(self) -> Weight:	1✔
UNCOV 39	return self.total_score	×
40
41	def get(self, rule_name: str, default: float = 0.0) -> float:	1✔
42	"""Get the score for a specific rule name."""
43	return self.components.get(rule_name, default)	1✔
44
45
46	class StepNumberScore(RuleScore):	1✔
47	"""Score for step number candidates that includes the parsed step value.
48
49	This avoids re-parsing the step number from source blocks when the value
50	is needed later (e.g., for building StepNumber elements or sorting).
51	"""
52
53	step_value: int	1✔
54	"""The parsed step number value (e.g., 1, 2, 3, 42)."""	1✔
55
56
57	class RuleBasedClassifier(LabelClassifier):	1✔
58	"""Base class for classifiers that use a list of rules to score candidates."""
59
60	@property	1✔
61	@abstractmethod	1✔
62	def rules(self) -> list[Rule]:	1✔
63	"""Get the list of rules for this classifier."""
UNCOV 64	pass	×
65
66	@property	1✔
67	def min_score(self) -> float:	1✔
68	"""Minimum score threshold for acceptance. Defaults to 0.0."""
69	return 0.0	1✔
70
71	def _create_score(	1✔
72	self,
73	block: Block,
74	components: dict[str, float],
75	total_score: float,
76	) -> RuleScore:
77	"""Create the score object for a candidate.
78
79	Subclasses can override this to return a more specific score type
80	that contains additional information (e.g., parsed values).
81
82	Args:
83	block: The block being scored
84	components: Dictionary of rule name to score
85	total_score: The weighted total score
86
87	Returns:
88	A RuleScore (or subclass) instance
89	"""
90	return RuleScore(components=components, total_score=total_score)	1✔
91
92	def _score(self, result: ClassificationResult) -> None:	1✔
93	"""Score blocks using rules."""
94	context = RuleContext(result.page_data, self.config, result)	1✔
95	rules = self.rules	1✔
96
97	for block in result.page_data.blocks:	1✔
98	components = {}	1✔
99	weighted_sum = 0.0	1✔
100	total_weight = 0.0	1✔
101	failed = False	1✔
102
103	for rule in rules:	1✔
104	score = rule.calculate(block, context)	1✔
105
106	# If rule returns None, it's skipped (not applicable)
107	if score is None:	1✔
108	continue	1✔
109
110	# If required rule fails (score 0), fail the block immediately
111	if rule.required and score == 0.0:	1✔
112	failed = True	1✔
113	# log.debug(
114	# "[%s] block_id=%s failed required rule '%s'",
115	# self.output,
116	# block.id,
117	# rule.name,
118	# )
119	break	1✔
120
121	rule_weight = rule.weight # Using direct weight from Rule instance	1✔
122
123	weighted_sum += score * rule_weight	1✔
124	total_weight += rule_weight	1✔
125	components[rule.name] = score	1✔
126
127	if failed:	1✔
128	continue	1✔
129
130	# Calculate final score
131	final_score = weighted_sum / total_weight if total_weight > 0 else 0.0	1✔
132
133	# Check classifier-specific acceptance logic
134	if not self._should_accept(final_score):	1✔
135	log.debug(	1✔
136	"[%s] block_id=%s "
137	"rejected: score=%.3f < min_score=%.3f components=%s",
138	self.output,
139	block.id,
140	final_score,
141	self.min_score,
142	components,
143	)
144	continue	1✔
145
146	log.debug(	1✔
147	"[%s] block_id=%s accepted: score=%.3f components=%s",
148	self.output,
149	block.id,
150	final_score,
151	components,
152	)
153
154	# Build source blocks list, including text outline effects for Text blocks
155	source_blocks: list = [block]	1✔
156	if isinstance(block, Text):	1✔
157	outline_effects = find_text_outline_effects(	1✔
158	block, result.page_data.blocks
159	)
160	source_blocks.extend(outline_effects)	1✔
161
162	# Add any classifier-specific additional source blocks
163	source_blocks.extend(self._get_additional_source_blocks(block, result))	1✔
164
165	# Create score object (subclasses can override _create_score)
166	score_details = self._create_score(block, components, final_score)	1✔
167
168	# Compute bbox as the union of all source blocks
169	# This ensures the candidate bbox matches the source_blocks union,
170	# required by validation (assert_element_bbox_matches_source_and_children)
171	candidate_bbox = BBox.union_all([b.bbox for b in source_blocks])	1✔
172
173	# Create candidate
174	candidate = Candidate(	1✔
175	bbox=candidate_bbox,
176	label=self.output,
177	score=final_score,
178	score_details=score_details,
179	source_blocks=source_blocks,
180	)
181	result.add_candidate(candidate)	1✔
182
183	def _get_additional_source_blocks(	1✔
184	self, block: Block, result: ClassificationResult
185	) -> list[Blocks]:
186	"""Get additional source blocks to include with the candidate.
187
188	Subclasses can override this to include related blocks (e.g.,
189	overlapping drawings, drop shadows) in the candidate's source_blocks.
190	These blocks will be marked as removed if the candidate wins.
191	"""
192	return []	1✔
193
194	def _should_accept(self, score: float) -> bool:	1✔
195	"""Determine if a score is high enough to be a candidate.
196
197	Subclasses can override this.
198	"""
199	return score >= self.min_score	1✔

bramp / build-along / 20361865516

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous