20398712053

Committed 20 Dec 2025 07:00PM UTC coverage: 89.361% (+0.2%) from 89.185%

Build # 20398712053

Build Type

push

github

Committed by

bramp

Commit Message

Improve circular dependency error to show dependency chain

- Add _find_dependency_cycle() to trace and format the actual circular dependency path
- Update error message to include both affected classifiers and the dependency chain
- Add test case to verify circular dependency detection and error message format

Run Details

48 of 56 new or added lines in 2 files covered. (85.71%)

145 existing lines in 28 files now uncovered.

13700 of 15331 relevant lines covered (89.36%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.7

/src/build_a_long/pdf_extract/classifier/rule_based_classifier.py

"""
Rule-based classifier implementation.
"""

from __future__ import annotations

import logging
from abc import abstractmethod
from collections.abc import Sequence
from typing import TYPE_CHECKING

from build_a_long.pdf_extract.classifier.block_filter import (
    find_contained_effects,
)
from build_a_long.pdf_extract.classifier.candidate import Candidate
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext
from build_a_long.pdf_extract.classifier.score import Score, Weight
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.page_blocks import Block, Blocks, Text

if TYPE_CHECKING:
    pass

log = logging.getLogger(__name__)


class RuleScore(Score):
    """Generic score based on rules."""

    components: dict[str, float]
    total_score: float

    def score(self) -> Weight:
        return self.total_score

    def get(self, rule_name: str, default: float = 0.0) -> float:
        """Get the score for a specific rule name."""
        return self.components.get(rule_name, default)


class StepNumberScore(RuleScore):
    """Score for step number candidates that includes the parsed step value.

    This avoids re-parsing the step number from source blocks when the value
    is needed later (e.g., for building StepNumber elements or sorting).
    """

    step_value: int
    """The parsed step number value (e.g., 1, 2, 3, 42)."""


class RuleBasedClassifier(LabelClassifier):
    """Base class for classifiers that use a list of rules to score candidates."""

    @property
    @abstractmethod
    def rules(self) -> Sequence[Rule]:
        """Get the list of rules for this classifier."""
        pass

    @property
    def min_score(self) -> float:
        """Minimum score threshold for acceptance. Defaults to 0.0."""
        return 0.0

    @property
    def effects_margin(self) -> float | None:
        """Margin to expand block bbox to find visual effects (outlines, shadows).

        If None, no automatic effect finding is performed.
        Defaults to 2.0.
        """
        return 2.0

    @property
    def effects_max_area_ratio(self) -> float | None:
        """Maximum ratio of effect block area to primary block area.

        Used to avoid consuming unrelated large blocks as effects.
        Defaults to None (no ratio check).
        """
        return None

    def _create_score(
        self,
        block: Block,
        components: dict[str, float],
        total_score: float,
    ) -> RuleScore:
        """Create the score object for a candidate.

        Subclasses can override this to return a more specific score type
        that contains additional information (e.g., parsed values).

        Args:
            block: The block being scored
            components: Dictionary of rule name to score
            total_score: The weighted total score

        Returns:
            A RuleScore (or subclass) instance
        """
        return RuleScore(components=components, total_score=total_score)

    def _score(self, result: ClassificationResult) -> None:
        """Score blocks using rules."""
        context = RuleContext(result.page_data, self.config, result)
        rules = self.rules

        for block in result.page_data.blocks:
            components = {}
            weighted_sum = 0.0
            total_weight = 0.0
            failed = False

            for rule in rules:
                score = rule.calculate(block, context)

                # If rule returns None, it's skipped (not applicable)
                if score is None:
                    continue

                # If required rule fails (score 0), fail the block immediately
                if rule.required and score == 0.0:
                    failed = True
                    # log.debug(
                    #    "[%s] block_id=%s failed required rule '%s'",
                    #    self.output,
                    #    block.id,
                    #    rule.name,
                    # )
                    break

                rule_weight = rule.weight  # Using direct weight from Rule instance

                weighted_sum += score * rule_weight
                total_weight += rule_weight
                components[rule.name] = score

            if failed:
                continue

            # Calculate final score
            final_score = weighted_sum / total_weight if total_weight > 0 else 0.0

            # Check classifier-specific acceptance logic
            if not self._should_accept(final_score):
                log.debug(
                    "[%s] block_id=%s "
                    "rejected: score=%.3f < min_score=%.3f components=%s",
                    self.output,
                    block.id,
                    final_score,
                    self.min_score,
                    components,
                )
                continue

            log.debug(
                "[%s] block_id=%s accepted: score=%.3f components=%s",
                self.output,
                block.id,
                final_score,
                components,
            )

            # Build source blocks list, deduplicating as we go
            seen_ids: set[int] = {block.id}
            source_blocks: list[Blocks] = [block]

            # Automatically find visual effects (outlines, shadows) for Text blocks
            margin = self.effects_margin
            if margin is not None and isinstance(block, Text):
                effects = find_contained_effects(
                    block,
                    result.page_data.blocks,
                    margin=margin,
                    max_area_ratio=self.effects_max_area_ratio,
                )
                for b in effects:
                    if b.id not in seen_ids:
                        seen_ids.add(b.id)
                        source_blocks.append(b)

            # Add any classifier-specific additional source blocks
            for b in self._get_additional_source_blocks(block, result):
                if b.id not in seen_ids:
                    seen_ids.add(b.id)
                    source_blocks.append(b)

            # Create score object (subclasses can override _create_score)
            score_details = self._create_score(block, components, final_score)

            # Compute bbox as the union of all source blocks
            # This ensures the candidate bbox matches the source_blocks union,
            # required by validation (assert_element_bbox_matches_source_and_children)
            candidate_bbox = BBox.union_all([b.bbox for b in source_blocks])

            # Create candidate
            candidate = Candidate(
                bbox=candidate_bbox,
                label=self.output,
                score=final_score,
                score_details=score_details,
                source_blocks=source_blocks,
            )
            result.add_candidate(candidate)

    def _get_additional_source_blocks(
        self, block: Block, result: ClassificationResult
    ) -> Sequence[Blocks]:
        """Get additional source blocks to include with the candidate.

        Subclasses can override this to include related blocks (e.g.,
        overlapping drawings, drop shadows) in the candidate's source_blocks.
        These blocks will be marked as removed if the candidate wins.
        """
        return []

    def _should_accept(self, score: float) -> bool:
        """Determine if a score is high enough to be a candidate.

        Subclasses can override this.
        """
        return score >= self.min_score

1	"""
2	Rule-based classifier implementation.
3	"""
4
5	from __future__ import annotations	1✔
6
7	import logging	1✔
8	from abc import abstractmethod	1✔
9	from collections.abc import Sequence	1✔
10	from typing import TYPE_CHECKING	1✔
11
12	from build_a_long.pdf_extract.classifier.block_filter import (	1✔
13	find_contained_effects,
14	)
15	from build_a_long.pdf_extract.classifier.candidate import Candidate	1✔
16	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
17	ClassificationResult,
18	)
19	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
20	LabelClassifier,
21	)
22	from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext	1✔
23	from build_a_long.pdf_extract.classifier.score import Score, Weight	1✔
24	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
25	from build_a_long.pdf_extract.extractor.page_blocks import Block, Blocks, Text	1✔
26
27	if TYPE_CHECKING:
28	pass
29
30	log = logging.getLogger(__name__)	1✔
31
32
33	class RuleScore(Score):	1✔
34	"""Generic score based on rules."""
35
36	components: dict[str, float]	1✔
37	total_score: float	1✔
38
39	def score(self) -> Weight:	1✔
UNCOV 40	return self.total_score	×
41
42	def get(self, rule_name: str, default: float = 0.0) -> float:	1✔
43	"""Get the score for a specific rule name."""
44	return self.components.get(rule_name, default)	1✔
45
46
47	class StepNumberScore(RuleScore):	1✔
48	"""Score for step number candidates that includes the parsed step value.
49
50	This avoids re-parsing the step number from source blocks when the value
51	is needed later (e.g., for building StepNumber elements or sorting).
52	"""
53
54	step_value: int	1✔
55	"""The parsed step number value (e.g., 1, 2, 3, 42)."""	1✔
56
57
58	class RuleBasedClassifier(LabelClassifier):	1✔
59	"""Base class for classifiers that use a list of rules to score candidates."""
60
61	@property	1✔
62	@abstractmethod	1✔
63	def rules(self) -> Sequence[Rule]:	1✔
64	"""Get the list of rules for this classifier."""
UNCOV 65	pass	×
66
67	@property	1✔
68	def min_score(self) -> float:	1✔
69	"""Minimum score threshold for acceptance. Defaults to 0.0."""
70	return 0.0	1✔
71
72	@property	1✔
73	def effects_margin(self) -> float \| None:	1✔
74	"""Margin to expand block bbox to find visual effects (outlines, shadows).
75
76	If None, no automatic effect finding is performed.
77	Defaults to 2.0.
78	"""
79	return 2.0	1✔
80
81	@property	1✔
82	def effects_max_area_ratio(self) -> float \| None:	1✔
83	"""Maximum ratio of effect block area to primary block area.
84
85	Used to avoid consuming unrelated large blocks as effects.
86	Defaults to None (no ratio check).
87	"""
88	return None	1✔
89
90	def _create_score(	1✔
91	self,
92	block: Block,
93	components: dict[str, float],
94	total_score: float,
95	) -> RuleScore:
96	"""Create the score object for a candidate.
97
98	Subclasses can override this to return a more specific score type
99	that contains additional information (e.g., parsed values).
100
101	Args:
102	block: The block being scored
103	components: Dictionary of rule name to score
104	total_score: The weighted total score
105
106	Returns:
107	A RuleScore (or subclass) instance
108	"""
109	return RuleScore(components=components, total_score=total_score)	1✔
110
111	def _score(self, result: ClassificationResult) -> None:	1✔
112	"""Score blocks using rules."""
113	context = RuleContext(result.page_data, self.config, result)	1✔
114	rules = self.rules	1✔
115
116	for block in result.page_data.blocks:	1✔
117	components = {}	1✔
118	weighted_sum = 0.0	1✔
119	total_weight = 0.0	1✔
120	failed = False	1✔
121
122	for rule in rules:	1✔
123	score = rule.calculate(block, context)	1✔
124
125	# If rule returns None, it's skipped (not applicable)
126	if score is None:	1✔
127	continue	1✔
128
129	# If required rule fails (score 0), fail the block immediately
130	if rule.required and score == 0.0:	1✔
131	failed = True	1✔
132	# log.debug(
133	# "[%s] block_id=%s failed required rule '%s'",
134	# self.output,
135	# block.id,
136	# rule.name,
137	# )
138	break	1✔
139
140	rule_weight = rule.weight # Using direct weight from Rule instance	1✔
141
142	weighted_sum += score * rule_weight	1✔
143	total_weight += rule_weight	1✔
144	components[rule.name] = score	1✔
145
146	if failed:	1✔
147	continue	1✔
148
149	# Calculate final score
150	final_score = weighted_sum / total_weight if total_weight > 0 else 0.0	1✔
151
152	# Check classifier-specific acceptance logic
153	if not self._should_accept(final_score):	1✔
154	log.debug(	1✔
155	"[%s] block_id=%s "
156	"rejected: score=%.3f < min_score=%.3f components=%s",
157	self.output,
158	block.id,
159	final_score,
160	self.min_score,
161	components,
162	)
163	continue	1✔
164
165	log.debug(	1✔
166	"[%s] block_id=%s accepted: score=%.3f components=%s",
167	self.output,
168	block.id,
169	final_score,
170	components,
171	)
172
173	# Build source blocks list, deduplicating as we go
174	seen_ids: set[int] = {block.id}	1✔
175	source_blocks: list[Blocks] = [block]	1✔
176
177	# Automatically find visual effects (outlines, shadows) for Text blocks
178	margin = self.effects_margin	1✔
179	if margin is not None and isinstance(block, Text):	1✔
180	effects = find_contained_effects(	1✔
181	block,
182	result.page_data.blocks,
183	margin=margin,
184	max_area_ratio=self.effects_max_area_ratio,
185	)
186	for b in effects:	1✔
187	if b.id not in seen_ids:	1✔
188	seen_ids.add(b.id)	1✔
189	source_blocks.append(b)	1✔
190
191	# Add any classifier-specific additional source blocks
192	for b in self._get_additional_source_blocks(block, result):	1✔
193	if b.id not in seen_ids:	1✔
194	seen_ids.add(b.id)	1✔
195	source_blocks.append(b)	1✔
196
197	# Create score object (subclasses can override _create_score)
198	score_details = self._create_score(block, components, final_score)	1✔
199
200	# Compute bbox as the union of all source blocks
201	# This ensures the candidate bbox matches the source_blocks union,
202	# required by validation (assert_element_bbox_matches_source_and_children)
203	candidate_bbox = BBox.union_all([b.bbox for b in source_blocks])	1✔
204
205	# Create candidate
206	candidate = Candidate(	1✔
207	bbox=candidate_bbox,
208	label=self.output,
209	score=final_score,
210	score_details=score_details,
211	source_blocks=source_blocks,
212	)
213	result.add_candidate(candidate)	1✔
214
215	def _get_additional_source_blocks(	1✔
216	self, block: Block, result: ClassificationResult
217	) -> Sequence[Blocks]:
218	"""Get additional source blocks to include with the candidate.
219
220	Subclasses can override this to include related blocks (e.g.,
221	overlapping drawings, drop shadows) in the candidate's source_blocks.
222	These blocks will be marked as removed if the candidate wins.
223	"""
224	return []	1✔
225
226	def _should_accept(self, score: float) -> bool:	1✔
227	"""Determine if a score is high enough to be a candidate.
228
229	Subclasses can override this.
230	"""
231	return score >= self.min_score	1✔

bramp / build-along / 20398712053

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous