20120615911

Committed 10 Dec 2025 10:24PM UTC coverage: 89.767% (+0.08%) from 89.689%

Build # 20120615911

Build Type

push

github

Committed by

bramp

Commit Message

Fix typo in fixtures README: .sh -> .py for regenerate script

Run Details

11790 of 13134 relevant lines covered (89.77%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.16

/src/build_a_long/pdf_extract/classifier/rule_based_classifier.py

"""
Rule-based classifier implementation.
"""

from __future__ import annotations

import logging
from abc import abstractmethod

from build_a_long.pdf_extract.classifier.block_filter import (
    find_text_outline_effects,
)
from build_a_long.pdf_extract.classifier.candidate import Candidate
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext
from build_a_long.pdf_extract.classifier.score import Score, Weight
from build_a_long.pdf_extract.extractor.page_blocks import Text

log = logging.getLogger(__name__)


class RuleScore(Score):
    """Generic score based on rules."""

    components: dict[str, float]
    total_score: float

    def score(self) -> Weight:
        return self.total_score

    def get(self, rule_name: str, default: float = 0.0) -> float:
        """Get the score for a specific rule name."""
        return self.components.get(rule_name, default)


class RuleBasedClassifier(LabelClassifier):
    """Base class for classifiers that use a list of rules to score candidates."""

    @property
    @abstractmethod
    def rules(self) -> list[Rule]:
        """Get the list of rules for this classifier."""
        pass

    @property
    def min_score(self) -> float:
        """Minimum score threshold for acceptance. Defaults to 0.0."""
        return 0.0

    def _score(self, result: ClassificationResult) -> None:
        """Score blocks using rules."""
        context = RuleContext(result.page_data, self.config)
        rules = self.rules

        for block in result.page_data.blocks:
            components = {}
            weighted_sum = 0.0
            total_weight = 0.0
            failed = False

            for rule in rules:
                score = rule.calculate(block, context)

                # If rule returns None, it's skipped (not applicable)
                if score is None:
                    continue

                # If required rule fails (score 0), fail the block immediately
                if rule.required and score == 0.0:
                    failed = True
                    # log.debug(
                    #    "[%s] block_id=%s failed required rule '%s'",
                    #    self.output,
                    #    block.id,
                    #    rule.name,
                    # )
                    break

                rule_weight = rule.weight  # Using direct weight from Rule instance

                weighted_sum += score * rule_weight
                total_weight += rule_weight
                components[rule.name] = score

            if failed:
                continue

            # Calculate final score
            if total_weight > 0:
                final_score = weighted_sum / total_weight
            else:
                final_score = 0.0

            # Check classifier-specific acceptance logic
            if not self._should_accept(final_score):
                log.debug(
                    "[%s] block_id=%s rejected: score=%.3f < min_score=%.3f components=%s",
                    self.output,
                    block.id,
                    final_score,
                    self.min_score,
                    components,
                )
                continue

            log.debug(
                "[%s] block_id=%s accepted: score=%.3f components=%s",
                self.output,
                block.id,
                final_score,
                components,
            )

            # Build source blocks list, including text outline effects for Text blocks
            source_blocks: list = [block]
            if isinstance(block, Text):
                outline_effects = find_text_outline_effects(
                    block, result.page_data.blocks
                )
                source_blocks.extend(outline_effects)

            # Create candidate
            candidate = Candidate(
                bbox=block.bbox,
                label=self.output,
                score=final_score,
                score_details=RuleScore(components=components, total_score=final_score),
                source_blocks=source_blocks,
            )
            result.add_candidate(candidate)

    def _should_accept(self, score: float) -> bool:
        """Determine if a score is high enough to be a candidate.

        Subclasses can override this.
        """
        return score >= self.min_score

1	"""
2	Rule-based classifier implementation.
3	"""
4
5	from __future__ import annotations	1✔
6
7	import logging	1✔
8	from abc import abstractmethod	1✔
9
10	from build_a_long.pdf_extract.classifier.block_filter import (	1✔
11	find_text_outline_effects,
12	)
13	from build_a_long.pdf_extract.classifier.candidate import Candidate	1✔
14	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
15	ClassificationResult,
16	)
17	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
18	LabelClassifier,
19	)
20	from build_a_long.pdf_extract.classifier.rules import Rule, RuleContext	1✔
21	from build_a_long.pdf_extract.classifier.score import Score, Weight	1✔
22	from build_a_long.pdf_extract.extractor.page_blocks import Text	1✔
23
24	log = logging.getLogger(__name__)	1✔
25
26
27	class RuleScore(Score):	1✔
28	"""Generic score based on rules."""
29
30	components: dict[str, float]	1✔
31	total_score: float	1✔
32
33	def score(self) -> Weight:	1✔
34	return self.total_score	×
35
36	def get(self, rule_name: str, default: float = 0.0) -> float:	1✔
37	"""Get the score for a specific rule name."""
38	return self.components.get(rule_name, default)	1✔
39
40
41	class RuleBasedClassifier(LabelClassifier):	1✔
42	"""Base class for classifiers that use a list of rules to score candidates."""
43
44	@property	1✔
45	@abstractmethod	1✔
46	def rules(self) -> list[Rule]:	1✔
47	"""Get the list of rules for this classifier."""
48	pass	×
49
50	@property	1✔
51	def min_score(self) -> float:	1✔
52	"""Minimum score threshold for acceptance. Defaults to 0.0."""
53	return 0.0	1✔
54
55	def _score(self, result: ClassificationResult) -> None:	1✔
56	"""Score blocks using rules."""
57	context = RuleContext(result.page_data, self.config)	1✔
58	rules = self.rules	1✔
59
60	for block in result.page_data.blocks:	1✔
61	components = {}	1✔
62	weighted_sum = 0.0	1✔
63	total_weight = 0.0	1✔
64	failed = False	1✔
65
66	for rule in rules:	1✔
67	score = rule.calculate(block, context)	1✔
68
69	# If rule returns None, it's skipped (not applicable)
70	if score is None:	1✔
71	continue	1✔
72
73	# If required rule fails (score 0), fail the block immediately
74	if rule.required and score == 0.0:	1✔
75	failed = True	1✔
76	# log.debug(
77	# "[%s] block_id=%s failed required rule '%s'",
78	# self.output,
79	# block.id,
80	# rule.name,
81	# )
82	break	1✔
83
84	rule_weight = rule.weight # Using direct weight from Rule instance	1✔
85
86	weighted_sum += score * rule_weight	1✔
87	total_weight += rule_weight	1✔
88	components[rule.name] = score	1✔
89
90	if failed:	1✔
91	continue	1✔
92
93	# Calculate final score
94	if total_weight > 0:	1✔
95	final_score = weighted_sum / total_weight	1✔
96	else:
97	final_score = 0.0	×
98
99	# Check classifier-specific acceptance logic
100	if not self._should_accept(final_score):	1✔
101	log.debug(	1✔
102	"[%s] block_id=%s rejected: score=%.3f < min_score=%.3f components=%s",
103	self.output,
104	block.id,
105	final_score,
106	self.min_score,
107	components,
108	)
109	continue	1✔
110
111	log.debug(	1✔
112	"[%s] block_id=%s accepted: score=%.3f components=%s",
113	self.output,
114	block.id,
115	final_score,
116	components,
117	)
118
119	# Build source blocks list, including text outline effects for Text blocks
120	source_blocks: list = [block]	1✔
121	if isinstance(block, Text):	1✔
122	outline_effects = find_text_outline_effects(	1✔
123	block, result.page_data.blocks
124	)
125	source_blocks.extend(outline_effects)	1✔
126
127	# Create candidate
128	candidate = Candidate(	1✔
129	bbox=block.bbox,
130	label=self.output,
131	score=final_score,
132	score_details=RuleScore(components=components, total_score=final_score),
133	source_blocks=source_blocks,
134	)
135	result.add_candidate(candidate)	1✔
136
137	def _should_accept(self, score: float) -> bool:	1✔
138	"""Determine if a score is high enough to be a candidate.
139
140	Subclasses can override this.
141	"""
142	return score >= self.min_score	1✔

bramp / build-along / 20120615911

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous