20400711546

Committed 20 Dec 2025 10:09PM UTC coverage: 89.367% (+0.006%) from 89.361%

Build # 20400711546

Build Type

push

github

Committed by

bramp

Commit Message

docs: Add comprehensive Classifier best practices documentation

- Add detailed docstrings to Classifier and RuleBasedClassifier classes
  covering all aspects of writing robust classifiers
- Document scoring phase: API access rules, Score object design,
  intrinsic vs relationship-based scoring
- Document build phase: source block rules, exception handling,
  construction patterns
- Document build_all(): when to use for global coordination
- Add complete code examples for atomic and composite patterns
- Fix DESIGN.md contradiction about Score objects storing candidates
- Update README.md and DESIGN.md to reference class docstrings as
  single source of truth
- Add recommendations to use RuleBasedClassifier for atomic classifiers

This consolidates documentation to reduce duplication and provides
clear guidelines for both humans and AI agents writing new classifiers.

Run Details

13708 of 15339 relevant lines covered (89.37%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.14

/src/build_a_long/pdf_extract/classifier/steps/step_count_classifier.py

"""
Step count classifier.

Purpose
-------
Detect step-count text like "2x" that appears in substep callout boxes.
These are similar to part counts but use a larger font size (typically 16pt),
between part count size and step number size.

Debugging
---------
Enable DEBUG logs with LOG_LEVEL=DEBUG.
"""

import logging
from collections.abc import Sequence
from typing import ClassVar

from build_a_long.pdf_extract.classifier.candidate import Candidate
from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.rule_based_classifier import (
    RuleBasedClassifier,
)
from build_a_long.pdf_extract.classifier.rules import (
    FontSizeRangeRule,
    IsInstanceFilter,
    PartCountTextRule,
    Rule,
)
from build_a_long.pdf_extract.classifier.rules.scale import LinearScale
from build_a_long.pdf_extract.classifier.text import (
    extract_part_count_value,
)
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    StepCount,
)
from build_a_long.pdf_extract.extractor.page_blocks import Text

log = logging.getLogger(__name__)


class StepCountClassifier(RuleBasedClassifier):
    """Classifier for step counts (substep counts like "2x").

    These are count labels that appear inside substep callout boxes,
    indicating how many times to build the sub-assembly.
    They use a font size between part counts and step numbers.
    """

    output: ClassVar[str] = "step_count"
    requires: ClassVar[frozenset[str]] = frozenset()

    @property
    def effects_margin(self) -> float | None:
        return 2.0

    @property
    def min_score(self) -> float:
        return self.config.step_count.min_score

    @property
    def rules(self) -> Sequence[Rule]:
        config = self.config
        step_count_config = config.step_count
        hints = config.font_size_hints

        return [
            # Must be text
            IsInstanceFilter(Text),
            # Check if text matches count pattern (e.g., "2x", "4x")
            PartCountTextRule(
                weight=step_count_config.text_weight,
                name="text_score",
                required=True,
            ),
            # Score font size: should be >= part_count_size and <= step_number_size
            # 0.7 within tolerance of min, 1.0 above min+tolerance, 0.0 outside range
            FontSizeRangeRule(
                scale=LinearScale(
                    {
                        (hints.part_count_size or 10.0) - 1.0: 0.0,
                        hints.part_count_size or 10.0: 0.7,
                        (hints.part_count_size or 10.0) + 1.0: 1.0,
                        (hints.step_number_size or 20.0) + 1.0: 1.0,
                        (hints.step_number_size or 20.0) + 2.0: 0.0,
                    }
                ),
                weight=step_count_config.font_size_weight,
                name="font_size_score",
            ),
        ]

    def build(self, candidate: Candidate, result: ClassificationResult) -> StepCount:
        """Construct a StepCount element from a candidate.

        The candidate may include additional source blocks (e.g., text outline
        effects) beyond the primary Text block.
        """
        # Get the primary text block (first in source_blocks)
        assert len(candidate.source_blocks) >= 1
        block = candidate.source_blocks[0]
        assert isinstance(block, Text)

        # Parse the count value
        value = extract_part_count_value(block.text)
        if value is None:
            raise ValueError(f"Could not parse step count from text: '{block.text}'")

        # Use candidate.bbox which is the union of all source blocks
        return StepCount(count=value, bbox=candidate.bbox)

1	"""
2	Step count classifier.
3
4	Purpose
5	-------
6	Detect step-count text like "2x" that appears in substep callout boxes.
7	These are similar to part counts but use a larger font size (typically 16pt),
8	between part count size and step number size.
9
10	Debugging
11	---------
12	Enable DEBUG logs with LOG_LEVEL=DEBUG.
13	"""
14
15	import logging	1✔
16	from collections.abc import Sequence	1✔
17	from typing import ClassVar	1✔
18
19	from build_a_long.pdf_extract.classifier.candidate import Candidate	1✔
20	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
21	ClassificationResult,
22	)
23	from build_a_long.pdf_extract.classifier.rule_based_classifier import (	1✔
24	RuleBasedClassifier,
25	)
26	from build_a_long.pdf_extract.classifier.rules import (	1✔
27	FontSizeRangeRule,
28	IsInstanceFilter,
29	PartCountTextRule,
30	Rule,
31	)
32	from build_a_long.pdf_extract.classifier.rules.scale import LinearScale	1✔
33	from build_a_long.pdf_extract.classifier.text import (	1✔
34	extract_part_count_value,
35	)
36	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
37	StepCount,
38	)
39	from build_a_long.pdf_extract.extractor.page_blocks import Text	1✔
40
41	log = logging.getLogger(__name__)	1✔
42
43
44	class StepCountClassifier(RuleBasedClassifier):	1✔
45	"""Classifier for step counts (substep counts like "2x").
46
47	These are count labels that appear inside substep callout boxes,
48	indicating how many times to build the sub-assembly.
49	They use a font size between part counts and step numbers.
50	"""
51
52	output: ClassVar[str] = "step_count"	1✔
53	requires: ClassVar[frozenset[str]] = frozenset()	1✔
54
55	@property	1✔
56	def effects_margin(self) -> float \| None:	1✔
57	return 2.0	1✔
58
59	@property	1✔
60	def min_score(self) -> float:	1✔
61	return self.config.step_count.min_score	1✔
62
63	@property	1✔
64	def rules(self) -> Sequence[Rule]:	1✔
65	config = self.config	1✔
66	step_count_config = config.step_count	1✔
67	hints = config.font_size_hints	1✔
68
69	return [	1✔
70	# Must be text
71	IsInstanceFilter(Text),
72	# Check if text matches count pattern (e.g., "2x", "4x")
73	PartCountTextRule(
74	weight=step_count_config.text_weight,
75	name="text_score",
76	required=True,
77	),
78	# Score font size: should be >= part_count_size and <= step_number_size
79	# 0.7 within tolerance of min, 1.0 above min+tolerance, 0.0 outside range
80	FontSizeRangeRule(
81	scale=LinearScale(
82	{
83	(hints.part_count_size or 10.0) - 1.0: 0.0,
84	hints.part_count_size or 10.0: 0.7,
85	(hints.part_count_size or 10.0) + 1.0: 1.0,
86	(hints.step_number_size or 20.0) + 1.0: 1.0,
87	(hints.step_number_size or 20.0) + 2.0: 0.0,
88	}
89	),
90	weight=step_count_config.font_size_weight,
91	name="font_size_score",
92	),
93	]
94
95	def build(self, candidate: Candidate, result: ClassificationResult) -> StepCount:	1✔
96	"""Construct a StepCount element from a candidate.
97
98	The candidate may include additional source blocks (e.g., text outline
99	effects) beyond the primary Text block.
100	"""
101	# Get the primary text block (first in source_blocks)
102	assert len(candidate.source_blocks) >= 1	1✔
103	block = candidate.source_blocks[0]	1✔
104	assert isinstance(block, Text)	1✔
105
106	# Parse the count value
107	value = extract_part_count_value(block.text)	1✔
108	if value is None:	1✔
109	raise ValueError(f"Could not parse step count from text: '{block.text}'")	×
110
111	# Use candidate.bbox which is the union of all source blocks
112	return StepCount(count=value, bbox=candidate.bbox)	1✔

bramp / build-along / 20400711546

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous