19058602748

Committed 04 Nov 2025 04:38AM UTC coverage: 88.027% (-0.09%) from 88.118%

Build # 19058602748

Build Type

push

github

Committed by

bramp

Commit Message

Add Step classifier and refactor Candidate to support synthetic elements

- Implement StepClassifier that combines step_number, parts_list, and diagram into Step structures
- Refactor Candidate class to make source_element optional and add required bbox field
- Step is a synthetic/composite element with no single source, so source_element is None
- Update all classifiers to pass bbox explicitly when creating Candidates
- Add comprehensive tests for StepClassifier covering various scenarios
- Update mark_winner and helper methods to handle optional source_element
- Clean up unused step_elements dictionary from StepClassifier
- All tests passing (10/10 classifier test suites)

Run Details

179 of 181 new or added lines in 5 files covered. (98.9%)

111 existing lines in 10 files now uncovered.

3529 of 4009 relevant lines covered (88.03%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.72

/src/build_a_long/pdf_extract/classifier/classifier.py

"""
Rule-based classifier for labeling page elements.

Pipeline order and dependencies
--------------------------------
Classifiers run in a fixed, enforced order because later stages depend on
labels produced by earlier stages:

1) PageNumberClassifier → outputs: "page_number"
2) PartCountClassifier  → outputs: "part_count"
3) StepNumberClassifier → outputs: "step_number" (uses page_number size as context)
4) PartsListClassifier  → outputs: "parts_list" (requires step_number and part_count)
5) PartsImageClassifier → outputs: "part_image" (requires parts_list and part_count)
6) StepClassifier       → outputs: "step" (requires step_number and parts_list)

If the order is changed such that a classifier runs before its requirements
are available, a ValueError will be raised at initialization time.
"""

import logging
from typing import List, Optional, Set

from build_a_long.pdf_extract.classifier.page_number_classifier import (
    PageNumberClassifier,
)
from build_a_long.pdf_extract.classifier.part_count_classifier import (
    PartCountClassifier,
)
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
    PartsListClassifier,
)
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
    PartsImageClassifier,
)
from build_a_long.pdf_extract.classifier.step_number_classifier import (
    StepNumberClassifier,
)
from build_a_long.pdf_extract.classifier.step_classifier import (
    StepClassifier,
)
from build_a_long.pdf_extract.classifier.types import (
    ClassifierConfig,
    ClassificationHints,
    ClassificationResult,
    RemovalReason,
)
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.page_elements import Text

logger = logging.getLogger(__name__)


def classify_elements(page: PageData) -> ClassificationResult:
    """Classify and label elements on a single page using rule-based heuristics.

    Args:
        page: A single PageData object to classify.

    Returns:
        A ClassificationResult object containing the classification results.
    """
    config = ClassifierConfig()
    classifier = Classifier(config)
    orchestrator = ClassificationOrchestrator(classifier)

    return orchestrator.process_page(page)


def classify_pages(pages: List[PageData]) -> List[ClassificationResult]:
    """Classify and label elements across multiple pages using rule-based heuristics.

    Args:
        pages: A list of PageData objects to classify.

    Returns:
        List of ClassificationResult objects, one per page.
    """
    config = ClassifierConfig()
    classifier = Classifier(config)
    orchestrator = ClassificationOrchestrator(classifier)

    results = []
    for page_data in pages:
        result = orchestrator.process_page(page_data)
        results.append(result)

    return results


class Classifier:
    """
    Performs a single run of classification based on rules, configuration, and hints.
    This class should be stateless.
    """

    def __init__(self, config: ClassifierConfig):
        self.config = config
        self.classifiers = [
            PageNumberClassifier(config, self),
            PartCountClassifier(config, self),
            StepNumberClassifier(config, self),
            PartsListClassifier(config, self),
            PartsImageClassifier(config, self),
            StepClassifier(config, self),
        ]

        produced: Set[str] = set()
        for c in self.classifiers:
            cls = c.__class__
            need = getattr(c, "requires", set())
            if not need.issubset(produced):
                missing = ", ".join(sorted(need - produced))
                raise ValueError(
                    f"Classifier order invalid: {cls.__name__} requires labels not yet produced: {missing}"
                )
            produced |= getattr(c, "outputs", set())

    def classify(
        self, page_data: PageData, hints: Optional[ClassificationHints] = None
    ) -> ClassificationResult:
        """
        Runs the classification logic and returns a result.
        It does NOT modify page_data directly.
        """
        result = ClassificationResult()

        for classifier in self.classifiers:
            classifier.evaluate(page_data, result)
            classifier.classify(page_data, result, hints)

        warnings = self._log_post_classification_warnings(page_data, result)
        for warning in warnings:
            result.add_warning(warning)

        # Extract persisted relations from PartsImageClassifier
        part_image_pairs = []
        for classifier in self.classifiers:
            if isinstance(classifier, PartsImageClassifier):
                part_image_pairs = classifier.get_part_image_pairs()
                break

        result.part_image_pairs = part_image_pairs

        return result

    def _remove_child_bboxes(
        self,
        page_data: PageData,
        target,
        result: ClassificationResult,
        keep_ids: Optional[Set[int]] = None,
    ) -> None:
        if keep_ids is None:
            keep_ids = set()

        target_bbox = target.bbox

        for ele in page_data.elements:
            if ele is target or id(ele) in keep_ids:
                continue
            b = ele.bbox
            if b.fully_inside(target_bbox):
                result.mark_removed(
                    ele, RemovalReason(reason_type="child_bbox", target_element=target)
                )

    def _remove_similar_bboxes(
        self,
        page_data: PageData,
        target,
        result: ClassificationResult,
        keep_ids: Optional[Set[int]] = None,
    ) -> None:
        if keep_ids is None:
            keep_ids = set()

        target_area = target.bbox.area
        tx, ty = target.bbox.center

        IOU_THRESHOLD = 0.8
        CENTER_EPS = 1.5
        AREA_TOL = 0.12

        for ele in page_data.elements:
            if ele is target or id(ele) in keep_ids:
                continue

            b = ele.bbox
            iou = target.bbox.iou(b)
            if iou >= IOU_THRESHOLD:
                result.mark_removed(
                    ele,
                    RemovalReason(reason_type="similar_bbox", target_element=target),
                )
                continue

            cx, cy = b.center
            if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:
                area = b.area
                if (
                    target_area > 0
                    and abs(area - target_area) / target_area <= AREA_TOL
                ):
                    result.mark_removed(
                        ele,
                        RemovalReason(
                            reason_type="similar_bbox", target_element=target
                        ),
                    )

    def _log_post_classification_warnings(
        self, page_data: PageData, result: ClassificationResult
    ) -> List[str]:
        warnings = []

        labeled_elements = result.get_labeled_elements()

        # Check if there's a page number
        has_page_number = any(
            label == "page_number" for label in labeled_elements.values()
        )
        if not has_page_number:
            warnings.append(f"Page {page_data.page_number}: missing page number")

        # Get elements by label
        parts_lists = [
            e for e, label in labeled_elements.items() if label == "parts_list"
        ]
        part_counts = [
            e for e, label in labeled_elements.items() if label == "part_count"
        ]

        for pl in parts_lists:
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
            if not inside_counts:
                warnings.append(
                    f"Page {page_data.page_number}: parts list at {pl.bbox} contains no part counts"
                )

        steps: list[Text] = [
            e
            for e, label in labeled_elements.items()
            if label == "step_number" and isinstance(e, Text)
        ]
        ABOVE_EPS = 2.0
        for step in steps:
            sb = step.bbox
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
            if not above:
                warnings.append(
                    f"Page {page_data.page_number}: step number '{step.text}' at {sb} has no parts list above it"
                )
        return warnings


class ClassificationOrchestrator:
    """
    Manages the backtracking classification process.
    This class is stateful.
    """

    def __init__(self, classifier: Classifier):
        self.classifier = classifier
        self.history: List[ClassificationResult] = []

    def process_page(self, page_data: PageData) -> ClassificationResult:
        """
        Orchestrates the classification of a single page, with backtracking.

        Returns:
            The final ClassificationResult containing labels and removal info.
        """
        hints = ClassificationHints()

        max_iterations = 1  # TODO raise this in future
        for i in range(max_iterations):
            result = self.classifier.classify(page_data, hints)
            self.history.append(result)

            inconsistencies = result.get_warnings()
            if not inconsistencies:
                return result

            hints = self._generate_new_hints(result, inconsistencies)

        final_result = self.history[-1]
        return final_result

    def _generate_new_hints(
        self, result: ClassificationResult, inconsistencies: List[str]
    ) -> ClassificationHints:
        """
        Creates new hints to guide the next classification run.
        """
        # TODO Here, we should look at the font sizes, and use that to help
        # bias the next run towards more consistent results.
        # TODO Figure out how to pass the hints between pages (of the book).
        return ClassificationHints()

1	"""
2	Rule-based classifier for labeling page elements.
3
4	Pipeline order and dependencies
5	--------------------------------
6	Classifiers run in a fixed, enforced order because later stages depend on
7	labels produced by earlier stages:
8
9	1) PageNumberClassifier → outputs: "page_number"
10	2) PartCountClassifier → outputs: "part_count"
11	3) StepNumberClassifier → outputs: "step_number" (uses page_number size as context)
12	4) PartsListClassifier → outputs: "parts_list" (requires step_number and part_count)
13	5) PartsImageClassifier → outputs: "part_image" (requires parts_list and part_count)
14	6) StepClassifier → outputs: "step" (requires step_number and parts_list)
15
16	If the order is changed such that a classifier runs before its requirements
17	are available, a ValueError will be raised at initialization time.
18	"""
19
20	import logging	1✔
21	from typing import List, Optional, Set	1✔
22
23	from build_a_long.pdf_extract.classifier.page_number_classifier import (	1✔
24	PageNumberClassifier,
25	)
26	from build_a_long.pdf_extract.classifier.part_count_classifier import (	1✔
27	PartCountClassifier,
28	)
29	from build_a_long.pdf_extract.classifier.parts_list_classifier import (	1✔
30	PartsListClassifier,
31	)
32	from build_a_long.pdf_extract.classifier.parts_image_classifier import (	1✔
33	PartsImageClassifier,
34	)
35	from build_a_long.pdf_extract.classifier.step_number_classifier import (	1✔
36	StepNumberClassifier,
37	)
38	from build_a_long.pdf_extract.classifier.step_classifier import (	1✔
39	StepClassifier,
40	)
41	from build_a_long.pdf_extract.classifier.types import (	1✔
42	ClassifierConfig,
43	ClassificationHints,
44	ClassificationResult,
45	RemovalReason,
46	)
47	from build_a_long.pdf_extract.extractor import PageData	1✔
48	from build_a_long.pdf_extract.extractor.page_elements import Text	1✔
49
50	logger = logging.getLogger(__name__)	1✔
51
52
53	def classify_elements(page: PageData) -> ClassificationResult:	1✔
54	"""Classify and label elements on a single page using rule-based heuristics.
55
56	Args:
57	page: A single PageData object to classify.
58
59	Returns:
60	A ClassificationResult object containing the classification results.
61	"""
62	config = ClassifierConfig()	1✔
63	classifier = Classifier(config)	1✔
64	orchestrator = ClassificationOrchestrator(classifier)	1✔
65
66	return orchestrator.process_page(page)	1✔
67
68
69	def classify_pages(pages: List[PageData]) -> List[ClassificationResult]:	1✔
70	"""Classify and label elements across multiple pages using rule-based heuristics.
71
72	Args:
73	pages: A list of PageData objects to classify.
74
75	Returns:
76	List of ClassificationResult objects, one per page.
77	"""
78	config = ClassifierConfig()	1✔
79	classifier = Classifier(config)	1✔
80	orchestrator = ClassificationOrchestrator(classifier)	1✔
81
82	results = []	1✔
83	for page_data in pages:	1✔
84	result = orchestrator.process_page(page_data)	1✔
85	results.append(result)	1✔
86
87	return results	1✔
88
89
90	class Classifier:	1✔
91	"""
92	Performs a single run of classification based on rules, configuration, and hints.
93	This class should be stateless.
94	"""
95
96	def __init__(self, config: ClassifierConfig):	1✔
97	self.config = config	1✔
98	self.classifiers = [	1✔
99	PageNumberClassifier(config, self),
100	PartCountClassifier(config, self),
101	StepNumberClassifier(config, self),
102	PartsListClassifier(config, self),
103	PartsImageClassifier(config, self),
104	StepClassifier(config, self),
105	]
106
107	produced: Set[str] = set()	1✔
108	for c in self.classifiers:	1✔
109	cls = c.__class__	1✔
110	need = getattr(c, "requires", set())	1✔
111	if not need.issubset(produced):	1✔
112	missing = ", ".join(sorted(need - produced))	1✔
113	raise ValueError(	1✔
114	f"Classifier order invalid: {cls.__name__} requires labels not yet produced: {missing}"
115	)
116	produced \|= getattr(c, "outputs", set())	1✔
117
118	def classify(	1✔
119	self, page_data: PageData, hints: Optional[ClassificationHints] = None
120	) -> ClassificationResult:
121	"""
122	Runs the classification logic and returns a result.
123	It does NOT modify page_data directly.
124	"""
125	result = ClassificationResult()	1✔
126
127	for classifier in self.classifiers:	1✔
128	classifier.evaluate(page_data, result)	1✔
129	classifier.classify(page_data, result, hints)	1✔
130
131	warnings = self._log_post_classification_warnings(page_data, result)	1✔
132	for warning in warnings:	1✔
133	result.add_warning(warning)	1✔
134
135	# Extract persisted relations from PartsImageClassifier
136	part_image_pairs = []	1✔
137	for classifier in self.classifiers:	1✔
138	if isinstance(classifier, PartsImageClassifier):	1✔
139	part_image_pairs = classifier.get_part_image_pairs()	1✔
140	break	1✔
141
142	result.part_image_pairs = part_image_pairs	1✔
143
144	return result	1✔
145
146	def _remove_child_bboxes(	1✔
147	self,
148	page_data: PageData,
149	target,
150	result: ClassificationResult,
151	keep_ids: Optional[Set[int]] = None,
152	) -> None:
153	if keep_ids is None:	1✔
154	keep_ids = set()	1✔
155
156	target_bbox = target.bbox	1✔
157
158	for ele in page_data.elements:	1✔
159	if ele is target or id(ele) in keep_ids:	1✔
160	continue	1✔
161	b = ele.bbox	1✔
162	if b.fully_inside(target_bbox):	1✔
163	result.mark_removed(	1✔
164	ele, RemovalReason(reason_type="child_bbox", target_element=target)
165	)
166
167	def _remove_similar_bboxes(	1✔
168	self,
169	page_data: PageData,
170	target,
171	result: ClassificationResult,
172	keep_ids: Optional[Set[int]] = None,
173	) -> None:
174	if keep_ids is None:	1✔
175	keep_ids = set()	1✔
176
177	target_area = target.bbox.area	1✔
178	tx, ty = target.bbox.center	1✔
179
180	IOU_THRESHOLD = 0.8	1✔
181	CENTER_EPS = 1.5	1✔
182	AREA_TOL = 0.12	1✔
183
184	for ele in page_data.elements:	1✔
185	if ele is target or id(ele) in keep_ids:	1✔
186	continue	1✔
187
188	b = ele.bbox	1✔
189	iou = target.bbox.iou(b)	1✔
190	if iou >= IOU_THRESHOLD:	1✔
191	result.mark_removed(	1✔
192	ele,
193	RemovalReason(reason_type="similar_bbox", target_element=target),
194	)
195	continue	1✔
196
197	cx, cy = b.center	1✔
198	if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:	1✔
UNCOV 199	area = b.area	×
UNCOV 200	if (	×
201	target_area > 0
202	and abs(area - target_area) / target_area <= AREA_TOL
203	):
UNCOV 204	result.mark_removed(	×
205	ele,
206	RemovalReason(
207	reason_type="similar_bbox", target_element=target
208	),
209	)
210
211	def _log_post_classification_warnings(	1✔
212	self, page_data: PageData, result: ClassificationResult
213	) -> List[str]:
214	warnings = []	1✔
215
216	labeled_elements = result.get_labeled_elements()	1✔
217
218	# Check if there's a page number
219	has_page_number = any(	1✔
220	label == "page_number" for label in labeled_elements.values()
221	)
222	if not has_page_number:	1✔
223	warnings.append(f"Page {page_data.page_number}: missing page number")	1✔
224
225	# Get elements by label
226	parts_lists = [	1✔
227	e for e, label in labeled_elements.items() if label == "parts_list"
228	]
229	part_counts = [	1✔
230	e for e, label in labeled_elements.items() if label == "part_count"
231	]
232
233	for pl in parts_lists:	1✔
234	inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]	1✔
235	if not inside_counts:	1✔
UNCOV 236	warnings.append(	×
237	f"Page {page_data.page_number}: parts list at {pl.bbox} contains no part counts"
238	)
239
240	steps: list[Text] = [	1✔
241	e
242	for e, label in labeled_elements.items()
243	if label == "step_number" and isinstance(e, Text)
244	]
245	ABOVE_EPS = 2.0	1✔
246	for step in steps:	1✔
247	sb = step.bbox	1✔
248	above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]	1✔
249	if not above:	1✔
250	warnings.append(	1✔
251	f"Page {page_data.page_number}: step number '{step.text}' at {sb} has no parts list above it"
252	)
253	return warnings	1✔
254
255
256	class ClassificationOrchestrator:	1✔
257	"""
258	Manages the backtracking classification process.
259	This class is stateful.
260	"""
261
262	def __init__(self, classifier: Classifier):	1✔
263	self.classifier = classifier	1✔
264	self.history: List[ClassificationResult] = []	1✔
265
266	def process_page(self, page_data: PageData) -> ClassificationResult:	1✔
267	"""
268	Orchestrates the classification of a single page, with backtracking.
269
270	Returns:
271	The final ClassificationResult containing labels and removal info.
272	"""
273	hints = ClassificationHints()	1✔
274
275	max_iterations = 1 # TODO raise this in future	1✔
276	for i in range(max_iterations):	1✔
277	result = self.classifier.classify(page_data, hints)	1✔
278	self.history.append(result)	1✔
279
280	inconsistencies = result.get_warnings()	1✔
281	if not inconsistencies:	1✔
282	return result	1✔
283
284	hints = self._generate_new_hints(result, inconsistencies)	1✔
285
286	final_result = self.history[-1]	1✔
287	return final_result	1✔
288
289	def _generate_new_hints(	1✔
290	self, result: ClassificationResult, inconsistencies: List[str]
291	) -> ClassificationHints:
292	"""
293	Creates new hints to guide the next classification run.
294	"""
295	# TODO Here, we should look at the font sizes, and use that to help
296	# bias the next run towards more consistent results.
297	# TODO Figure out how to pass the hints between pages (of the book).
298	return ClassificationHints()	1✔

bramp / build-along / 19058602748

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous