19121465813

Committed 06 Nov 2025 01:09AM UTC coverage: 85.372% (-0.5%) from 85.858%

Build # 19121465813

Build Type

push

github

Committed by

bramp

Commit Message

chore: Enabled another linter and applied it.

Run Details

12 of 32 new or added lines in 3 files covered. (37.5%)

32 existing lines in 4 files now uncovered.

4062 of 4758 relevant lines covered (85.37%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.52

/src/build_a_long/pdf_extract/classifier/classifier.py

"""
Rule-based classifier for labeling page elements.

Pipeline order and dependencies
--------------------------------
Classifiers run in a fixed, enforced order because later stages depend on
labels produced by earlier stages:

1) PageNumberClassifier → outputs: "page_number"
2) PartCountClassifier  → outputs: "part_count"
3) StepNumberClassifier → outputs: "step_number" (uses page_number size as context)
4) PartsListClassifier  → outputs: "parts_list" (requires step_number and part_count)
5) PartsImageClassifier → outputs: "part_image" (requires parts_list and part_count)
6) StepClassifier       → outputs: "step" (requires step_number and parts_list)

If the order is changed such that a classifier runs before its requirements
are available, a ValueError will be raised at initialization time.
"""

import logging

from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationHints,
    ClassificationResult,
    ClassifierConfig,
    RemovalReason,
)
from build_a_long.pdf_extract.classifier.page_number_classifier import (
    PageNumberClassifier,
)
from build_a_long.pdf_extract.classifier.part_count_classifier import (
    PartCountClassifier,
)
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
    PartsImageClassifier,
)
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
    PartsListClassifier,
)
from build_a_long.pdf_extract.classifier.step_classifier import (
    StepClassifier,
)
from build_a_long.pdf_extract.classifier.step_number_classifier import (
    StepNumberClassifier,
)
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.page_elements import Text

logger = logging.getLogger(__name__)


def classify_elements(page: PageData) -> ClassificationResult:
    """Classify and label elements on a single page using rule-based heuristics.

    Args:
        page: A single PageData object to classify.

    Returns:
        A ClassificationResult object containing the classification results.
    """
    config = ClassifierConfig()
    classifier = Classifier(config)
    orchestrator = ClassificationOrchestrator(classifier)

    return orchestrator.process_page(page)


def classify_pages(pages: list[PageData]) -> list[ClassificationResult]:
    """Classify and label elements across multiple pages using rule-based heuristics.

    Args:
        pages: A list of PageData objects to classify.

    Returns:
        List of ClassificationResult objects, one per page.
    """
    config = ClassifierConfig()
    classifier = Classifier(config)
    orchestrator = ClassificationOrchestrator(classifier)

    results = []
    for page_data in pages:
        result = orchestrator.process_page(page_data)
        results.append(result)

    return results


class Classifier:
    """
    Performs a single run of classification based on rules, configuration, and hints.
    This class should be stateless.
    """

    def __init__(self, config: ClassifierConfig):
        self.config = config
        self.classifiers = [
            PageNumberClassifier(config, self),
            PartCountClassifier(config, self),
            StepNumberClassifier(config, self),
            PartsListClassifier(config, self),
            PartsImageClassifier(config, self),
            StepClassifier(config, self),
        ]

        produced: set[str] = set()
        for c in self.classifiers:
            cls = c.__class__
            need = getattr(c, "requires", set())
            if not need.issubset(produced):
                missing = ", ".join(sorted(need - produced))
                raise ValueError(
                    f"Classifier order invalid: {cls.__name__} requires labels not yet produced: {missing}"
                )
            produced |= getattr(c, "outputs", set())

    def classify(
        self, page_data: PageData, hints: ClassificationHints | None = None
    ) -> ClassificationResult:
        """
        Runs the classification logic and returns a result.
        It does NOT modify page_data directly.
        """
        result = ClassificationResult(page_data=page_data)

        for classifier in self.classifiers:
            classifier.evaluate(page_data, result)
            classifier.classify(page_data, result, hints)

        warnings = self._log_post_classification_warnings(page_data, result)
        for warning in warnings:
            result.add_warning(warning)

        return result

    def _remove_child_bboxes(
        self,
        page_data: PageData,
        target,
        result: ClassificationResult,
        keep_ids: set[int] | None = None,
    ) -> None:
        if keep_ids is None:
            keep_ids = set()

        target_bbox = target.bbox

        for ele in page_data.elements:
            if ele is target or id(ele) in keep_ids:
                continue
            b = ele.bbox
            if b.fully_inside(target_bbox):
                result.mark_removed(
                    ele, RemovalReason(reason_type="child_bbox", target_element=target)
                )

    def _remove_similar_bboxes(
        self,
        page_data: PageData,
        target,
        result: ClassificationResult,
        keep_ids: set[int] | None = None,
    ) -> None:
        if keep_ids is None:
            keep_ids = set()

        target_area = target.bbox.area
        tx, ty = target.bbox.center

        IOU_THRESHOLD = 0.8
        CENTER_EPS = 1.5
        AREA_TOL = 0.12

        for ele in page_data.elements:
            if ele is target or id(ele) in keep_ids:
                continue

            b = ele.bbox
            iou = target.bbox.iou(b)
            if iou >= IOU_THRESHOLD:
                result.mark_removed(
                    ele,
                    RemovalReason(reason_type="similar_bbox", target_element=target),
                )
                continue

            cx, cy = b.center
            if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:
                area = b.area
                if (
                    target_area > 0
                    and abs(area - target_area) / target_area <= AREA_TOL
                ):
                    result.mark_removed(
                        ele,
                        RemovalReason(
                            reason_type="similar_bbox", target_element=target
                        ),
                    )

    def _log_post_classification_warnings(
        self, page_data: PageData, result: ClassificationResult
    ) -> list[str]:
        warnings = []

        labeled_elements = result.get_labeled_elements()

        # Check if there's a page number
        has_page_number = any(
            label == "page_number" for label in labeled_elements.values()
        )
        if not has_page_number:
            warnings.append(f"Page {page_data.page_number}: missing page number")

        # Get elements by label
        parts_lists = [
            e for e, label in labeled_elements.items() if label == "parts_list"
        ]
        part_counts = [
            e for e, label in labeled_elements.items() if label == "part_count"
        ]

        for pl in parts_lists:
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
            if not inside_counts:
                warnings.append(
                    f"Page {page_data.page_number}: parts list at {pl.bbox} contains no part counts"
                )

        steps: list[Text] = [
            e
            for e, label in labeled_elements.items()
            if label == "step_number" and isinstance(e, Text)
        ]
        ABOVE_EPS = 2.0
        for step in steps:
            sb = step.bbox
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
            if not above:
                warnings.append(
                    f"Page {page_data.page_number}: step number '{step.text}' at {sb} has no parts list above it"
                )
        return warnings


class ClassificationOrchestrator:
    """
    Manages the backtracking classification process.
    This class is stateful.
    """

    def __init__(self, classifier: Classifier):
        self.classifier = classifier
        self.history: list[ClassificationResult] = []

    def process_page(self, page_data: PageData) -> ClassificationResult:
        """
        Orchestrates the classification of a single page, with backtracking.

        Returns:
            The final ClassificationResult containing labels and removal info.
        """
        hints = ClassificationHints()

        max_iterations = 1  # TODO raise this in future
        for i in range(max_iterations):
            result = self.classifier.classify(page_data, hints)
            self.history.append(result)

            inconsistencies = result.get_warnings()
            if not inconsistencies:
                return result

            hints = self._generate_new_hints(result, inconsistencies)

        final_result = self.history[-1]
        return final_result

    def _generate_new_hints(
        self, result: ClassificationResult, inconsistencies: list[str]
    ) -> ClassificationHints:
        """
        Creates new hints to guide the next classification run.
        """
        # TODO Here, we should look at the font sizes, and use that to help
        # bias the next run towards more consistent results.
        # TODO Figure out how to pass the hints between pages (of the book).
        return ClassificationHints()

1	"""
2	Rule-based classifier for labeling page elements.
3
4	Pipeline order and dependencies
5	--------------------------------
6	Classifiers run in a fixed, enforced order because later stages depend on
7	labels produced by earlier stages:
8
9	1) PageNumberClassifier → outputs: "page_number"
10	2) PartCountClassifier → outputs: "part_count"
11	3) StepNumberClassifier → outputs: "step_number" (uses page_number size as context)
12	4) PartsListClassifier → outputs: "parts_list" (requires step_number and part_count)
13	5) PartsImageClassifier → outputs: "part_image" (requires parts_list and part_count)
14	6) StepClassifier → outputs: "step" (requires step_number and parts_list)
15
16	If the order is changed such that a classifier runs before its requirements
17	are available, a ValueError will be raised at initialization time.
18	"""
19
20	import logging	1✔
21
22	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
23	ClassificationHints,
24	ClassificationResult,
25	ClassifierConfig,
26	RemovalReason,
27	)
28	from build_a_long.pdf_extract.classifier.page_number_classifier import (	1✔
29	PageNumberClassifier,
30	)
31	from build_a_long.pdf_extract.classifier.part_count_classifier import (	1✔
32	PartCountClassifier,
33	)
34	from build_a_long.pdf_extract.classifier.parts_image_classifier import (	1✔
35	PartsImageClassifier,
36	)
37	from build_a_long.pdf_extract.classifier.parts_list_classifier import (	1✔
38	PartsListClassifier,
39	)
40	from build_a_long.pdf_extract.classifier.step_classifier import (	1✔
41	StepClassifier,
42	)
43	from build_a_long.pdf_extract.classifier.step_number_classifier import (	1✔
44	StepNumberClassifier,
45	)
46	from build_a_long.pdf_extract.extractor import PageData	1✔
47	from build_a_long.pdf_extract.extractor.page_elements import Text	1✔
48
49	logger = logging.getLogger(__name__)	1✔
50
51
52	def classify_elements(page: PageData) -> ClassificationResult:	1✔
53	"""Classify and label elements on a single page using rule-based heuristics.
54
55	Args:
56	page: A single PageData object to classify.
57
58	Returns:
59	A ClassificationResult object containing the classification results.
60	"""
61	config = ClassifierConfig()	1✔
62	classifier = Classifier(config)	1✔
63	orchestrator = ClassificationOrchestrator(classifier)	1✔
64
65	return orchestrator.process_page(page)	1✔
66
67
68	def classify_pages(pages: list[PageData]) -> list[ClassificationResult]:	1✔
69	"""Classify and label elements across multiple pages using rule-based heuristics.
70
71	Args:
72	pages: A list of PageData objects to classify.
73
74	Returns:
75	List of ClassificationResult objects, one per page.
76	"""
77	config = ClassifierConfig()	1✔
78	classifier = Classifier(config)	1✔
79	orchestrator = ClassificationOrchestrator(classifier)	1✔
80
81	results = []	1✔
82	for page_data in pages:	1✔
83	result = orchestrator.process_page(page_data)	1✔
84	results.append(result)	1✔
85
86	return results	1✔
87
88
89	class Classifier:	1✔
90	"""
91	Performs a single run of classification based on rules, configuration, and hints.
92	This class should be stateless.
93	"""
94
95	def __init__(self, config: ClassifierConfig):	1✔
96	self.config = config	1✔
97	self.classifiers = [	1✔
98	PageNumberClassifier(config, self),
99	PartCountClassifier(config, self),
100	StepNumberClassifier(config, self),
101	PartsListClassifier(config, self),
102	PartsImageClassifier(config, self),
103	StepClassifier(config, self),
104	]
105
106	produced: set[str] = set()	1✔
107	for c in self.classifiers:	1✔
108	cls = c.__class__	1✔
109	need = getattr(c, "requires", set())	1✔
110	if not need.issubset(produced):	1✔
111	missing = ", ".join(sorted(need - produced))	1✔
112	raise ValueError(	1✔
113	f"Classifier order invalid: {cls.__name__} requires labels not yet produced: {missing}"
114	)
115	produced \|= getattr(c, "outputs", set())	1✔
116
117	def classify(	1✔
118	self, page_data: PageData, hints: ClassificationHints \| None = None
119	) -> ClassificationResult:
120	"""
121	Runs the classification logic and returns a result.
122	It does NOT modify page_data directly.
123	"""
124	result = ClassificationResult(page_data=page_data)	1✔
125
126	for classifier in self.classifiers:	1✔
127	classifier.evaluate(page_data, result)	1✔
128	classifier.classify(page_data, result, hints)	1✔
129
130	warnings = self._log_post_classification_warnings(page_data, result)	1✔
131	for warning in warnings:	1✔
132	result.add_warning(warning)	1✔
133
134	return result	1✔
135
136	def _remove_child_bboxes(	1✔
137	self,
138	page_data: PageData,
139	target,
140	result: ClassificationResult,
141	keep_ids: set[int] \| None = None,
142	) -> None:
143	if keep_ids is None:	1✔
144	keep_ids = set()	1✔
145
146	target_bbox = target.bbox	1✔
147
148	for ele in page_data.elements:	1✔
149	if ele is target or id(ele) in keep_ids:	1✔
150	continue	1✔
151	b = ele.bbox	1✔
152	if b.fully_inside(target_bbox):	1✔
153	result.mark_removed(	1✔
154	ele, RemovalReason(reason_type="child_bbox", target_element=target)
155	)
156
157	def _remove_similar_bboxes(	1✔
158	self,
159	page_data: PageData,
160	target,
161	result: ClassificationResult,
162	keep_ids: set[int] \| None = None,
163	) -> None:
164	if keep_ids is None:	1✔
165	keep_ids = set()	1✔
166
167	target_area = target.bbox.area	1✔
168	tx, ty = target.bbox.center	1✔
169
170	IOU_THRESHOLD = 0.8	1✔
171	CENTER_EPS = 1.5	1✔
172	AREA_TOL = 0.12	1✔
173
174	for ele in page_data.elements:	1✔
175	if ele is target or id(ele) in keep_ids:	1✔
176	continue	1✔
177
178	b = ele.bbox	1✔
179	iou = target.bbox.iou(b)	1✔
180	if iou >= IOU_THRESHOLD:	1✔
181	result.mark_removed(	1✔
182	ele,
183	RemovalReason(reason_type="similar_bbox", target_element=target),
184	)
185	continue	1✔
186
187	cx, cy = b.center	1✔
188	if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:	1✔
UNCOV 189	area = b.area	×
UNCOV 190	if (	×
191	target_area > 0
192	and abs(area - target_area) / target_area <= AREA_TOL
193	):
UNCOV 194	result.mark_removed(	×
195	ele,
196	RemovalReason(
197	reason_type="similar_bbox", target_element=target
198	),
199	)
200
201	def _log_post_classification_warnings(	1✔
202	self, page_data: PageData, result: ClassificationResult
203	) -> list[str]:
204	warnings = []	1✔
205
206	labeled_elements = result.get_labeled_elements()	1✔
207
208	# Check if there's a page number
209	has_page_number = any(	1✔
210	label == "page_number" for label in labeled_elements.values()
211	)
212	if not has_page_number:	1✔
213	warnings.append(f"Page {page_data.page_number}: missing page number")	1✔
214
215	# Get elements by label
216	parts_lists = [	1✔
217	e for e, label in labeled_elements.items() if label == "parts_list"
218	]
219	part_counts = [	1✔
220	e for e, label in labeled_elements.items() if label == "part_count"
221	]
222
223	for pl in parts_lists:	1✔
224	inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]	1✔
225	if not inside_counts:	1✔
UNCOV 226	warnings.append(	×
227	f"Page {page_data.page_number}: parts list at {pl.bbox} contains no part counts"
228	)
229
230	steps: list[Text] = [	1✔
231	e
232	for e, label in labeled_elements.items()
233	if label == "step_number" and isinstance(e, Text)
234	]
235	ABOVE_EPS = 2.0	1✔
236	for step in steps:	1✔
237	sb = step.bbox	1✔
238	above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]	1✔
239	if not above:	1✔
240	warnings.append(	1✔
241	f"Page {page_data.page_number}: step number '{step.text}' at {sb} has no parts list above it"
242	)
243	return warnings	1✔
244
245
246	class ClassificationOrchestrator:	1✔
247	"""
248	Manages the backtracking classification process.
249	This class is stateful.
250	"""
251
252	def __init__(self, classifier: Classifier):	1✔
253	self.classifier = classifier	1✔
254	self.history: list[ClassificationResult] = []	1✔
255
256	def process_page(self, page_data: PageData) -> ClassificationResult:	1✔
257	"""
258	Orchestrates the classification of a single page, with backtracking.
259
260	Returns:
261	The final ClassificationResult containing labels and removal info.
262	"""
263	hints = ClassificationHints()	1✔
264
265	max_iterations = 1 # TODO raise this in future	1✔
266	for i in range(max_iterations):	1✔
267	result = self.classifier.classify(page_data, hints)	1✔
268	self.history.append(result)	1✔
269
270	inconsistencies = result.get_warnings()	1✔
271	if not inconsistencies:	1✔
272	return result	1✔
273
274	hints = self._generate_new_hints(result, inconsistencies)	1✔
275
276	final_result = self.history[-1]	1✔
277	return final_result	1✔
278
279	def _generate_new_hints(	1✔
280	self, result: ClassificationResult, inconsistencies: list[str]
281	) -> ClassificationHints:
282	"""
283	Creates new hints to guide the next classification run.
284	"""
285	# TODO Here, we should look at the font sizes, and use that to help
286	# bias the next run towards more consistent results.
287	# TODO Figure out how to pass the hints between pages (of the book).
288	return ClassificationHints()	1✔

bramp / build-along / 19121465813

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous