20086551557

Committed 10 Dec 2025 03:43AM UTC coverage: 90.303% (+0.3%) from 90.041%

Build # 20086551557

Build Type

push

github

Committed by

bramp

Commit Message

Refactor arrow shaft detection: unified method, stroked line support, multi-head grouping

- Merge _find_simple_shaft, _find_stroked_line_shaft, and _find_cornered_shaft
  into a single unified _find_shaft method that handles all shaft types by
  extracting points and finding closest/furthest from the arrowhead tip
- Add support for stroked line shafts (stroke_color instead of fill_color)
- Add tail_grouping_tolerance config for grouping arrowheads with nearby tails
- Group arrowheads that share the same shaft_block (L-shaped arrows with
  multiple heads at different ends)
- Use union-find algorithm to group arrowheads by shared shaft or tail proximity
- Extract colors_match to shared utils module
- Add comprehensive tests for stroked line shafts, tail correctness, and
  multi-head arrow grouping
- Update golden files for pages 011, 013, 015, 017 with corrected arrow detection

Coverage Stats

204 of 206 new or added lines in 5 files covered. (99.03%)

252 existing lines in 14 files now uncovered.

11855 of 13128 relevant lines covered (90.3%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.67

/src/build_a_long/pdf_extract/classifier/utils.py

"""Utility functions for classifiers."""

from collections.abc import Sequence

from build_a_long.pdf_extract.extractor.page_blocks import Drawing


def score_white_fill(block: Drawing, white_threshold: float = 0.9) -> float:
    """Score a drawing block based on having white fill.

    Args:
        block: The Drawing block to analyze.
        white_threshold: Threshold for considering a channel "white" (0.0-1.0).

    Returns:
        Score from 0.0 to 1.0 where 1.0 is white fill.
    """
    if block.fill_color is None:
        return 0.0

    r, g, b = block.fill_color
    # Check if it's white (all channels above threshold)
    if r >= white_threshold and g >= white_threshold and b >= white_threshold:
        return 1.0

    # Light gray is also acceptable
    # Using 0.7 as a reasonable baseline for "light gray" based on existing classifiers
    gray_threshold = 0.7
    if r > gray_threshold and g > gray_threshold and b > gray_threshold:
        # Scale score based on how close to white it is
        # 0.7 -> 0.6, 0.9 -> 1.0
        min_score = 0.6
        return min_score + (1.0 - min_score) * (min(r, g, b) - gray_threshold) / (
            1.0 - gray_threshold
        )

    return 0.0


def colors_match(
    color1: Sequence[float],
    color2: Sequence[float],
    tolerance: float = 0.1,
) -> bool:
    """Check if two colors match within a tolerance.

    Args:
        color1: First color as RGB tuple (0.0-1.0).
        color2: Second color as RGB tuple (0.0-1.0).
        tolerance: Maximum difference per channel.

    Returns:
        True if colors match within tolerance.
    """
    if len(color1) != len(color2):
        return False
    return all(abs(c1 - c2) <= tolerance for c1, c2 in zip(color1, color2, strict=True))


def extract_unique_points(
    line_items: Sequence[tuple], precision: int = 1
) -> list[tuple[float, float]]:
    """Extract unique points from line items.

    Args:
        line_items: List of line items, each ('l', (x1, y1), (x2, y2)).
        precision: Decimal places for rounding to determine uniqueness.

    Returns:
        List of unique (x, y) points.
    """
    points: list[tuple[float, float]] = []
    seen: set[tuple[float, float]] = set()

    for item in line_items:
        # item is ('l', (x1, y1), (x2, y2))
        # Ensure it is a line item
        if item[0] != "l":
            continue

        p1, p2 = item[1], item[2]
        for p in [p1, p2]:
            key = (round(p[0], precision), round(p[1], precision))
            if key not in seen:
                seen.add(key)
                points.append((p[0], p[1]))

    return points

1	"""Utility functions for classifiers."""
2
3	from collections.abc import Sequence	1✔
4
5	from build_a_long.pdf_extract.extractor.page_blocks import Drawing	1✔
6
7
8	def score_white_fill(block: Drawing, white_threshold: float = 0.9) -> float:	1✔
9	"""Score a drawing block based on having white fill.
10
11	Args:
12	block: The Drawing block to analyze.
13	white_threshold: Threshold for considering a channel "white" (0.0-1.0).
14
15	Returns:
16	Score from 0.0 to 1.0 where 1.0 is white fill.
17	"""
18	if block.fill_color is None:	1✔
19	return 0.0	1✔
20
21	r, g, b = block.fill_color	1✔
22	# Check if it's white (all channels above threshold)
23	if r >= white_threshold and g >= white_threshold and b >= white_threshold:	1✔
24	return 1.0	1✔
25
26	# Light gray is also acceptable
27	# Using 0.7 as a reasonable baseline for "light gray" based on existing classifiers
28	gray_threshold = 0.7	1✔
29	if r > gray_threshold and g > gray_threshold and b > gray_threshold:	1✔
30	# Scale score based on how close to white it is
31	# 0.7 -> 0.6, 0.9 -> 1.0
32	min_score = 0.6	1✔
33	return min_score + (1.0 - min_score) * (min(r, g, b) - gray_threshold) / (	1✔
34	1.0 - gray_threshold
35	)
36
37	return 0.0	1✔
38
39
40	def colors_match(	1✔
41	color1: Sequence[float],
42	color2: Sequence[float],
43	tolerance: float = 0.1,
44	) -> bool:
45	"""Check if two colors match within a tolerance.
46
47	Args:
48	color1: First color as RGB tuple (0.0-1.0).
49	color2: Second color as RGB tuple (0.0-1.0).
50	tolerance: Maximum difference per channel.
51
52	Returns:
53	True if colors match within tolerance.
54	"""
55	if len(color1) != len(color2):	1✔
56	return False	1✔
57	return all(abs(c1 - c2) <= tolerance for c1, c2 in zip(color1, color2, strict=True))	1✔
58
59
60	def extract_unique_points(	1✔
61	line_items: Sequence[tuple], precision: int = 1
62	) -> list[tuple[float, float]]:
63	"""Extract unique points from line items.
64
65	Args:
66	line_items: List of line items, each ('l', (x1, y1), (x2, y2)).
67	precision: Decimal places for rounding to determine uniqueness.
68
69	Returns:
70	List of unique (x, y) points.
71	"""
72	points: list[tuple[float, float]] = []	1✔
73	seen: set[tuple[float, float]] = set()	1✔
74
75	for item in line_items:	1✔
76	# item is ('l', (x1, y1), (x2, y2))
77	# Ensure it is a line item
78	if item[0] != "l":	1✔
NEW 79	continue	×
80
81	p1, p2 = item[1], item[2]	1✔
82	for p in [p1, p2]:	1✔
83	key = (round(p[0], precision), round(p[1], precision))	1✔
84	if key not in seen:	1✔
85	seen.add(key)	1✔
86	points.append((p[0], p[1]))	1✔
87
88	return points	1✔

bramp / build-along / 20086551557

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous