• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19995046189

06 Dec 2025 10:18PM UTC coverage: 90.506% (+0.09%) from 90.421%
19995046189

push

github

bramp
test: regenerate golden files for step classifier refactoring

10525 of 11629 relevant lines covered (90.51%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.85
/src/build_a_long/pdf_extract/extractor/lego_page_elements.py
1
from __future__ import annotations
1✔
2

3
import json
1✔
4
from abc import ABC
1✔
5
from collections.abc import Iterator
1✔
6
from enum import Enum
1✔
7
from typing import Annotated, Any, Literal
1✔
8

9
from annotated_types import Ge, Gt
1✔
10
from pydantic import (
1✔
11
    BaseModel,
12
    ConfigDict,
13
    Discriminator,
14
    Field,
15
    PlainSerializer,
16
    model_validator,
17
)
18

19
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
20
from build_a_long.pdf_extract.utils import SerializationMixin, remove_empty_lists
1✔
21

22

23
class LegoPageElement(SerializationMixin, BaseModel, ABC):
1✔
24
    """Base class for LEGO-specific structured elements constructed by classifiers.
25

26
    LegoPageElements are typically constructed from one or more Blocks during
27
    classification and are stored in Candidate.constructed, not in PageData.elements.
28

29
    Contract:
30
    - Every element has exactly one bounding box in page coordinates
31
      (same coordinate system produced by the extractor).
32
    - Subclasses are small data holders.
33
    - Inherits from Pydantic BaseModel to get automatic model_dump(), model_dump_json(),
34
      model_validate(), model_validate_json() methods.
35
    - Uses discriminated unions to add __tag__ field for polymorphic serialization.
36
    """
37

38
    # Note: page_data is excluded from serialization at dump time, not in config
39
    model_config = ConfigDict(populate_by_name=True)
1✔
40

41
    bbox: BBox
1✔
42

43
    def to_json(self, *, indent: str | int | None = None, **kwargs: Any) -> str:
1✔
44
        """Serialize to JSON with proper defaults (by_alias=True, exclude_none=True).
45

46
        Floats are rounded to 2 decimal places for consistent output.
47
        Empty lists are removed from the output.
48

49
        Args:
50
            indent: Optional indentation for pretty-printing (str like '\t', int, or None)
51
            **kwargs: Additional arguments passed to model_dump()
52
        """
53
        # Use to_dict() from mixin which rounds floats
54
        data = self.to_dict(**kwargs)
1✔
55
        cleaned_data = remove_empty_lists(data)
1✔
56

57
        # Use compact separators when not indented (matches Pydantic's behavior)
58
        separators = (",", ":") if indent is None else (",", ": ")
1✔
59
        return json.dumps(cleaned_data, indent=indent, separators=separators)
1✔
60

61
    def __str__(self) -> str:
1✔
62
        """Return a single-line string representation with key information."""
63
        return f"{self.__class__.__name__}(bbox={str(self.bbox)})"
×
64

65
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
66
        """Iterate over this element and all child elements.
67

68
        Default implementation yields only self. Subclasses with children
69
        should override to yield self first, then recursively yield children.
70

71
        Yields:
72
            This element and all descendant LegoPageElements
73
        """
74
        yield self
1✔
75

76

77
class PageNumber(LegoPageElement):
1✔
78
    """The page number, usually a small integer on the page.
79

80
    Positional context: Typically located in the lower-left or lower-right corner
81
    of the page.
82

83
    See layout diagram: lego_page_layout.png
84
    """
85

86
    tag: Literal["PageNumber"] = Field(
1✔
87
        default="PageNumber", alias="__tag__", frozen=True
88
    )
89
    value: Annotated[int, Ge(0)]
1✔
90

91
    def __str__(self) -> str:
1✔
92
        """Return a single-line string representation with key information."""
93
        return f"PageNumber(value={self.value})"
×
94

95

96
class StepNumber(LegoPageElement):
1✔
97
    """A step number label.
98

99
    Positional context: Located below the PartsList within a Step, left-aligned
100
    with the PartsList container.
101

102
    See layout diagram: lego_page_layout.png
103
    """
104

105
    tag: Literal["StepNumber"] = Field(
1✔
106
        default="StepNumber", alias="__tag__", frozen=True
107
    )
108
    value: Annotated[int, Gt(0)]
1✔
109

110
    def __str__(self) -> str:
1✔
111
        """Return a single-line string representation with key information."""
112
        return f"StepNumber(value={self.value})"
×
113

114

115
class PartCount(LegoPageElement):
1✔
116
    """The visual count label associated with a part entry (e.g., '2x').
117

118
    Positional context: Positioned directly below the corresponding part image/diagram,
119
    left-aligned with the part image.
120

121
    See layout diagram: lego_page_layout.png
122
    """
123

124
    tag: Literal["PartCount"] = Field(default="PartCount", alias="__tag__", frozen=True)
1✔
125
    count: Annotated[int, Gt(0)]
1✔
126

127
    # TODO Do we really need this ?
128
    matched_hint: Literal["part_count", "catalog_part_count"] | None = None
1✔
129
    """Which font size hint was matched during classification.
1✔
130
    
131
    - 'part_count': Standard instruction page part count
132
    - 'catalog_part_count': Catalog/inventory page part count
133
    """
134

135
    def __str__(self) -> str:
1✔
136
        """Return a single-line string representation with key information."""
137
        hint_str = f", {self.matched_hint}" if self.matched_hint else ""
×
138
        return f"PartCount(count={self.count}x{hint_str})"
×
139

140

141
class StepCount(LegoPageElement):
1✔
142
    """The visual count label for a substep (e.g., '2x').
143

144
    Positional context: Positioned inside a substep callout box, indicating
145
    how many times to build that sub-assembly.
146

147
    This is similar to PartCount but uses a larger font size and appears
148
    in substep callout boxes rather than parts lists.
149

150
    See layout diagram: lego_page_layout.png
151
    """
152

153
    tag: Literal["StepCount"] = Field(default="StepCount", alias="__tag__", frozen=True)
1✔
154
    count: Annotated[int, Gt(0)]
1✔
155

156
    def __str__(self) -> str:
1✔
157
        """Return a single-line string representation with key information."""
158
        return f"StepCount(count={self.count}x)"
×
159

160

161
class PartNumber(LegoPageElement):
1✔
162
    """The element ID number for a part (catalog pages).
163

164
    Positional context: Located directly below the part count on catalog pages.
165
    This is a 4-8 digit number that identifies the specific LEGO element.
166

167
    See layout diagram: lego_page_layout.png
168
    """
169

170
    tag: Literal["PartNumber"] = Field(
1✔
171
        default="PartNumber", alias="__tag__", frozen=True
172
    )
173

174
    element_id: str
1✔
175
    """The LEGO element ID (4-8 digits, never starts with zero)."""
1✔
176

177
    def __str__(self) -> str:
1✔
178
        """Return a single-line string representation with key information."""
179
        return f"PartNumber(element_id={self.element_id})"
×
180

181

182
class PieceLength(LegoPageElement):
1✔
183
    """The length indicator for a LEGO piece (e.g., '4' for a 4-stud beam).
184

185
    Positional context: Located in the top-right area of a part image, typically
186
    surrounded by a circle or oval. Uses a smaller font size than step numbers.
187
    Can appear on any page type (instruction, catalog, or info pages).
188

189
    This is distinct from a step number - it indicates the physical length of
190
    the part being shown, not a construction step.
191

192
    See layout diagram: lego_page_layout.png
193
    """
194

195
    tag: Literal["PieceLength"] = Field(
1✔
196
        default="PieceLength", alias="__tag__", frozen=True
197
    )
198

199
    value: Annotated[int, Gt(0)]
1✔
200
    """The length value (number of studs or other measurement units)."""
1✔
201

202
    def __str__(self) -> str:
1✔
203
        """Return a single-line string representation with key information."""
204
        return f"PieceLength(value={self.value})"
×
205

206

207
class Shine(LegoPageElement):
1✔
208
    """A visual 'shine' or 'star' effect indicating a shiny/metallic part.
209

210
    Positional context: Typically a small star-like drawing located in the
211
    top-right area of a part image.
212
    """
213

214
    tag: Literal["Shine"] = Field(default="Shine", alias="__tag__", frozen=True)
1✔
215

216
    def __str__(self) -> str:
1✔
217
        """Return a single-line string representation with key information."""
218
        return f"Shine(bbox={self.bbox})"
×
219

220

221
class PartImage(LegoPageElement):
1✔
222
    """A candidate image that could represent a LEGO part.
223

224
    Positional context: These images typically appear in parts lists, positioned
225
    above their corresponding PartCount text and left-aligned.
226

227
    This element represents a validated image candidate that will be paired with
228
    a PartCount by PartsClassifier to construct Part elements.
229

230
    The bbox field (inherited from LegoPageElement) defines the image region.
231
    """
232

233
    tag: Literal["PartImage"] = Field(default="PartImage", alias="__tag__", frozen=True)
1✔
234

235
    shine: Shine | None = None
1✔
236
    """Optional shine effect indicating a metallic part."""
1✔
237

238
    def __str__(self) -> str:
1✔
239
        """Return a single-line string representation with key information."""
240
        shine_str = ", shiny" if self.shine else ""
×
241
        return f"PartImage(bbox={self.bbox}{shine_str})"
×
242

243
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
244
        """Iterate over this PartImage and all child elements."""
245
        yield self
1✔
246
        if self.shine:
1✔
247
            yield from self.shine.iter_elements()
1✔
248

249

250
class ProgressBar(LegoPageElement):
1✔
251
    """A progress bar showing building progress through the instruction book.
252

253
    Positional context: Typically located at the bottom of the page, spanning most
254
    of the page width, near the page number. Often consists of one or more
255
    Drawing/Image elements forming a horizontal bar with progress indicators.
256

257
    Note: The bbox is clipped to page boundaries for display purposes, but the
258
    original unclipped width is preserved in full_width for progress calculation.
259

260
    See layout diagram: lego_page_layout.png
261
    """
262

263
    tag: Literal["ProgressBar"] = Field(
1✔
264
        default="ProgressBar", alias="__tag__", frozen=True
265
    )
266

267
    progress: float | None = None
1✔
268
    """Optional progress percentage (0.0 to 1.0) if detectable from the visual."""
1✔
269

270
    full_width: float
1✔
271
    """The original unclipped width of the progress bar, used for progress calculation.
1✔
272
    
273
    When the progress bar bbox extends beyond page boundaries (a PDF extraction
274
    artifact), the bbox is clipped but this field preserves the original width
275
    that may be semantically meaningful for calculating progress percentage.
276
    """
277

278
    def __str__(self) -> str:
1✔
279
        """Return a single-line string representation with key information."""
280
        progress_str = f", {self.progress:.1%}" if self.progress is not None else ""
×
281
        return f"ProgressBar(bbox={str(self.bbox)}{progress_str})"
×
282

283

284
class RotationSymbol(LegoPageElement):
1✔
285
    """A symbol indicating the builder should rotate the assembled model.
286

287
    Positional context: Typically appears near diagram elements, often positioned
288
    beside or below the main instruction diagram. Can be either a small raster
289
    image (~40-80 pixels square) or a cluster of vector drawings forming curved
290
    arrows in a circular pattern.
291

292
    See layout diagram: lego_page_layout.png
293
    """
294

295
    tag: Literal["RotationSymbol"] = Field(
1✔
296
        default="RotationSymbol", alias="__tag__", frozen=True
297
    )
298

299
    def __str__(self) -> str:
1✔
300
        """Return a single-line string representation with key information."""
301
        return f"RotationSymbol(bbox={self.bbox})"
×
302

303

304
class Arrow(LegoPageElement):
1✔
305
    """An arrow indicating direction or relationship between elements.
306

307
    Arrows consist of a triangular arrowhead that points in a specific direction.
308
    In LEGO instructions, arrows typically:
309
    - Point from a main assembly to a sub-step callout
310
    - Indicate direction of motion or insertion
311
    - Connect related elements visually
312

313
    The bbox encompasses the arrowhead. The tip is where the arrow points TO,
314
    and the tail is where the arrow line originates FROM (the other end of the
315
    arrow shaft, if detected).
316

317
    Direction is measured in degrees where:
318
    - 0° = pointing right
319
    - 90° = pointing down
320
    - 180° or -180° = pointing left
321
    - -90° = pointing up
322
    """
323

324
    tag: Literal["Arrow"] = Field(default="Arrow", alias="__tag__", frozen=True)
1✔
325

326
    direction: float
1✔
327
    """Angle in degrees indicating where the arrow points.
1✔
328

329
    0° = right, 90° = down, 180° = left, -90° = up.
330
    """
331

332
    tip: tuple[float, float]
1✔
333
    """The tip point (x, y) of the arrowhead - where the arrow points TO."""
1✔
334

335
    tail: tuple[float, float] | None = None
1✔
336
    """The tail point (x, y) - where the arrow line originates FROM.
1✔
337
    
338
    This is the far end of the arrow shaft (not the arrowhead base).
339
    May be None if the arrow shaft was not detected.
340
    """
341

342
    def __str__(self) -> str:
1✔
343
        """Return a single-line string representation with key information."""
344
        return f"Arrow(bbox={self.bbox}, direction={self.direction:.0f}°)"
×
345

346

347
class Part(LegoPageElement):
1✔
348
    """A single part entry within a parts list.
349

350
    Positional context: The part image/diagram appears first, with the PartCount
351
    label positioned directly below it, both left-aligned.
352

353
    See layout diagram: lego_page_layout.png
354
    See overview of all parts: https://brickarchitect.com/files/LEGO_BRICK_LABELS-CONTACT_SHEET.pdf
355
    """
356

357
    tag: Literal["Part"] = Field(default="Part", alias="__tag__", frozen=True)
1✔
358
    count: PartCount
1✔
359

360
    diagram: PartImage | None = None
1✔
361

362
    number: PartNumber | None = None
1✔
363
    """Optional part number used only on catalog pages."""
1✔
364

365
    length: PieceLength | None = None
1✔
366
    """Optional piece length indicator (e.g., '4' for a 4-stud axle).
1✔
367
    
368
    Appears in the top-right of the part image, surrounded by a circle.
369
    Can appear on any page type.
370
    """
371

372
    # TODO maybe add color?
373
    # TODO Some parts have a "shiny" highlight - maybe reference that image
374

375
    def __str__(self) -> str:
1✔
376
        """Return a single-line string representation with key information."""
377
        number_str = f", number={self.number.element_id}" if self.number else ""
×
378
        length_str = f", len={self.length.value}" if self.length else ""
×
379
        return f"Part(count={self.count.count}x{number_str}{length_str})"
×
380

381
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
382
        """Iterate over this Part and all child elements."""
383
        yield self
1✔
384
        yield self.count
1✔
385
        if self.diagram:
1✔
386
            yield from self.diagram.iter_elements()
1✔
387
        if self.number:
1✔
388
            yield from self.number.iter_elements()
1✔
389
        if self.length:
1✔
390
            yield from self.length.iter_elements()
1✔
391

392

393
class PartsList(LegoPageElement):
1✔
394
    """A container of multiple parts for the page's parts list.
395

396
    Positional context: Contained within a Step. Located
397
    at the top of the step area, typically on the left side. Individual parts are
398
    arranged with their images first, followed by their count labels below.
399

400
    See layout diagram: lego_page_layout.png
401
    """
402

403
    tag: Literal["PartsList"] = Field(default="PartsList", alias="__tag__", frozen=True)
1✔
404
    parts: list[Part]
1✔
405

406
    @property
1✔
407
    def total_items(self) -> int:
1✔
408
        """Total number of individual items accounting for counts.
409

410
        Example: if the list contains Part(count=2) and Part(count=5), this
411
        returns 7.
412
        """
413

414
        return sum(p.count.count for p in self.parts)
1✔
415

416
    def __str__(self) -> str:
1✔
417
        """Return a single-line string representation with key information."""
418
        return f"PartsList(parts={len(self.parts)}, total_items={self.total_items})"
×
419

420
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
421
        """Iterate over this PartsList and all child elements."""
422
        yield self
1✔
423
        for part in self.parts:
1✔
424
            yield from part.iter_elements()
1✔
425

426

427
class BagNumber(LegoPageElement):
1✔
428
    """The bag number, usually a small integer on the page."""
429

430
    tag: Literal["BagNumber"] = Field(default="BagNumber", alias="__tag__", frozen=True)
1✔
431
    value: Annotated[int, Gt(0)]
1✔
432

433
    def __str__(self) -> str:
1✔
434
        """Return a single-line string representation with key information."""
435
        return f"BagNumber(value={self.value})"
×
436

437

438
class NewBag(LegoPageElement):
1✔
439
    """The graphic showing a new bag icon on the page.
440

441
    A NewBag can have an optional bag number. When present, the number indicates
442
    which specific bag to open (e.g., "Bag 1", "Bag 2"). When absent, the bag
443
    graphic indicates that all bags should be opened (no specific number).
444
    """
445

446
    tag: Literal["NewBag"] = Field(default="NewBag", alias="__tag__", frozen=True)
1✔
447
    number: BagNumber | None = None
1✔
448

449
    def __str__(self) -> str:
1✔
450
        """Return a single-line string representation with key information."""
451
        if self.number:
×
452
            return f"NewBag(bag={self.number.value})"
×
453
        return "NewBag(all)"
×
454

455
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
456
        """Iterate over this Step and all child elements."""
457
        yield self
1✔
458
        if self.number:
1✔
459
            yield from self.number.iter_elements()
1✔
460

461

462
class Diagram(LegoPageElement):
1✔
463
    """The graphic showing how to complete the step.
464

465
    Positional context: The main diagram is positioned on the right side of the
466
    step area, occupying most of the horizontal space. It shows the assembly
467
    instructions for the step.
468

469
    See layout diagram: lego_page_layout.png
470
    """
471

472
    tag: Literal["Diagram"] = Field(default="Diagram", alias="__tag__", frozen=True)
1✔
473

474
    def __str__(self) -> str:
1✔
475
        """Return a single-line string representation with key information."""
476
        return f"Diagram(bbox={str(self.bbox)})"
×
477

478

479
class SubAssemblyStep(LegoPageElement):
1✔
480
    """A single step within a sub-assembly callout box.
481

482
    SubAssemblies can contain multiple mini-steps, each with its own step number
483
    (typically starting at 1) and diagram showing that sub-step's construction.
484

485
    Positional context: Within a SubAssembly box, steps are arranged horizontally
486
    or in a grid, with step numbers positioned above or beside their corresponding
487
    diagrams.
488
    """
489

490
    tag: Literal["SubAssemblyStep"] = Field(
1✔
491
        default="SubAssemblyStep", alias="__tag__", frozen=True
492
    )
493

494
    step_number: StepNumber
1✔
495
    """The step number for this sub-assembly step (typically 1, 2, 3, etc.)."""
1✔
496

497
    diagram: Diagram | None = None
1✔
498
    """The diagram showing this sub-step's construction."""
1✔
499

500
    def __str__(self) -> str:
1✔
501
        """Return a single-line string representation with key information."""
502
        diagram_str = ", diagram" if self.diagram else ""
×
503
        return f"SubAssemblyStep(number={self.step_number.value}{diagram_str})"
×
504

505
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
506
        """Iterate over this SubAssemblyStep and all child elements."""
507
        yield self
×
508
        yield from self.step_number.iter_elements()
×
509
        if self.diagram:
×
510
            yield from self.diagram.iter_elements()
×
511

512

513
class SubAssembly(LegoPageElement):
1✔
514
    """A sub-assembly within a main step, typically shown in a callout box.
515

516
    Positional context: SubAssemblies appear as white/light-colored rectangular
517
    boxes with arrows pointing from them to the main diagram. They show smaller
518
    sub-assemblies that may need to be built multiple times (indicated by a count
519
    like "2x") before being attached to the main assembly.
520

521
    Structure:
522
    - A white/light rectangular box (detected via Drawing blocks)
523
    - One or more steps, each with a step number and diagram
524
    - An optional count indicating how many times to build it (e.g., "2x")
525

526
    Note: Arrows pointing from subassemblies to the main diagram are stored in
527
    the parent Step element's arrows field.
528

529
    See layout diagram: lego_page_layout.png
530
    """
531

532
    tag: Literal["SubAssembly"] = Field(
1✔
533
        default="SubAssembly", alias="__tag__", frozen=True
534
    )
535

536
    steps: list[SubAssemblyStep] = Field(default_factory=list)
1✔
537
    """The steps within this sub-assembly, each with a step number and diagram."""
1✔
538

539
    diagram: Diagram | None = None
1✔
540
    """The main/final diagram showing the completed sub-assembly.
1✔
541
    
542
    This is used for simple subassemblies without internal steps. When steps
543
    are present, each step has its own diagram instead.
544
    """
545

546
    count: StepCount | None = None
1✔
547
    """Optional count indicating how many times to build this sub-assembly."""
1✔
548

549
    def __str__(self) -> str:
1✔
550
        """Return a single-line string representation with key information."""
551
        count_str = f"count={self.count.count}x, " if self.count else ""
×
552
        if self.steps:
×
553
            steps_str = f"steps={len(self.steps)}"
×
554
        elif self.diagram:
×
555
            steps_str = "diagram"
×
556
        else:
557
            steps_str = "no diagram"
×
558
        return f"SubAssembly({count_str}{steps_str})"
×
559

560
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
561
        """Iterate over this SubAssembly and all child elements."""
562
        yield self
1✔
563
        if self.count:
1✔
564
            yield from self.count.iter_elements()
1✔
565
        for step in self.steps:
1✔
566
            yield from step.iter_elements()
×
567
        if self.diagram:
1✔
568
            yield from self.diagram.iter_elements()
1✔
569

570

571
class Step(LegoPageElement):
1✔
572
    """A single instruction step on the page.
573

574
    Positional context: Steps are arranged vertically on the page, typically 1-2
575
    per page. Within each step:
576
    - PartsList is at the top-left
577
    - StepNumber is below the PartsList (left-aligned)
578
    - Main Diagram is on the right side, taking most of the space
579

580
    See layout diagram: lego_page_layout.png
581
    """
582

583
    tag: Literal["Step"] = Field(default="Step", alias="__tag__", frozen=True)
1✔
584

585
    step_number: StepNumber
1✔
586
    parts_list: PartsList | None = None
1✔
587
    diagram: Diagram | None = None
1✔
588
    rotation_symbol: RotationSymbol | None = None
1✔
589
    """Optional rotation symbol indicating the builder should rotate the model."""
1✔
590

591
    arrows: list[Arrow] = Field(default_factory=list)
1✔
592
    """Arrows indicating direction or relationship between elements.
1✔
593
    
594
    These typically point from subassembly callout boxes to the main diagram,
595
    or indicate direction of motion/insertion for parts.
596
    """
597

598
    subassemblies: list[SubAssembly] = Field(default_factory=list)
1✔
599
    """Sub-assemblies shown in callout boxes within this step.
1✔
600
    
601
    SubAssemblies show smaller sub-assemblies that may need to be built
602
    multiple times before being attached to the main assembly.
603
    """
604

605
    def __str__(self) -> str:
1✔
606
        """Return a single-line string representation with key information."""
607
        rotation_str = ", rotation" if self.rotation_symbol else ""
×
608
        arrows_str = f", arrows={len(self.arrows)}" if self.arrows else ""
×
609
        subassemblies_str = (
×
610
            f", subassemblies={len(self.subassemblies)}" if self.subassemblies else ""
611
        )
612
        parts_count = len(self.parts_list.parts) if self.parts_list else 0
×
613
        return (
×
614
            f"Step(number={self.step_number.value}, "
615
            f"parts={parts_count}{rotation_str}{arrows_str}{subassemblies_str})"
616
        )
617

618
    @property
1✔
619
    def value(self) -> int:
1✔
620
        """Return the step number value for convenience."""
621
        return self.step_number.value
×
622

623
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
624
        """Iterate over this Step and all child elements."""
625
        yield self
1✔
626
        yield from self.step_number.iter_elements()
1✔
627
        if self.parts_list:
1✔
628
            yield from self.parts_list.iter_elements()
1✔
629
        if self.diagram:
1✔
630
            yield from self.diagram.iter_elements()
1✔
631
        if self.rotation_symbol:
1✔
632
            yield from self.rotation_symbol.iter_elements()
1✔
633
        for arrow in self.arrows:
1✔
634
            yield from arrow.iter_elements()
1✔
635
        for subassembly in self.subassemblies:
1✔
636
            yield from subassembly.iter_elements()
1✔
637

638

639
class Page(LegoPageElement):
1✔
640
    """A complete page of LEGO instructions.
641

642
    This is the top-level element that contains all other elements on a page.
643
    It represents the structured, hierarchical view of the page after classification
644
    and hierarchy building.
645

646
    Attributes:
647
        pdf_page_number: The 1-indexed page number from the original PDF
648
        page_number: The LEGO page number element (printed on the page), if found
649
        steps: List of Step elements on the page (for INSTRUCTION pages)
650
        catalog: Parts list for catalog/inventory pages (for CATALOG pages)
651
        warnings: List of warnings generated during hierarchy building
652
        unprocessed_elements: Raw elements that were classified but couldn't
653
            be converted
654
    """
655

656
    class PageType(Enum):
1✔
657
        """Type of LEGO instruction page."""
658

659
        INSTRUCTION = "instruction"
1✔
660
        CATALOG = "catalog"
1✔
661
        INFO = "info"
1✔
662

663
    tag: Literal["Page"] = Field(default="Page", alias="__tag__", frozen=True)
1✔
664

665
    pdf_page_number: int
1✔
666
    """The 1-indexed page number from the original PDF."""
1✔
667

668
    categories: Annotated[
1✔
669
        set[PageType],
670
        PlainSerializer(
671
            lambda cats: sorted(cat.value for cat in cats), return_type=list[str]
672
        ),
673
    ] = Field(default_factory=set)
674
    """Set of categories this page belongs to. A page can have multiple categories.
1✔
675
    
676
    For example, a page might be both INSTRUCTION and CATALOG if it contains
677
    both building steps and a parts catalog.
678
    
679
    Note: Serialized as a sorted list for deterministic JSON output.
680
    """
681

682
    page_number: PageNumber | None = None
1✔
683
    progress_bar: ProgressBar | None = None
1✔
684

685
    new_bags: list[NewBag] = Field(default_factory=list)
1✔
686
    steps: list[Step] = Field(default_factory=list)
1✔
687
    catalog: list[Part] = Field(default_factory=list)
1✔
688
    """List of parts for catalog pages. Empty list for non-catalog pages."""
1✔
689

690
    @property
1✔
691
    def is_instruction(self) -> bool:
1✔
692
        """Check if this page is an instruction page."""
693
        return Page.PageType.INSTRUCTION in self.categories
1✔
694

695
    @property
1✔
696
    def is_catalog(self) -> bool:
1✔
697
        """Check if this page is a catalog page."""
698
        return Page.PageType.CATALOG in self.categories
1✔
699

700
    @property
1✔
701
    def is_info(self) -> bool:
1✔
702
        """Check if this page is an info page."""
703
        return Page.PageType.INFO in self.categories
×
704

705
    def __str__(self) -> str:
1✔
706
        """Return a single-line string representation with key information."""
707
        page_num = self.page_number.value if self.page_number else "unknown"
1✔
708
        categories_str = (
1✔
709
            f", categories=[{', '.join(c.name for c in self.categories)}]"
710
            if self.categories
711
            else ""
712
        )
713
        bags_str = f", bags={len(self.new_bags)}" if self.new_bags else ""
1✔
714
        catalog_str = f", catalog={len(self.catalog)} parts" if self.catalog else ""
1✔
715
        steps_str = f", steps={len(self.steps)}" if self.steps else ""
1✔
716
        return (
1✔
717
            f"Page(number={page_num}{categories_str}{bags_str}{catalog_str}{steps_str})"
718
        )
719

720
    def iter_elements(self) -> Iterator[LegoPageElement]:
1✔
721
        """Iterate over this Page and all child elements.
722

723
        Yields all elements in depth-first order: the Page itself, then all
724
        contained elements (page_number, progress_bar, steps and their children).
725

726
        Yields:
727
            This element and all descendant LegoPageElements
728
        """
729
        yield self
1✔
730

731
        if self.page_number:
1✔
732
            yield from self.page_number.iter_elements()
1✔
733
        if self.progress_bar:
1✔
734
            yield from self.progress_bar.iter_elements()
1✔
735

736
        for new_bag in self.new_bags:
1✔
737
            yield from new_bag.iter_elements()
1✔
738

739
        for part in self.catalog:
1✔
740
            yield from part.iter_elements()
1✔
741

742
        for step in self.steps:
1✔
743
            yield from step.iter_elements()
1✔
744

745

746
LegoPageElements = Annotated[
1✔
747
    PageNumber
748
    | StepNumber
749
    | StepCount
750
    | PartCount
751
    | PartNumber
752
    | PieceLength
753
    | PartImage
754
    | Shine
755
    | ProgressBar
756
    | RotationSymbol
757
    | Arrow
758
    | Part
759
    | PartsList
760
    | BagNumber
761
    | NewBag
762
    | Diagram
763
    | SubAssemblyStep
764
    | SubAssembly
765
    | Step
766
    | Page,
767
    Discriminator("tag"),
768
]
769

770

771
class Manual(SerializationMixin, BaseModel):
1✔
772
    """A complete LEGO instruction manual containing all pages.
773

774
    This is the top-level container that holds all pages from a PDF and provides
775
    cross-page analysis capabilities like finding unique parts, matching parts
776
    across pages by image digest, and navigating between pages.
777

778
    Pages are automatically sorted by PDF page number when the Manual is created.
779

780
    Attributes:
781
        pages: List of Page objects, sorted by pdf_page_number
782
        set_number: Optional LEGO set number (e.g., "75375")
783
        name: Optional name of the set (e.g., "Millennium Falcon")
784
    """
785

786
    model_config = ConfigDict(populate_by_name=True)
1✔
787

788
    tag: Literal["Manual"] = Field(default="Manual", alias="__tag__", frozen=True)
1✔
789

790
    # Set information
791
    set_number: str | None = None
1✔
792
    name: str | None = None
1✔
793

794
    # Source PDF metadata
795
    source_pdf: str | None = None
1✔
796
    """Path to the source PDF file."""
1✔
797

798
    source_size: int | None = None
1✔
799
    """Size of the source PDF file in bytes."""
1✔
800

801
    source_hash: str | None = None
1✔
802
    """Hash of the source PDF file (e.g. SHA256)."""
1✔
803

804
    # Main parsed contents
805
    pages: list[Page] = Field(default_factory=list)
1✔
806
    """List of Page objects, sorted by pdf_page_number."""
1✔
807

808
    unsupported_reason: str | None = None
1✔
809
    """If present, indicates why this manual could not be fully processed."""
1✔
810

811
    @model_validator(mode="after")
1✔
812
    def sort_pages(self) -> Manual:
1✔
813
        """Sort pages by PDF page number after initialization."""
814
        self.pages.sort(key=lambda p: p.pdf_page_number)
1✔
815
        return self
1✔
816

817
    def get_page(self, pdf_page_number: int) -> Page | None:
1✔
818
        """Get a page by its PDF page number.
819

820
        Args:
821
            pdf_page_number: The PDF page number to find (1-indexed)
822

823
        Returns:
824
            The Page at that PDF page number, or None if not found
825
        """
826
        for page in self.pages:
1✔
827
            if page.pdf_page_number == pdf_page_number:
1✔
828
                return page
1✔
829
        return None
1✔
830

831
    def get_page_by_lego_number(self, lego_page_number: int) -> Page | None:
1✔
832
        """Get a page by its LEGO page number (the number printed on the page).
833

834
        Args:
835
            lego_page_number: The LEGO page number to find
836

837
        Returns:
838
            The Page with the matching LEGO page number, or None if not found
839
        """
840
        for page in self.pages:
×
841
            if page.page_number and page.page_number.value == lego_page_number:
×
842
                return page
×
843
        return None
×
844

845
    @property
1✔
846
    def instruction_pages(self) -> list[Page]:
1✔
847
        """Get all instruction pages (pages with building steps)."""
848
        return [p for p in self.pages if p.is_instruction]
1✔
849

850
    @property
1✔
851
    def catalog_pages(self) -> list[Page]:
1✔
852
        """Get all catalog pages (pages with parts inventory)."""
853
        return [p for p in self.pages if p.is_catalog]
1✔
854

855
    @property
1✔
856
    def info_pages(self) -> list[Page]:
1✔
857
        """Get all info pages."""
858
        return [p for p in self.pages if p.is_info]
×
859

860
    @property
1✔
861
    def catalog_parts(self) -> list[Part]:
1✔
862
        """Get all parts from catalog pages.
863

864
        Returns:
865
            List of all Part objects from catalog pages, which typically have
866
            PartNumber (element_id) information for identification.
867
        """
868
        parts: list[Part] = []
1✔
869
        for page in self.catalog_pages:
1✔
870
            parts.extend(page.catalog)
1✔
871
        return parts
1✔
872

873
    @property
1✔
874
    def all_steps(self) -> list[Step]:
1✔
875
        """Get all steps from all instruction pages in order.
876

877
        Returns:
878
            List of all Step objects from instruction pages
879
        """
880
        steps: list[Step] = []
1✔
881
        for page in self.instruction_pages:
1✔
882
            steps.extend(page.steps)
1✔
883
        return steps
1✔
884

885
    @property
1✔
886
    def total_parts_count(self) -> int:
1✔
887
        """Get the total count of all parts across all steps.
888

889
        This sums up all part counts from all parts lists in all steps.
890
        """
891
        total = 0
1✔
892
        for step in self.all_steps:
1✔
893
            if step.parts_list:
1✔
894
                total += step.parts_list.total_items
1✔
895
        return total
1✔
896

897
    def __str__(self) -> str:
1✔
898
        """Return a single-line string representation with key information."""
899
        set_str = f"set={self.set_number}, " if self.set_number else ""
1✔
900
        name_str = f'"{self.name}", ' if self.name else ""
1✔
901
        return (
1✔
902
            f"Manual({set_str}{name_str}"
903
            f"pages={len(self.pages)}, "
904
            f"steps={len(self.all_steps)}, "
905
            f"catalog_parts={len(self.catalog_parts)})"
906
        )
907

908
    def to_json(self, *, indent: str | int | None = None, **kwargs: Any) -> str:
1✔
909
        """Serialize to JSON with proper defaults (by_alias=True, exclude_none=True).
910

911
        Floats are rounded to 2 decimal places for consistent output.
912
        Empty lists are removed from the output.
913

914
        Args:
915
            indent: Optional indentation for pretty-printing (str like '\t', int, or None)
916
            **kwargs: Additional arguments passed to model_dump()
917
        """
918
        # Use to_dict() from mixin which rounds floats
919
        data = self.to_dict(**kwargs)
1✔
920
        cleaned_data = remove_empty_lists(data)
1✔
921

922
        # Use compact separators when not indented (matches Pydantic's behavior)
923
        separators = (",", ":") if indent is None else (",", ": ")
1✔
924
        return json.dumps(cleaned_data, indent=indent, separators=separators)
1✔
925

926

927
# TODO Add sub-assembly (or sub-step) element.
928
# TODO Add a final preview element.
929
# TODO Add a "information" element (for facts about the set).
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc