• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 20378511984

19 Dec 2025 06:09PM UTC coverage: 86.588% (+4.8%) from 81.75%
20378511984

Pull #7

github

web-flow
Merge 81717401a into aeb37fd6a
Pull Request #7: Merge develop branch to main

464 of 550 new or added lines in 17 files covered. (84.36%)

1 existing line in 1 file now uncovered.

1317 of 1521 relevant lines covered (86.59%)

4.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.99
/src/chunklet/code_chunker/code_chunker.py
1
"""
2
Author: Speedyk-005 | Copyright (c) 2025 | License: MIT
3

4
Language-Agnostic Code Chunking Utility
5

6
This module provides a robust, convention-aware engine for segmenting source code into
7
semantic units ("chunks") such as functions, classes, namespaces, and logical blocks.
8
Unlike purely heuristic or grammar-dependent parsers, the `CodeChunker` relies on
9
anchored, multi-language regex patterns and indentation rules to identify structures
10
consistently across a variety of programming languages.
11

12
Limitations
13
-----------
14
`CodeChunker` assumes syntactically conventional code. Highly obfuscated, minified,
15
or macro-generated sources may not fully respect its boundary patterns, though such
16
cases fall outside its intended domain.
17

18
Inspired by:
19
    - Camel.utils.chunker.CodeChunker (@ CAMEL-AI.org)
20
    - code-chunker by JimAiMoment
21
    - whats_that_code by matthewdeanmartin
22
    - CintraAI Code Chunker
23
"""
24

25
import sys
5✔
26
from pathlib import Path
5✔
27
from typing import Any, Literal, Callable, Generator, Annotated
5✔
28
from functools import partial
5✔
29
from itertools import chain
5✔
30

31
from more_itertools import unique_everseen
5✔
32
from pydantic import Field
5✔
33
from box import Box
5✔
34

35
try:
5✔
36
    from charset_normalizer import from_path
5✔
37
    from littletree import Node
5✔
38
    import defusedxml.ElementTree as ET
5✔
39
except ImportError:
×
NEW
40
    from_path, Node, ET = None, None, None
×
41

42
from loguru import logger
5✔
43

44
from chunklet.base_chunker import BaseChunker
5✔
45
from chunklet.code_chunker._code_structure_extractor import CodeStructureExtractor
5✔
46
from chunklet.common.path_utils import is_path_like
5✔
47
from chunklet.common.batch_runner import run_in_batch
5✔
48
from chunklet.common.validation import validate_input, restricted_iterable
5✔
49
from chunklet.common.token_utils import count_tokens
5✔
50
from chunklet.exceptions import (
5✔
51
    InvalidInputError,
52
    MissingTokenCounterError,
53
    TokenLimitError,
54
)
55

56

57
class CodeChunker(BaseChunker):
5✔
58
    """
59
    Language-agnostic code chunking utility for semantic code segmentation.
60

61
    Extracts structural units (functions, classes, namespaces) from source code
62
    across multiple programming languages using pattern-based detection and
63
    token-aware segmentation.
64

65
    Key Features:
66
        - Cross-language support (Python, C/C++, Java, C#, JavaScript, Go, etc.)
67
        - Structural analysis with namespace hierarchy tracking
68
        - Configurable token limits with strict/lenient overflow handling
69
        - Flexible docstring and comment processing modes
70
        - Accurate line number preservation and source tracking
71
        - Parallel batch processing for multiple files
72
        - Comprehensive logging and progress tracking
73
    """
74

75
    @validate_input
5✔
76
    def __init__(
5✔
77
        self,
78
        verbose: bool = False,
79
        token_counter: Callable[[str], int] | None = None,
80
    ):
81
        """
82
        Initialize the CodeChunker with optional token counter and verbosity control.
83

84
        Args:
85
            verbose (bool): Enable verbose logging.
86
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
87
                If None, must be provided when calling chunk() methods.
88
        """
89
        self.token_counter = token_counter
5✔
90
        self._verbose = verbose
5✔
91
        self.extractor = CodeStructureExtractor(verbose=self._verbose)
5✔
92

93
    @property
5✔
94
    def verbose(self) -> bool:
5✔
95
        """Get the verbose setting."""
96
        return self._verbose
5✔
97

98
    @verbose.setter
5✔
99
    def verbose(self, value: bool) -> None:
5✔
100
        """Set the verbose setting and propagate to the extractor."""
NEW
101
        self._verbose = value
×
NEW
102
        self.extractor.verbose = value
×
103

104
    def _merge_tree(self, relations_list: list[list]) -> str:
5✔
105
        """
106
        Merges multiple sets of parent-child relation dictionaries into a single tree
107
        then returns its string representation.
108

109
        Args:
110
            relations_list (list[list]): A list containing relation lists.
111

112
        Returns:
113
            str: The string representation of the tree
114
        """
115
        if not relations_list:
5✔
116
            return "global"
×
117

118
        # Flatten the set of lists into a single iterable
119
        all_relations_flat = chain.from_iterable(relations_list)
5✔
120

121
        # Deduplicate relations
122
        def relation_key(relation: dict):
5✔
123
            return tuple(sorted(relation.items()))
5✔
124

125
        unique_relations = list(unique_everseen(all_relations_flat, key=relation_key))
5✔
126

127
        if not unique_relations:
5✔
128
            return "global"
5✔
129

130
        merged_tree = Node.from_relations(unique_relations, root="global")
5✔
131

132
        return merged_tree.to_string()
5✔
133

134
    def _format_limit_msg(
5✔
135
        self,
136
        box_tokens: int,
137
        max_tokens: int,
138
        box_lines: int,
139
        max_lines: int,
140
        function_count: int,
141
        max_functions: int,
142
        content_preview: str,
143
    ) -> str:
144
        """
145
        Format a limit exceeded error message, only including limits that are not sys.maxsize.
146

147
        Args:
148
            box_tokens: Actual token count in the block
149
            max_tokens: Maximum allowed tokens
150
            box_lines: Actual line count in the block
151
            max_lines: Maximum allowed lines
152
            function_count: Actual function count in the block
153
            max_functions: Maximum allowed functions
154
            content_preview: Preview of the content that exceeded limits
155

156
        Returns:
157
            Formatted error message with applicable limits
158
        """
159
        limits = []
5✔
160

161
        if max_tokens != sys.maxsize:
5✔
162
            limits.append(f"tokens: {box_tokens} > {max_tokens}")
5✔
163
        if max_lines != sys.maxsize:
5✔
164
            limits.append(f"lines: {box_lines} > {max_lines}")
5✔
165
        if max_functions != sys.maxsize:
5✔
166
            limits.append(f"functions: {function_count} > {max_functions}")
5✔
167

168
        limits_str = ", ".join(limits)
5✔
169

170
        return (
5✔
171
            f"Structural block exceeds maximum limit ({limits_str}).\n"
172
            f"Content starting with: \n```\n{content_preview}...\n```\n"
173
            "Reason: Prevent splitting inside interest points (function, class, region, ...)\n"
174
            "💡Hint: Consider increasing 'max_tokens', 'max_lines', or 'max_functions', "
175
            "refactoring the oversized block, or setting 'strict=False' to allow automatic splitting of oversized blocks."
176
        )
177

178
    def _split_oversized(
5✔
179
        self,
180
        snippet_dict: dict,
181
        max_tokens: int,
182
        max_lines: int,
183
        source: str | Path,
184
        token_counter: Callable | None,
185
        cumulative_lengths: tuple[int, ...],
186
    ):
187
        """
188
        Split an oversized structural block into smaller sub-chunks.
189

190
        This helper is used when a single code block exceeds the maximum
191
        token limit and `strict_mode` is disabled. It divides the block's
192
        content into token-bounded fragments while preserving line order
193
        and basic metadata.
194

195
        Args:
196
            snippet_dict (dict): The oversized snippet to split.
197
            max_tokens (int): Maximum tokens per sub-chunk.
198
            max_lines (int): Maximum lines per sub-chunk.
199
            source (str | Path): The source of the code.
200
            token_counter (Callable | None): The token counting function.
201
            cumulative_lengths (tuple[int, ...]): The cumulative lengths of the lines in the source code.
202

203
        Returns:
204
            list[Box]: A list of sub-chunks derived from the original block.
205
        """
206
        sub_boxes = []
5✔
207
        curr_chunk = []
5✔
208
        token_count = 0
5✔
209
        line_count = 0
5✔
210

211
        # Iterate through each line in the snippet_dict content
212
        for line_no, line in enumerate(
5✔
213
            snippet_dict["content"].splitlines(), start=snippet_dict["start_line"]
214
        ):
215
            line_tokens = (
5✔
216
                count_tokens(line, token_counter) if max_tokens != sys.maxsize else 0
217
            )
218

219
            # If adding this line would exceed either max_tokens or max_lines, commit current chunk
220
            if (token_count + line_tokens > max_tokens) or (line_count + 1 > max_lines):
5✔
221
                start_line = line_no - len(curr_chunk)
5✔
222
                end_line = line_no - 1
5✔
223
                start_span = (
5✔
224
                    0 if start_line == 1 else cumulative_lengths[start_line - 2]
225
                )
226
                end_span = cumulative_lengths[end_line - 1]
5✔
227
                tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
228
                sub_boxes.append(
5✔
229
                    Box(
230
                        {
231
                            "content": "\n".join(curr_chunk),
232
                            "metadata": {
233
                                "tree": tree,
234
                                "start_line": start_line,
235
                                "end_line": end_line,
236
                                "span": (start_span, end_span),
237
                                "source": (
238
                                    str(source)
239
                                    if isinstance(source, Path)
240
                                    or (
241
                                        isinstance(source, str) and is_path_like(source)
242
                                    )
243
                                    else "N/A"
244
                                ),
245
                            },
246
                        }
247
                    )
248
                )
249
                curr_chunk = [line]  # Add the overflow line!
5✔
250
                token_count = 0
5✔
251
                line_count = 0
5✔
252

253
            curr_chunk.append(line)
5✔
254
            token_count += line_tokens
5✔
255
            line_count += 1
5✔
256

257
        # Add any remaining chunk at the end
258
        if curr_chunk:
5✔
259
            start_line = snippet_dict["end_line"] - len(curr_chunk) + 1
5✔
260
            end_line = snippet_dict["end_line"]
5✔
261
            start_span = 0 if start_line == 1 else cumulative_lengths[start_line - 2]
5✔
262
            end_span = cumulative_lengths[end_line - 1]
5✔
263
            tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
264
            sub_boxes.append(
5✔
265
                Box(
266
                    {
267
                        "content": "\n".join(curr_chunk),
268
                        "metadata": {
269
                            "tree": tree,
270
                            "start_line": start_line,
271
                            "end_line": end_line,
272
                            "span": (start_span, end_span),
273
                            "source": (
274
                                str(source)
275
                                if (isinstance(source, Path) or is_path_like(source))
276
                                else "N/A"
277
                            ),
278
                        },
279
                    }
280
                )
281
            )
282

283
        return sub_boxes
5✔
284

285
    def _group_by_chunk(
5✔
286
        self,
287
        snippet_dicts: list[dict],
288
        cumulative_lengths: tuple[int, ...],
289
        token_counter: Callable[[str], int] | None,
290
        max_tokens: int,
291
        max_lines: int,
292
        max_functions: int,
293
        strict: bool,
294
        source: str | Path,
295
    ) -> list[Box]:
296
        """
297
        Group code snippets into chunks based on specified constraints.
298

299
        Iteratively merges snippets into chunks while respecting token, line, and function limits.
300
        Handles oversized snippets by splitting them if strict mode is disabled.
301

302
        Args:
303
            snippet_dicts (list[dict]): List of extracted code snippet dictionaries.
304
            cumulative_lengths (tuple[int, ...]): Cumulative character lengths for span calculation.
305
            token_counter (Callable[[str], int] | None): Function to count tokens in text.
306
            max_tokens (int): Maximum tokens per chunk.
307
            max_lines (int): Maximum lines per chunk.
308
            max_functions (int): Maximum functions per chunk.
309
            strict (bool): If True, raise error on oversized snippets; if False, split them.
310
            source (str | Path): Original source for metadata.
311

312
        Returns:
313
            list[Box]: List of chunk boxes with content and metadata.
314
        """
315
        merged_content = []
5✔
316
        relations_list = []
5✔
317
        start_line = None
5✔
318
        end_line = None
5✔
319
        token_count = 0
5✔
320
        line_count = 0
5✔
321
        function_count = 0
5✔
322
        result_chunks = []
5✔
323

324
        index = 0
5✔
325
        while index < len(snippet_dicts):
5✔
326
            snippet_dict = snippet_dicts[index]
5✔
327
            box_tokens = (
5✔
328
                count_tokens(snippet_dict["content"], token_counter)
329
                if max_tokens != sys.maxsize
330
                else 0
331
            )
332
            box_lines = snippet_dict["content"].count("\n") + (
5✔
333
                1 if snippet_dict["content"] else 0
334
            )
335
            is_function = bool(snippet_dict.get("func_partial_signature"))
5✔
336

337
            # Check if adding this snippet exceeds any limits
338
            token_limit_reached = token_count + box_tokens > max_tokens
5✔
339
            line_limit_reached = line_count + box_lines > max_lines
5✔
340
            function_limit_reached = is_function and (
5✔
341
                function_count + 1 > max_functions
342
            )
343

344
            if not (
5✔
345
                token_limit_reached or line_limit_reached or function_limit_reached
346
            ):
347
                # Fits: merge normally
348
                merged_content.append(snippet_dict["content"])
5✔
349
                relations_list.append(snippet_dict["relations"])
5✔
350
                token_count += box_tokens
5✔
351
                line_count += box_lines
5✔
352
                if is_function:
5✔
353
                    function_count += 1
5✔
354

355
                if start_line is None:
5✔
356
                    start_line = snippet_dict["start_line"]
5✔
357
                end_line = snippet_dict["end_line"]
5✔
358
                index += 1
5✔
359

360
            elif not merged_content:
5✔
361
                # Too big and nothing merged yet: handle oversize
362
                if strict:
5✔
363
                    raise TokenLimitError(
5✔
364
                        self._format_limit_msg(
365
                            box_tokens,
366
                            max_tokens,
367
                            box_lines,
368
                            max_lines,
369
                            function_count,
370
                            max_functions,
371
                            snippet_dict["content"][:100],
372
                        )
373
                    )
374
                else:  # Else split further
375
                    logger.warning(
5✔
376
                        "Splitting oversized block (tokens: {} lines: {}) into sub-chunks",
377
                        box_tokens,
378
                        box_lines,
379
                    )
380

381
                    sub_chunks = self._split_oversized(
5✔
382
                        snippet_dict,
383
                        max_tokens,
384
                        max_lines,
385
                        source,
386
                        token_counter,
387
                        cumulative_lengths,
388
                    )
389

390
                    for sub_chunk in sub_chunks:
5✔
391
                        sub_chunk.metadata.chunk_num = len(result_chunks) + 1
5✔
392
                        result_chunks.append(sub_chunk)
5✔
393
                    index += 1
5✔
394
            else:
395
                # Flush current merged content as a chunk
396
                start_span = (
5✔
397
                    0 if start_line == 1 else cumulative_lengths[start_line - 2]
398
                )
399
                end_span = cumulative_lengths[end_line - 1]
5✔
400
                merged_chunk = Box(
5✔
401
                    {
402
                        "content": "\n".join(merged_content),
403
                        "metadata": {
404
                            "chunk_num": len(result_chunks) + 1,
405
                            "tree": self._merge_tree(relations_list),
406
                            "start_line": start_line,
407
                            "end_line": end_line,
408
                            "span": (start_span, end_span),
409
                            "source": (
410
                                str(source)
411
                                if (isinstance(source, Path) or is_path_like(source))
412
                                else "N/A"
413
                            ),
414
                        },
415
                    }
416
                )
417
                result_chunks.append(merged_chunk)
5✔
418

419
                # Reset for next chunk
420
                merged_content.clear()
5✔
421
                relations_list.clear()
5✔
422
                start_line = None
5✔
423
                end_line = None
5✔
424
                token_count = 0
5✔
425
                line_count = 0
5✔
426
                function_count = 0
5✔
427

428
        # Flush remaining content
429
        if merged_content:
5✔
430
            start_span = 0 if start_line == 1 else cumulative_lengths[start_line - 2]
5✔
431
            end_span = cumulative_lengths[end_line - 1]
5✔
432
            merged_chunk = Box(
5✔
433
                {
434
                    "content": "\n".join(merged_content),
435
                    "metadata": {
436
                        "chunk_num": len(result_chunks) + 1,
437
                        "tree": self._merge_tree(relations_list),
438
                        "start_line": start_line,
439
                        "end_line": end_line,
440
                        "span": (start_span, end_span),
441
                        "source": (
442
                            str(source)
443
                            if isinstance(source, Path)
444
                            or (isinstance(source, str) and is_path_like(source))
445
                            else "N/A"
446
                        ),
447
                    },
448
                }
449
            )
450
            result_chunks.append(merged_chunk)
5✔
451

452
        return result_chunks
5✔
453

454
    def _validate_constraints(
5✔
455
        self,
456
        max_tokens: int | None,
457
        max_lines: int | None,
458
        max_functions: int | None,
459
        token_counter: Callable[[str], int] | None,
460
    ):
461
        """
462
        Validates that at least one chunking constraint is provided and sets default values.
463

464
        Args:
465
            max_tokens (int | None): Maximum number of tokens per chunk.
466
            max_lines (int | None): Maximum number of lines per chunk.
467
            max_functions (int | None): Maximum number of functions per chunk.
468
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
469

470
        Raises:
471
            InvalidInputError: If no chunking constraints are provided.
472
            MissingTokenCounterError: If `max_tokens` is provided but no `token_counter` is provided.
473
        """
474
        if not any((max_tokens, max_lines, max_functions)):
5✔
475
            raise InvalidInputError(
5✔
476
                "At least one of 'max_tokens', 'max_lines', or 'max_functions' must be provided."
477
            )
478

479
        # If token_counter is required but not provided
480
        if max_tokens is not None and not (token_counter or self.token_counter):
5✔
481
            raise MissingTokenCounterError()
5✔
482

483
    @validate_input
5✔
484
    def chunk(
5✔
485
        self,
486
        source: str | Path,
487
        *,
488
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
489
        max_lines: Annotated[int | None, Field(ge=5)] = None,
490
        max_functions: Annotated[int | None, Field(ge=1)] = None,
491
        token_counter: Callable[[str], int] | None = None,
492
        include_comments: bool = True,
493
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
494
        strict: bool = True,
495
    ) -> list[Box]:
496
        """
497
        Extract semantic code chunks from source using multi-dimensional analysis.
498

499
        Processes source code by identifying structural boundaries (functions, classes,
500
        namespaces) and grouping content based on multiple constraints including
501
        tokens, lines, and logical units while preserving semantic coherence.
502

503
        Args:
504
            source (str | Path): Raw code string or file path to process.
505
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
506
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
507
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
508
            token_counter (Callable, optional): Token counting function. Uses instance
509
                counter if None. Required for token-based chunking.
510
            include_comments (bool): Include comments in output chunks. Default: True.
511
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
512
                - "summary": Include only first line of docstrings
513
                - "all": Include complete docstrings
514
                - "excluded": Remove all docstrings
515
                Defaults to "all"
516
            strict (bool): If True, raise error when structural blocks exceed
517
                max_tokens. If False, split oversized blocks. Default: True.
518

519
        Returns:
520
            list[Box]: List of code chunks with metadata. Each Box contains:
521
                - content (str): Code content
522
                - tree (str): Namespace hierarchy
523
                - start_line (int): Starting line in original source
524
                - end_line (int): Ending line in original source
525
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
526
                - source_path (str): Source file path or "N/A"
527

528
        Raises:
529
            InvalidInputError: Invalid configuration parameters.
530
            MissingTokenCounterError: No token counter available.
531
            FileProcessingError: Source file cannot be read.
532
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
533
            CallbackError: If the token counter fails or returns an invalid type.
534
        """
535
        self._validate_constraints(max_tokens, max_lines, max_functions, token_counter)
5✔
536

537
        # Adjust limits for internal use
538
        if max_tokens is None:
5✔
539
            max_tokens = sys.maxsize
5✔
540
        if max_lines is None:
5✔
541
            max_lines = sys.maxsize
5✔
542
        if max_functions is None:
5✔
543
            max_functions = sys.maxsize
5✔
544

545
        token_counter = token_counter or self.token_counter
5✔
546

547
        if isinstance(source, str) and not source.strip():
5✔
548
            self.log_info("Input source is empty. Returning empty list.")
5✔
549
            return []
5✔
550

551
        self.log_info(
5✔
552
            "Starting chunk processing for {}",
553
            (
554
                f"source: {source}"
555
                if isinstance(source, Path)
556
                or (isinstance(source, str) and is_path_like(source))
557
                else f"code starting with:\n```\n{source[:100]}...\n```\n"
558
            ),
559
        )
560

561
        snippet_dicts, cumulative_lengths = self.extractor.extract_code_structure(
5✔
562
            source, include_comments, docstring_mode
563
        )
564

565
        result_chunks = self._group_by_chunk(
5✔
566
            snippet_dicts=snippet_dicts,
567
            cumulative_lengths=cumulative_lengths,
568
            token_counter=token_counter,
569
            max_tokens=max_tokens,
570
            max_lines=max_lines,
571
            max_functions=max_functions,
572
            strict=strict,
573
            source=source,
574
        )
575

576
        self.log_info(
5✔
577
            "Generated {} chunk(s) for the {}",
578
            len(result_chunks),
579
            (
580
                f"source: {source}"
581
                if isinstance(source, Path)
582
                or (isinstance(source, str) and is_path_like(source))
583
                else f"code starting with:\n```\n{source[:100]}...\n```\n"
584
            ),
585
        )
586

587
        return result_chunks
5✔
588

589
    @validate_input
5✔
590
    def batch_chunk(
5✔
591
        self,
592
        sources: restricted_iterable(str | Path),
593
        *,
594
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
595
        max_lines: Annotated[int | None, Field(ge=5)] = None,
596
        max_functions: Annotated[int | None, Field(ge=1)] = None,
597
        token_counter: Callable[[str], int] | None = None,
598
        separator: Any = None,
599
        include_comments: bool = True,
600
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
601
        strict: bool = True,
602
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
603
        show_progress: bool = True,
604
        on_errors: Literal["raise", "skip", "break"] = "raise",
605
    ) -> Generator[Box, None, None]:
606
        """
607
        Process multiple source files or code strings in parallel.
608

609
        Leverages multiprocessing to efficiently chunk multiple code sources,
610
        applying consistent chunking rules across all inputs.
611

612
        Args:
613
            sources (restricted_iterable[str | Path]): A restricted iterable of file paths or raw code strings to process.
614
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
615
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
616
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
617
            token_counter (Callable | None): Token counting function. Uses instance
618
                counter if None. Required for token-based chunking.
619
            separator (Any): A value to be yielded after the chunks of each text are processed.
620
                Note: None cannot be used as a separator.
621
            include_comments (bool): Include comments in output chunks. Default: True.
622
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
623
                - "summary": Include only first line of docstrings
624
                - "all": Include complete docstrings
625
                - "excluded": Remove all docstrings
626
                Defaults to "all"
627
            strict (bool): If True, raise error when structural blocks exceed
628
                max_tokens. If False, split oversized blocks. Default: True.
629
            n_jobs (int | None): Number of parallel workers. Uses all available CPUs if None.
630
            show_progress (bool): Display progress bar during processing. Defaults to True.
631
            on_errors (Literal["raise", "skip", "break"]):
632
                How to handle errors during processing. Defaults to 'raise'.
633

634
        yields:
635
            Box: `Box` object, representing a chunk with its content and metadata.
636
                Includes:
637
                - content (str): Code content
638
                - tree (str): Namespace hierarchy
639
                - start_line (int): Starting line in original source
640
                - end_line (int): Ending line in original source
641
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
642
                - source_path (str): Source file path or "N/A"
643

644
        Raises:
645
            InvalidInputError: Invalid input parameters.
646
            MissingTokenCounterError: No token counter available.
647
            FileProcessingError: Source file cannot be read.
648
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
649
            CallbackError: If the token counter fails or returns an invalid type.
650
        """
651
        chunk_func = partial(
5✔
652
            self.chunk,
653
            max_tokens=max_tokens,
654
            max_lines=max_lines,
655
            max_functions=max_functions,
656
            token_counter=token_counter or self.token_counter,
657
            include_comments=include_comments,
658
            docstring_mode=docstring_mode,
659
            strict=strict,
660
        )
661

662
        yield from run_in_batch(
5✔
663
            func=chunk_func,
664
            iterable_of_args=sources,
665
            iterable_name="sources",
666
            separator=separator,
667
            n_jobs=n_jobs,
668
            show_progress=show_progress,
669
            on_errors=on_errors,
670
            verbose=self.verbose,
671
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc