• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 20357264638

19 Dec 2025 01:56AM UTC coverage: 81.036% (-0.3%) from 81.333%
20357264638

push

github

speedyk-005
fix: Add python-multipart dependency for visualizer

1299 of 1603 relevant lines covered (81.04%)

4.05 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.05
/src/chunklet/code_chunker/code_chunker.py
1
"""
2
Author: Speedyk-005 | Copyright (c) 2025 | License: MIT
3

4
Language-Agnostic Code Chunking Utility
5

6
This module provides a robust, convention-aware engine for segmenting source code into
7
semantic units ("chunks") such as functions, classes, namespaces, and logical blocks.
8
Unlike purely heuristic or grammar-dependent parsers, the `CodeChunker` relies on
9
anchored, multi-language regex patterns and indentation rules to identify structures
10
consistently across a variety of programming languages.
11

12
Limitations
13
-----------
14
`CodeChunker` assumes syntactically conventional code. Highly obfuscated, minified,
15
or macro-generated sources may not fully respect its boundary patterns, though such
16
cases fall outside its intended domain.
17

18
Inspired by:
19
    - Camel.utils.chunker.CodeChunker (@ CAMEL-AI.org)
20
    - code-chunker by JimAiMoment
21
    - whats_that_code by matthewdeanmartin
22
    - CintraAI Code Chunker
23
"""
24

25
import sys
5✔
26
from pathlib import Path
5✔
27
from typing import Any, Literal, Callable, Generator, Annotated
5✔
28
from functools import partial
5✔
29
from itertools import chain
5✔
30

31
from more_itertools import unique_everseen
5✔
32
from pydantic import Field
5✔
33
from box import Box
5✔
34

35
try:
5✔
36
    from charset_normalizer import from_path
5✔
37
    from littletree import Node
5✔
38
    import defusedxml.ElementTree as ET
5✔
39
except ImportError:
×
40
    from_path, Node, ET = None, None, None
×
41

42
from loguru import logger
5✔
43

44
from chunklet.base_chunker import BaseChunker
5✔
45
from chunklet.code_chunker._code_structure_extractor import CodeStructureExtractor
5✔
46
from chunklet.common.path_utils import is_path_like
5✔
47
from chunklet.common.batch_runner import run_in_batch
5✔
48
from chunklet.common.validation import validate_input, restricted_iterable
5✔
49
from chunklet.common.token_utils import count_tokens
5✔
50
from chunklet.exceptions import (
5✔
51
    InvalidInputError,
52
    MissingTokenCounterError,
53
    TokenLimitError,
54
)
55

56

57
class CodeChunker(BaseChunker):
5✔
58
    """
59
    Language-agnostic code chunking utility for semantic code segmentation.
60

61
    Extracts structural units (functions, classes, namespaces) from source code
62
    across multiple programming languages using pattern-based detection and
63
    token-aware segmentation.
64

65
    Key Features:
66
        - Cross-language support (Python, C/C++, Java, C#, JavaScript, Go, etc.)
67
        - Structural analysis with namespace hierarchy tracking
68
        - Configurable token limits with strict/lenient overflow handling
69
        - Flexible docstring and comment processing modes
70
        - Accurate line number preservation and source tracking
71
        - Parallel batch processing for multiple files
72
        - Comprehensive logging and progress tracking
73
    """
74

75
    @validate_input
5✔
76
    def __init__(
5✔
77
        self,
78
        verbose: bool = False,
79
        token_counter: Callable[[str], int] | None = None,
80
    ):
81
        """
82
        Initialize the CodeChunker with optional token counter and verbosity control.
83

84
        Args:
85
            verbose (bool): Enable verbose logging.
86
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
87
                If None, must be provided when calling chunk() methods.
88
        """
89
        self.token_counter = token_counter
5✔
90
        self._verbose = verbose
5✔
91
        self.extractor = CodeStructureExtractor(verbose=self._verbose)
5✔
92

93
    @property
5✔
94
    def verbose(self) -> bool:
5✔
95
        """Get the verbose setting."""
96
        return self._verbose
5✔
97

98
    @verbose.setter
5✔
99
    def verbose(self, value: bool) -> None:
5✔
100
        """Set the verbose setting and propagate to the extractor."""
101
        self._verbose = value
×
102
        self.extractor.verbose = value
×
103

104
    def _merge_tree(self, relations_list: list[list]) -> str:
5✔
105
        """
106
        Merges multiple sets of parent-child relation dictionaries into a single tree
107
        then returns its string representation.
108

109
        Args:
110
            relations_list (list[list]): A list containing relation lists.
111

112
        Returns:
113
            str: The string representation of the tree
114
        """
115
        if not relations_list:
5✔
116
            return "global"
×
117

118
        # Flatten the set of lists into a single iterable
119
        all_relations_flat = chain.from_iterable(relations_list)
5✔
120

121
        # Deduplicate relations
122
        def relation_key(relation: dict):
5✔
123
            return tuple(sorted(relation.items()))
5✔
124

125
        unique_relations = list(unique_everseen(all_relations_flat, key=relation_key))
5✔
126

127
        if not unique_relations:
5✔
128
            return "global"
5✔
129

130
        merged_tree = Node.from_relations(unique_relations, root="global")
5✔
131

132
        return merged_tree.to_string()
5✔
133

134
    def _format_limit_msg(
5✔
135
        self,
136
        box_tokens: int,
137
        max_tokens: int,
138
        box_lines: int,
139
        max_lines: int,
140
        function_count: int,
141
        max_functions: int,
142
        content_preview: str,
143
    ) -> str:
144
        """
145
        Format a limit exceeded error message, only including limits that are not sys.maxsize.
146

147
        Args:
148
            box_tokens: Actual token count in the block
149
            max_tokens: Maximum allowed tokens
150
            box_lines: Actual line count in the block
151
            max_lines: Maximum allowed lines
152
            function_count: Actual function count in the block
153
            max_functions: Maximum allowed functions
154
            content_preview: Preview of the content that exceeded limits
155

156
        Returns:
157
            Formatted error message with applicable limits
158
        """
159
        limits = []
5✔
160

161
        if max_tokens != sys.maxsize:
5✔
162
            limits.append(f"tokens: {box_tokens} > {max_tokens}")
5✔
163
        if max_lines != sys.maxsize:
5✔
164
            limits.append(f"lines: {box_lines} > {max_lines}")
×
165
        if max_functions != sys.maxsize:
5✔
166
            limits.append(f"functions: {function_count} > {max_functions}")
×
167

168
        if not limits:
5✔
169
            return "Block exceeds unspecified limits"
×
170

171
        limits_str = ", ".join(limits)
5✔
172

173
        return (
5✔
174
            f"Structural block exceeds maximum limit ({limits_str}).\n"
175
            f"Content starting with: \n```\n{content_preview}...\n```\n"
176
            "Reason: Prevent splitting inside interest points (function, class, region, ...)\n"
177
            "💡Hint: Consider increasing 'max_tokens', 'max_lines', or 'max_functions', "
178
            "refactoring the oversized block, or setting 'strict=False' to allow automatic splitting of oversized blocks."
179
        )
180

181
    def _split_oversized(
5✔
182
        self,
183
        snippet_dict: dict,
184
        max_tokens: int,
185
        max_lines: int,
186
        source: str | Path,
187
        token_counter: Callable | None,
188
        cumulative_lengths: tuple[int, ...],
189
    ):
190
        """
191
        Split an oversized structural block into smaller sub-chunks.
192

193
        This helper is used when a single code block exceeds the maximum
194
        token limit and `strict_mode` is disabled. It divides the block's
195
        content into token-bounded fragments while preserving line order
196
        and basic metadata.
197

198
        Args:
199
            snippet_dict (dict): The oversized snippet to split.
200
            max_tokens (int): Maximum tokens per sub-chunk.
201
            max_lines (int): Maximum lines per sub-chunk.
202
            source (str | Path): The source of the code.
203
            token_counter (Callable | None): The token counting function.
204
            cumulative_lengths (tuple[int, ...]): The cumulative lengths of the lines in the source code.
205

206
        Returns:
207
            list[Box]: A list of sub-chunks derived from the original block.
208
        """
209
        sub_boxes = []
5✔
210
        curr_chunk = []
5✔
211
        token_count = 0
5✔
212
        line_count = 0
5✔
213

214
        # Iterate through each line in the snippet_dict content
215
        for line_no, line in enumerate(
5✔
216
            snippet_dict["content"].splitlines(), start=snippet_dict["start_line"]
217
        ):
218
            line_tokens = (
5✔
219
                count_tokens(line, token_counter) if max_tokens != sys.maxsize else 0
220
            )
221

222
            # If adding this line would exceed either max_tokens or max_lines, commit current chunk
223
            if (token_count + line_tokens > max_tokens) or (line_count + 1 > max_lines):
5✔
224
                start_line = line_no - len(curr_chunk)
5✔
225
                end_line = line_no - 1
5✔
226
                start_span = (
5✔
227
                    0 if start_line == 1 else cumulative_lengths[start_line - 2]
228
                )
229
                end_span = cumulative_lengths[end_line - 1]
5✔
230
                tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
231
                sub_boxes.append(
5✔
232
                    Box(
233
                        {
234
                            "content": "\n".join(curr_chunk),
235
                            "metadata": {
236
                                "tree": tree,
237
                                "start_line": start_line,
238
                                "end_line": end_line,
239
                                "span": (start_span, end_span),
240
                                "source": (
241
                                    str(source)
242
                                    if isinstance(source, (str, Path))
243
                                    else "N/A"
244
                                ),
245
                            },
246
                        }
247
                    )
248
                )
249
                curr_chunk = [line]  # Add the overflow line!
5✔
250
                token_count = 0
5✔
251
                line_count = 0
5✔
252

253
            curr_chunk.append(line)
5✔
254
            token_count += line_tokens
5✔
255
            line_count += 1
5✔
256

257
        # Add any remaining chunk at the end
258
        if curr_chunk:
5✔
259
            start_line = snippet_dict["end_line"] - len(curr_chunk) + 1
5✔
260
            end_line = snippet_dict["end_line"]
5✔
261
            start_span = 0 if start_line == 1 else cumulative_lengths[start_line - 2]
5✔
262
            end_span = cumulative_lengths[end_line - 1]
5✔
263
            tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
264
            sub_boxes.append(
5✔
265
                Box(
266
                    {
267
                        "content": "\n".join(curr_chunk),
268
                        "metadata": {
269
                            "tree": tree,
270
                            "start_line": start_line,
271
                            "end_line": end_line,
272
                            "span": (start_span, end_span),
273
                            "source": (
274
                                str(source)
275
                                if (isinstance(source, Path) or is_path_like(source))
276
                                else "N/A"
277
                            ),
278
                        },
279
                    }
280
                )
281
            )
282

283
        return sub_boxes
5✔
284

285
    def _group_by_chunk(
5✔
286
        self,
287
        snippet_dicts: list[dict],
288
        cumulative_lengths: tuple[int, ...],
289
        token_counter: Callable[[str], int] | None,
290
        max_tokens: int,
291
        max_lines: int,
292
        max_functions: int,
293
        strict: bool,
294
        source: str | Path,
295
    ) -> list[Box]:
296
        """
297
        Group code snippets into chunks based on specified constraints.
298

299
        Iteratively merges snippets into chunks while respecting token, line, and function limits.
300
        Handles oversized snippets by splitting them if strict mode is disabled.
301

302
        Args:
303
            snippet_dicts (list[dict]): List of extracted code snippet dictionaries.
304
            cumulative_lengths (tuple[int, ...]): Cumulative character lengths for span calculation.
305
            token_counter (Callable[[str], int] | None): Function to count tokens in text.
306
            max_tokens (int): Maximum tokens per chunk.
307
            max_lines (int): Maximum lines per chunk.
308
            max_functions (int): Maximum functions per chunk.
309
            strict (bool): If True, raise error on oversized snippets; if False, split them.
310
            source (str | Path): Original source for metadata.
311

312
        Returns:
313
            list[Box]: List of chunk boxes with content and metadata.
314
        """
315
        merged_content = []
5✔
316
        relations_list = []
5✔
317
        start_line = None
5✔
318
        end_line = None
5✔
319
        token_count = 0
5✔
320
        line_count = 0
5✔
321
        function_count = 0
5✔
322
        result_chunks = []
5✔
323

324
        index = 0
5✔
325
        while index < len(snippet_dicts):
5✔
326
            snippet_dict = snippet_dicts[index]
5✔
327
            box_tokens = (
5✔
328
                count_tokens(snippet_dict["content"], token_counter)
329
                if max_tokens != sys.maxsize
330
                else 0
331
            )
332
            box_lines = snippet_dict["content"].count("\n") + (
5✔
333
                1 if snippet_dict["content"] else 0
334
            )
335
            is_function = bool(snippet_dict.get("func_partial_signature"))
5✔
336

337
            # Check if adding this snippet exceeds any limits
338
            token_limit_reached = token_count + box_tokens > max_tokens
5✔
339
            line_limit_reached = line_count + box_lines > max_lines
5✔
340
            function_limit_reached = is_function and (
5✔
341
                function_count + 1 > max_functions
342
            )
343

344
            if not (
5✔
345
                token_limit_reached or line_limit_reached or function_limit_reached
346
            ):
347
                # Fits: merge normally
348
                merged_content.append(snippet_dict["content"])
5✔
349
                relations_list.append(snippet_dict["relations"])
5✔
350
                token_count += box_tokens
5✔
351
                line_count += box_lines
5✔
352
                if is_function:
5✔
353
                    function_count += 1
5✔
354

355
                if start_line is None:
5✔
356
                    start_line = snippet_dict["start_line"]
5✔
357
                end_line = snippet_dict["end_line"]
5✔
358
                index += 1
5✔
359

360
            elif not merged_content:
5✔
361
                # Too big and nothing merged yet: handle oversize
362
                if strict:
5✔
363
                    raise TokenLimitError(
5✔
364
                        self._format_limit_msg(
365
                            box_tokens,
366
                            max_tokens,
367
                            box_lines,
368
                            max_lines,
369
                            function_count,
370
                            max_functions,
371
                            snippet_dict["content"][:100],
372
                        )
373
                    )
374
                else:  # Else split further
375
                    logger.warning(
5✔
376
                        "Splitting oversized block (tokens: {} lines: {}) into sub-chunks",
377
                        box_tokens,
378
                        box_lines,
379
                    )
380

381
                    sub_chunks = self._split_oversized(
5✔
382
                        snippet_dict,
383
                        max_tokens,
384
                        max_lines,
385
                        source,
386
                        token_counter,
387
                        cumulative_lengths,
388
                    )
389

390
                    for sub_chunk in sub_chunks:
5✔
391
                        sub_chunk.metadata.chunk_num = len(result_chunks) + 1
5✔
392
                        result_chunks.append(sub_chunk)
5✔
393
                    index += 1
5✔
394
            else:
395
                # Flush current merged content as a chunk
396
                start_span = (
5✔
397
                    0 if start_line == 1 else cumulative_lengths[start_line - 2]
398
                )
399
                end_span = cumulative_lengths[end_line - 1]
5✔
400
                merged_chunk = Box(
5✔
401
                    {
402
                        "content": "\n".join(merged_content),
403
                        "metadata": {
404
                            "chunk_num": len(result_chunks) + 1,
405
                            "tree": self._merge_tree(relations_list),
406
                            "start_line": start_line,
407
                            "end_line": end_line,
408
                            "span": (start_span, end_span),
409
                            "source": (
410
                                str(source)
411
                                if (isinstance(source, Path) or is_path_like(source))
412
                                else "N/A"
413
                            ),
414
                        },
415
                    }
416
                )
417
                result_chunks.append(merged_chunk)
5✔
418

419
                # Reset for next chunk
420
                merged_content.clear()
5✔
421
                relations_list.clear()
5✔
422
                start_line = None
5✔
423
                end_line = None
5✔
424
                token_count = 0
5✔
425
                line_count = 0
5✔
426
                function_count = 0
5✔
427

428
        # Flush remaining content
429
        if merged_content:
5✔
430
            start_span = 0 if start_line == 1 else cumulative_lengths[start_line - 2]
5✔
431
            end_span = cumulative_lengths[end_line - 1]
5✔
432
            merged_chunk = Box(
5✔
433
                {
434
                    "content": "\n".join(merged_content),
435
                    "metadata": {
436
                        "chunk_num": len(result_chunks) + 1,
437
                        "tree": self._merge_tree(relations_list),
438
                        "start_line": start_line,
439
                        "end_line": end_line,
440
                        "span": (start_span, end_span),
441
                        "source": (
442
                            str(source)
443
                            if (isinstance(source, Path) or is_path_like(source))
444
                            else "N/A"
445
                        ),
446
                    },
447
                }
448
            )
449
            result_chunks.append(merged_chunk)
5✔
450

451
        return result_chunks
5✔
452

453
    def _validate_constraints(
5✔
454
        self,
455
        max_tokens: int | None,
456
        max_lines: int | None,
457
        max_functions: int | None,
458
        token_counter: Callable[[str], int] | None,
459
    ):
460
        """
461
        Validates that at least one chunking constraint is provided and sets default values.
462

463
        Args:
464
            max_tokens (int | None): Maximum number of tokens per chunk.
465
            max_lines (int | None): Maximum number of lines per chunk.
466
            max_functions (int | None): Maximum number of functions per chunk.
467
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
468

469
        Raises:
470
            InvalidInputError: If no chunking constraints are provided.
471
            MissingTokenCounterError: If `max_tokens` is provided but no `token_counter` is provided.
472
        """
473
        if not any((max_tokens, max_lines, max_functions)):
5✔
474
            raise InvalidInputError(
5✔
475
                "At least one of 'max_tokens', 'max_lines', or 'max_functions' must be provided."
476
            )
477

478
        # If token_counter is required but not provided
479
        if max_tokens is not None and not (token_counter or self.token_counter):
5✔
480
            raise MissingTokenCounterError()
5✔
481

482
    @validate_input
5✔
483
    def chunk(
5✔
484
        self,
485
        source: str | Path,
486
        *,
487
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
488
        max_lines: Annotated[int | None, Field(ge=5)] = None,
489
        max_functions: Annotated[int | None, Field(ge=1)] = None,
490
        token_counter: Callable[[str], int] | None = None,
491
        include_comments: bool = True,
492
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
493
        strict: bool = True,
494
    ) -> list[Box]:
495
        """
496
        Extract semantic code chunks from source using multi-dimensional analysis.
497

498
        Processes source code by identifying structural boundaries (functions, classes,
499
        namespaces) and grouping content based on multiple constraints including
500
        tokens, lines, and logical units while preserving semantic coherence.
501

502
        Args:
503
            source (str | Path): Raw code string or file path to process.
504
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
505
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
506
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
507
            token_counter (Callable, optional): Token counting function. Uses instance
508
                counter if None. Required for token-based chunking.
509
            include_comments (bool): Include comments in output chunks. Default: True.
510
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
511
                - "summary": Include only first line of docstrings
512
                - "all": Include complete docstrings
513
                - "excluded": Remove all docstrings
514
                Defaults to "all"
515
            strict (bool): If True, raise error when structural blocks exceed
516
                max_tokens. If False, split oversized blocks. Default: True.
517

518
        Returns:
519
            list[Box]: List of code chunks with metadata. Each Box contains:
520
                - content (str): Code content
521
                - tree (str): Namespace hierarchy
522
                - start_line (int): Starting line in original source
523
                - end_line (int): Ending line in original source
524
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
525
                - source_path (str): Source file path or "N/A"
526

527
        Raises:
528
            InvalidInputError: Invalid configuration parameters.
529
            MissingTokenCounterError: No token counter available.
530
            FileProcessingError: Source file cannot be read.
531
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
532
            CallbackError: If the token counter fails or returns an invalid type.
533
        """
534
        self._validate_constraints(max_tokens, max_lines, max_functions, token_counter)
5✔
535

536
        # Adjust limits for internal use
537
        if max_tokens is None:
5✔
538
            max_tokens = sys.maxsize
5✔
539
        if max_lines is None:
5✔
540
            max_lines = sys.maxsize
5✔
541
        if max_functions is None:
5✔
542
            max_functions = sys.maxsize
5✔
543

544
        token_counter = token_counter or self.token_counter
5✔
545

546
        if not source.strip():
5✔
547
            self.log_info("Input source is empty. Returning empty list.")
×
548
            return []
×
549

550
        self.log_info(
5✔
551
            "Starting chunk processing for {}",
552
            (
553
                f"source: {str(Path)}"
554
                if (isinstance(str, Path) or is_path_like(source))
555
                else f"code starting with:\n```\n{source[:100]}...\n```\n"
556
            ),
557
        )
558

559
        snippet_dicts, cumulative_lengths = self.extractor.extract_code_structure(
5✔
560
            source, include_comments, docstring_mode
561
        )
562

563
        result_chunks = self._group_by_chunk(
5✔
564
            snippet_dicts=snippet_dicts,
565
            cumulative_lengths=cumulative_lengths,
566
            token_counter=token_counter,
567
            max_tokens=max_tokens,
568
            max_lines=max_lines,
569
            max_functions=max_functions,
570
            strict=strict,
571
            source=source,
572
        )
573

574
        self.log_info(
5✔
575
            "Generated {} chunk(s) for the {}",
576
            len(result_chunks),
577
            (
578
                f"source: {str(Path)}"
579
                if (isinstance(str, Path) or is_path_like(source))
580
                else f"code starting with:\n```\n{source[:100]}...\n```\n"
581
            ),
582
        )
583

584
        return result_chunks
5✔
585

586
    @validate_input
5✔
587
    def batch_chunk(
5✔
588
        self,
589
        sources: restricted_iterable(str | Path),
590
        *,
591
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
592
        max_lines: Annotated[int | None, Field(ge=5)] = None,
593
        max_functions: Annotated[int | None, Field(ge=1)] = None,
594
        token_counter: Callable[[str], int] | None = None,
595
        separator: Any = None,
596
        include_comments: bool = True,
597
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
598
        strict: bool = True,
599
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
600
        show_progress: bool = True,
601
        on_errors: Literal["raise", "skip", "break"] = "raise",
602
    ) -> Generator[Box, None, None]:
603
        """
604
        Process multiple source files or code strings in parallel.
605

606
        Leverages multiprocessing to efficiently chunk multiple code sources,
607
        applying consistent chunking rules across all inputs.
608

609
        Args:
610
            sources (restricted_iterable[str | Path]): A restricted iterable of file paths or raw code strings to process.
611
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
612
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
613
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
614
            token_counter (Callable | None): Token counting function. Uses instance
615
                counter if None. Required for token-based chunking.
616
            separator (Any): A value to be yielded after the chunks of each text are processed.
617
                Note: None cannot be used as a separator.
618
            include_comments (bool): Include comments in output chunks. Default: True.
619
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
620
                - "summary": Include only first line of docstrings
621
                - "all": Include complete docstrings
622
                - "excluded": Remove all docstrings
623
                Defaults to "all"
624
            strict (bool): If True, raise error when structural blocks exceed
625
                max_tokens. If False, split oversized blocks. Default: True.
626
            n_jobs (int | None): Number of parallel workers. Uses all available CPUs if None.
627
            show_progress (bool): Display progress bar during processing. Defaults to True.
628
            on_errors (Literal["raise", "skip", "break"]):
629
                How to handle errors during processing. Defaults to 'raise'.
630

631
        yields:
632
            Box: `Box` object, representing a chunk with its content and metadata.
633
                Includes:
634
                - content (str): Code content
635
                - tree (str): Namespace hierarchy
636
                - start_line (int): Starting line in original source
637
                - end_line (int): Ending line in original source
638
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
639
                - source_path (str): Source file path or "N/A"
640

641
        Raises:
642
            InvalidInputError: Invalid input parameters.
643
            MissingTokenCounterError: No token counter available.
644
            FileProcessingError: Source file cannot be read.
645
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
646
            CallbackError: If the token counter fails or returns an invalid type.
647
        """
648
        chunk_func = partial(
5✔
649
            self.chunk,
650
            max_tokens=max_tokens,
651
            max_lines=max_lines,
652
            max_functions=max_functions,
653
            token_counter=token_counter or self.token_counter,
654
            include_comments=include_comments,
655
            docstring_mode=docstring_mode,
656
            strict=strict,
657
        )
658

659
        yield from run_in_batch(
5✔
660
            func=chunk_func,
661
            iterable_of_args=sources,
662
            iterable_name="sources",
663
            separator=separator,
664
            n_jobs=n_jobs,
665
            show_progress=show_progress,
666
            on_errors=on_errors,
667
            verbose=self.verbose,
668
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc