• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 22647183911

03 Mar 2026 11:15PM UTC coverage: 90.659% (-0.01%) from 90.671%
22647183911

Pull #14

github

web-flow
Merge ab58ffaaf into 4c6b47c93
Pull Request #14: Refactor method ordering to follow Step-down Rule

378 of 397 new or added lines in 7 files covered. (95.21%)

2 existing lines in 2 files now uncovered.

1349 of 1488 relevant lines covered (90.66%)

4.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.24
/src/chunklet/code_chunker/code_chunker.py
1
"""
2
Author: Speedyk-005 | Copyright (c) 2025 | License: MIT
3

4
Language-Agnostic Code Chunking Utility
5

6
This module provides a robust, convention-aware engine for segmenting source code into
7
semantic units ("chunks") such as functions, classes, namespaces, and logical blocks.
8

9
The chunking process uses pattern-based line-by-line processing to identify code structures
10
and track context through indentation levels, enabling accurate detection of nested structures
11
while respecting language-specific syntax.
12

13
Limitations
14
-----------
15
`CodeChunker` assumes syntactically conventional code. Highly obfuscated, minified,
16
or macro-generated sources may not fully respect its boundary patterns, though such
17
cases fall outside its intended domain.
18

19
Inspired by:
20
    - Camel.utils.chunker.CodeChunker (@ CAMEL-AI.org)
21
    - code-chunker by JimAiMoment
22
    - whats_that_code by matthewdeanmartin
23
    - CintraAI Code Chunker
24
"""
25

26
import sys
5✔
27
from functools import partial
5✔
28
from itertools import chain
5✔
29
from pathlib import Path
5✔
30
from typing import Annotated, Any, Callable, Generator, Literal
5✔
31

32
from box import Box
5✔
33
from more_itertools import unique_everseen
5✔
34
from pydantic import Field
5✔
35

36
try:
5✔
37
    import defusedxml.ElementTree as ET
5✔
38
    from charset_normalizer import from_path
5✔
39
    from littletree import Node
5✔
40
except ImportError:  # pragma: no cover
41
    from_path, Node, ET = None, None, None
42

43
from loguru import logger
5✔
44

45
from chunklet.base_chunker import BaseChunker
5✔
46
from chunklet.code_chunker._code_structure_extractor import CodeStructureExtractor
5✔
47
from chunklet.code_chunker.utils import is_python_code
5✔
48
from chunklet.common.batch_runner import run_in_batch
5✔
49
from chunklet.common.deprecation import deprecated_callable
5✔
50
from chunklet.common.logging_utils import log_info
5✔
51
from chunklet.common.path_utils import is_path_like, read_text_file
5✔
52
from chunklet.common.token_utils import count_tokens
5✔
53
from chunklet.common.validation import restricted_iterable, validate_input
5✔
54
from chunklet.exceptions import (
5✔
55
    InvalidInputError,
56
    MissingTokenCounterError,
57
    TokenLimitError,
58
)
59

60

61
class CodeChunker(BaseChunker):
5✔
62
    """
63
    Language-agnostic code chunking utility for semantic code segmentation.
64

65
    Extracts structural units (functions, classes, namespaces) from source code
66
    across multiple programming languages using pattern-based detection and
67
    token-aware segmentation.
68

69
    Key Features:
70
        - Cross-language support (Python, C/C++, Java, C#, JavaScript, Go, etc.)
71
        - Structural analysis with namespace hierarchy tracking
72
        - Configurable token limits with strict/lenient overflow handling
73
        - Flexible docstring and comment processing modes
74
        - Accurate line number preservation and source tracking
75
        - Parallel batch processing for multiple files
76
        - Comprehensive logging and progress tracking
77
    """
78

79
    @validate_input
5✔
80
    def __init__(
5✔
81
        self,
82
        verbose: bool = False,
83
        token_counter: Callable[[str], int] | None = None,
84
    ):
85
        """
86
        Initialize the CodeChunker with optional token counter and verbosity control.
87

88
        Args:
89
            verbose (bool): Enable verbose logging.
90
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
91
                If None, must be provided when calling chunk() methods.
92
        """
93
        self.token_counter = token_counter
5✔
94
        self._verbose = verbose
5✔
95
        self.extractor = CodeStructureExtractor(verbose=self._verbose)
5✔
96

97
    @property
5✔
98
    def verbose(self) -> bool:
5✔
99
        """Get the verbose setting."""
100
        return self._verbose
5✔
101

102
    @verbose.setter
5✔
103
    def verbose(self, value: bool) -> None:
5✔
104
        """Set the verbose setting and propagate to the extractor."""
105
        self._verbose = value
×
106
        self.extractor.verbose = value
×
107

108
    @validate_input
5✔
109
    def chunk_text(
5✔
110
        self,
111
        code: str,
112
        *,
113
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
114
        max_lines: Annotated[int | None, Field(ge=5)] = None,
115
        max_functions: Annotated[int | None, Field(ge=1)] = None,
116
        token_counter: Callable[[str], int] | None = None,
117
        include_comments: bool = True,
118
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
119
        strict: bool = True,
120
    ) -> list[Box]:
121
        """
122
        Extract semantic code chunks from source using multi-dimensional analysis.
123
        Processes source code by identifying structural boundaries (functions, classes,
124
        namespaces) and grouping content based on multiple constraints including
125
        tokens, lines, and logical units while preserving semantic coherence.
126

127
        Args:
128
            code (str | Path): Raw code string or file path to process.
129
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
130
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
131
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
132
            token_counter (Callable, optional): Token counting function. Uses instance
133
                counter if None. Required for token-based chunking.
134
            include_comments (bool): Include comments in output chunks. Default: True.
135
            docstring_mode (Literal["summary", "all", "excluded"]): Docstring processing strategy:
136

137
                - "summary": Include only first line of docstrings
138
                - "all": Include complete docstrings
139
                - "excluded": Remove all docstrings
140
                Defaults to "all".
141
            strict (bool): If True, raise error when structural blocks exceed
142
                max_tokens. If False, split oversized blocks. Default: True.
143

144
        Returns:
145
            list[Box]: List of code chunks with metadata. Each Box contains:
146

147
                - content (str): Code content
148
                - tree (str): Namespace hierarchy
149
                - start_line (int): Starting line in original source
150
                - end_line (int): Ending line in original source
151
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
152
                - source_path (str): "N/A"
153

154
        Raises:
155
            InvalidInputError: Invalid configuration parameters.
156
            MissingTokenCounterError: No token counter available.
157
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
158
            CallbackError: If the token counter fails or returns an invalid type.
159
        """
160
        self._validate_constraints(max_tokens, max_lines, max_functions, token_counter)
5✔
161

162
        if max_tokens is None:
5✔
163
            max_tokens = sys.maxsize
5✔
164
        if max_lines is None:
5✔
165
            max_lines = sys.maxsize
5✔
166
        if max_functions is None:
5✔
167
            max_functions = sys.maxsize
5✔
168

169
        token_counter = token_counter or self.token_counter
5✔
170

171
        if not code.strip():
5✔
172
            log_info(self.verbose, "Input code is empty. Returning empty list.")
5✔
173
            return []
5✔
174

175
        log_info(
5✔
176
            self.verbose,
177
            "Starting chunk processing for code starting with:\n```\n{}...\n```",
178
            code[:100],
179
        )
180

181
        snippet_dicts, cumulative_lengths = self.extractor.extract_code_structure(
5✔
182
            code, include_comments, docstring_mode, is_python_code(code)
183
        )
184

185
        result_chunks = self._group_by_chunk(
5✔
186
            snippet_dicts=snippet_dicts,
187
            cumulative_lengths=cumulative_lengths,
188
            token_counter=token_counter,
189
            max_tokens=max_tokens,
190
            max_lines=max_lines,
191
            max_functions=max_functions,
192
            strict=strict,
193
            source=code,
194
        )
195

196
        log_info(self.verbose, "Generated {} chunk(s) for the code", len(result_chunks))
5✔
197

198
        return result_chunks
5✔
199

200
    def _validate_constraints(
5✔
201
        self,
202
        max_tokens: int | None,
203
        max_lines: int | None,
204
        max_functions: int | None,
205
        token_counter: Callable[[str], int] | None,
206
    ):
207
        """
208
        Validates that at least one chunking constraint is provided and sets default values.
209

210
        Args:
211
            max_tokens (int | None): Maximum number of tokens per chunk.
212
            max_lines (int | None): Maximum number of lines per chunk.
213
            max_functions (int | None): Maximum number of functions per chunk.
214
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
215

216
        Raises:
217
            InvalidInputError: If no chunking constraints are provided.
218
            MissingTokenCounterError: If `max_tokens` is provided but no `token_counter` is provided.
219
        """
220
        if not any((max_tokens, max_lines, max_functions)):
5✔
221
            raise InvalidInputError(
5✔
222
                "At least one of 'max_tokens', 'max_lines', or 'max_functions' must be provided."
223
            )
224

225
        # If token_counter is required but not provided
226
        if max_tokens is not None and not (token_counter or self.token_counter):
5✔
227
            raise MissingTokenCounterError()
5✔
228

229
    def _group_by_chunk(
5✔
230
        self,
231
        snippet_dicts: list[dict],
232
        cumulative_lengths: tuple[int, ...],
233
        token_counter: Callable[[str], int] | None,
234
        max_tokens: int,
235
        max_lines: int,
236
        max_functions: int,
237
        strict: bool,
238
        source: str | Path,
239
    ) -> list[Box]:
240
        """
241
        Group code snippets into chunks based on specified constraints.
242

243
        Iteratively merges snippets into chunks while respecting token, line, and function limits.
244
        Handles oversized snippets by splitting them if strict mode is disabled.
245

246
        Args:
247
            snippet_dicts (list[dict]): List of extracted code snippet dictionaries.
248
            cumulative_lengths (tuple[int, ...]): Cumulative character lengths for span calculation.
249
            token_counter (Callable[[str], int] | None): Function to count tokens in text.
250
            max_tokens (int): Maximum tokens per chunk.
251
            max_lines (int): Maximum lines per chunk.
252
            max_functions (int): Maximum functions per chunk.
253
            strict (bool): If True, raise error on oversized snippets; if False, split them.
254
            source (str | Path): Original source for metadata.
255

256
        Returns:
257
            list[Box]: List of chunk boxes with content and metadata.
258
        """
259
        source = (
5✔
260
            str(source) if (isinstance(source, Path) or is_path_like(source)) else "N/A"
261
        )
262

263
        merged_content = []
5✔
264
        relations_list = []
5✔
265
        start_line = None
5✔
266
        end_line = None
5✔
267
        token_count = 0
5✔
268
        line_count = 0
5✔
269
        function_count = 0
5✔
270
        result_chunks = []
5✔
271

272
        index = 0
5✔
273
        while index < len(snippet_dicts):
5✔
274
            snippet_dict = snippet_dicts[index]
5✔
275
            box_tokens = (
5✔
276
                count_tokens(snippet_dict["content"], token_counter)
277
                if max_tokens != sys.maxsize
278
                else 0
279
            )
280
            box_lines = snippet_dict["content"].count("\n") + bool(
5✔
281
                snippet_dict["content"]
282
            )
283
            is_function = bool(snippet_dict.get("func_partial_signature"))
5✔
284

285
            # Check if adding this snippet exceeds any limits
286
            token_limit_reached = token_count + box_tokens > max_tokens
5✔
287
            line_limit_reached = line_count + box_lines > max_lines
5✔
288
            function_limit_reached = is_function and (
5✔
289
                function_count + 1 > max_functions
290
            )
291

292
            if not (
5✔
293
                token_limit_reached or line_limit_reached or function_limit_reached
294
            ):
295
                # Fits: merge normally
296
                merged_content.append(snippet_dict["content"])
5✔
297
                relations_list.append(snippet_dict["relations"])
5✔
298
                token_count += box_tokens
5✔
299
                line_count += box_lines
5✔
300
                if is_function:
5✔
301
                    function_count += 1
5✔
302

303
                if start_line is None:
5✔
304
                    start_line = snippet_dict["start_line"]
5✔
305
                end_line = snippet_dict["end_line"]
5✔
306
                index += 1
5✔
307

308
            elif not merged_content:
5✔
309
                # Too big and nothing merged yet: handle oversize
310
                limit_msg = self._format_limit_msg(
5✔
311
                    box_tokens,
312
                    max_tokens,
313
                    box_lines,
314
                    max_lines,
315
                    function_count,
316
                    max_functions,
317
                    snippet_dict["content"][:100],
318
                )
319
                if strict:
5✔
320
                    raise TokenLimitError(
5✔
321
                        f"Structural block exceeds maximum limit.\n{limit_msg}\n"
322
                        "Reason: Prevent splitting inside interest points (function, class, region, ...)\n"
323
                        "💡Hint: Consider increasing 'max_tokens', 'max_lines', or 'max_functions', "
324
                        "refactoring the oversized block, or setting 'strict=False' to allow automatic splitting of oversized blocks."
325
                    )
326
                else:  # Else split further
327
                    logger.warning(
5✔
328
                        "Splitting oversized block into sub-chunks.\n(%s)",
329
                        limit_msg,
330
                    )
331

332
                    sub_chunks = self._split_oversized(
5✔
333
                        snippet_dict,
334
                        max_tokens,
335
                        max_lines,
336
                        source,
337
                        token_counter,
338
                        cumulative_lengths,
339
                    )
340

341
                    for sub_chunk in sub_chunks:
5✔
342
                        sub_chunk.metadata.chunk_num = len(result_chunks) + 1
5✔
343
                        result_chunks.append(sub_chunk)
5✔
344
                    index += 1
5✔
345
            else:
346
                # Flush current merged content as a chunk
347
                start_span = cumulative_lengths[start_line - 1]
5✔
348
                end_span = cumulative_lengths[end_line]
5✔
349
                merged_chunk = Box(
5✔
350
                    {
351
                        "content": "\n".join(merged_content),
352
                        "metadata": {
353
                            "chunk_num": len(result_chunks) + 1,
354
                            "tree": self._merge_tree(relations_list),
355
                            "start_line": start_line,
356
                            "end_line": end_line,
357
                            "span": (start_span, end_span),
358
                            "source": source,
359
                        },
360
                    }
361
                )
362
                result_chunks.append(merged_chunk)
5✔
363

364
                # Reset for next chunk
365
                merged_content.clear()
5✔
366
                relations_list.clear()
5✔
367
                start_line = None
5✔
368
                end_line = None
5✔
369
                token_count = 0
5✔
370
                line_count = 0
5✔
371
                function_count = 0
5✔
372

373
        # Flush remaining content
374
        if merged_content:
5✔
375
            start_span = cumulative_lengths[start_line - 1]
5✔
376
            end_span = cumulative_lengths[end_line]
5✔
377
            merged_chunk = Box(
5✔
378
                {
379
                    "content": "\n".join(merged_content),
380
                    "metadata": {
381
                        "chunk_num": len(result_chunks) + 1,
382
                        "tree": self._merge_tree(relations_list),
383
                        "start_line": start_line,
384
                        "end_line": end_line,
385
                        "span": (start_span, end_span),
386
                        "source": source,
387
                    },
388
                }
389
            )
390
            result_chunks.append(merged_chunk)
5✔
391

392
        return result_chunks
5✔
393

394
    def _format_limit_msg(
5✔
395
        self,
396
        box_tokens: int,
397
        max_tokens: int,
398
        box_lines: int,
399
        max_lines: int,
400
        function_count: int,
401
        max_functions: int,
402
        content_preview: str,
403
    ) -> str:
404
        """
405
        Format a limit exceeded error message, only including limits that are not sys.maxsize.
406

407
        Args:
408
            box_tokens: Actual token count in the block
409
            max_tokens: Maximum allowed tokens
410
            box_lines: Actual line count in the block
411
            max_lines: Maximum allowed lines
412
            function_count: Actual function count in the block
413
            max_functions: Maximum allowed functions
414
            content_preview: Preview of the content that exceeded limits
415

416
        Returns:
417
            Formatted error message with applicable limits
418
        """
419
        limits = []
5✔
420

421
        if max_tokens != sys.maxsize:
5✔
422
            limits.append(f"tokens: {box_tokens} > {max_tokens}")
5✔
423
        if max_lines != sys.maxsize:
5✔
424
            limits.append(f"lines: {box_lines} > {max_lines}")
5✔
425
        if max_functions != sys.maxsize:
5✔
426
            limits.append(f"functions: {function_count} > {max_functions}")
5✔
427

428
        return (
5✔
429
            f"Limits: {', '.join(limits)}\n"
430
            f"Content starting with: \n```\n{content_preview}...\n```"
431
        )
432

433
    def _split_oversized(
5✔
434
        self,
435
        snippet_dict: dict,
436
        max_tokens: int,
437
        max_lines: int,
438
        source: str | Path,
439
        token_counter: Callable | None,
440
        cumulative_lengths: tuple[int, ...],
441
    ):
442
        """
443
        Split an oversized structural block into smaller sub-chunks.
444

445
        This helper is used when a single code block exceeds the maximum
446
        token limit and `strict_mode` is disabled. It divides the block's
447
        content into token-bounded fragments while preserving line order
448
        and basic metadata.
449

450
        Args:
451
            snippet_dict (dict): The oversized snippet to split.
452
            max_tokens (int): Maximum tokens per sub-chunk.
453
            max_lines (int): Maximum lines per sub-chunk.
454
            source (str | Path): The source of the code.
455
            token_counter (Callable | None): The token counting function.
456
            cumulative_lengths (tuple[int, ...]): The cumulative lengths of the lines in the source code.
457

458
        Returns:
459
            list[Box]: A list of sub-chunks derived from the original block.
460
        """
461
        sub_boxes = []
5✔
462
        curr_chunk = []
5✔
463
        token_count = 0
5✔
464
        line_count = 0
5✔
465

466
        # Iterate through each line in the snippet_dict content
467
        for line_no, line in enumerate(
5✔
468
            snippet_dict["content"].splitlines(), start=snippet_dict["start_line"]
469
        ):
470
            line_tokens = (
5✔
471
                count_tokens(line, token_counter) if max_tokens != sys.maxsize else 0
472
            )
473

474
            # If adding this line would exceed either max_tokens or max_lines, commit current chunk
475
            if (token_count + line_tokens > max_tokens) or (line_count + 1 > max_lines):
5✔
476
                start_line = line_no - len(curr_chunk)
5✔
477
                end_line = line_no - 1
5✔
478
                start_span = cumulative_lengths[start_line - 1]
5✔
479
                end_span = cumulative_lengths[end_line]
5✔
480
                tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
481
                sub_boxes.append(
5✔
482
                    Box(
483
                        {
484
                            "content": "\n".join(curr_chunk),
485
                            "metadata": {
486
                                "tree": tree,
487
                                "start_line": start_line,
488
                                "end_line": end_line,
489
                                "span": (start_span, end_span),
490
                                "source": (
491
                                    str(source)
492
                                    if isinstance(source, Path)
493
                                    or (
494
                                        isinstance(source, str) and is_path_like(source)
495
                                    )
496
                                    else "N/A"
497
                                ),
498
                            },
499
                        }
500
                    )
501
                )
502
                curr_chunk = [line]  # Add the overflow line!
5✔
503
                token_count = 0
5✔
504
                line_count = 0
5✔
505

506
            curr_chunk.append(line)
5✔
507
            token_count += line_tokens
5✔
508
            line_count += 1
5✔
509

510
        # Add any remaining chunk at the end
511
        if curr_chunk:
5✔
512
            start_line = snippet_dict["end_line"] - len(curr_chunk) + 1
5✔
513
            end_line = snippet_dict["end_line"]
5✔
514
            start_span = cumulative_lengths[start_line - 1]
5✔
515
            end_span = cumulative_lengths[end_line]
5✔
516
            tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
517
            sub_boxes.append(
5✔
518
                Box(
519
                    {
520
                        "content": "\n".join(curr_chunk),
521
                        "metadata": {
522
                            "tree": tree,
523
                            "start_line": start_line,
524
                            "end_line": end_line,
525
                            "span": (start_span, end_span),
526
                            "source": (
527
                                str(source)
528
                                if (isinstance(source, Path) or is_path_like(source))
529
                                else "N/A"
530
                            ),
531
                        },
532
                    }
533
                )
534
            )
535

536
        return sub_boxes
5✔
537

538
    def _merge_tree(self, relations_list: list[list]) -> str:
5✔
539
        """
540
        Merges multiple sets of parent-child relation dictionaries into a single tree
541
        then returns its string representation.
542

543
        Args:
544
            relations_list (list[list]): A list containing relation lists.
545

546
        Returns:
547
            str: The string representation of the tree
548
        """
549
        if not relations_list:
5✔
NEW
550
            return "global"
×
551

552
        # Flatten the set of lists into a single iterable
553
        all_relations_flat = chain.from_iterable(relations_list)
5✔
554

555
        # Deduplicate relations
556
        def relation_key(relation: dict):
5✔
557
            return tuple(sorted(relation.items()))
5✔
558

559
        unique_relations = list(unique_everseen(all_relations_flat, key=relation_key))
5✔
560

561
        if not unique_relations:
5✔
562
            return "global"
5✔
563

564
        merged_tree = Node.from_relations(unique_relations, root="global")
5✔
565

566
        return merged_tree.to_string()
5✔
567

568
    @validate_input
5✔
569
    def chunk_file(
5✔
570
        self,
571
        path: str | Path,
572
        *,
573
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
574
        max_lines: Annotated[int | None, Field(ge=5)] = None,
575
        max_functions: Annotated[int | None, Field(ge=1)] = None,
576
        token_counter: Callable[[str], int] | None = None,
577
        include_comments: bool = True,
578
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
579
        strict: bool = True,
580
    ) -> list[Box]:
581
        """
582
        Extract semantic code chunks from source using multi-dimensional analysis.
583
        Processes source code by identifying structural boundaries (functions, classes,
584
        namespaces) and grouping content based on multiple constraints including
585
        tokens, lines, and logical units while preserving semantic coherence.
586

587
        Args:
588
            path (str | Path): File path to process.
589
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
590
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
591
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
592
            token_counter (Callable, optional): Token counting function. Uses instance
593
                counter if None. Required for token-based chunking.
594
            include_comments (bool): Include comments in output chunks. Default: True.
595
            docstring_mode (Literal["summary", "all", "excluded"]): Docstring processing strategy:
596

597
                - "summary": Include only first line of docstrings
598
                - "all": Include complete docstrings
599
                - "excluded": Remove all docstrings
600
                Defaults to "all".
601
            strict (bool): If True, raise error when structural blocks exceed
602
                max_tokens. If False, split oversized blocks. Default: True.
603

604
        Returns:
605
            list[Box]: List of code chunks with metadata. Each Box contains:
606

607
                - content (str): Code content
608
                - tree (str): Namespace hierarchy
609
                - start_line (int): Starting line in original source
610
                - end_line (int): Ending line in original source
611
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
612
                - source_path (str): Source file path
613

614
        Raises:
615
            InvalidInputError: Invalid configuration parameters.
616
            MissingTokenCounterError: No token counter available.
617
            FileProcessingError: Source file cannot be read.
618
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
619
            CallbackError: If the token counter fails or returns an invalid type.
620
        """
621
        path = Path(path)
5✔
622
        code = read_text_file(path)
5✔
623

624
        if not code.strip():
5✔
625
            log_info(self.verbose, "Input code is empty. Returning empty list.")
×
626
            return []
×
627

628
        log_info(self.verbose, "Starting chunk processing for file: {}", path)
5✔
629

630
        return self.chunk_text(
5✔
631
            code=code,
632
            max_tokens=max_tokens,
633
            max_lines=max_lines,
634
            max_functions=max_functions,
635
            token_counter=token_counter or self.token_counter,
636
            include_comments=include_comments,
637
            docstring_mode=docstring_mode,
638
            strict=strict,
639
        )
640

641
    @validate_input
5✔
642
    def chunk_texts(
5✔
643
        self,
644
        codes: "restricted_iterable(str)",  # pyright: ignore
645
        *,
646
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
647
        max_lines: Annotated[int | None, Field(ge=5)] = None,
648
        max_functions: Annotated[int | None, Field(ge=1)] = None,
649
        token_counter: Callable[[str], int] | None = None,
650
        separator: Any = None,
651
        include_comments: bool = True,
652
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
653
        strict: bool = True,
654
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
655
        show_progress: bool = True,
656
        on_errors: Literal["raise", "skip", "break"] = "raise",
657
    ) -> Generator[Box, None, None]:
658
        """
659
        Process multiple source files or code strings in parallel.
660
        Leverages multiprocessing to efficiently chunk multiple code sources,
661
        applying consistent chunking rules across all inputs.
662

663
        Args:
664
            codes (restricted_iterable[str]): A restricted iterable of raw code strings.
665
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
666
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
667
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
668
            token_counter (Callable | None): Token counting function. Uses instance
669
                counter if None. Required for token-based chunking.
670
            separator (Any): A value to be yielded after the chunks of each text are processed.
671
                Note: None cannot be used as a separator.
672
            include_comments (bool): Include comments in output chunks. Default: True.
673
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
674

675
                - "summary": Include only first line of docstrings
676
                - "all": Include complete docstrings
677
                - "excluded": Remove all docstrings
678
                Defaults to "all"
679
            strict (bool): If True, raise error when structural blocks exceed max_tokens. If False, split oversized blocks. Default: True.
680
            n_jobs (int | None): Number of parallel workers. Uses all available CPUs if None.
681
            show_progress (bool): Display progress bar during processing. Defaults to True.
682
            on_errors (Literal["raise", "skip", "break"]):
683
                How to handle errors during processing. Defaults to 'raise'.
684

685
        yields:
686
            Box: `Box` object, representing a chunk with its content and metadata.
687
                Includes:
688

689
                - content (str): Code content
690
                - tree (str): Namespace hierarchy
691
                - start_line (int): Starting line in original source
692
                - end_line (int): Ending line in original source
693
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
694
                - source_path (str): "N/A"
695

696
        Raises:
697
            InvalidInputError: Invalid input parameters.
698
            MissingTokenCounterError: No token counter available.
699
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
700
            CallbackError: If the token counter fails or returns an invalid type.
701
        """
702
        chunk_func = partial(
5✔
703
            self.chunk_text,
704
            max_tokens=max_tokens,
705
            max_lines=max_lines,
706
            max_functions=max_functions,
707
            token_counter=token_counter or self.token_counter,
708
            include_comments=include_comments,
709
            docstring_mode=docstring_mode,
710
            strict=strict,
711
        )
712

713
        yield from run_in_batch(
5✔
714
            func=chunk_func,
715
            iterable_of_args=codes,
716
            iterable_name="codes",
717
            separator=separator,
718
            n_jobs=n_jobs,
719
            show_progress=show_progress,
720
            on_errors=on_errors,
721
            verbose=self.verbose,
722
        )
723

724
    @validate_input
5✔
725
    def chunk_files(
5✔
726
        self,
727
        paths: "restricted_iterable(str | Path)",  # pyright: ignore
728
        *,
729
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
730
        max_lines: Annotated[int | None, Field(ge=5)] = None,
731
        max_functions: Annotated[int | None, Field(ge=1)] = None,
732
        token_counter: Callable[[str], int] | None = None,
733
        separator: Any = None,
734
        include_comments: bool = True,
735
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
736
        strict: bool = True,
737
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
738
        show_progress: bool = True,
739
        on_errors: Literal["raise", "skip", "break"] = "raise",
740
    ) -> Generator[Box, None, None]:
741
        """
742
        Process multiple source files or code strings in parallel.
743
        Leverages multiprocessing to efficiently chunk multiple code sources,
744
        applying consistent chunking rules across all inputs.
745

746
        Args:
747
            paths (restricted_iterable[str | Path]): A restricted iterable of file paths to process.
748
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
749
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
750
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
751
            token_counter (Callable | None): Token counting function. Uses instance
752
                counter if None. Required for token-based chunking.
753
            separator (Any): A value to be yielded after the chunks of each text are processed.
754
                Note: None cannot be used as a separator.
755
            include_comments (bool): Include comments in output chunks. Default: True.
756
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
757

758
                - "summary": Include only first line of docstrings
759
                - "all": Include complete docstrings
760
                - "excluded": Remove all docstrings
761
                Defaults to "all"
762
            strict (bool): If True, raise error when structural blocks exceed max_tokens. If False, split oversized blocks. Default: True.
763
            n_jobs (int | None): Number of parallel workers. Uses all available CPUs if None.
764
            show_progress (bool): Display progress bar during processing. Defaults to True.
765
            on_errors (Literal["raise", "skip", "break"]):
766
                How to handle errors during processing. Defaults to 'raise'.
767

768
        yields:
769
            Box: `Box` object, representing a chunk with its content and metadata.
770
                Includes:
771

772
                - content (str): Code content
773
                - tree (str): Namespace hierarchy
774
                - start_line (int): Starting line in original source
775
                - end_line (int): Ending line in original source
776
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
777
                - source_path (str): Source file path
778

779
        Raises:
780
            InvalidInputError: Invalid input parameters.
781
            MissingTokenCounterError: No token counter available.
782
            FileProcessingError: Source file cannot be read.
783
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
784
            CallbackError: If the token counter fails or returns an invalid type.
785
        """
786
        chunk_func = partial(
5✔
787
            self.chunk_file,
788
            max_tokens=max_tokens,
789
            max_lines=max_lines,
790
            max_functions=max_functions,
791
            token_counter=token_counter or self.token_counter,
792
            include_comments=include_comments,
793
            docstring_mode=docstring_mode,
794
            strict=strict,
795
        )
796

797
        yield from run_in_batch(
5✔
798
            func=chunk_func,
799
            iterable_of_args=paths,
800
            iterable_name="paths",
801
            separator=separator,
802
            n_jobs=n_jobs,
803
            show_progress=show_progress,
804
            on_errors=on_errors,
805
            verbose=self.verbose,
806
        )
807

808
    @deprecated_callable(
809
        use_instead="chunk_text or chunk_file",
810
        deprecated_in="2.2.0",
811
        removed_in="3.0.0",
812
    )
813
    def chunk(  # pragma: no cover
814
        self,
815
        source: str | Path,
816
        *,
817
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
818
        max_lines: Annotated[int | None, Field(ge=5)] = None,
819
        max_functions: Annotated[int | None, Field(ge=1)] = None,
820
        token_counter: Callable[[str], int] | None = None,
821
        include_comments: bool = True,
822
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
823
        strict: bool = True,
824
    ) -> list[Box]:
825
        """
826
        Chunk code into semantic pieces.
827

828
        Note:
829
            Deprecated since v2.2.0. Will be removed in v3.0.0. Use `chunk_file` or `chunk_text` instead.
830
        """
831
        if isinstance(source, Path) or (
832
            isinstance(source, str) and is_path_like(source)
833
        ):
834
            return self.chunk_file(
835
                path=source,
836
                max_tokens=max_tokens,
837
                max_lines=max_lines,
838
                max_functions=max_functions,
839
                token_counter=token_counter,
840
                include_comments=include_comments,
841
                docstring_mode=docstring_mode,
842
                strict=strict,
843
            )
844
        return self.chunk_text(
845
            code=source,
846
            max_tokens=max_tokens,
847
            max_lines=max_lines,
848
            max_functions=max_functions,
849
            token_counter=token_counter,
850
            include_comments=include_comments,
851
            docstring_mode=docstring_mode,
852
            strict=strict,
853
        )
854

855
    @deprecated_callable(
856
        use_instead="chunk_texts or chunk_files",
857
        deprecated_in="2.2.0",
858
        removed_in="3.0.0",
859
    )
860
    def batch_chunk(  # pragma: no cover
861
        self,
862
        sources: "restricted_iterable(str | Path)",  # pyright: ignore
863
        *,
864
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
865
        max_lines: Annotated[int | None, Field(ge=5)] = None,
866
        max_functions: Annotated[int | None, Field(ge=1)] = None,
867
        token_counter: Callable[[str], int] | None = None,
868
        separator: Any = None,
869
        include_comments: bool = True,
870
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
871
        strict: bool = True,
872
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
873
        show_progress: bool = True,
874
        on_errors: Literal["raise", "skip", "break"] = "raise",
875
    ) -> Generator[Box, None, None]:
876
        """
877
        Batch chunk multiple code sources.
878

879
        Note:
880
            Deprecated since v2.2.0. Will be removed in v3.0.0. Use `chunk_files` instead.
881
        """
882
        chunk_func = partial(
883
            self.chunk,
884
            max_tokens=max_tokens,
885
            max_lines=max_lines,
886
            max_functions=max_functions,
887
            token_counter=token_counter or self.token_counter,
888
            include_comments=include_comments,
889
            docstring_mode=docstring_mode,
890
            strict=strict,
891
        )
892

893
        yield from run_in_batch(
894
            func=chunk_func,
895
            iterable_of_args=sources,
896
            iterable_name="sources",
897
            separator=separator,
898
            n_jobs=n_jobs,
899
            show_progress=show_progress,
900
            on_errors=on_errors,
901
            verbose=self.verbose,
902
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc