• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 22282014157

22 Feb 2026 05:38PM UTC coverage: 87.005%. First build
22282014157

Pull #12

github

web-flow
Merge 9cbe634aa into 83dda3c2e
Pull Request #12: # v2.2.0: The Unification Edition

285 of 336 new or added lines in 24 files covered. (84.82%)

1406 of 1616 relevant lines covered (87.0%)

4.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.75
/src/chunklet/code_chunker/code_chunker.py
1
"""
2
Author: Speedyk-005 | Copyright (c) 2025 | License: MIT
3

4
Language-Agnostic Code Chunking Utility
5

6
This module provides a robust, convention-aware engine for segmenting source code into
7
semantic units ("chunks") such as functions, classes, namespaces, and logical blocks.
8

9
The chunking process uses pattern-based line-by-line processing to identify code structures
10
and track context through indentation levels, enabling accurate detection of nested structures
11
while respecting language-specific syntax.
12

13
Limitations
14
-----------
15
`CodeChunker` assumes syntactically conventional code. Highly obfuscated, minified,
16
or macro-generated sources may not fully respect its boundary patterns, though such
17
cases fall outside its intended domain.
18

19
Inspired by:
20
    - Camel.utils.chunker.CodeChunker (@ CAMEL-AI.org)
21
    - code-chunker by JimAiMoment
22
    - whats_that_code by matthewdeanmartin
23
    - CintraAI Code Chunker
24
"""
25

26
import sys
5✔
27
from functools import partial
5✔
28
from itertools import chain
5✔
29
from pathlib import Path
5✔
30
from typing import Annotated, Any, Callable, Generator, Literal
5✔
31

32
from box import Box
5✔
33
from more_itertools import unique_everseen
5✔
34
from pydantic import Field
5✔
35

36
try:
5✔
37
    import defusedxml.ElementTree as ET
5✔
38
    from charset_normalizer import from_path
5✔
39
    from littletree import Node
5✔
40
except ImportError:
×
41
    from_path, Node, ET = None, None, None
×
42

43
from loguru import logger
5✔
44

45
from chunklet.base_chunker import BaseChunker
5✔
46
from chunklet.code_chunker._code_structure_extractor import CodeStructureExtractor
5✔
47
from chunklet.code_chunker.utils import is_python_code
5✔
48
from chunklet.common.batch_runner import run_in_batch
5✔
49
from chunklet.common.deprecation import deprecated_callable
5✔
50
from chunklet.common.logging_utils import log_info
5✔
51
from chunklet.common.path_utils import is_path_like, read_text_file
5✔
52
from chunklet.common.token_utils import count_tokens
5✔
53
from chunklet.common.validation import restricted_iterable, validate_input
5✔
54
from chunklet.exceptions import (
5✔
55
    InvalidInputError,
56
    MissingTokenCounterError,
57
    TokenLimitError,
58
)
59

60

61
class CodeChunker(BaseChunker):
5✔
62
    """
63
    Language-agnostic code chunking utility for semantic code segmentation.
64

65
    Extracts structural units (functions, classes, namespaces) from source code
66
    across multiple programming languages using pattern-based detection and
67
    token-aware segmentation.
68

69
    Key Features:
70
        - Cross-language support (Python, C/C++, Java, C#, JavaScript, Go, etc.)
71
        - Structural analysis with namespace hierarchy tracking
72
        - Configurable token limits with strict/lenient overflow handling
73
        - Flexible docstring and comment processing modes
74
        - Accurate line number preservation and source tracking
75
        - Parallel batch processing for multiple files
76
        - Comprehensive logging and progress tracking
77
    """
78

79
    @validate_input
5✔
80
    def __init__(
5✔
81
        self,
82
        verbose: bool = False,
83
        token_counter: Callable[[str], int] | None = None,
84
    ):
85
        """
86
        Initialize the CodeChunker with optional token counter and verbosity control.
87

88
        Args:
89
            verbose (bool): Enable verbose logging.
90
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
91
                If None, must be provided when calling chunk() methods.
92
        """
93
        self.token_counter = token_counter
5✔
94
        self._verbose = verbose
5✔
95
        self.extractor = CodeStructureExtractor(verbose=self._verbose)
5✔
96

97
    @property
5✔
98
    def verbose(self) -> bool:
5✔
99
        """Get the verbose setting."""
100
        return self._verbose
5✔
101

102
    @verbose.setter
5✔
103
    def verbose(self, value: bool) -> None:
5✔
104
        """Set the verbose setting and propagate to the extractor."""
105
        self._verbose = value
×
106
        self.extractor.verbose = value
×
107

108
    def _merge_tree(self, relations_list: list[list]) -> str:
5✔
109
        """
110
        Merges multiple sets of parent-child relation dictionaries into a single tree
111
        then returns its string representation.
112

113
        Args:
114
            relations_list (list[list]): A list containing relation lists.
115

116
        Returns:
117
            str: The string representation of the tree
118
        """
119
        if not relations_list:
5✔
120
            return "global"
×
121

122
        # Flatten the set of lists into a single iterable
123
        all_relations_flat = chain.from_iterable(relations_list)
5✔
124

125
        # Deduplicate relations
126
        def relation_key(relation: dict):
5✔
127
            return tuple(sorted(relation.items()))
5✔
128

129
        unique_relations = list(unique_everseen(all_relations_flat, key=relation_key))
5✔
130

131
        if not unique_relations:
5✔
132
            return "global"
5✔
133

134
        merged_tree = Node.from_relations(unique_relations, root="global")
5✔
135

136
        return merged_tree.to_string()
5✔
137

138
    def _split_oversized(
5✔
139
        self,
140
        snippet_dict: dict,
141
        max_tokens: int,
142
        max_lines: int,
143
        source: str | Path,
144
        token_counter: Callable | None,
145
        cumulative_lengths: tuple[int, ...],
146
    ):
147
        """
148
        Split an oversized structural block into smaller sub-chunks.
149

150
        This helper is used when a single code block exceeds the maximum
151
        token limit and `strict_mode` is disabled. It divides the block's
152
        content into token-bounded fragments while preserving line order
153
        and basic metadata.
154

155
        Args:
156
            snippet_dict (dict): The oversized snippet to split.
157
            max_tokens (int): Maximum tokens per sub-chunk.
158
            max_lines (int): Maximum lines per sub-chunk.
159
            source (str | Path): The source of the code.
160
            token_counter (Callable | None): The token counting function.
161
            cumulative_lengths (tuple[int, ...]): The cumulative lengths of the lines in the source code.
162

163
        Returns:
164
            list[Box]: A list of sub-chunks derived from the original block.
165
        """
166
        sub_boxes = []
5✔
167
        curr_chunk = []
5✔
168
        token_count = 0
5✔
169
        line_count = 0
5✔
170

171
        # Iterate through each line in the snippet_dict content
172
        for line_no, line in enumerate(
5✔
173
            snippet_dict["content"].splitlines(), start=snippet_dict["start_line"]
174
        ):
175
            line_tokens = (
5✔
176
                count_tokens(line, token_counter) if max_tokens != sys.maxsize else 0
177
            )
178

179
            # If adding this line would exceed either max_tokens or max_lines, commit current chunk
180
            if (token_count + line_tokens > max_tokens) or (line_count + 1 > max_lines):
5✔
181
                start_line = line_no - len(curr_chunk)
5✔
182
                end_line = line_no - 1
5✔
183
                start_span = cumulative_lengths[start_line - 1]
5✔
184
                end_span = cumulative_lengths[end_line]
5✔
185
                tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
186
                sub_boxes.append(
5✔
187
                    Box(
188
                        {
189
                            "content": "\n".join(curr_chunk),
190
                            "metadata": {
191
                                "tree": tree,
192
                                "start_line": start_line,
193
                                "end_line": end_line,
194
                                "span": (start_span, end_span),
195
                                "source": (
196
                                    str(source)
197
                                    if isinstance(source, Path)
198
                                    or (
199
                                        isinstance(source, str) and is_path_like(source)
200
                                    )
201
                                    else "N/A"
202
                                ),
203
                            },
204
                        }
205
                    )
206
                )
207
                curr_chunk = [line]  # Add the overflow line!
5✔
208
                token_count = 0
5✔
209
                line_count = 0
5✔
210

211
            curr_chunk.append(line)
5✔
212
            token_count += line_tokens
5✔
213
            line_count += 1
5✔
214

215
        # Add any remaining chunk at the end
216
        if curr_chunk:
5✔
217
            start_line = snippet_dict["end_line"] - len(curr_chunk) + 1
5✔
218
            end_line = snippet_dict["end_line"]
5✔
219
            start_span = cumulative_lengths[start_line - 1]
5✔
220
            end_span = cumulative_lengths[end_line]
5✔
221
            tree = Node.from_relations(snippet_dict["relations"]).to_string()
5✔
222
            sub_boxes.append(
5✔
223
                Box(
224
                    {
225
                        "content": "\n".join(curr_chunk),
226
                        "metadata": {
227
                            "tree": tree,
228
                            "start_line": start_line,
229
                            "end_line": end_line,
230
                            "span": (start_span, end_span),
231
                            "source": (
232
                                str(source)
233
                                if (isinstance(source, Path) or is_path_like(source))
234
                                else "N/A"
235
                            ),
236
                        },
237
                    }
238
                )
239
            )
240

241
        return sub_boxes
5✔
242

243
    def _format_limit_msg(
5✔
244
        self,
245
        box_tokens: int,
246
        max_tokens: int,
247
        box_lines: int,
248
        max_lines: int,
249
        function_count: int,
250
        max_functions: int,
251
        content_preview: str,
252
    ) -> str:
253
        """
254
        Format a limit exceeded error message, only including limits that are not sys.maxsize.
255

256
        Args:
257
            box_tokens: Actual token count in the block
258
            max_tokens: Maximum allowed tokens
259
            box_lines: Actual line count in the block
260
            max_lines: Maximum allowed lines
261
            function_count: Actual function count in the block
262
            max_functions: Maximum allowed functions
263
            content_preview: Preview of the content that exceeded limits
264

265
        Returns:
266
            Formatted error message with applicable limits
267
        """
268
        limits = []
5✔
269

270
        if max_tokens != sys.maxsize:
5✔
271
            limits.append(f"tokens: {box_tokens} > {max_tokens}")
5✔
272
        if max_lines != sys.maxsize:
5✔
273
            limits.append(f"lines: {box_lines} > {max_lines}")
5✔
274
        if max_functions != sys.maxsize:
5✔
275
            limits.append(f"functions: {function_count} > {max_functions}")
5✔
276

277
        return (
5✔
278
            f"Limits: {', '.join(limits)}\n"
279
            f"Content starting with: \n```\n{content_preview}...\n```"
280
        )
281

282
    def _group_by_chunk(
5✔
283
        self,
284
        snippet_dicts: list[dict],
285
        cumulative_lengths: tuple[int, ...],
286
        token_counter: Callable[[str], int] | None,
287
        max_tokens: int,
288
        max_lines: int,
289
        max_functions: int,
290
        strict: bool,
291
        source: str | Path,
292
    ) -> list[Box]:
293
        """
294
        Group code snippets into chunks based on specified constraints.
295

296
        Iteratively merges snippets into chunks while respecting token, line, and function limits.
297
        Handles oversized snippets by splitting them if strict mode is disabled.
298

299
        Args:
300
            snippet_dicts (list[dict]): List of extracted code snippet dictionaries.
301
            cumulative_lengths (tuple[int, ...]): Cumulative character lengths for span calculation.
302
            token_counter (Callable[[str], int] | None): Function to count tokens in text.
303
            max_tokens (int): Maximum tokens per chunk.
304
            max_lines (int): Maximum lines per chunk.
305
            max_functions (int): Maximum functions per chunk.
306
            strict (bool): If True, raise error on oversized snippets; if False, split them.
307
            source (str | Path): Original source for metadata.
308

309
        Returns:
310
            list[Box]: List of chunk boxes with content and metadata.
311
        """
312
        source = (
5✔
313
            str(source) if (isinstance(source, Path) or is_path_like(source)) else "N/A"
314
        )
315

316
        merged_content = []
5✔
317
        relations_list = []
5✔
318
        start_line = None
5✔
319
        end_line = None
5✔
320
        token_count = 0
5✔
321
        line_count = 0
5✔
322
        function_count = 0
5✔
323
        result_chunks = []
5✔
324

325
        index = 0
5✔
326
        while index < len(snippet_dicts):
5✔
327
            snippet_dict = snippet_dicts[index]
5✔
328
            box_tokens = (
5✔
329
                count_tokens(snippet_dict["content"], token_counter)
330
                if max_tokens != sys.maxsize
331
                else 0
332
            )
333
            box_lines = snippet_dict["content"].count("\n") + bool(
5✔
334
                snippet_dict["content"]
335
            )
336
            is_function = bool(snippet_dict.get("func_partial_signature"))
5✔
337

338
            # Check if adding this snippet exceeds any limits
339
            token_limit_reached = token_count + box_tokens > max_tokens
5✔
340
            line_limit_reached = line_count + box_lines > max_lines
5✔
341
            function_limit_reached = is_function and (
5✔
342
                function_count + 1 > max_functions
343
            )
344

345
            if not (
5✔
346
                token_limit_reached or line_limit_reached or function_limit_reached
347
            ):
348
                # Fits: merge normally
349
                merged_content.append(snippet_dict["content"])
5✔
350
                relations_list.append(snippet_dict["relations"])
5✔
351
                token_count += box_tokens
5✔
352
                line_count += box_lines
5✔
353
                if is_function:
5✔
354
                    function_count += 1
5✔
355

356
                if start_line is None:
5✔
357
                    start_line = snippet_dict["start_line"]
5✔
358
                end_line = snippet_dict["end_line"]
5✔
359
                index += 1
5✔
360

361
            elif not merged_content:
5✔
362
                # Too big and nothing merged yet: handle oversize
363
                limit_msg = self._format_limit_msg(
5✔
364
                    box_tokens,
365
                    max_tokens,
366
                    box_lines,
367
                    max_lines,
368
                    function_count,
369
                    max_functions,
370
                    snippet_dict["content"][:100],
371
                )
372
                if strict:
5✔
373
                    raise TokenLimitError(
5✔
374
                        f"Structural block exceeds maximum limit.\n{limit_msg}\n"
375
                        "Reason: Prevent splitting inside interest points (function, class, region, ...)\n"
376
                        "💡Hint: Consider increasing 'max_tokens', 'max_lines', or 'max_functions', "
377
                        "refactoring the oversized block, or setting 'strict=False' to allow automatic splitting of oversized blocks."
378
                    )
379
                else:  # Else split further
380
                    logger.warning(
5✔
381
                        "Splitting oversized block into sub-chunks.\n(%s)",
382
                        limit_msg,
383
                    )
384

385
                    sub_chunks = self._split_oversized(
5✔
386
                        snippet_dict,
387
                        max_tokens,
388
                        max_lines,
389
                        source,
390
                        token_counter,
391
                        cumulative_lengths,
392
                    )
393

394
                    for sub_chunk in sub_chunks:
5✔
395
                        sub_chunk.metadata.chunk_num = len(result_chunks) + 1
5✔
396
                        result_chunks.append(sub_chunk)
5✔
397
                    index += 1
5✔
398
            else:
399
                # Flush current merged content as a chunk
400
                start_span = cumulative_lengths[start_line - 1]
5✔
401
                end_span = cumulative_lengths[end_line]
5✔
402
                merged_chunk = Box(
5✔
403
                    {
404
                        "content": "\n".join(merged_content),
405
                        "metadata": {
406
                            "chunk_num": len(result_chunks) + 1,
407
                            "tree": self._merge_tree(relations_list),
408
                            "start_line": start_line,
409
                            "end_line": end_line,
410
                            "span": (start_span, end_span),
411
                            "source": source,
412
                        },
413
                    }
414
                )
415
                result_chunks.append(merged_chunk)
5✔
416

417
                # Reset for next chunk
418
                merged_content.clear()
5✔
419
                relations_list.clear()
5✔
420
                start_line = None
5✔
421
                end_line = None
5✔
422
                token_count = 0
5✔
423
                line_count = 0
5✔
424
                function_count = 0
5✔
425

426
        # Flush remaining content
427
        if merged_content:
5✔
428
            start_span = cumulative_lengths[start_line - 1]
5✔
429
            end_span = cumulative_lengths[end_line]
5✔
430
            merged_chunk = Box(
5✔
431
                {
432
                    "content": "\n".join(merged_content),
433
                    "metadata": {
434
                        "chunk_num": len(result_chunks) + 1,
435
                        "tree": self._merge_tree(relations_list),
436
                        "start_line": start_line,
437
                        "end_line": end_line,
438
                        "span": (start_span, end_span),
439
                        "source": source,
440
                    },
441
                }
442
            )
443
            result_chunks.append(merged_chunk)
5✔
444

445
        return result_chunks
5✔
446

447
    def _validate_constraints(
5✔
448
        self,
449
        max_tokens: int | None,
450
        max_lines: int | None,
451
        max_functions: int | None,
452
        token_counter: Callable[[str], int] | None,
453
    ):
454
        """
455
        Validates that at least one chunking constraint is provided and sets default values.
456

457
        Args:
458
            max_tokens (int | None): Maximum number of tokens per chunk.
459
            max_lines (int | None): Maximum number of lines per chunk.
460
            max_functions (int | None): Maximum number of functions per chunk.
461
            token_counter (Callable[[str], int] | None): Function that counts tokens in text.
462

463
        Raises:
464
            InvalidInputError: If no chunking constraints are provided.
465
            MissingTokenCounterError: If `max_tokens` is provided but no `token_counter` is provided.
466
        """
467
        if not any((max_tokens, max_lines, max_functions)):
5✔
468
            raise InvalidInputError(
5✔
469
                "At least one of 'max_tokens', 'max_lines', or 'max_functions' must be provided."
470
            )
471

472
        # If token_counter is required but not provided
473
        if max_tokens is not None and not (token_counter or self.token_counter):
5✔
474
            raise MissingTokenCounterError()
5✔
475

476
    @validate_input
5✔
477
    def chunk_text(
5✔
478
        self,
479
        code: str,
480
        *,
481
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
482
        max_lines: Annotated[int | None, Field(ge=5)] = None,
483
        max_functions: Annotated[int | None, Field(ge=1)] = None,
484
        token_counter: Callable[[str], int] | None = None,
485
        include_comments: bool = True,
486
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
487
        strict: bool = True,
488
    ) -> list[Box]:
489
        """
490
        Extract semantic code chunks from source using multi-dimensional analysis.
491
        Processes source code by identifying structural boundaries (functions, classes,
492
        namespaces) and grouping content based on multiple constraints including
493
        tokens, lines, and logical units while preserving semantic coherence.
494

495
        Args:
496
            code (str | Path): Raw code string or file path to process.
497
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
498
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
499
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
500
            token_counter (Callable, optional): Token counting function. Uses instance
501
                counter if None. Required for token-based chunking.
502
            include_comments (bool): Include comments in output chunks. Default: True.
503
            docstring_mode (Literal["summary", "all", "excluded"]): Docstring processing strategy:
504

505
                - "summary": Include only first line of docstrings
506
                - "all": Include complete docstrings
507
                - "excluded": Remove all docstrings
508
                Defaults to "all".
509
            strict (bool): If True, raise error when structural blocks exceed
510
                max_tokens. If False, split oversized blocks. Default: True.
511

512
        Returns:
513
            list[Box]: List of code chunks with metadata. Each Box contains:
514

515
                - content (str): Code content
516
                - tree (str): Namespace hierarchy
517
                - start_line (int): Starting line in original source
518
                - end_line (int): Ending line in original source
519
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
520
                - source_path (str): "N/A"
521

522
        Raises:
523
            InvalidInputError: Invalid configuration parameters.
524
            MissingTokenCounterError: No token counter available.
525
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
526
            CallbackError: If the token counter fails or returns an invalid type.
527
        """
528
        self._validate_constraints(max_tokens, max_lines, max_functions, token_counter)
5✔
529

530
        if max_tokens is None:
5✔
531
            max_tokens = sys.maxsize
5✔
532
        if max_lines is None:
5✔
533
            max_lines = sys.maxsize
5✔
534
        if max_functions is None:
5✔
535
            max_functions = sys.maxsize
5✔
536

537
        token_counter = token_counter or self.token_counter
5✔
538

539
        if not code.strip():
5✔
540
            log_info(self.verbose, "Input code is empty. Returning empty list.")
5✔
541
            return []
5✔
542

543
        log_info(
5✔
544
            self.verbose,
545
            "Starting chunk processing for code starting with:\n```\n{}...\n```",
546
            code[:100],
547
        )
548

549
        snippet_dicts, cumulative_lengths = self.extractor.extract_code_structure(
5✔
550
            code, include_comments, docstring_mode, is_python_code(code)
551
        )
552

553
        result_chunks = self._group_by_chunk(
5✔
554
            snippet_dicts=snippet_dicts,
555
            cumulative_lengths=cumulative_lengths,
556
            token_counter=token_counter,
557
            max_tokens=max_tokens,
558
            max_lines=max_lines,
559
            max_functions=max_functions,
560
            strict=strict,
561
            source=code,
562
        )
563

564
        log_info(self.verbose, "Generated {} chunk(s) for the code", len(result_chunks))
5✔
565

566
        return result_chunks
5✔
567

568
    @validate_input
5✔
569
    def chunk_file(
5✔
570
        self,
571
        path: str | Path,
572
        *,
573
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
574
        max_lines: Annotated[int | None, Field(ge=5)] = None,
575
        max_functions: Annotated[int | None, Field(ge=1)] = None,
576
        token_counter: Callable[[str], int] | None = None,
577
        include_comments: bool = True,
578
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
579
        strict: bool = True,
580
    ) -> list[Box]:
581
        """
582
        Extract semantic code chunks from source using multi-dimensional analysis.
583
        Processes source code by identifying structural boundaries (functions, classes,
584
        namespaces) and grouping content based on multiple constraints including
585
        tokens, lines, and logical units while preserving semantic coherence.
586

587
        Args:
588
            path (str | Path): File path to process.
589
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
590
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
591
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
592
            token_counter (Callable, optional): Token counting function. Uses instance
593
                counter if None. Required for token-based chunking.
594
            include_comments (bool): Include comments in output chunks. Default: True.
595
            docstring_mode (Literal["summary", "all", "excluded"]): Docstring processing strategy:
596

597
                - "summary": Include only first line of docstrings
598
                - "all": Include complete docstrings
599
                - "excluded": Remove all docstrings
600
                Defaults to "all".
601
            strict (bool): If True, raise error when structural blocks exceed
602
                max_tokens. If False, split oversized blocks. Default: True.
603

604
        Returns:
605
            list[Box]: List of code chunks with metadata. Each Box contains:
606

607
                - content (str): Code content
608
                - tree (str): Namespace hierarchy
609
                - start_line (int): Starting line in original source
610
                - end_line (int): Ending line in original source
611
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
612
                - source_path (str): Source file path
613

614
        Raises:
615
            InvalidInputError: Invalid configuration parameters.
616
            MissingTokenCounterError: No token counter available.
617
            FileProcessingError: Source file cannot be read.
618
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
619
            CallbackError: If the token counter fails or returns an invalid type.
620
        """
621
        path = Path(path)
5✔
622
        code = read_text_file(path)
5✔
623

624
        if not code.strip():
5✔
NEW
625
            log_info(self.verbose, "Input code is empty. Returning empty list.")
×
NEW
626
            return []
×
627

628
        log_info(self.verbose, "Starting chunk processing for file: {}", path)
5✔
629

630
        return self.chunk_text(
5✔
631
            code=code,
632
            max_tokens=max_tokens,
633
            max_lines=max_lines,
634
            max_functions=max_functions,
635
            token_counter=token_counter or self.token_counter,
636
            include_comments=include_comments,
637
            docstring_mode=docstring_mode,
638
            strict=strict,
639
        )
640

641
    @validate_input
5✔
642
    def chunk_texts(
5✔
643
        self,
644
        codes: "restricted_iterable(str)",  # pyright: ignore
645
        *,
646
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
647
        max_lines: Annotated[int | None, Field(ge=5)] = None,
648
        max_functions: Annotated[int | None, Field(ge=1)] = None,
649
        token_counter: Callable[[str], int] | None = None,
650
        separator: Any = None,
651
        include_comments: bool = True,
652
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
653
        strict: bool = True,
654
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
655
        show_progress: bool = True,
656
        on_errors: Literal["raise", "skip", "break"] = "raise",
657
    ) -> Generator[Box, None, None]:
658
        """
659
        Process multiple source files or code strings in parallel.
660
        Leverages multiprocessing to efficiently chunk multiple code sources,
661
        applying consistent chunking rules across all inputs.
662

663
        Args:
664
            codes (restricted_iterable[str]): A restricted iterable of raw code strings.
665
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
666
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
667
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
668
            token_counter (Callable | None): Token counting function. Uses instance
669
                counter if None. Required for token-based chunking.
670
            separator (Any): A value to be yielded after the chunks of each text are processed.
671
                Note: None cannot be used as a separator.
672
            include_comments (bool): Include comments in output chunks. Default: True.
673
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
674

675
                - "summary": Include only first line of docstrings
676
                - "all": Include complete docstrings
677
                - "excluded": Remove all docstrings
678
                Defaults to "all"
679
            strict (bool): If True, raise error when structural blocks exceed max_tokens. If False, split oversized blocks. Default: True.
680
            n_jobs (int | None): Number of parallel workers. Uses all available CPUs if None.
681
            show_progress (bool): Display progress bar during processing. Defaults to True.
682
            on_errors (Literal["raise", "skip", "break"]):
683
                How to handle errors during processing. Defaults to 'raise'.
684

685
        yields:
686
            Box: `Box` object, representing a chunk with its content and metadata.
687
                Includes:
688

689
                - content (str): Code content
690
                - tree (str): Namespace hierarchy
691
                - start_line (int): Starting line in original source
692
                - end_line (int): Ending line in original source
693
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
694
                - source_path (str): "N/A"
695

696
        Raises:
697
            InvalidInputError: Invalid input parameters.
698
            MissingTokenCounterError: No token counter available.
699
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
700
            CallbackError: If the token counter fails or returns an invalid type.
701
        """
702
        chunk_func = partial(
5✔
703
            self.chunk_text,
704
            max_tokens=max_tokens,
705
            max_lines=max_lines,
706
            max_functions=max_functions,
707
            token_counter=token_counter or self.token_counter,
708
            include_comments=include_comments,
709
            docstring_mode=docstring_mode,
710
            strict=strict,
711
        )
712

713
        yield from run_in_batch(
5✔
714
            func=chunk_func,
715
            iterable_of_args=codes,
716
            iterable_name="codes",
717
            separator=separator,
718
            n_jobs=n_jobs,
719
            show_progress=show_progress,
720
            on_errors=on_errors,
721
            verbose=self.verbose,
722
        )
723

724
    @validate_input
5✔
725
    def chunk_files(
5✔
726
        self,
727
        paths: "restricted_iterable(str | Path)",  # pyright: ignore
728
        *,
729
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
730
        max_lines: Annotated[int | None, Field(ge=5)] = None,
731
        max_functions: Annotated[int | None, Field(ge=1)] = None,
732
        token_counter: Callable[[str], int] | None = None,
733
        separator: Any = None,
734
        include_comments: bool = True,
735
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
736
        strict: bool = True,
737
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
738
        show_progress: bool = True,
739
        on_errors: Literal["raise", "skip", "break"] = "raise",
740
    ) -> Generator[Box, None, None]:
741
        """
742
        Process multiple source files or code strings in parallel.
743
        Leverages multiprocessing to efficiently chunk multiple code sources,
744
        applying consistent chunking rules across all inputs.
745

746
        Args:
747
            paths (restricted_iterable[str | Path]): A restricted iterable of file paths to process.
748
            max_tokens (int, optional): Maximum tokens per chunk. Must be >= 12.
749
            max_lines (int, optional): Maximum number of lines per chunk. Must be >= 5.
750
            max_functions (int, optional): Maximum number of functions per chunk. Must be >= 1.
751
            token_counter (Callable | None): Token counting function. Uses instance
752
                counter if None. Required for token-based chunking.
753
            separator (Any): A value to be yielded after the chunks of each text are processed.
754
                Note: None cannot be used as a separator.
755
            include_comments (bool): Include comments in output chunks. Default: True.
756
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
757

758
                - "summary": Include only first line of docstrings
759
                - "all": Include complete docstrings
760
                - "excluded": Remove all docstrings
761
                Defaults to "all"
762
            strict (bool): If True, raise error when structural blocks exceed max_tokens. If False, split oversized blocks. Default: True.
763
            n_jobs (int | None): Number of parallel workers. Uses all available CPUs if None.
764
            show_progress (bool): Display progress bar during processing. Defaults to True.
765
            on_errors (Literal["raise", "skip", "break"]):
766
                How to handle errors during processing. Defaults to 'raise'.
767

768
        yields:
769
            Box: `Box` object, representing a chunk with its content and metadata.
770
                Includes:
771

772
                - content (str): Code content
773
                - tree (str): Namespace hierarchy
774
                - start_line (int): Starting line in original source
775
                - end_line (int): Ending line in original source
776
                - span (tuple[int, int]): Character-level span (start and end offsets) in the original source.
777
                - source_path (str): Source file path
778

779
        Raises:
780
            InvalidInputError: Invalid input parameters.
781
            MissingTokenCounterError: No token counter available.
782
            FileProcessingError: Source file cannot be read.
783
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
784
            CallbackError: If the token counter fails or returns an invalid type.
785
        """
786
        chunk_func = partial(
5✔
787
            self.chunk_file,
788
            max_tokens=max_tokens,
789
            max_lines=max_lines,
790
            max_functions=max_functions,
791
            token_counter=token_counter or self.token_counter,
792
            include_comments=include_comments,
793
            docstring_mode=docstring_mode,
794
            strict=strict,
795
        )
796

797
        yield from run_in_batch(
5✔
798
            func=chunk_func,
799
            iterable_of_args=paths,
800
            iterable_name="paths",
801
            separator=separator,
802
            n_jobs=n_jobs,
803
            show_progress=show_progress,
804
            on_errors=on_errors,
805
            verbose=self.verbose,
806
        )
807

808
    @deprecated_callable(
5✔
809
        use_instead="chunk_text or chunk_file",
810
        deprecated_in="2.2.0",
811
        removed_in="3.0.0",
812
    )
813
    def chunk(
5✔
814
        self,
815
        source: str | Path,
816
        *,
817
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
818
        max_lines: Annotated[int | None, Field(ge=5)] = None,
819
        max_functions: Annotated[int | None, Field(ge=1)] = None,
820
        token_counter: Callable[[str], int] | None = None,
821
        include_comments: bool = True,
822
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
823
        strict: bool = True,
824
    ) -> list[Box]:
825
        """
826
        Chunk code into semantic pieces.
827

828
        Note:
829
            Deprecated since v2.2.0. Will be removed in v3.0.0. Use `chunk_file` or `chunk_text` instead.
830
        """
NEW
831
        if isinstance(source, Path) or (
×
832
            isinstance(source, str) and is_path_like(source)
833
        ):
NEW
834
            return self.chunk_file(
×
835
                path=source,
836
                max_tokens=max_tokens,
837
                max_lines=max_lines,
838
                max_functions=max_functions,
839
                token_counter=token_counter,
840
                include_comments=include_comments,
841
                docstring_mode=docstring_mode,
842
                strict=strict,
843
            )
NEW
844
        return self.chunk_text(
×
845
            code=source,
846
            max_tokens=max_tokens,
847
            max_lines=max_lines,
848
            max_functions=max_functions,
849
            token_counter=token_counter,
850
            include_comments=include_comments,
851
            docstring_mode=docstring_mode,
852
            strict=strict,
853
        )
854

855
    @deprecated_callable(
5✔
856
        use_instead="chunk_texts or chunk_files",
857
        deprecated_in="2.2.0",
858
        removed_in="3.0.0",
859
    )
860
    def batch_chunk(
5✔
861
        self,
862
        sources: "restricted_iterable(str | Path)",  # pyright: ignore
863
        *,
864
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
865
        max_lines: Annotated[int | None, Field(ge=5)] = None,
866
        max_functions: Annotated[int | None, Field(ge=1)] = None,
867
        token_counter: Callable[[str], int] | None = None,
868
        separator: Any = None,
869
        include_comments: bool = True,
870
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
871
        strict: bool = True,
872
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
873
        show_progress: bool = True,
874
        on_errors: Literal["raise", "skip", "break"] = "raise",
875
    ) -> Generator[Box, None, None]:
876
        """
877
        Batch chunk multiple code sources.
878

879
        Note:
880
            Deprecated since v2.2.0. Will be removed in v3.0.0. Use `chunk_files` instead.
881
        """
882
        chunk_func = partial(
×
883
            self.chunk,
884
            max_tokens=max_tokens,
885
            max_lines=max_lines,
886
            max_functions=max_functions,
887
            token_counter=token_counter or self.token_counter,
888
            include_comments=include_comments,
889
            docstring_mode=docstring_mode,
890
            strict=strict,
891
        )
892

893
        yield from run_in_batch(
×
894
            func=chunk_func,
895
            iterable_of_args=sources,
896
            iterable_name="sources",
897
            separator=separator,
898
            n_jobs=n_jobs,
899
            show_progress=show_progress,
900
            on_errors=on_errors,
901
            verbose=self.verbose,
902
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc