• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 24798516591

22 Apr 2026 07:19PM UTC coverage: 90.606% (-0.2%) from 90.758%
24798516591

push

github

speedyk-005
refactor: remove redundant type hints from docstrings

- Strip (type) from Args/Returns where signature already has types
- Simplify Returns format to prose description
- Run clean_docstrings.py on src/chunklet (26 files)
- Add ExtractionState TypedDict for type safety (from earlier refactor)

1360 of 1501 relevant lines covered (90.61%)

3.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.24
/src/chunklet/code_chunker/code_chunker.py
1
"""
2
Author: Speedyk-005 | Copyright (c) 2025 | License: MIT
3

4
Language-Agnostic Code Chunking Utility
5

6
This module provides a robust, convention-aware engine for segmenting source code into
7
semantic units ("chunks") such as functions, classes, namespaces, and logical blocks.
8

9
The chunking process uses pattern-based line-by-line processing to identify code structures
10
and track context through indentation levels, enabling accurate detection of nested structures
11
while respecting language-specific syntax.
12

13
Limitations
14
-----------
15
`CodeChunker` assumes syntactically conventional code. Highly obfuscated, minified,
16
or macro-generated sources may not fully respect its boundary patterns, though such
17
cases fall outside its intended domain.
18

19
Inspired by:
20
    - Camel.utils.chunker.CodeChunker (@ CAMEL-AI.org)
21
    - code-chunker by JimAiMoment
22
    - whats_that_code by matthewdeanmartin
23
    - CintraAI Code Chunker
24
"""
25

26
import sys
4✔
27
from functools import partial
4✔
28
from itertools import chain
4✔
29
from pathlib import Path
4✔
30
from typing import Annotated, Any, Callable, Generator, Literal
4✔
31

32
from dotdict3 import DotDict
4✔
33
from more_itertools import unique_everseen
4✔
34
from pydantic import Field
4✔
35

36
try:
4✔
37
    import defusedxml.ElementTree as ET
4✔
38
    from charset_normalizer import from_path
4✔
39
    from littletree import Node
4✔
40
except ImportError:  # pragma: no cover
41
    from_path, Node, ET = None, None, None
42

43
from loguru import logger
4✔
44

45
from chunklet.base_chunker import BaseChunker
4✔
46
from chunklet.code_chunker._code_structure_extractor import CodeStructureExtractor
4✔
47
from chunklet.code_chunker.utils import is_python_code
4✔
48
from chunklet.common.batch_runner import run_in_batch
4✔
49
from chunklet.common.deprecation import deprecated_callable
4✔
50
from chunklet.common.logging_utils import log_info
4✔
51
from chunklet.common.path_utils import is_path_like, read_text_file
4✔
52
from chunklet.common.token_utils import count_tokens
4✔
53
from chunklet.common.validation import IterableOfStr, IterableOfPath, validate_input
4✔
54
from chunklet.exceptions import (
4✔
55
    InvalidInputError,
56
    MissingTokenCounterError,
57
    TokenLimitError,
58
)
59

60

61
class CodeChunker(BaseChunker):
4✔
62
    """
63
    Language-agnostic code chunking utility for semantic code segmentation.
64

65
    Extracts structural units (functions, classes, namespaces) from source code
66
    across multiple programming languages using pattern-based detection and
67
    token-aware segmentation.
68

69
    Key Features:
70
        - Cross-language support (Python, C/C++, Java, C#, JavaScript, Go, etc.)
71
        - Structural analysis with namespace hierarchy tracking
72
        - Configurable token limits with strict/lenient overflow handling
73
        - Flexible docstring and comment processing modes
74
        - Accurate line number preservation and source tracking
75
        - Parallel batch processing for multiple files
76
        - Comprehensive logging and progress tracking
77
    """
78

79
    @validate_input
4✔
80
    def __init__(
4✔
81
        self,
82
        verbose: bool = False,
83
        token_counter: Callable[[str], int] | None = None,
84
    ):
85
        """
86
        Initialize the CodeChunker with optional token counter and verbosity control.
87

88
        Args:
89
            verbose: Enable verbose logging.
90
            token_counter: Function that counts tokens in text.
91
                If None, must be provided when calling chunk() methods.
92
        """
93
        self.token_counter = token_counter
4✔
94
        self._verbose = verbose
4✔
95
        self.extractor = CodeStructureExtractor(verbose=self._verbose)
4✔
96

97
    @property
4✔
98
    def verbose(self) -> bool:
4✔
99
        """Get the verbosity status."""
100
        return self._verbose
4✔
101

102
    @verbose.setter
4✔
103
    def verbose(self, value: bool) -> None:
4✔
104
        """Set the verbosity and propagate to the extractor."""
105
        self._verbose = value
×
106
        self.extractor.verbose = value
×
107

108
    def _merge_tree(self, relations_list: list[list]) -> str:
4✔
109
        """
110
        Merges multiple sets of parent-child relation dictionaries into a single tree
111
        then returns its string representation.
112

113
        Args:
114
            relations_list: A list containing relation lists.
115

116
        Returns:
117
            The string representation of the tree
118
        """
119
        if not relations_list:
4✔
120
            return "global"
×
121

122
        # Flatten the set of lists into a single iterable
123
        all_relations_flat = chain.from_iterable(relations_list)
4✔
124

125
        # Deduplicate relations
126
        def relation_key(relation: dict):
4✔
127
            return tuple(sorted(relation.items()))
4✔
128
        unique_relations = list(unique_everseen(all_relations_flat, key=relation_key))
4✔
129

130
        if not unique_relations:
4✔
131
            return "global"
4✔
132

133
        merged_tree = Node.from_relations(unique_relations, root="global")
4✔
134

135
        return merged_tree.to_string()
4✔
136

137
    def _split_oversized(
4✔
138
        self,
139
        snippet_dict: dict,
140
        max_tokens: int,
141
        max_lines: int,
142
        source: str | Path,
143
        token_counter: Callable | None,
144
        cumulative_lengths: tuple[int, ...],
145
    ):
146
        """
147
        Split an oversized structural block into smaller sub-chunks.
148

149
        This helper is used when a single code block exceeds the maximum
150
        token limit and `strict_mode` is disabled. It divides the block's
151
        content into token-bounded fragments while preserving line order
152
        and basic metadata.
153

154
        Args:
155
            snippet_dict: The oversized snippet to split.
156
            max_tokens: Maximum tokens per sub-chunk.
157
            max_lines: Maximum lines per sub-chunk.
158
            source: The source of the code.
159
            token_counter: The token counting function.
160
            cumulative_lengths: The cumulative lengths of the lines in the source code.
161

162
        Returns:
163
            A list of sub-chunks derived from the original block.
164
        """
165
        sub_chunks = []
4✔
166
        curr_chunk = []
4✔
167
        token_count = 0
4✔
168
        line_count = 0
4✔
169

170
        # Iterate through each line in the snippet_dict content
171
        for line_no, line in enumerate(
4✔
172
            snippet_dict["content"].splitlines(), start=snippet_dict["start_line"]
173
        ):
174
            line_tokens = (
4✔
175
                count_tokens(line, token_counter) if max_tokens != sys.maxsize else 0
176
            )
177

178
            # If adding this line would exceed either max_tokens or max_lines, commit current chunk
179
            if (token_count + line_tokens > max_tokens) or (line_count + 1 > max_lines):
4✔
180
                start_line = line_no - len(curr_chunk)
4✔
181
                end_line = line_no - 1
4✔
182
                start_span = cumulative_lengths[start_line - 1]
4✔
183
                end_span = cumulative_lengths[end_line]
4✔
184
                tree = Node.from_relations(snippet_dict["relations"]).to_string()
4✔
185
                sub_chunks.append(
4✔
186
                    DotDict(
187
                        {
188
                            "content": "\n".join(curr_chunk),
189
                            "metadata": {
190
                                "tree": tree,
191
                                "start_line": start_line,
192
                                "end_line": end_line,
193
                                "span": (start_span, end_span),
194
                                "source": (
195
                                    str(source)
196
                                    if isinstance(source, Path)
197
                                    or (
198
                                        isinstance(source, str) and is_path_like(source)
199
                                    )
200
                                    else "N/A"
201
                                ),
202
                            },
203
                        }
204
                    )
205
                )
206
                curr_chunk = [line]  # Add the overflow line!
4✔
207
                token_count = 0
4✔
208
                line_count = 0
4✔
209

210
            curr_chunk.append(line)
4✔
211
            token_count += line_tokens
4✔
212
            line_count += 1
4✔
213

214
        # Add any remaining chunk at the end
215
        if curr_chunk:
4✔
216
            start_line = snippet_dict["end_line"] - len(curr_chunk) + 1
4✔
217
            end_line = snippet_dict["end_line"]
4✔
218
            start_span = cumulative_lengths[start_line - 1]
4✔
219
            end_span = cumulative_lengths[end_line]
4✔
220
            tree = Node.from_relations(snippet_dict["relations"]).to_string()
4✔
221
            sub_chunks.append(
4✔
222
                DotDict(
223
                    {
224
                        "content": "\n".join(curr_chunk),
225
                        "metadata": {
226
                            "tree": tree,
227
                            "start_line": start_line,
228
                            "end_line": end_line,
229
                            "span": (start_span, end_span),
230
                            "source": (
231
                                str(source)
232
                                if (isinstance(source, Path) or is_path_like(source))
233
                                else "N/A"
234
                            ),
235
                        },
236
                    }
237
                )
238
            )
239

240
        return sub_chunks
4✔
241

242
    def _format_limit_msg(
4✔
243
        self,
244
        box_tokens: int,
245
        max_tokens: int,
246
        box_lines: int,
247
        max_lines: int,
248
        function_count: int,
249
        max_functions: int,
250
        content_preview: str,
251
    ) -> str:
252
        """
253
        Format a limit exceeded error message, only including limits that are not sys.maxsize.
254

255
        Args:
256
            box_tokens: Actual token count in the block
257
            max_tokens: Maximum allowed tokens
258
            box_lines: Actual line count in the block
259
            max_lines: Maximum allowed lines
260
            function_count: Actual function count in the block
261
            max_functions: Maximum allowed functions
262
            content_preview: Preview of the content that exceeded limits
263

264
        Returns:
265
            Formatted error message with applicable limits
266
        """
267
        limits = []
4✔
268

269
        if max_tokens != sys.maxsize:
4✔
270
            limits.append(f"tokens: {box_tokens} > {max_tokens}")
4✔
271
        if max_lines != sys.maxsize:
4✔
272
            limits.append(f"lines: {box_lines} > {max_lines}")
4✔
273
        if max_functions != sys.maxsize:
4✔
274
            limits.append(f"functions: {function_count} > {max_functions}")
4✔
275

276
        return (
4✔
277
            f"Limits: {', '.join(limits)}\n"
278
            f"Content starting with: \n```\n{content_preview}...\n```"
279
        )
280

281
    def _group_by_chunk(
4✔
282
        self,
283
        snippet_dicts: list[dict],
284
        cumulative_lengths: tuple[int, ...],
285
        token_counter: Callable[[str], int] | None,
286
        max_tokens: int,
287
        max_lines: int,
288
        max_functions: int,
289
        strict: bool,
290
        source: str | Path,
291
    ) -> list[DotDict]:
292
        """
293
        Group code snippets into chunks based on specified constraints.
294

295
        Iteratively merges snippets into chunks while respecting token, line, and function limits.
296
        Handles oversized snippets by splitting them if strict mode is disabled.
297

298
        Args:
299
            snippet_dicts: List of extracted code snippet dictionaries.
300
            cumulative_lengths: Cumulative character lengths for span calculation.
301
            token_counter: Function to count tokens in text.
302
            max_tokens: Maximum tokens per chunk.
303
            max_lines: Maximum lines per chunk.
304
            max_functions: Maximum functions per chunk.
305
            strict: If True, raise error on oversized snippets; if False, split them.
306
            source: Original source for metadata.
307

308
        Returns:
309
            List of chunks with content and metadata.
310
        """
311
        source = (
4✔
312
            str(source) if (isinstance(source, Path) or is_path_like(source)) else "N/A"
313
        )
314

315
        merged_content = []
4✔
316
        relations_list = []
4✔
317
        start_line = None
4✔
318
        end_line = None
4✔
319
        token_count = 0
4✔
320
        line_count = 0
4✔
321
        function_count = 0
4✔
322
        result_chunks = []
4✔
323

324
        index = 0
4✔
325
        while index < len(snippet_dicts):
4✔
326
            snippet_dict = snippet_dicts[index]
4✔
327
            box_tokens = (
4✔
328
                count_tokens(snippet_dict["content"], token_counter)
329
                if max_tokens != sys.maxsize
330
                else 0
331
            )
332
            box_lines = snippet_dict["content"].count("\n") + bool(
4✔
333
                snippet_dict["content"]
334
            )
335
            is_function = bool(snippet_dict.get("func_partial_signature"))
4✔
336

337
            # Check if adding this snippet exceeds any limits
338
            token_limit_reached = token_count + box_tokens > max_tokens
4✔
339
            line_limit_reached = line_count + box_lines > max_lines
4✔
340
            function_limit_reached = is_function and (
4✔
341
                function_count + 1 > max_functions
342
            )
343

344
            if not (
4✔
345
                token_limit_reached or line_limit_reached or function_limit_reached
346
            ):
347
                # Fits: merge normally
348
                merged_content.append(snippet_dict["content"])
4✔
349
                relations_list.append(snippet_dict["relations"])
4✔
350
                token_count += box_tokens
4✔
351
                line_count += box_lines
4✔
352
                if is_function:
4✔
353
                    function_count += 1
4✔
354

355
                if start_line is None:
4✔
356
                    start_line = snippet_dict["start_line"]
4✔
357
                end_line = snippet_dict["end_line"]
4✔
358
                index += 1
4✔
359

360
            elif not merged_content:
4✔
361
                # Too big and nothing merged yet: handle oversize
362
                limit_msg = self._format_limit_msg(
4✔
363
                    box_tokens,
364
                    max_tokens,
365
                    box_lines,
366
                    max_lines,
367
                    function_count,
368
                    max_functions,
369
                    snippet_dict["content"][:100],
370
                )
371
                if strict:
4✔
372
                    raise TokenLimitError(
4✔
373
                        f"Structural block exceeds maximum limit.\n{limit_msg}\n"
374
                        "Reason: Prevent splitting inside interest points (function, class, region, ...)\n"
375
                        "💡Hint: Consider increasing 'max_tokens', 'max_lines', or 'max_functions', "
376
                        "refactoring the oversized block, or setting 'strict=False' to allow automatic splitting of oversized blocks."
377
                    )
378
                else:  # Else split further
379
                    logger.warning(
4✔
380
                        "Splitting oversized block into sub-chunks.\n(%s)",
381
                        limit_msg,
382
                    )
383

384
                    sub_chunks = self._split_oversized(
4✔
385
                        snippet_dict,
386
                        max_tokens,
387
                        max_lines,
388
                        source,
389
                        token_counter,
390
                        cumulative_lengths,
391
                    )
392

393
                    for sub_chunk in sub_chunks:
4✔
394
                        sub_chunk.metadata.chunk_num = len(result_chunks) + 1
4✔
395
                        result_chunks.append(sub_chunk)
4✔
396
                    index += 1
4✔
397
            else:
398
                # Flush current merged content as a chunk
399
                start_span = cumulative_lengths[start_line - 1]
4✔
400
                end_span = cumulative_lengths[end_line]
4✔
401
                merged_chunk = DotDict(
4✔
402
                    {
403
                        "content": "\n".join(merged_content),
404
                        "metadata": {
405
                            "chunk_num": len(result_chunks) + 1,
406
                            "tree": self._merge_tree(relations_list),
407
                            "start_line": start_line,
408
                            "end_line": end_line,
409
                            "span": (start_span, end_span),
410
                            "source": source,
411
                        },
412
                    }
413
                )
414
                result_chunks.append(merged_chunk)
4✔
415

416
                # Reset for next chunk
417
                merged_content.clear()
4✔
418
                relations_list.clear()
4✔
419
                start_line = None
4✔
420
                end_line = None
4✔
421
                token_count = 0
4✔
422
                line_count = 0
4✔
423
                function_count = 0
4✔
424

425
        # Flush remaining content
426
        if merged_content:
4✔
427
            start_span = cumulative_lengths[start_line - 1]
4✔
428
            end_span = cumulative_lengths[end_line]
4✔
429
            merged_chunk = DotDict(
4✔
430
                {
431
                    "content": "\n".join(merged_content),
432
                    "metadata": {
433
                        "chunk_num": len(result_chunks) + 1,
434
                        "tree": self._merge_tree(relations_list),
435
                        "start_line": start_line,
436
                        "end_line": end_line,
437
                        "span": (start_span, end_span),
438
                        "source": source,
439
                    },
440
                }
441
            )
442
            result_chunks.append(merged_chunk)
4✔
443

444
        return result_chunks
4✔
445

446
    def _validate_constraints(
4✔
447
        self,
448
        max_tokens: int | None,
449
        max_lines: int | None,
450
        max_functions: int | None,
451
        token_counter: Callable[[str], int] | None,
452
    ):
453
        """
454
        Validates that at least one chunking constraint is provided and sets default values.
455

456
        Args:
457
            max_tokens: Maximum number of tokens per chunk.
458
            max_lines: Maximum number of lines per chunk.
459
            max_functions: Maximum number of functions per chunk.
460
            token_counter: Function that counts tokens in text.
461

462
        Raises:
463
            InvalidInputError: If no chunking constraints are provided.
464
            MissingTokenCounterError: If `max_tokens` is provided but no `token_counter` is provided.
465
        """
466
        if not any((max_tokens, max_lines, max_functions)):
4✔
467
            raise InvalidInputError(
4✔
468
                "At least one of 'max_tokens', 'max_lines', or 'max_functions' must be provided."
469
            )
470

471
        # If token_counter is required but not provided
472
        if max_tokens is not None and not (token_counter or self.token_counter):
4✔
473
            raise MissingTokenCounterError()
4✔
474

475
    @validate_input
4✔
476
    def chunk_text(
4✔
477
        self,
478
        code: str,
479
        *,
480
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
481
        max_lines: Annotated[int | None, Field(ge=5)] = None,
482
        max_functions: Annotated[int | None, Field(ge=1)] = None,
483
        token_counter: Callable[[str], int] | None = None,
484
        include_comments: bool = True,
485
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
486
        strict: bool = True,
487
    ) -> list[DotDict]:
488
        """
489
        Extract semantic code chunks from source using multi-dimensional analysis.
490
        Processes source code by identifying structural boundaries (functions, classes,
491
        namespaces) and grouping content based on multiple constraints including
492
        tokens, lines, and logical units while preserving semantic coherence.
493

494
        Args:
495
            code: Raw code string or file path to process.
496
            max_tokens: Maximum tokens per chunk. Must be >= 12.
497
            max_lines: Maximum number of lines per chunk. Must be >= 5.
498
            max_functions: Maximum number of functions per chunk. Must be >= 1.
499
            token_counter: Token counting function. Uses instance
500
                counter if None. Required for token-based chunking.
501
            include_comments: Include comments in output chunks. Default: True.
502
            docstring_mode: Docstring processing strategy:
503

504
                - "summary": Include only first line of docstrings
505
                - "all": Include complete docstrings
506
                - "excluded": Remove all docstrings
507
                Defaults to "all".
508
            strict: If True, raise error when structural blocks exceed
509
                max_tokens. If False, split oversized blocks. Default: True.
510

511
        Returns:
512
            List of code chunks with metadata. Each DotDict contains:
513

514
                - content: Code content
515
                - tree: Namespace hierarchy
516
                - start_line: Starting line in original source
517
                - end_line: Ending line in original source
518
                - span: Character-level span (start and end offsets) in the original source.
519
                - source_path: "N/A"
520

521
        Raises:
522
            InvalidInputError: Invalid configuration parameters.
523
            MissingTokenCounterError: No token counter available.
524
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
525
            CallbackError: If the token counter fails or returns an invalid type.
526
        """
527
        self._validate_constraints(max_tokens, max_lines, max_functions, token_counter)
4✔
528

529
        if max_tokens is None:
4✔
530
            max_tokens = sys.maxsize
4✔
531
        if max_lines is None:
4✔
532
            max_lines = sys.maxsize
4✔
533
        if max_functions is None:
4✔
534
            max_functions = sys.maxsize
4✔
535

536
        token_counter = token_counter or self.token_counter
4✔
537

538
        if not code.strip():
4✔
539
            log_info(self.verbose, "Input code is empty. Returning empty list.")
4✔
540
            return []
4✔
541

542
        log_info(
4✔
543
            self.verbose,
544
            "Starting chunk processing for code starting with:\n```\n{}...\n```",
545
            code[:100],
546
        )
547

548
        snippet_dicts, cumulative_lengths = self.extractor.extract_code_structure(
4✔
549
            code, include_comments, docstring_mode, is_python_code(code)
550
        )
551

552
        result_chunks = self._group_by_chunk(
4✔
553
            snippet_dicts=snippet_dicts,
554
            cumulative_lengths=cumulative_lengths,
555
            token_counter=token_counter,
556
            max_tokens=max_tokens,
557
            max_lines=max_lines,
558
            max_functions=max_functions,
559
            strict=strict,
560
            source=code,
561
        )
562

563
        log_info(self.verbose, "Generated {} chunk(s) for the code", len(result_chunks))
4✔
564

565
        return result_chunks
4✔
566

567
    @validate_input
4✔
568
    def chunk_file(
4✔
569
        self,
570
        path: str | Path,
571
        *,
572
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
573
        max_lines: Annotated[int | None, Field(ge=5)] = None,
574
        max_functions: Annotated[int | None, Field(ge=1)] = None,
575
        token_counter: Callable[[str], int] | None = None,
576
        include_comments: bool = True,
577
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
578
        strict: bool = True,
579
    ) -> list[DotDict]:
580
        """
581
        Extract semantic code chunks from source using multi-dimensional analysis.
582
        Processes source code by identifying structural boundaries (functions, classes,
583
        namespaces) and grouping content based on multiple constraints including
584
        tokens, lines, and logical units while preserving semantic coherence.
585

586
        Args:
587
            path: File path to process.
588
            max_tokens: Maximum tokens per chunk. Must be >= 12.
589
            max_lines: Maximum number of lines per chunk. Must be >= 5.
590
            max_functions: Maximum number of functions per chunk. Must be >= 1.
591
            token_counter: Token counting function. Uses instance
592
                counter if None. Required for token-based chunking.
593
            include_comments: Include comments in output chunks. Default: True.
594
            docstring_mode: Docstring processing strategy:
595

596
                - "summary": Include only first line of docstrings
597
                - "all": Include complete docstrings
598
                - "excluded": Remove all docstrings
599
                Defaults to "all".
600
            strict: If True, raise error when structural blocks exceed
601
                max_tokens. If False, split oversized blocks. Default: True.
602

603
        Returns:
604
            List of code chunks with metadata. Each DotDict contains:
605

606
                - content: Code content
607
                - tree: Namespace hierarchy
608
                - start_line: Starting line in original source
609
                - end_line: Ending line in original source
610
                - span: Character-level span (start and end offsets) in the original source.
611
                - source_path: Source file path
612

613
        Raises:
614
            InvalidInputError: Invalid configuration parameters.
615
            MissingTokenCounterError: No token counter available.
616
            FileProcessingError: Source file cannot be read.
617
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
618
            CallbackError: If the token counter fails or returns an invalid type.
619
        """
620
        path = Path(path)
4✔
621
        code = read_text_file(path)
4✔
622

623
        if not code.strip():
4✔
624
            log_info(self.verbose, "Input code is empty. Returning empty list.")
×
625
            return []
×
626

627
        log_info(self.verbose, "Starting chunk processing for file: {}", path)
4✔
628

629
        return self.chunk_text(
4✔
630
            code=code,
631
            max_tokens=max_tokens,
632
            max_lines=max_lines,
633
            max_functions=max_functions,
634
            token_counter=token_counter or self.token_counter,
635
            include_comments=include_comments,
636
            docstring_mode=docstring_mode,
637
            strict=strict,
638
        )
639

640
    @validate_input
4✔
641
    def chunk_texts(
4✔
642
        self,
643
        codes: IterableOfStr,
644
        *,
645
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
646
        max_lines: Annotated[int | None, Field(ge=5)] = None,
647
        max_functions: Annotated[int | None, Field(ge=1)] = None,
648
        token_counter: Callable[[str], int] | None = None,
649
        separator: Any = None,
650
        include_comments: bool = True,
651
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
652
        strict: bool = True,
653
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
654
        show_progress: bool = True,
655
        on_errors: Literal["raise", "skip", "break"] = "raise",
656
    ) -> Generator[DotDict, None, None]:
657
        """
658
        Process multiple source files or code strings in parallel.
659
        Leverages multiprocessing to efficiently chunk multiple code sources,
660
        applying consistent chunking rules across all inputs.
661

662
        Args:
663
            codes: A non-string iterable of raw code strings.
664
            max_tokens: Maximum tokens per chunk. Must be >= 12.
665
            max_lines: Maximum number of lines per chunk. Must be >= 5.
666
            max_functions: Maximum number of functions per chunk. Must be >= 1.
667
            token_counter: Token counting function. Uses instance
668
                counter if None. Required for token-based chunking.
669
            separator: A value to be yielded after the chunks of each text are processed.
670
                Note: None cannot be used as a separator.
671
            include_comments: Include comments in output chunks. Default: True.
672
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
673

674
                - "summary": Include only first line of docstrings
675
                - "all": Include complete docstrings
676
                - "excluded": Remove all docstrings
677
                Defaults to "all"
678
            strict: If True, raise error when structural blocks exceed max_tokens. If False, split oversized blocks. Default: True.
679
            n_jobs: Number of parallel workers. Uses all available CPUs if None.
680
            show_progress: Display progress bar during processing. Defaults to True.
681
            on_errors:
682
                How to handle errors during processing. Defaults to 'raise'.
683

684
        yields:
685
            `DotDict` object, representing a chunk with its content and metadata.
686
                Includes:
687

688
                - content: Code content
689
                - tree: Namespace hierarchy
690
                - start_line: Starting line in original source
691
                - end_line: Ending line in original source
692
                - span: Character-level span (start and end offsets) in the original source.
693
                - source_path: "N/A"
694

695
        Raises:
696
            InvalidInputError: Invalid input parameters.
697
            MissingTokenCounterError: No token counter available.
698
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
699
            CallbackError: If the token counter fails or returns an invalid type.
700
        """
701
        chunk_func = partial(
4✔
702
            self.chunk_text,
703
            max_tokens=max_tokens,
704
            max_lines=max_lines,
705
            max_functions=max_functions,
706
            token_counter=token_counter or self.token_counter,
707
            include_comments=include_comments,
708
            docstring_mode=docstring_mode,
709
            strict=strict,
710
        )
711

712
        yield from run_in_batch(
4✔
713
            func=chunk_func,
714
            iterable_of_args=codes,
715
            iterable_name="codes",
716
            separator=separator,
717
            n_jobs=n_jobs,
718
            show_progress=show_progress,
719
            on_errors=on_errors,
720
            verbose=self.verbose,
721
        )
722

723
    @validate_input
4✔
724
    def chunk_files(
4✔
725
        self,
726
        paths: IterableOfPath,
727
        *,
728
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
729
        max_lines: Annotated[int | None, Field(ge=5)] = None,
730
        max_functions: Annotated[int | None, Field(ge=1)] = None,
731
        token_counter: Callable[[str], int] | None = None,
732
        separator: Any = None,
733
        include_comments: bool = True,
734
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
735
        strict: bool = True,
736
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
737
        show_progress: bool = True,
738
        on_errors: Literal["raise", "skip", "break"] = "raise",
739
    ) -> Generator[DotDict, None, None]:
740
        """
741
        Process multiple source files or code strings in parallel.
742
        Leverages multiprocessing to efficiently chunk multiple code sources,
743
        applying consistent chunking rules across all inputs.
744

745
        Args:
746
            paths: A non-string iterable of file paths to process.
747
            max_tokens: Maximum tokens per chunk. Must be >= 12.
748
            max_lines: Maximum number of lines per chunk. Must be >= 5.
749
            max_functions: Maximum number of functions per chunk. Must be >= 1.
750
            token_counter: Token counting function. Uses instance
751
                counter if None. Required for token-based chunking.
752
            separator: A value to be yielded after the chunks of each text are processed.
753
                Note: None cannot be used as a separator.
754
            include_comments: Include comments in output chunks. Default: True.
755
            docstring_mode(Literal["summary", "all", "excluded"]): Docstring processing strategy:
756

757
                - "summary": Include only first line of docstrings
758
                - "all": Include complete docstrings
759
                - "excluded": Remove all docstrings
760
                Defaults to "all"
761
            strict: If True, raise error when structural blocks exceed max_tokens. If False, split oversized blocks. Default: True.
762
            n_jobs: Number of parallel workers. Uses all available CPUs if None.
763
            show_progress: Display progress bar during processing. Defaults to True.
764
            on_errors:
765
                How to handle errors during processing. Defaults to 'raise'.
766

767
        yields:
768
            `DotDict` object, representing a chunk with its content and metadata.
769
                Includes:
770

771
                - content: Code content
772
                - tree: Namespace hierarchy
773
                - start_line: Starting line in original source
774
                - end_line: Ending line in original source
775
                - span: Character-level span (start and end offsets) in the original source.
776
                - source_path: Source file path
777

778
        Raises:
779
            InvalidInputError: Invalid input parameters.
780
            MissingTokenCounterError: No token counter available.
781
            FileProcessingError: Source file cannot be read.
782
            TokenLimitError: Structural block exceeds max_tokens in strict mode.
783
            CallbackError: If the token counter fails or returns an invalid type.
784
        """
785
        chunk_func = partial(
4✔
786
            self.chunk_file,
787
            max_tokens=max_tokens,
788
            max_lines=max_lines,
789
            max_functions=max_functions,
790
            token_counter=token_counter or self.token_counter,
791
            include_comments=include_comments,
792
            docstring_mode=docstring_mode,
793
            strict=strict,
794
        )
795

796
        yield from run_in_batch(
4✔
797
            func=chunk_func,
798
            iterable_of_args=paths,
799
            iterable_name="paths",
800
            separator=separator,
801
            n_jobs=n_jobs,
802
            show_progress=show_progress,
803
            on_errors=on_errors,
804
            verbose=self.verbose,
805
        )
806

807
    @deprecated_callable(
808
        use_instead="chunk_text or chunk_file",
809
        deprecated_in="2.2.0",
810
        removed_in="3.0.0",
811
    )
812
    def chunk(  # pragma: no cover
813
        self,
814
        source: str | Path,
815
        *,
816
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
817
        max_lines: Annotated[int | None, Field(ge=5)] = None,
818
        max_functions: Annotated[int | None, Field(ge=1)] = None,
819
        token_counter: Callable[[str], int] | None = None,
820
        include_comments: bool = True,
821
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
822
        strict: bool = True,
823
    ) -> list[DotDict]:
824
        """
825
        Chunk code into semantic pieces.
826

827
        Note:
828
            Deprecated since v2.2.0. Will be removed in v3.0.0. Use `chunk_file` or `chunk_text` instead.
829
        """
830
        if isinstance(source, Path) or (
831
            isinstance(source, str) and is_path_like(source)
832
        ):
833
            return self.chunk_file(
834
                path=source,
835
                max_tokens=max_tokens,
836
                max_lines=max_lines,
837
                max_functions=max_functions,
838
                token_counter=token_counter,
839
                include_comments=include_comments,
840
                docstring_mode=docstring_mode,
841
                strict=strict,
842
            )
843
        return self.chunk_text(
844
            code=source,
845
            max_tokens=max_tokens,
846
            max_lines=max_lines,
847
            max_functions=max_functions,
848
            token_counter=token_counter,
849
            include_comments=include_comments,
850
            docstring_mode=docstring_mode,
851
            strict=strict,
852
        )
853

854
    @deprecated_callable(
855
        use_instead="chunk_texts or chunk_files",
856
        deprecated_in="2.2.0",
857
        removed_in="3.0.0",
858
    )
859
    def batch_chunk(  # pragma: no cover
860
        self,
861
        sources: IterableOfPath,
862
        *,
863
        max_tokens: Annotated[int | None, Field(ge=12)] = None,
864
        max_lines: Annotated[int | None, Field(ge=5)] = None,
865
        max_functions: Annotated[int | None, Field(ge=1)] = None,
866
        token_counter: Callable[[str], int] | None = None,
867
        separator: Any = None,
868
        include_comments: bool = True,
869
        docstring_mode: Literal["summary", "all", "excluded"] = "all",
870
        strict: bool = True,
871
        n_jobs: Annotated[int, Field(ge=1)] | None = None,
872
        show_progress: bool = True,
873
        on_errors: Literal["raise", "skip", "break"] = "raise",
874
    ) -> Generator[DotDict, None, None]:
875
        """
876
        Batch chunk multiple code sources.
877

878
        Note:
879
            Deprecated since v2.2.0. Will be removed in v3.0.0. Use `chunk_files` instead.
880
        """
881
        chunk_func = partial(
882
            self.chunk,
883
            max_tokens=max_tokens,
884
            max_lines=max_lines,
885
            max_functions=max_functions,
886
            token_counter=token_counter or self.token_counter,
887
            include_comments=include_comments,
888
            docstring_mode=docstring_mode,
889
            strict=strict,
890
        )
891

892
        yield from run_in_batch(
893
            func=chunk_func,
894
            iterable_of_args=sources,
895
            iterable_name="sources",
896
            separator=separator,
897
            n_jobs=n_jobs,
898
            show_progress=show_progress,
899
            on_errors=on_errors,
900
            verbose=self.verbose,
901
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc