• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

georgia-tech-db / eva / #850

08 Nov 2023 08:36PM UTC coverage: 0.0% (-77.0%) from 76.982%
#850

push

circleci

americast
fix metrics logic

0 of 1 new or added line in 1 file covered. (0.0%)

9789 existing lines in 252 files now uncovered.

0 of 12428 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/evadb/readers/document/document_reader.py
1
# coding=utf-8
2
# Copyright 2018-2023 EvaDB
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
UNCOV
15
from pathlib import Path
×
UNCOV
16
from typing import Dict, Iterator
×
17

UNCOV
18
from evadb.catalog.sql_config import ROW_NUM_COLUMN
×
UNCOV
19
from evadb.configuration.constants import (
×
20
    DEFAULT_DOCUMENT_CHUNK_OVERLAP,
21
    DEFAULT_DOCUMENT_CHUNK_SIZE,
22
)
UNCOV
23
from evadb.readers.abstract_reader import AbstractReader
×
UNCOV
24
from evadb.readers.document.registry import (
×
25
    _lazy_import_loader,
26
    _lazy_import_text_splitter,
27
)
28

29

UNCOV
30
class DocumentReader(AbstractReader):
×
UNCOV
31
    def __init__(self, *args, chunk_params, **kwargs):
×
32
        super().__init__(*args, **kwargs)
×
33
        self._LOADER_MAPPING = _lazy_import_loader()
×
34
        self._splitter_class = _lazy_import_text_splitter()
×
35

36
        # https://github.com/hwchase17/langchain/blob/5b6bbf4ab2a33ed0d33ff5d3cb3979a7edc15682/langchain/text_splitter.py#L570
37
        # by default we use chunk_size 4000 and overlap 200
38
        self._chunk_size = chunk_params.get("chunk_size", DEFAULT_DOCUMENT_CHUNK_SIZE)
×
39
        self._chunk_overlap = chunk_params.get(
×
40
            "chunk_overlap", DEFAULT_DOCUMENT_CHUNK_OVERLAP
41
        )
42

UNCOV
43
    def _read(self) -> Iterator[Dict]:
×
44
        ext = Path(self.file_url).suffix
×
45
        assert ext in self._LOADER_MAPPING, f"File Format {ext} not supported"
×
46
        loader_class, loader_args = self._LOADER_MAPPING[ext]
×
47
        loader = loader_class(self.file_url, **loader_args)
×
48

49
        # todo: implement out own splitter
50
        langchain_text_splitter = self._splitter_class(
×
51
            chunk_size=self._chunk_size, chunk_overlap=self._chunk_overlap
52
        )
53

54
        row_num = 0
×
55
        for data in loader.load():
×
56
            for chunk_id, row in enumerate(
×
57
                langchain_text_splitter.split_documents([data])
58
            ):
59
                yield {
×
60
                    "chunk_id": chunk_id,
61
                    "data": row.page_content,
62
                    ROW_NUM_COLUMN: row_num,
63
                }
64
                row_num += 1
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc