#850

Committed 08 Nov 2023 08:36PM UTC coverage: 0.0% (-77.0%) from 76.982%

Build # #850

Build Type

push

circleci

Committed by

americast

Commit Message

fix metrics logic

Run Details

0 of 1 new or added line in 1 file covered. (0.0%)

9789 existing lines in 252 files now uncovered.

0 of 12428 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/evadb/readers/document/document_reader.py

# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Dict, Iterator

from evadb.catalog.sql_config import ROW_NUM_COLUMN
from evadb.configuration.constants import (
    DEFAULT_DOCUMENT_CHUNK_OVERLAP,
    DEFAULT_DOCUMENT_CHUNK_SIZE,
)
from evadb.readers.abstract_reader import AbstractReader
from evadb.readers.document.registry import (
    _lazy_import_loader,
    _lazy_import_text_splitter,
)


class DocumentReader(AbstractReader):
    def __init__(self, *args, chunk_params, **kwargs):
        super().__init__(*args, **kwargs)
        self._LOADER_MAPPING = _lazy_import_loader()
        self._splitter_class = _lazy_import_text_splitter()

        # https://github.com/hwchase17/langchain/blob/5b6bbf4ab2a33ed0d33ff5d3cb3979a7edc15682/langchain/text_splitter.py#L570
        # by default we use chunk_size 4000 and overlap 200
        self._chunk_size = chunk_params.get("chunk_size", DEFAULT_DOCUMENT_CHUNK_SIZE)
        self._chunk_overlap = chunk_params.get(
            "chunk_overlap", DEFAULT_DOCUMENT_CHUNK_OVERLAP
        )

    def _read(self) -> Iterator[Dict]:
        ext = Path(self.file_url).suffix
        assert ext in self._LOADER_MAPPING, f"File Format {ext} not supported"
        loader_class, loader_args = self._LOADER_MAPPING[ext]
        loader = loader_class(self.file_url, **loader_args)

        # todo: implement out own splitter
        langchain_text_splitter = self._splitter_class(
            chunk_size=self._chunk_size, chunk_overlap=self._chunk_overlap
        )

        row_num = 0
        for data in loader.load():
            for chunk_id, row in enumerate(
                langchain_text_splitter.split_documents([data])
            ):
                yield {
                    "chunk_id": chunk_id,
                    "data": row.page_content,
                    ROW_NUM_COLUMN: row_num,
                }
                row_num += 1

1	# coding=utf-8
2	# Copyright 2018-2023 EvaDB
3	#
4	# Licensed under the Apache License, Version 2.0 (the "License");
5	# you may not use this file except in compliance with the License.
6	# You may obtain a copy of the License at
7	#
8	# http://www.apache.org/licenses/LICENSE-2.0
9	#
10	# Unless required by applicable law or agreed to in writing, software
11	# distributed under the License is distributed on an "AS IS" BASIS,
12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	# See the License for the specific language governing permissions and
14	# limitations under the License.
UNCOV 15	from pathlib import Path	×
UNCOV 16	from typing import Dict, Iterator	×
17
UNCOV 18	from evadb.catalog.sql_config import ROW_NUM_COLUMN	×
UNCOV 19	from evadb.configuration.constants import (	×
20	DEFAULT_DOCUMENT_CHUNK_OVERLAP,
21	DEFAULT_DOCUMENT_CHUNK_SIZE,
22	)
UNCOV 23	from evadb.readers.abstract_reader import AbstractReader	×
UNCOV 24	from evadb.readers.document.registry import (	×
25	_lazy_import_loader,
26	_lazy_import_text_splitter,
27	)
28
29
UNCOV 30	class DocumentReader(AbstractReader):	×
UNCOV 31	def __init__(self, args, chunk_params, *kwargs):	×
32	super().__init__(args, *kwargs)	×
33	self._LOADER_MAPPING = _lazy_import_loader()	×
34	self._splitter_class = _lazy_import_text_splitter()	×
35
36	# https://github.com/hwchase17/langchain/blob/5b6bbf4ab2a33ed0d33ff5d3cb3979a7edc15682/langchain/text_splitter.py#L570
37	# by default we use chunk_size 4000 and overlap 200
38	self._chunk_size = chunk_params.get("chunk_size", DEFAULT_DOCUMENT_CHUNK_SIZE)	×
39	self._chunk_overlap = chunk_params.get(	×
40	"chunk_overlap", DEFAULT_DOCUMENT_CHUNK_OVERLAP
41	)
42
UNCOV 43	def _read(self) -> Iterator[Dict]:	×
44	ext = Path(self.file_url).suffix	×
45	assert ext in self._LOADER_MAPPING, f"File Format {ext} not supported"	×
46	loader_class, loader_args = self._LOADER_MAPPING[ext]	×
47	loader = loader_class(self.file_url, **loader_args)	×
48
49	# todo: implement out own splitter
50	langchain_text_splitter = self._splitter_class(	×
51	chunk_size=self._chunk_size, chunk_overlap=self._chunk_overlap
52	)
53
54	row_num = 0	×
55	for data in loader.load():	×
56	for chunk_id, row in enumerate(	×
57	langchain_text_splitter.split_documents([data])
58	):
59	yield {	×
60	"chunk_id": chunk_id,
61	"data": row.page_content,
62	ROW_NUM_COLUMN: row_num,
63	}
64	row_num += 1	×

georgia-tech-db / eva / #850

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous