#754

Committed 04 Sep 2023 09:54PM UTC coverage: 74.807% (-5.5%) from 80.336%

Build # #754

Build Type

push

circle-ci

Committed by

jiashenC

Commit Message

update case

Run Details

8727 of 11666 relevant lines covered (74.81%)

0.75 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.91

/evadb/readers/csv_reader.py

# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Iterator

import numpy as np
import pandas as pd

from evadb.catalog.sql_config import IDENTIFIER_COLUMN
from evadb.readers.abstract_reader import AbstractReader
from evadb.utils.logging_manager import logger


class CSVReader(AbstractReader):
    def __init__(self, *args, column_list, **kwargs):
        """
        Reads a CSV file and yields frame data.
        Args:
            column_list: list of columns (TupleValueExpression)
            to read from the CSV file
        """

        self._column_list = column_list
        super().__init__(*args, **kwargs)

    def _read(self) -> Iterator[Dict]:
        # TODO: What is a good location to put this code?
        def convert_csv_string_to_ndarray(row_string):
            """
            Convert a string of comma separated values to a numpy
            float array
            """
            return np.array([np.float32(val) for val in row_string.split(",")])

        logger.info("Reading CSV frames")

        # TODO: Need to add strong sanity checks on the columns.

        # Read the csv in chunks, and only keep the columns we need.
        # Ignore _row_id that we don't need to take care of.
        col_list_names = [
            col.name for col in self._column_list if col.name != IDENTIFIER_COLUMN
        ]

        col_map = {col.name: col for col in self._column_list}
        for chunk in pd.read_csv(self.file_url, chunksize=512, usecols=col_list_names):
            # apply the required conversions
            for col in chunk.columns:
                # TODO: Is there a better way to do this?
                if (
                    isinstance(chunk[col].iloc[0], str)
                    and col_map[col].col_object.type.name == "NDARRAY"
                ):
                    # convert the string to a numpy array
                    chunk[col] = chunk[col].apply(convert_csv_string_to_ndarray)

            # yield the chunk
            for chunk_index, chunk_row in chunk.iterrows():
                yield chunk_row

1	# coding=utf-8
2	# Copyright 2018-2023 EvaDB
3	#
4	# Licensed under the Apache License, Version 2.0 (the "License");
5	# you may not use this file except in compliance with the License.
6	# You may obtain a copy of the License at
7	#
8	# http://www.apache.org/licenses/LICENSE-2.0
9	#
10	# Unless required by applicable law or agreed to in writing, software
11	# distributed under the License is distributed on an "AS IS" BASIS,
12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	# See the License for the specific language governing permissions and
14	# limitations under the License.
15	from typing import Dict, Iterator	1✔
16
17	import numpy as np	1✔
18	import pandas as pd	1✔
19
20	from evadb.catalog.sql_config import IDENTIFIER_COLUMN	1✔
21	from evadb.readers.abstract_reader import AbstractReader	1✔
22	from evadb.utils.logging_manager import logger	1✔
23
24
25	class CSVReader(AbstractReader):	1✔
26	def __init__(self, args, column_list, *kwargs):	1✔
27	"""
28	Reads a CSV file and yields frame data.
29	Args:
30	column_list: list of columns (TupleValueExpression)
31	to read from the CSV file
32	"""
33
34	self._column_list = column_list	1✔
35	super().__init__(args, *kwargs)	1✔
36
37	def _read(self) -> Iterator[Dict]:	1✔
38	# TODO: What is a good location to put this code?
39	def convert_csv_string_to_ndarray(row_string):	1✔
40	"""
41	Convert a string of comma separated values to a numpy
42	float array
43	"""
44	return np.array([np.float32(val) for val in row_string.split(",")])	×
45
46	logger.info("Reading CSV frames")	1✔
47
48	# TODO: Need to add strong sanity checks on the columns.
49
50	# Read the csv in chunks, and only keep the columns we need.
51	# Ignore _row_id that we don't need to take care of.
52	col_list_names = [	1✔
53	col.name for col in self._column_list if col.name != IDENTIFIER_COLUMN
54	]
55
56	col_map = {col.name: col for col in self._column_list}	1✔
57	for chunk in pd.read_csv(self.file_url, chunksize=512, usecols=col_list_names):	1✔
58	# apply the required conversions
59	for col in chunk.columns:	1✔
60	# TODO: Is there a better way to do this?
61	if (	1✔
62	isinstance(chunk[col].iloc[0], str)
63	and col_map[col].col_object.type.name == "NDARRAY"
64	):
65	# convert the string to a numpy array
66	chunk[col] = chunk[col].apply(convert_csv_string_to_ndarray)	×
67
68	# yield the chunk
69	for chunk_index, chunk_row in chunk.iterrows():	1✔
70	yield chunk_row	1✔

georgia-tech-db / eva / #754

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous