• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

georgia-tech-db / eva / #754

04 Sep 2023 09:54PM UTC coverage: 74.807% (-5.5%) from 80.336%
#754

push

circle-ci

jiashenC
update case

8727 of 11666 relevant lines covered (74.81%)

0.75 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.91
/evadb/readers/csv_reader.py
1
# coding=utf-8
2
# Copyright 2018-2023 EvaDB
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
from typing import Dict, Iterator
1✔
16

17
import numpy as np
1✔
18
import pandas as pd
1✔
19

20
from evadb.catalog.sql_config import IDENTIFIER_COLUMN
1✔
21
from evadb.readers.abstract_reader import AbstractReader
1✔
22
from evadb.utils.logging_manager import logger
1✔
23

24

25
class CSVReader(AbstractReader):
1✔
26
    def __init__(self, *args, column_list, **kwargs):
1✔
27
        """
28
        Reads a CSV file and yields frame data.
29
        Args:
30
            column_list: list of columns (TupleValueExpression)
31
            to read from the CSV file
32
        """
33

34
        self._column_list = column_list
1✔
35
        super().__init__(*args, **kwargs)
1✔
36

37
    def _read(self) -> Iterator[Dict]:
1✔
38
        # TODO: What is a good location to put this code?
39
        def convert_csv_string_to_ndarray(row_string):
1✔
40
            """
41
            Convert a string of comma separated values to a numpy
42
            float array
43
            """
44
            return np.array([np.float32(val) for val in row_string.split(",")])
×
45

46
        logger.info("Reading CSV frames")
1✔
47

48
        # TODO: Need to add strong sanity checks on the columns.
49

50
        # Read the csv in chunks, and only keep the columns we need.
51
        # Ignore _row_id that we don't need to take care of.
52
        col_list_names = [
1✔
53
            col.name for col in self._column_list if col.name != IDENTIFIER_COLUMN
54
        ]
55

56
        col_map = {col.name: col for col in self._column_list}
1✔
57
        for chunk in pd.read_csv(self.file_url, chunksize=512, usecols=col_list_names):
1✔
58
            # apply the required conversions
59
            for col in chunk.columns:
1✔
60
                # TODO: Is there a better way to do this?
61
                if (
1✔
62
                    isinstance(chunk[col].iloc[0], str)
63
                    and col_map[col].col_object.type.name == "NDARRAY"
64
                ):
65
                    # convert the string to a numpy array
66
                    chunk[col] = chunk[col].apply(convert_csv_string_to_ndarray)
×
67

68
            # yield the chunk
69
            for chunk_index, chunk_row in chunk.iterrows():
1✔
70
                yield chunk_row
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc