• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

georgia-tech-db / eva / #758

04 Sep 2023 08:37PM UTC coverage: 0.0% (-78.3%) from 78.333%
#758

push

circle-ci

hershd23
Increased underline length in at line 75 in text_summarization.rst
	modified:   docs/source/benchmarks/text_summarization.rst

0 of 11303 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/evadb/catalog/catalog_utils.py
1
# coding=utf-8
2
# Copyright 2018-2023 EvaDB
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
import uuid
×
16
from pathlib import Path
×
17
from typing import Any, Dict, List
×
18

19
from evadb.catalog.catalog_type import (
×
20
    ColumnType,
21
    DocumentColumnName,
22
    ImageColumnName,
23
    NdArrayType,
24
    PDFColumnName,
25
    TableType,
26
    VideoColumnName,
27
)
28
from evadb.catalog.models.utils import (
×
29
    ColumnCatalogEntry,
30
    TableCatalogEntry,
31
    UdfCacheCatalogEntry,
32
    UdfCatalogEntry,
33
)
34
from evadb.catalog.sql_config import IDENTIFIER_COLUMN
×
35
from evadb.configuration.configuration_manager import ConfigurationManager
×
36
from evadb.expression.function_expression import FunctionExpression
×
37
from evadb.expression.tuple_value_expression import TupleValueExpression
×
38
from evadb.parser.create_statement import ColConstraintInfo, ColumnDefinition
×
39
from evadb.utils.generic_utils import get_str_hash, remove_directory_contents
×
40

41

42
def is_video_table(table: TableCatalogEntry):
×
43
    return table.table_type == TableType.VIDEO_DATA
×
44

45

46
def is_document_table(table: TableCatalogEntry):
×
47
    return table.table_type == TableType.DOCUMENT_DATA
×
48

49

50
def is_pdf_table(table: TableCatalogEntry):
×
51
    return table.table_type == TableType.PDF_DATA
×
52

53

54
def is_string_col(col: ColumnCatalogEntry):
×
55
    return col.type == ColumnType.TEXT or col.array_type == NdArrayType.STR
×
56

57

58
def get_video_table_column_definitions() -> List[ColumnDefinition]:
×
59
    """
60
    name: video path
61
    id: frame id
62
    data: frame data
63
    audio: frame audio
64
    """
65
    columns = [
×
66
        ColumnDefinition(
67
            VideoColumnName.name.name,
68
            ColumnType.TEXT,
69
            None,
70
            None,
71
            ColConstraintInfo(unique=True),
72
        ),
73
        ColumnDefinition(VideoColumnName.id.name, ColumnType.INTEGER, None, None),
74
        ColumnDefinition(
75
            VideoColumnName.data.name,
76
            ColumnType.NDARRAY,
77
            NdArrayType.UINT8,
78
            (None, None, None),
79
        ),
80
        ColumnDefinition(VideoColumnName.seconds.name, ColumnType.FLOAT, None, []),
81
    ]
82
    return columns
×
83

84

85
def get_image_table_column_definitions() -> List[ColumnDefinition]:
×
86
    """
87
    name: image path
88
    data: image decoded data
89
    """
90
    columns = [
×
91
        ColumnDefinition(
92
            ImageColumnName.name.name,
93
            ColumnType.TEXT,
94
            None,
95
            None,
96
            ColConstraintInfo(unique=True),
97
        ),
98
        ColumnDefinition(
99
            ImageColumnName.data.name,
100
            ColumnType.NDARRAY,
101
            NdArrayType.UINT8,
102
            (None, None, None),
103
        ),
104
    ]
105
    return columns
×
106

107

108
def get_document_table_column_definitions() -> List[ColumnDefinition]:
×
109
    """
110
    name: file path
111
    chunk_id: chunk id (0-indexed for each file)
112
    data: text data associated with the chunk
113
    """
114
    columns = [
×
115
        ColumnDefinition(
116
            DocumentColumnName.name.name,
117
            ColumnType.TEXT,
118
            None,
119
            None,
120
            ColConstraintInfo(unique=True),
121
        ),
122
        ColumnDefinition(
123
            DocumentColumnName.chunk_id.name, ColumnType.INTEGER, None, None
124
        ),
125
        ColumnDefinition(
126
            DocumentColumnName.data.name,
127
            ColumnType.TEXT,
128
            None,
129
            None,
130
        ),
131
    ]
132
    return columns
×
133

134

135
def get_pdf_table_column_definitions() -> List[ColumnDefinition]:
×
136
    """
137
    name: pdf name
138
    page: page no
139
    paragraph: paragraph no
140
    data: pdf paragraph data
141
    """
142
    columns = [
×
143
        ColumnDefinition(PDFColumnName.name.name, ColumnType.TEXT, None, None),
144
        ColumnDefinition(PDFColumnName.page.name, ColumnType.INTEGER, None, None),
145
        ColumnDefinition(PDFColumnName.paragraph.name, ColumnType.INTEGER, None, None),
146
        ColumnDefinition(
147
            PDFColumnName.data.name,
148
            ColumnType.TEXT,
149
            None,
150
            None,
151
        ),
152
    ]
153
    return columns
×
154

155

156
def get_table_primary_columns(
×
157
    table_catalog_obj: TableCatalogEntry,
158
) -> List[ColumnDefinition]:
159
    """
160
    Get the primary columns for a table based on its table type.
161

162
    Args:
163
        table_catalog_obj (TableCatalogEntry): The table catalog object.
164

165
    Returns:
166
        List[ColumnDefinition]: The list of primary columns for the table.
167
    """
168
    primary_columns = [
×
169
        ColumnDefinition(IDENTIFIER_COLUMN, ColumnType.INTEGER, None, None)
170
    ]
171
    # _row_id for all the TableTypes, however for Video data and PDF data we also add frame_id (id) and paragraph as part of unique key
172
    if table_catalog_obj.table_type == TableType.VIDEO_DATA:
×
173
        # _row_id, id
174
        primary_columns.append(
×
175
            ColumnDefinition(VideoColumnName.id.name, ColumnType.INTEGER, None, None),
176
        )
177

178
    elif table_catalog_obj.table_type == TableType.PDF_DATA:
×
179
        # _row_id, paragraph
180
        primary_columns.append(
×
181
            ColumnDefinition(
182
                PDFColumnName.paragraph.name, ColumnType.INTEGER, None, None
183
            )
184
        )
185

186
    elif table_catalog_obj.table_type == TableType.DOCUMENT_DATA:
×
187
        # _row_id, chunk_id
188
        primary_columns.append(
×
189
            ColumnDefinition(
190
                DocumentColumnName.chunk_id.name, ColumnType.INTEGER, None, None
191
            )
192
        )
193

194
    return primary_columns
×
195

196

197
def xform_column_definitions_to_catalog_entries(
×
198
    col_list: List[ColumnDefinition],
199
) -> List[ColumnCatalogEntry]:
200
    """Create column catalog entries for the input parsed column list.
201

202
    Arguments:
203
        col_list {List[ColumnDefinition]} -- parsed col list to be created
204
    """
205

206
    result_list = []
×
207
    for col in col_list:
×
208
        column_entry = ColumnCatalogEntry(
×
209
            name=col.name,
210
            type=col.type,
211
            array_type=col.array_type,
212
            array_dimensions=col.dimension,
213
            is_nullable=col.cci.nullable,
214
        )
215
        # todo: change me
216
        result_list.append(column_entry)
×
217

218
    return result_list
×
219

220

221
def construct_udf_cache_catalog_entry(
×
222
    func_expr: FunctionExpression, cache_dir: str
223
) -> UdfCacheCatalogEntry:
224
    """Constructs a udf cache catalog entry from a given function expression.
225
    It is assumed that the function expression has already been bound using the binder.
226
    The catalog entry is populated with dependent udfs and columns by traversing the
227
    expression tree. The cache name is represented by the signature of the function
228
    expression.
229
    Args:
230
        func_expr (FunctionExpression): the function expression with which the cache is associated
231
        cache_dir (str): path to store the cache
232
    Returns:
233
        UdfCacheCatalogEntry: the udf cache catalog entry
234
    """
235
    udf_depends = []
×
236
    col_depends = []
×
237
    for expr in func_expr.find_all(FunctionExpression):
×
238
        udf_depends.append(expr.udf_obj.row_id)
×
239
    for expr in func_expr.find_all(TupleValueExpression):
×
240
        col_depends.append(expr.col_object.row_id)
×
241
    cache_name = func_expr.signature()
×
242

243
    # add salt to the cache_name so that we generate unique name
244
    path = str(get_str_hash(cache_name + uuid.uuid4().hex))
×
245
    cache_path = str(Path(cache_dir) / Path(f"{path}_{func_expr.name}"))
×
246
    args = tuple([arg.signature() for arg in func_expr.children])
×
247
    entry = UdfCacheCatalogEntry(
×
248
        name=func_expr.signature(),
249
        udf_id=func_expr.udf_obj.row_id,
250
        cache_path=cache_path,
251
        args=args,
252
        udf_depends=udf_depends,
253
        col_depends=col_depends,
254
    )
255

256
    return entry
×
257

258

259
def cleanup_storage(config):
×
260
    remove_directory_contents(config.get_value("storage", "index_dir"))
×
261
    remove_directory_contents(config.get_value("storage", "cache_dir"))
×
262
    remove_directory_contents(config.get_value("core", "datasets_dir"))
×
263

264

265
def get_metadata_entry_or_val(
×
266
    udf_obj: UdfCatalogEntry, key: str, default_val: Any = None
267
) -> str:
268
    """
269
    Return the metadata value for the given key, or the default value if the
270
    key is not found.
271

272
    Args:
273
        udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is
274
        used to extract metadata information.
275
        key (str): The metadata key for which the corresponding value needs to be retrieved.
276
        default_val (Any): The default value to be returned if the metadata key is not found.
277

278
    Returns:
279
        str: metadata value
280
    """
281
    for metadata in udf_obj.metadata:
×
282
        if metadata.key == key:
×
283
            return metadata.value
×
284
    return default_val
×
285

286

287
def get_metadata_properties(udf_obj: UdfCatalogEntry) -> Dict:
×
288
    """
289
    Return all the metadata properties as key value pair
290

291
    Args:
292
        udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is
293
        used to extract metadata information.
294
    Returns:
295
        Dict: key-value for each metadata entry
296
    """
297
    properties = {}
×
298
    for metadata in udf_obj.metadata:
×
299
        properties[metadata.key] = metadata.value
×
300
    return properties
×
301

302

303
#### get catalog instance
304
# This function plays a crucial role in ensuring that different threads do
305
# not share the same catalog object, as it can result in serialization issues and
306
# incorrect behavior with SQLAlchemy. Therefore, whenever a catalog instance is
307
# required, we create a new one. One possible optimization is to share the catalog
308
# instance across all objects within the same thread. It is worth investigating whether
309
# SQLAlchemy already handles this optimization for us, which will be explored at a
310
# later time.
311
def get_catalog_instance(db_uri: str, config: ConfigurationManager):
×
312
    from evadb.catalog.catalog_manager import CatalogManager
×
313

314
    return CatalogManager(db_uri, config)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc