#758

Committed 04 Sep 2023 08:37PM UTC coverage: 0.0% (-78.3%) from 78.333%

Build # #758

Build Type

push

circle-ci

Committed by

hershd23

Commit Message

Increased underline length in at line 75 in text_summarization.rst
	modified:   docs/source/benchmarks/text_summarization.rst

Run Details

0 of 11303 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/evadb/catalog/catalog_utils.py

# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import uuid
from pathlib import Path
from typing import Any, Dict, List

from evadb.catalog.catalog_type import (
    ColumnType,
    DocumentColumnName,
    ImageColumnName,
    NdArrayType,
    PDFColumnName,
    TableType,
    VideoColumnName,
)
from evadb.catalog.models.utils import (
    ColumnCatalogEntry,
    TableCatalogEntry,
    UdfCacheCatalogEntry,
    UdfCatalogEntry,
)
from evadb.catalog.sql_config import IDENTIFIER_COLUMN
from evadb.configuration.configuration_manager import ConfigurationManager
from evadb.expression.function_expression import FunctionExpression
from evadb.expression.tuple_value_expression import TupleValueExpression
from evadb.parser.create_statement import ColConstraintInfo, ColumnDefinition
from evadb.utils.generic_utils import get_str_hash, remove_directory_contents


def is_video_table(table: TableCatalogEntry):
    return table.table_type == TableType.VIDEO_DATA


def is_document_table(table: TableCatalogEntry):
    return table.table_type == TableType.DOCUMENT_DATA


def is_pdf_table(table: TableCatalogEntry):
    return table.table_type == TableType.PDF_DATA


def is_string_col(col: ColumnCatalogEntry):
    return col.type == ColumnType.TEXT or col.array_type == NdArrayType.STR


def get_video_table_column_definitions() -> List[ColumnDefinition]:
    """
    name: video path
    id: frame id
    data: frame data
    audio: frame audio
    """
    columns = [
        ColumnDefinition(
            VideoColumnName.name.name,
            ColumnType.TEXT,
            None,
            None,
            ColConstraintInfo(unique=True),
        ),
        ColumnDefinition(VideoColumnName.id.name, ColumnType.INTEGER, None, None),
        ColumnDefinition(
            VideoColumnName.data.name,
            ColumnType.NDARRAY,
            NdArrayType.UINT8,
            (None, None, None),
        ),
        ColumnDefinition(VideoColumnName.seconds.name, ColumnType.FLOAT, None, []),
    ]
    return columns


def get_image_table_column_definitions() -> List[ColumnDefinition]:
    """
    name: image path
    data: image decoded data
    """
    columns = [
        ColumnDefinition(
            ImageColumnName.name.name,
            ColumnType.TEXT,
            None,
            None,
            ColConstraintInfo(unique=True),
        ),
        ColumnDefinition(
            ImageColumnName.data.name,
            ColumnType.NDARRAY,
            NdArrayType.UINT8,
            (None, None, None),
        ),
    ]
    return columns


def get_document_table_column_definitions() -> List[ColumnDefinition]:
    """
    name: file path
    chunk_id: chunk id (0-indexed for each file)
    data: text data associated with the chunk
    """
    columns = [
        ColumnDefinition(
            DocumentColumnName.name.name,
            ColumnType.TEXT,
            None,
            None,
            ColConstraintInfo(unique=True),
        ),
        ColumnDefinition(
            DocumentColumnName.chunk_id.name, ColumnType.INTEGER, None, None
        ),
        ColumnDefinition(
            DocumentColumnName.data.name,
            ColumnType.TEXT,
            None,
            None,
        ),
    ]
    return columns


def get_pdf_table_column_definitions() -> List[ColumnDefinition]:
    """
    name: pdf name
    page: page no
    paragraph: paragraph no
    data: pdf paragraph data
    """
    columns = [
        ColumnDefinition(PDFColumnName.name.name, ColumnType.TEXT, None, None),
        ColumnDefinition(PDFColumnName.page.name, ColumnType.INTEGER, None, None),
        ColumnDefinition(PDFColumnName.paragraph.name, ColumnType.INTEGER, None, None),
        ColumnDefinition(
            PDFColumnName.data.name,
            ColumnType.TEXT,
            None,
            None,
        ),
    ]
    return columns


def get_table_primary_columns(
    table_catalog_obj: TableCatalogEntry,
) -> List[ColumnDefinition]:
    """
    Get the primary columns for a table based on its table type.

    Args:
        table_catalog_obj (TableCatalogEntry): The table catalog object.

    Returns:
        List[ColumnDefinition]: The list of primary columns for the table.
    """
    primary_columns = [
        ColumnDefinition(IDENTIFIER_COLUMN, ColumnType.INTEGER, None, None)
    ]
    # _row_id for all the TableTypes, however for Video data and PDF data we also add frame_id (id) and paragraph as part of unique key
    if table_catalog_obj.table_type == TableType.VIDEO_DATA:
        # _row_id, id
        primary_columns.append(
            ColumnDefinition(VideoColumnName.id.name, ColumnType.INTEGER, None, None),
        )

    elif table_catalog_obj.table_type == TableType.PDF_DATA:
        # _row_id, paragraph
        primary_columns.append(
            ColumnDefinition(
                PDFColumnName.paragraph.name, ColumnType.INTEGER, None, None
            )
        )

    elif table_catalog_obj.table_type == TableType.DOCUMENT_DATA:
        # _row_id, chunk_id
        primary_columns.append(
            ColumnDefinition(
                DocumentColumnName.chunk_id.name, ColumnType.INTEGER, None, None
            )
        )

    return primary_columns


def xform_column_definitions_to_catalog_entries(
    col_list: List[ColumnDefinition],
) -> List[ColumnCatalogEntry]:
    """Create column catalog entries for the input parsed column list.

    Arguments:
        col_list {List[ColumnDefinition]} -- parsed col list to be created
    """

    result_list = []
    for col in col_list:
        column_entry = ColumnCatalogEntry(
            name=col.name,
            type=col.type,
            array_type=col.array_type,
            array_dimensions=col.dimension,
            is_nullable=col.cci.nullable,
        )
        # todo: change me
        result_list.append(column_entry)

    return result_list


def construct_udf_cache_catalog_entry(
    func_expr: FunctionExpression, cache_dir: str
) -> UdfCacheCatalogEntry:
    """Constructs a udf cache catalog entry from a given function expression.
    It is assumed that the function expression has already been bound using the binder.
    The catalog entry is populated with dependent udfs and columns by traversing the
    expression tree. The cache name is represented by the signature of the function
    expression.
    Args:
        func_expr (FunctionExpression): the function expression with which the cache is associated
        cache_dir (str): path to store the cache
    Returns:
        UdfCacheCatalogEntry: the udf cache catalog entry
    """
    udf_depends = []
    col_depends = []
    for expr in func_expr.find_all(FunctionExpression):
        udf_depends.append(expr.udf_obj.row_id)
    for expr in func_expr.find_all(TupleValueExpression):
        col_depends.append(expr.col_object.row_id)
    cache_name = func_expr.signature()

    # add salt to the cache_name so that we generate unique name
    path = str(get_str_hash(cache_name + uuid.uuid4().hex))
    cache_path = str(Path(cache_dir) / Path(f"{path}_{func_expr.name}"))
    args = tuple([arg.signature() for arg in func_expr.children])
    entry = UdfCacheCatalogEntry(
        name=func_expr.signature(),
        udf_id=func_expr.udf_obj.row_id,
        cache_path=cache_path,
        args=args,
        udf_depends=udf_depends,
        col_depends=col_depends,
    )

    return entry


def cleanup_storage(config):
    remove_directory_contents(config.get_value("storage", "index_dir"))
    remove_directory_contents(config.get_value("storage", "cache_dir"))
    remove_directory_contents(config.get_value("core", "datasets_dir"))


def get_metadata_entry_or_val(
    udf_obj: UdfCatalogEntry, key: str, default_val: Any = None
) -> str:
    """
    Return the metadata value for the given key, or the default value if the
    key is not found.

    Args:
        udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is
        used to extract metadata information.
        key (str): The metadata key for which the corresponding value needs to be retrieved.
        default_val (Any): The default value to be returned if the metadata key is not found.

    Returns:
        str: metadata value
    """
    for metadata in udf_obj.metadata:
        if metadata.key == key:
            return metadata.value
    return default_val


def get_metadata_properties(udf_obj: UdfCatalogEntry) -> Dict:
    """
    Return all the metadata properties as key value pair

    Args:
        udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is
        used to extract metadata information.
    Returns:
        Dict: key-value for each metadata entry
    """
    properties = {}
    for metadata in udf_obj.metadata:
        properties[metadata.key] = metadata.value
    return properties


#### get catalog instance
# This function plays a crucial role in ensuring that different threads do
# not share the same catalog object, as it can result in serialization issues and
# incorrect behavior with SQLAlchemy. Therefore, whenever a catalog instance is
# required, we create a new one. One possible optimization is to share the catalog
# instance across all objects within the same thread. It is worth investigating whether
# SQLAlchemy already handles this optimization for us, which will be explored at a
# later time.
def get_catalog_instance(db_uri: str, config: ConfigurationManager):
    from evadb.catalog.catalog_manager import CatalogManager

    return CatalogManager(db_uri, config)

1	# coding=utf-8
2	# Copyright 2018-2023 EvaDB
3	#
4	# Licensed under the Apache License, Version 2.0 (the "License");
5	# you may not use this file except in compliance with the License.
6	# You may obtain a copy of the License at
7	#
8	# http://www.apache.org/licenses/LICENSE-2.0
9	#
10	# Unless required by applicable law or agreed to in writing, software
11	# distributed under the License is distributed on an "AS IS" BASIS,
12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	# See the License for the specific language governing permissions and
14	# limitations under the License.
15	import uuid	×
16	from pathlib import Path	×
17	from typing import Any, Dict, List	×
18
19	from evadb.catalog.catalog_type import (	×
20	ColumnType,
21	DocumentColumnName,
22	ImageColumnName,
23	NdArrayType,
24	PDFColumnName,
25	TableType,
26	VideoColumnName,
27	)
28	from evadb.catalog.models.utils import (	×
29	ColumnCatalogEntry,
30	TableCatalogEntry,
31	UdfCacheCatalogEntry,
32	UdfCatalogEntry,
33	)
34	from evadb.catalog.sql_config import IDENTIFIER_COLUMN	×
35	from evadb.configuration.configuration_manager import ConfigurationManager	×
36	from evadb.expression.function_expression import FunctionExpression	×
37	from evadb.expression.tuple_value_expression import TupleValueExpression	×
38	from evadb.parser.create_statement import ColConstraintInfo, ColumnDefinition	×
39	from evadb.utils.generic_utils import get_str_hash, remove_directory_contents	×
40
41
42	def is_video_table(table: TableCatalogEntry):	×
43	return table.table_type == TableType.VIDEO_DATA	×
44
45
46	def is_document_table(table: TableCatalogEntry):	×
47	return table.table_type == TableType.DOCUMENT_DATA	×
48
49
50	def is_pdf_table(table: TableCatalogEntry):	×
51	return table.table_type == TableType.PDF_DATA	×
52
53
54	def is_string_col(col: ColumnCatalogEntry):	×
55	return col.type == ColumnType.TEXT or col.array_type == NdArrayType.STR	×
56
57
58	def get_video_table_column_definitions() -> List[ColumnDefinition]:	×
59	"""
60	name: video path
61	id: frame id
62	data: frame data
63	audio: frame audio
64	"""
65	columns = [	×
66	ColumnDefinition(
67	VideoColumnName.name.name,
68	ColumnType.TEXT,
69	None,
70	None,
71	ColConstraintInfo(unique=True),
72	),
73	ColumnDefinition(VideoColumnName.id.name, ColumnType.INTEGER, None, None),
74	ColumnDefinition(
75	VideoColumnName.data.name,
76	ColumnType.NDARRAY,
77	NdArrayType.UINT8,
78	(None, None, None),
79	),
80	ColumnDefinition(VideoColumnName.seconds.name, ColumnType.FLOAT, None, []),
81	]
82	return columns	×
83
84
85	def get_image_table_column_definitions() -> List[ColumnDefinition]:	×
86	"""
87	name: image path
88	data: image decoded data
89	"""
90	columns = [	×
91	ColumnDefinition(
92	ImageColumnName.name.name,
93	ColumnType.TEXT,
94	None,
95	None,
96	ColConstraintInfo(unique=True),
97	),
98	ColumnDefinition(
99	ImageColumnName.data.name,
100	ColumnType.NDARRAY,
101	NdArrayType.UINT8,
102	(None, None, None),
103	),
104	]
105	return columns	×
106
107
108	def get_document_table_column_definitions() -> List[ColumnDefinition]:	×
109	"""
110	name: file path
111	chunk_id: chunk id (0-indexed for each file)
112	data: text data associated with the chunk
113	"""
114	columns = [	×
115	ColumnDefinition(
116	DocumentColumnName.name.name,
117	ColumnType.TEXT,
118	None,
119	None,
120	ColConstraintInfo(unique=True),
121	),
122	ColumnDefinition(
123	DocumentColumnName.chunk_id.name, ColumnType.INTEGER, None, None
124	),
125	ColumnDefinition(
126	DocumentColumnName.data.name,
127	ColumnType.TEXT,
128	None,
129	None,
130	),
131	]
132	return columns	×
133
134
135	def get_pdf_table_column_definitions() -> List[ColumnDefinition]:	×
136	"""
137	name: pdf name
138	page: page no
139	paragraph: paragraph no
140	data: pdf paragraph data
141	"""
142	columns = [	×
143	ColumnDefinition(PDFColumnName.name.name, ColumnType.TEXT, None, None),
144	ColumnDefinition(PDFColumnName.page.name, ColumnType.INTEGER, None, None),
145	ColumnDefinition(PDFColumnName.paragraph.name, ColumnType.INTEGER, None, None),
146	ColumnDefinition(
147	PDFColumnName.data.name,
148	ColumnType.TEXT,
149	None,
150	None,
151	),
152	]
153	return columns	×
154
155
156	def get_table_primary_columns(	×
157	table_catalog_obj: TableCatalogEntry,
158	) -> List[ColumnDefinition]:
159	"""
160	Get the primary columns for a table based on its table type.
161
162	Args:
163	table_catalog_obj (TableCatalogEntry): The table catalog object.
164
165	Returns:
166	List[ColumnDefinition]: The list of primary columns for the table.
167	"""
168	primary_columns = [	×
169	ColumnDefinition(IDENTIFIER_COLUMN, ColumnType.INTEGER, None, None)
170	]
171	# _row_id for all the TableTypes, however for Video data and PDF data we also add frame_id (id) and paragraph as part of unique key
172	if table_catalog_obj.table_type == TableType.VIDEO_DATA:	×
173	# _row_id, id
174	primary_columns.append(	×
175	ColumnDefinition(VideoColumnName.id.name, ColumnType.INTEGER, None, None),
176	)
177
178	elif table_catalog_obj.table_type == TableType.PDF_DATA:	×
179	# _row_id, paragraph
180	primary_columns.append(	×
181	ColumnDefinition(
182	PDFColumnName.paragraph.name, ColumnType.INTEGER, None, None
183	)
184	)
185
186	elif table_catalog_obj.table_type == TableType.DOCUMENT_DATA:	×
187	# _row_id, chunk_id
188	primary_columns.append(	×
189	ColumnDefinition(
190	DocumentColumnName.chunk_id.name, ColumnType.INTEGER, None, None
191	)
192	)
193
194	return primary_columns	×
195
196
197	def xform_column_definitions_to_catalog_entries(	×
198	col_list: List[ColumnDefinition],
199	) -> List[ColumnCatalogEntry]:
200	"""Create column catalog entries for the input parsed column list.
201
202	Arguments:
203	col_list {List[ColumnDefinition]} -- parsed col list to be created
204	"""
205
206	result_list = []	×
207	for col in col_list:	×
208	column_entry = ColumnCatalogEntry(	×
209	name=col.name,
210	type=col.type,
211	array_type=col.array_type,
212	array_dimensions=col.dimension,
213	is_nullable=col.cci.nullable,
214	)
215	# todo: change me
216	result_list.append(column_entry)	×
217
218	return result_list	×
219
220
221	def construct_udf_cache_catalog_entry(	×
222	func_expr: FunctionExpression, cache_dir: str
223	) -> UdfCacheCatalogEntry:
224	"""Constructs a udf cache catalog entry from a given function expression.
225	It is assumed that the function expression has already been bound using the binder.
226	The catalog entry is populated with dependent udfs and columns by traversing the
227	expression tree. The cache name is represented by the signature of the function
228	expression.
229	Args:
230	func_expr (FunctionExpression): the function expression with which the cache is associated
231	cache_dir (str): path to store the cache
232	Returns:
233	UdfCacheCatalogEntry: the udf cache catalog entry
234	"""
235	udf_depends = []	×
236	col_depends = []	×
237	for expr in func_expr.find_all(FunctionExpression):	×
238	udf_depends.append(expr.udf_obj.row_id)	×
239	for expr in func_expr.find_all(TupleValueExpression):	×
240	col_depends.append(expr.col_object.row_id)	×
241	cache_name = func_expr.signature()	×
242
243	# add salt to the cache_name so that we generate unique name
244	path = str(get_str_hash(cache_name + uuid.uuid4().hex))	×
245	cache_path = str(Path(cache_dir) / Path(f"{path}_{func_expr.name}"))	×
246	args = tuple([arg.signature() for arg in func_expr.children])	×
247	entry = UdfCacheCatalogEntry(	×
248	name=func_expr.signature(),
249	udf_id=func_expr.udf_obj.row_id,
250	cache_path=cache_path,
251	args=args,
252	udf_depends=udf_depends,
253	col_depends=col_depends,
254	)
255
256	return entry	×
257
258
259	def cleanup_storage(config):	×
260	remove_directory_contents(config.get_value("storage", "index_dir"))	×
261	remove_directory_contents(config.get_value("storage", "cache_dir"))	×
262	remove_directory_contents(config.get_value("core", "datasets_dir"))	×
263
264
265	def get_metadata_entry_or_val(	×
266	udf_obj: UdfCatalogEntry, key: str, default_val: Any = None
267	) -> str:
268	"""
269	Return the metadata value for the given key, or the default value if the
270	key is not found.
271
272	Args:
273	udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is
274	used to extract metadata information.
275	key (str): The metadata key for which the corresponding value needs to be retrieved.
276	default_val (Any): The default value to be returned if the metadata key is not found.
277
278	Returns:
279	str: metadata value
280	"""
281	for metadata in udf_obj.metadata:	×
282	if metadata.key == key:	×
283	return metadata.value	×
284	return default_val	×
285
286
287	def get_metadata_properties(udf_obj: UdfCatalogEntry) -> Dict:	×
288	"""
289	Return all the metadata properties as key value pair
290
291	Args:
292	udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is
293	used to extract metadata information.
294	Returns:
295	Dict: key-value for each metadata entry
296	"""
297	properties = {}	×
298	for metadata in udf_obj.metadata:	×
299	properties[metadata.key] = metadata.value	×
300	return properties	×
301
302
303	#### get catalog instance
304	# This function plays a crucial role in ensuring that different threads do
305	# not share the same catalog object, as it can result in serialization issues and
306	# incorrect behavior with SQLAlchemy. Therefore, whenever a catalog instance is
307	# required, we create a new one. One possible optimization is to share the catalog
308	# instance across all objects within the same thread. It is worth investigating whether
309	# SQLAlchemy already handles this optimization for us, which will be explored at a
310	# later time.
311	def get_catalog_instance(db_uri: str, config: ConfigurationManager):	×
312	from evadb.catalog.catalog_manager import CatalogManager	×
313
314	return CatalogManager(db_uri, config)	×

georgia-tech-db / eva / #758

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous