#758

Committed 04 Sep 2023 08:37PM UTC coverage: 0.0% (-78.3%) from 78.333%

Build # #758

Build Type

push

circle-ci

Committed by

hershd23

Commit Message

Increased underline length in at line 75 in text_summarization.rst
	modified:   docs/source/benchmarks/text_summarization.rst

Run Details

0 of 11303 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/evadb/third_party/huggingface/create.py

# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, List, Type, Union

import numpy as np

from evadb.catalog.catalog_type import ColumnType, NdArrayType
from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry
from evadb.catalog.models.udf_metadata_catalog import UdfMetadataCatalogEntry
from evadb.third_party.huggingface.model import (
    ASRHFModel,
    AudioHFModel,
    HFInputTypes,
    ImageHFModel,
    TextHFModel,
)
from evadb.utils.generic_utils import try_to_import_transformers

"""
We currently support the following tasks from HuggingFace.
Each task is mapped to the type of input it expects.
"""
INPUT_TYPE_FOR_SUPPORTED_TASKS = {
    "audio-classification": HFInputTypes.AUDIO,
    "automatic-speech-recognition": HFInputTypes.AUDIO,
    "text-classification": HFInputTypes.TEXT,
    "summarization": HFInputTypes.TEXT,
    "translation": HFInputTypes.TEXT,
    "text2text-generation": HFInputTypes.TEXT,
    "text-generation": HFInputTypes.TEXT,
    "ner": HFInputTypes.TEXT,
    "image-classification": HFInputTypes.IMAGE,
    "image-segmentation": HFInputTypes.IMAGE,
    "image-to-text": HFInputTypes.IMAGE,
    "object-detection": HFInputTypes.IMAGE,
    "depth-estimation": HFInputTypes.IMAGE,
}

MODEL_FOR_TASK = {
    "audio-classification": AudioHFModel,
    "automatic-speech-recognition": ASRHFModel,
    "text-classification": TextHFModel,
    "summarization": TextHFModel,
    "translation": TextHFModel,
    "text2text-generation": TextHFModel,
    "text-generation": TextHFModel,
    "ner": TextHFModel,
    "image-classification": ImageHFModel,
    "image-segmentation": ImageHFModel,
    "image-to-text": ImageHFModel,
    "object-detection": ImageHFModel,
    "depth-estimation": ImageHFModel,
}


def sample_text():
    return "My name is Sarah and I live in London"


def sample_image():
    from PIL import Image, ImageDraw

    width, height = 224, 224
    image = Image.new("RGB", (width, height), "white")
    draw = ImageDraw.Draw(image)

    circle_radius = min(width, height) // 4
    circle_center = (width // 2, height // 2)
    circle_bbox = (
        circle_center[0] - circle_radius,
        circle_center[1] - circle_radius,
        circle_center[0] + circle_radius,
        circle_center[1] + circle_radius,
    )
    draw.ellipse(circle_bbox, fill="yellow")
    return image


def sample_audio():
    duration_ms, sample_rate = 1000, 16000
    num_samples = int(duration_ms * sample_rate / 1000)
    audio_data = np.random.rand(num_samples)
    return audio_data


def gen_sample_input(input_type: HFInputTypes):
    if input_type == HFInputTypes.TEXT:
        return sample_text()
    elif input_type == HFInputTypes.IMAGE:
        return sample_image()
    elif input_type == HFInputTypes.AUDIO:
        return sample_audio()
    assert False, "Invalid Input Type for UDF"


def infer_output_name_and_type(**pipeline_args):
    """
    Infer the name and type for each output of the HuggingFace UDF
    """
    assert "task" in pipeline_args, "Task Not Found In Model Definition"
    task = pipeline_args["task"]
    assert (
        task in INPUT_TYPE_FOR_SUPPORTED_TASKS
    ), f"Task {task} not supported in EvaDB currently"

    # Construct the pipeline
    try_to_import_transformers()
    from transformers import pipeline

    pipe = pipeline(**pipeline_args)

    # Run the pipeline through a dummy input to get a sample output
    input_type = INPUT_TYPE_FOR_SUPPORTED_TASKS[task]
    model_input = gen_sample_input(input_type)
    model_output = pipe(model_input)

    # Get a dictionary of output names and types from the output
    output_types = {}
    if isinstance(model_output, list):
        sample_out = model_output[0]
    else:
        sample_out = model_output

    for key, value in sample_out.items():
        output_types[key] = type(value)

    return input_type, output_types


def io_entry_for_inputs(udf_name: str, udf_input: Union[str, List]):
    """
    Generates the IO Catalog Entry for the inputs to HF UDFs
    Input is one of ["text", "image", "audio", "video", "multimodal"]
    """
    if isinstance(udf_input, HFInputTypes):
        udf_input = [udf_input]
    inputs = []
    for input_type in udf_input:
        array_type = NdArrayType.ANYTYPE
        if input_type == HFInputTypes.TEXT:
            array_type = NdArrayType.STR
        elif input_type == HFInputTypes.IMAGE or udf_input == HFInputTypes.AUDIO:
            array_type = NdArrayType.FLOAT32
        inputs.append(
            UdfIOCatalogEntry(
                name=f"{udf_name}_{input_type}",
                type=ColumnType.NDARRAY,
                is_nullable=False,
                array_type=array_type,
                is_input=True,
            )
        )
    return inputs


def ptype_to_ndarray_type(col_type: type):
    """
    Helper function that maps python types to ndarray types
    """
    if col_type == str:
        return NdArrayType.STR
    elif col_type == float:
        return NdArrayType.FLOAT32
    else:
        return NdArrayType.ANYTYPE


def io_entry_for_outputs(udf_outputs: Dict[str, Type]):
    """
    Generates the IO Catalog Entry for the output
    """
    outputs = []
    for col_name, col_type in udf_outputs.items():
        outputs.append(
            UdfIOCatalogEntry(
                name=col_name,
                type=ColumnType.NDARRAY,
                array_type=ptype_to_ndarray_type(col_type),
                is_input=False,
            )
        )
    return outputs


def gen_hf_io_catalog_entries(udf_name: str, metadata: List[UdfMetadataCatalogEntry]):
    """
    Generates IO Catalog Entries for a HuggingFace UDF.
    The attributes of the huggingface model can be extracted from metadata.
    """
    pipeline_args = {arg.key: arg.value for arg in metadata}
    udf_input, udf_output = infer_output_name_and_type(**pipeline_args)
    annotated_inputs = io_entry_for_inputs(udf_name, udf_input)
    annotated_outputs = io_entry_for_outputs(udf_output)
    return annotated_inputs + annotated_outputs

1	# coding=utf-8
2	# Copyright 2018-2023 EvaDB
3	#
4	# Licensed under the Apache License, Version 2.0 (the "License");
5	# you may not use this file except in compliance with the License.
6	# You may obtain a copy of the License at
7	#
8	# http://www.apache.org/licenses/LICENSE-2.0
9	#
10	# Unless required by applicable law or agreed to in writing, software
11	# distributed under the License is distributed on an "AS IS" BASIS,
12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	# See the License for the specific language governing permissions and
14	# limitations under the License.
15	from typing import Dict, List, Type, Union	×
16
17	import numpy as np	×
18
19	from evadb.catalog.catalog_type import ColumnType, NdArrayType	×
20	from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry	×
21	from evadb.catalog.models.udf_metadata_catalog import UdfMetadataCatalogEntry	×
22	from evadb.third_party.huggingface.model import (	×
23	ASRHFModel,
24	AudioHFModel,
25	HFInputTypes,
26	ImageHFModel,
27	TextHFModel,
28	)
29	from evadb.utils.generic_utils import try_to_import_transformers	×
30
31	"""	×
32	We currently support the following tasks from HuggingFace.
33	Each task is mapped to the type of input it expects.
34	"""
35	INPUT_TYPE_FOR_SUPPORTED_TASKS = {	×
36	"audio-classification": HFInputTypes.AUDIO,
37	"automatic-speech-recognition": HFInputTypes.AUDIO,
38	"text-classification": HFInputTypes.TEXT,
39	"summarization": HFInputTypes.TEXT,
40	"translation": HFInputTypes.TEXT,
41	"text2text-generation": HFInputTypes.TEXT,
42	"text-generation": HFInputTypes.TEXT,
43	"ner": HFInputTypes.TEXT,
44	"image-classification": HFInputTypes.IMAGE,
45	"image-segmentation": HFInputTypes.IMAGE,
46	"image-to-text": HFInputTypes.IMAGE,
47	"object-detection": HFInputTypes.IMAGE,
48	"depth-estimation": HFInputTypes.IMAGE,
49	}
50
51	MODEL_FOR_TASK = {	×
52	"audio-classification": AudioHFModel,
53	"automatic-speech-recognition": ASRHFModel,
54	"text-classification": TextHFModel,
55	"summarization": TextHFModel,
56	"translation": TextHFModel,
57	"text2text-generation": TextHFModel,
58	"text-generation": TextHFModel,
59	"ner": TextHFModel,
60	"image-classification": ImageHFModel,
61	"image-segmentation": ImageHFModel,
62	"image-to-text": ImageHFModel,
63	"object-detection": ImageHFModel,
64	"depth-estimation": ImageHFModel,
65	}
66
67
68	def sample_text():	×
69	return "My name is Sarah and I live in London"	×
70
71
72	def sample_image():	×
73	from PIL import Image, ImageDraw	×
74
75	width, height = 224, 224	×
76	image = Image.new("RGB", (width, height), "white")	×
77	draw = ImageDraw.Draw(image)	×
78
79	circle_radius = min(width, height) // 4	×
80	circle_center = (width // 2, height // 2)	×
81	circle_bbox = (	×
82	circle_center[0] - circle_radius,
83	circle_center[1] - circle_radius,
84	circle_center[0] + circle_radius,
85	circle_center[1] + circle_radius,
86	)
87	draw.ellipse(circle_bbox, fill="yellow")	×
88	return image	×
89
90
91	def sample_audio():	×
92	duration_ms, sample_rate = 1000, 16000	×
93	num_samples = int(duration_ms * sample_rate / 1000)	×
94	audio_data = np.random.rand(num_samples)	×
95	return audio_data	×
96
97
98	def gen_sample_input(input_type: HFInputTypes):	×
99	if input_type == HFInputTypes.TEXT:	×
100	return sample_text()	×
101	elif input_type == HFInputTypes.IMAGE:	×
102	return sample_image()	×
103	elif input_type == HFInputTypes.AUDIO:	×
104	return sample_audio()	×
105	assert False, "Invalid Input Type for UDF"	×
106
107
108	def infer_output_name_and_type(**pipeline_args):	×
109	"""
110	Infer the name and type for each output of the HuggingFace UDF
111	"""
112	assert "task" in pipeline_args, "Task Not Found In Model Definition"	×
113	task = pipeline_args["task"]	×
114	assert (	×
115	task in INPUT_TYPE_FOR_SUPPORTED_TASKS
116	), f"Task {task} not supported in EvaDB currently"
117
118	# Construct the pipeline
119	try_to_import_transformers()	×
120	from transformers import pipeline	×
121
122	pipe = pipeline(**pipeline_args)	×
123
124	# Run the pipeline through a dummy input to get a sample output
125	input_type = INPUT_TYPE_FOR_SUPPORTED_TASKS[task]	×
126	model_input = gen_sample_input(input_type)	×
127	model_output = pipe(model_input)	×
128
129	# Get a dictionary of output names and types from the output
130	output_types = {}	×
131	if isinstance(model_output, list):	×
132	sample_out = model_output[0]	×
133	else:
134	sample_out = model_output	×
135
136	for key, value in sample_out.items():	×
137	output_types[key] = type(value)	×
138
139	return input_type, output_types	×
140
141
142	def io_entry_for_inputs(udf_name: str, udf_input: Union[str, List]):	×
143	"""
144	Generates the IO Catalog Entry for the inputs to HF UDFs
145	Input is one of ["text", "image", "audio", "video", "multimodal"]
146	"""
147	if isinstance(udf_input, HFInputTypes):	×
148	udf_input = [udf_input]	×
149	inputs = []	×
150	for input_type in udf_input:	×
151	array_type = NdArrayType.ANYTYPE	×
152	if input_type == HFInputTypes.TEXT:	×
153	array_type = NdArrayType.STR	×
154	elif input_type == HFInputTypes.IMAGE or udf_input == HFInputTypes.AUDIO:	×
155	array_type = NdArrayType.FLOAT32	×
156	inputs.append(	×
157	UdfIOCatalogEntry(
158	name=f"{udf_name}_{input_type}",
159	type=ColumnType.NDARRAY,
160	is_nullable=False,
161	array_type=array_type,
162	is_input=True,
163	)
164	)
165	return inputs	×
166
167
168	def ptype_to_ndarray_type(col_type: type):	×
169	"""
170	Helper function that maps python types to ndarray types
171	"""
172	if col_type == str:	×
173	return NdArrayType.STR	×
174	elif col_type == float:	×
175	return NdArrayType.FLOAT32	×
176	else:
177	return NdArrayType.ANYTYPE	×
178
179
180	def io_entry_for_outputs(udf_outputs: Dict[str, Type]):	×
181	"""
182	Generates the IO Catalog Entry for the output
183	"""
184	outputs = []	×
185	for col_name, col_type in udf_outputs.items():	×
186	outputs.append(	×
187	UdfIOCatalogEntry(
188	name=col_name,
189	type=ColumnType.NDARRAY,
190	array_type=ptype_to_ndarray_type(col_type),
191	is_input=False,
192	)
193	)
194	return outputs	×
195
196
197	def gen_hf_io_catalog_entries(udf_name: str, metadata: List[UdfMetadataCatalogEntry]):	×
198	"""
199	Generates IO Catalog Entries for a HuggingFace UDF.
200	The attributes of the huggingface model can be extracted from metadata.
201	"""
202	pipeline_args = {arg.key: arg.value for arg in metadata}	×
203	udf_input, udf_output = infer_output_name_and_type(**pipeline_args)	×
204	annotated_inputs = io_entry_for_inputs(udf_name, udf_input)	×
205	annotated_outputs = io_entry_for_outputs(udf_output)	×
206	return annotated_inputs + annotated_outputs	×

georgia-tech-db / eva / #758

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous