13040707031

Committed 29 Jan 2025 09:23PM UTC coverage: 96.412% (+15.9%) from 80.557%

Build # 13040707031

Build Type

Pull #84

github

Specific Base 24322d

Committed by

kevdevg

Commit Message

feat: multimodal output for openain/litellm

Pull Request Pull Request #84: feat: multimodal output audio for OpenAi and Litellm

Run Details

33 of 34 new or added lines in 3 files covered. (97.06%)

54 existing lines in 10 files now uncovered.

2472 of 2564 relevant lines covered (96.41%)

3.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.23

/scope3ai/tracers/utils/multimodal.py

import base64
import logging
from io import BytesIO

from scope3ai.api.types import ImpactRow
from scope3ai.api.typesgen import Image as RootImage
from scope3ai.tracers.utils.audio import MUTAGEN_MAPPING, _get_audio_duration


def aggregate_multimodal_image(content: dict, row: ImpactRow) -> None:
    from PIL import Image

    url = content["image_url"]["url"]
    if url.startswith("data:"):
        # extract content type, and data part
        # example: data:image/jpeg;base64,....
        content_type, data = url.split(",", 1)
        image_data = BytesIO(base64.b64decode(data))
        image = Image.open(image_data)
        width, height = image.size
        size = RootImage(root=f"{width}x{height}")

        if row.input_images is None:
            row.input_images = [size]
        else:
            row.input_images.append(size)

    else:
        # TODO: not supported yet.
        # Should we actually download the file here just to have the size ??
        pass


def aggregate_multimodal_audio(content: dict, row: ImpactRow) -> None:
    input_audio = content["input_audio"]
    format = input_audio["format"]
    b64data = input_audio["data"]
    assert format in MUTAGEN_MAPPING

    # decode the base64 data
    audio_data = base64.b64decode(b64data)
    # TODO: accept audio duration as float in AiApi
    duration = _get_audio_duration(format, audio_data)

    if duration:
        if row.input_audio_seconds is None:
            row.input_audio_seconds = duration
        else:
            row.input_audio_seconds += duration


def aggregate_multimodal_audio_content_output(
    content: str, audio_format: str, row: ImpactRow
) -> None:
    assert audio_format in MUTAGEN_MAPPING

    audio_data = base64.b64decode(content)
    duration = _get_audio_duration(audio_format, audio_data)
    if duration:
        if row.output_audio_seconds is None:
            row.output_audio_seconds = duration
        else:
            row.output_audio_seconds += duration


def aggregate_multimodal_content(
    content: dict, row: ImpactRow, logger: logging.Logger
) -> None:
    try:
        content_type = content.get("type")
        if content_type == "image_url":
            aggregate_multimodal_image(content, row)
        elif content_type == "input_audio":
            aggregate_multimodal_audio(content, row)
    except Exception as e:
        logger.error(f"Error processing multimodal content: {e}")


def aggregate_multimodal(message: dict, row: ImpactRow, logger: logging.Logger) -> None:
    # if the message content is not a tuple/list, it's just text.
    # so there is nothing multimodal in it, we can just forget about it.
    content = message.get("content", [])
    if isinstance(content, (tuple, list)):
        for item in content:
            aggregate_multimodal_content(item, row, logger)

1	import base64	4✔
2	import logging	4✔
3	from io import BytesIO	4✔
4
5	from scope3ai.api.types import ImpactRow	4✔
6	from scope3ai.api.typesgen import Image as RootImage	4✔
7	from scope3ai.tracers.utils.audio import MUTAGEN_MAPPING, _get_audio_duration	4✔
8
9
10	def aggregate_multimodal_image(content: dict, row: ImpactRow) -> None:	4✔
11	from PIL import Image	4✔
12
13	url = content["image_url"]["url"]	4✔
14	if url.startswith("data:"):	4✔
15	# extract content type, and data part
16	# example: data:image/jpeg;base64,....
17	content_type, data = url.split(",", 1)	4✔
18	image_data = BytesIO(base64.b64decode(data))	4✔
19	image = Image.open(image_data)	4✔
20	width, height = image.size	4✔
21	size = RootImage(root=f"{width}x{height}")	4✔
22
23	if row.input_images is None:	4✔
24	row.input_images = [size]	4✔
25	else:
26	row.input_images.append(size)	4✔
27
28	else:
29	# TODO: not supported yet.
30	# Should we actually download the file here just to have the size ??
31	pass	1✔
32
33
34	def aggregate_multimodal_audio(content: dict, row: ImpactRow) -> None:	4✔
35	input_audio = content["input_audio"]	4✔
36	format = input_audio["format"]	4✔
37	b64data = input_audio["data"]	4✔
38	assert format in MUTAGEN_MAPPING	4✔
39
40	# decode the base64 data
41	audio_data = base64.b64decode(b64data)	4✔
42	# TODO: accept audio duration as float in AiApi
43	duration = _get_audio_duration(format, audio_data)	4✔
44
45	if duration:	4✔
46	if row.input_audio_seconds is None:	4✔
47	row.input_audio_seconds = duration	4✔
48	else:
49	row.input_audio_seconds += duration	4✔
50
51
52	def aggregate_multimodal_audio_content_output(	4✔
53	content: str, audio_format: str, row: ImpactRow
54	) -> None:
55	assert audio_format in MUTAGEN_MAPPING	4✔
56
57	audio_data = base64.b64decode(content)	4✔
58	duration = _get_audio_duration(audio_format, audio_data)	4✔
59	if duration:	4✔
60	if row.output_audio_seconds is None:	4✔
61	row.output_audio_seconds = duration	4✔
62	else:
NEW 63	row.output_audio_seconds += duration	×
64
65
66	def aggregate_multimodal_content(	4✔
67	content: dict, row: ImpactRow, logger: logging.Logger
68	) -> None:
69	try:	4✔
70	content_type = content.get("type")	4✔
71	if content_type == "image_url":	4✔
72	aggregate_multimodal_image(content, row)	4✔
73	elif content_type == "input_audio":	4✔
74	aggregate_multimodal_audio(content, row)	4✔
75	except Exception as e:	×
76	logger.error(f"Error processing multimodal content: {e}")	×
77
78
79	def aggregate_multimodal(message: dict, row: ImpactRow, logger: logging.Logger) -> None:	4✔
80	# if the message content is not a tuple/list, it's just text.
81	# so there is nothing multimodal in it, we can just forget about it.
82	content = message.get("content", [])	4✔
83	if isinstance(content, (tuple, list)):	4✔
84	for item in content:	4✔
85	aggregate_multimodal_content(item, row, logger)	4✔

scope3data / scope3ai-py / 13040707031

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous