13972131258

Committed 20 Mar 2025 02:43PM UTC coverage: 90.021% (-0.03%) from 90.054%

Build # 13972131258

Build Type

Pull #9069

github

Committed by

web-flow

Commit Message

Merge 8371761b0 into 67ab3788e

Pull Request Pull Request #9069: refactor!: `ChatMessage` serialization-deserialization updates

Run Details

9833 of 10923 relevant lines covered (90.02%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.06

haystack/components/audio/whisper_local.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import tempfile
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union, get_args

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
from haystack.utils import ComponentDevice

with LazyImport("Run 'pip install \"openai-whisper>=20231106\"' to install whisper.") as whisper_import:
    import whisper

WhisperLocalModel = Literal[
    "base",
    "base.en",
    "large",
    "large-v1",
    "large-v2",
    "large-v3",
    "medium",
    "medium.en",
    "small",
    "small.en",
    "tiny",
    "tiny.en",
]


@component
class LocalWhisperTranscriber:
    """
    Transcribes audio files using OpenAI's Whisper model on your local machine.

    For the supported audio formats, languages, and other parameters, see the
    [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
    [GitHub repository](https://github.com/openai/whisper).

    ### Usage example

    ```python
    from haystack.components.audio import LocalWhisperTranscriber

    whisper = LocalWhisperTranscriber(model="small")
    whisper.warm_up()
    transcription = whisper.run(sources=["path/to/audio/file"])
    ```
    """

    def __init__(
        self,
        model: WhisperLocalModel = "large",
        device: Optional[ComponentDevice] = None,
        whisper_params: Optional[Dict[str, Any]] = None,
    ):
        """
        Creates an instance of the LocalWhisperTranscriber component.

        :param model:
            The name of the model to use. Set to one of the following models:
            "tiny", "base", "small", "medium", "large" (default).
            For details on the models and their modifications, see the
            [Whisper documentation](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages).
        :param device:
            The device for loading the model. If `None`, automatically selects the default device.
        """
        whisper_import.check()
        if model not in get_args(WhisperLocalModel):
            raise ValueError(
                f"Model name '{model}' not recognized. Choose one among: {', '.join(get_args(WhisperLocalModel))}."
            )
        self.model = model
        self.whisper_params = whisper_params or {}
        self.device = ComponentDevice.resolve_device(device)
        self._model = None

    def warm_up(self) -> None:
        """
        Loads the model in memory.
        """
        if not self._model:
            self._model = whisper.load_model(self.model, device=self.device.to_torch())

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        return default_to_dict(self, model=self.model, device=self.device.to_dict(), whisper_params=self.whisper_params)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "LocalWhisperTranscriber":
        """
        Deserializes the component from a dictionary.

        :param data:
            The dictionary to deserialize from.
        :returns:
            The deserialized component.
        """
        init_params = data["init_parameters"]
        if init_params.get("device") is not None:
            init_params["device"] = ComponentDevice.from_dict(init_params["device"])
        return default_from_dict(cls, data)

    @component.output_types(documents=List[Document])
    def run(self, sources: List[Union[str, Path, ByteStream]], whisper_params: Optional[Dict[str, Any]] = None):
        """
        Transcribes a list of audio files into a list of documents.

        For the supported audio formats, languages, and other parameters, see the
        [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
        [GitHup repo](https://github.com/openai/whisper).

        :param sources:
            A list of paths or binary streams to transcribe.

        :returns: A dictionary with the following keys:
            - `documents`: A list of documents where each document is a transcribed audio file. The content of
                the document is the transcription text, and the document's metadata contains the values returned by
                the Whisper model, such as the alignment data and the path to the audio file used
                for the transcription.
        """
        if self._model is None:
            raise RuntimeError(
                "The component LocalWhisperTranscriber was not warmed up. Run 'warm_up()' before calling 'run()'."
            )

        if whisper_params is None:
            whisper_params = self.whisper_params

        documents = self.transcribe(sources, **whisper_params)
        return {"documents": documents}

    def transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> List[Document]:
        """
        Transcribes the audio files into a list of Documents, one for each input file.

        For the supported audio formats, languages, and other parameters, see the
        [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
        [github repo](https://github.com/openai/whisper).

        :param sources:
            A list of paths or binary streams to transcribe.
        :returns:
            A list of Documents, one for each file.
        """
        transcriptions = self._raw_transcribe(sources, **kwargs)
        documents = []
        for path, transcript in transcriptions.items():
            content = transcript.pop("text")
            doc = Document(content=content, meta={"audio_file": path, **transcript})
            documents.append(doc)
        return documents

    def _raw_transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> Dict[Path, Any]:
        """
        Transcribes the given audio files. Returns the output of the model, a dictionary, for each input file.

        For the supported audio formats, languages, and other parameters, see the
        [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
        [github repo](https://github.com/openai/whisper).

        :param sources:
            A list of paths or binary streams to transcribe.
        :returns:
            A dictionary mapping 'file_path' to 'transcription'.
        """
        if self._model is None:
            raise RuntimeError("Model is not loaded, please run 'warm_up()' before calling 'run()'")

        return_segments = kwargs.pop("return_segments", False)
        transcriptions = {}

        for source in sources:
            path = Path(source) if not isinstance(source, ByteStream) else source.meta.get("file_path")

            if isinstance(source, ByteStream) and path is None:
                with tempfile.NamedTemporaryFile(delete=False) as fp:
                    path = Path(fp.name)
                    source.to_file(path)

            transcription = self._model.transcribe(str(path), **kwargs)

            if not return_segments:
                transcription.pop("segments", None)

            transcriptions[path] = transcription

        return transcriptions

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import tempfile	1✔
6	from pathlib import Path	1✔
7	from typing import Any, Dict, List, Literal, Optional, Union, get_args	1✔
8
9	from haystack import Document, component, default_from_dict, default_to_dict	1✔
10	from haystack.dataclasses import ByteStream	1✔
11	from haystack.lazy_imports import LazyImport	1✔
12	from haystack.utils import ComponentDevice	1✔
13
14	with LazyImport("Run 'pip install \"openai-whisper>=20231106\"' to install whisper.") as whisper_import:	1✔
15	import whisper	1✔
16
17	WhisperLocalModel = Literal[	1✔
18	"base",
19	"base.en",
20	"large",
21	"large-v1",
22	"large-v2",
23	"large-v3",
24	"medium",
25	"medium.en",
26	"small",
27	"small.en",
28	"tiny",
29	"tiny.en",
30	]
31
32
33	@component	1✔
34	class LocalWhisperTranscriber:	1✔
35	"""
36	Transcribes audio files using OpenAI's Whisper model on your local machine.
37
38	For the supported audio formats, languages, and other parameters, see the
39	[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
40	[GitHub repository](https://github.com/openai/whisper).
41
42	### Usage example
43
44	```python
45	from haystack.components.audio import LocalWhisperTranscriber
46
47	whisper = LocalWhisperTranscriber(model="small")
48	whisper.warm_up()
49	transcription = whisper.run(sources=["path/to/audio/file"])
50	```
51	"""
52
53	def __init__(	1✔
54	self,
55	model: WhisperLocalModel = "large",
56	device: Optional[ComponentDevice] = None,
57	whisper_params: Optional[Dict[str, Any]] = None,
58	):
59	"""
60	Creates an instance of the LocalWhisperTranscriber component.
61
62	:param model:
63	The name of the model to use. Set to one of the following models:
64	"tiny", "base", "small", "medium", "large" (default).
65	For details on the models and their modifications, see the
66	[Whisper documentation](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages).
67	:param device:
68	The device for loading the model. If `None`, automatically selects the default device.
69	"""
70	whisper_import.check()	1✔
71	if model not in get_args(WhisperLocalModel):	1✔
72	raise ValueError(	1✔
73	f"Model name '{model}' not recognized. Choose one among: {', '.join(get_args(WhisperLocalModel))}."
74	)
75	self.model = model	1✔
76	self.whisper_params = whisper_params or {}	1✔
77	self.device = ComponentDevice.resolve_device(device)	1✔
78	self._model = None	1✔
79
80	def warm_up(self) -> None:	1✔
81	"""
82	Loads the model in memory.
83	"""
84	if not self._model:	1✔
85	self._model = whisper.load_model(self.model, device=self.device.to_torch())	1✔
86
87	def to_dict(self) -> Dict[str, Any]:	1✔
88	"""
89	Serializes the component to a dictionary.
90
91	:returns:
92	Dictionary with serialized data.
93	"""
94	return default_to_dict(self, model=self.model, device=self.device.to_dict(), whisper_params=self.whisper_params)	1✔
95
96	@classmethod	1✔
97	def from_dict(cls, data: Dict[str, Any]) -> "LocalWhisperTranscriber":	1✔
98	"""
99	Deserializes the component from a dictionary.
100
101	:param data:
102	The dictionary to deserialize from.
103	:returns:
104	The deserialized component.
105	"""
106	init_params = data["init_parameters"]	1✔
107	if init_params.get("device") is not None:	1✔
108	init_params["device"] = ComponentDevice.from_dict(init_params["device"])	1✔
109	return default_from_dict(cls, data)	1✔
110
111	@component.output_types(documents=List[Document])	1✔
112	def run(self, sources: List[Union[str, Path, ByteStream]], whisper_params: Optional[Dict[str, Any]] = None):	1✔
113	"""
114	Transcribes a list of audio files into a list of documents.
115
116	For the supported audio formats, languages, and other parameters, see the
117	[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
118	[GitHup repo](https://github.com/openai/whisper).
119
120	:param sources:
121	A list of paths or binary streams to transcribe.
122
123	:returns: A dictionary with the following keys:
124	- `documents`: A list of documents where each document is a transcribed audio file. The content of
125	the document is the transcription text, and the document's metadata contains the values returned by
126	the Whisper model, such as the alignment data and the path to the audio file used
127	for the transcription.
128	"""
129	if self._model is None:	1✔
130	raise RuntimeError(	×
131	"The component LocalWhisperTranscriber was not warmed up. Run 'warm_up()' before calling 'run()'."
132	)
133
134	if whisper_params is None:	1✔
135	whisper_params = self.whisper_params	1✔
136
137	documents = self.transcribe(sources, **whisper_params)	1✔
138	return {"documents": documents}	1✔
139
140	def transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> List[Document]:	1✔
141	"""
142	Transcribes the audio files into a list of Documents, one for each input file.
143
144	For the supported audio formats, languages, and other parameters, see the
145	[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
146	[github repo](https://github.com/openai/whisper).
147
148	:param sources:
149	A list of paths or binary streams to transcribe.
150	:returns:
151	A list of Documents, one for each file.
152	"""
153	transcriptions = self._raw_transcribe(sources, **kwargs)	1✔
154	documents = []	1✔
155	for path, transcript in transcriptions.items():	1✔
156	content = transcript.pop("text")	1✔
157	doc = Document(content=content, meta={"audio_file": path, **transcript})	1✔
158	documents.append(doc)	1✔
159	return documents	1✔
160
161	def _raw_transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> Dict[Path, Any]:	1✔
162	"""
163	Transcribes the given audio files. Returns the output of the model, a dictionary, for each input file.
164
165	For the supported audio formats, languages, and other parameters, see the
166	[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
167	[github repo](https://github.com/openai/whisper).
168
169	:param sources:
170	A list of paths or binary streams to transcribe.
171	:returns:
172	A dictionary mapping 'file_path' to 'transcription'.
173	"""
174	if self._model is None:	1✔
175	raise RuntimeError("Model is not loaded, please run 'warm_up()' before calling 'run()'")	×
176
177	return_segments = kwargs.pop("return_segments", False)	1✔
178	transcriptions = {}	1✔
179
180	for source in sources:	1✔
181	path = Path(source) if not isinstance(source, ByteStream) else source.meta.get("file_path")	1✔
182
183	if isinstance(source, ByteStream) and path is None:	1✔
184	with tempfile.NamedTemporaryFile(delete=False) as fp:	×
185	path = Path(fp.name)	×
186	source.to_file(path)	×
187
188	transcription = self._model.transcribe(str(path), **kwargs)	1✔
189
190	if not return_segments:	1✔
191	transcription.pop("segments", None)	1✔
192
193	transcriptions[path] = transcription	1✔
194
195	return transcriptions	1✔

deepset-ai / haystack / 13972131258

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous