From 8bad51d8f90ae97ddd76e42b23d5d804203b97e3 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 6 Dec 2024 08:38:15 +0900 Subject: [PATCH 01/10] [refactor] Distinguish Whisper between OpenAI and Groq --- tests/recognizers/{test_whisper.py => test_openai.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/recognizers/{test_whisper.py => test_openai.py} (100%) diff --git a/tests/recognizers/test_whisper.py b/tests/recognizers/test_openai.py similarity index 100% rename from tests/recognizers/test_whisper.py rename to tests/recognizers/test_openai.py From 21acf32a02ea3db68e4b3b23a6fafe6d2a088bfd Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 6 Dec 2024 08:45:49 +0900 Subject: [PATCH 02/10] [test] Implementation with RESPX --- tests/recognizers/test_openai.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py index e84d0503..09a3f633 100644 --- a/tests/recognizers/test_openai.py +++ b/tests/recognizers/test_openai.py @@ -1,10 +1,34 @@ from unittest import TestCase from unittest.mock import MagicMock, patch +import httpx +import respx + from speech_recognition import AudioData, Recognizer from speech_recognition.recognizers import whisper +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "sk_openai_api_key") + + respx_mock.post("https://api.openai.com/v1/audio/transcriptions").mock( + return_value=httpx.Response( + 200, + json={"text": "Transcription by OpenAI Whisper"}, + ) + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = whisper.recognize_whisper_api( + MagicMock(spec=Recognizer), audio_data, model="whisper-1" + ) + + assert actual == "Transcription by OpenAI Whisper" + + @patch("speech_recognition.recognizers.whisper.os.environ") @patch("speech_recognition.recognizers.whisper.BytesIO") @patch("openai.OpenAI") From ee7234d942c620410357aa2a364b1f211eb835df Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 6 Dec 2024 08:52:43 +0900 Subject: [PATCH 03/10] [refactor] Use OpenAICompatibleRecognizer --- speech_recognition/recognizers/whisper.py | 22 +++++++++++----------- tests/recognizers/test_openai.py | 2 ++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/speech_recognition/recognizers/whisper.py b/speech_recognition/recognizers/whisper.py index 31a8e43e..d4d63923 100644 --- a/speech_recognition/recognizers/whisper.py +++ b/speech_recognition/recognizers/whisper.py @@ -1,19 +1,25 @@ from __future__ import annotations import os -from io import BytesIO +from typing import Literal from speech_recognition.audio import AudioData from speech_recognition.exceptions import SetupError +from speech_recognition.recognizers.whisper_api import ( + OpenAICompatibleRecognizer, +) + +# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model +WhisperModel = Literal["whisper-1"] def recognize_whisper_api( recognizer, audio_data: "AudioData", *, - model: str = "whisper-1", + model: WhisperModel = "whisper-1", api_key: str | None = None, -): +) -> str: """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. @@ -23,8 +29,6 @@ def recognize_whisper_api( Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing. """ - if not isinstance(audio_data, AudioData): - raise ValueError("``audio_data`` must be an ``AudioData`` instance") if api_key is None and os.environ.get("OPENAI_API_KEY") is None: raise SetupError("Set environment variable ``OPENAI_API_KEY``") @@ -35,9 +39,5 @@ def recognize_whisper_api( "missing openai module: ensure that openai is set up correctly." ) - wav_data = BytesIO(audio_data.get_wav_data()) - wav_data.name = "SpeechRecognition_audio.wav" - - client = openai.OpenAI(api_key=api_key) - transcript = client.audio.transcriptions.create(file=wav_data, model=model) - return transcript.text + recognizer = OpenAICompatibleRecognizer(openai.OpenAI(api_key=api_key)) + return recognizer.recognize(audio_data, model) diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py index 09a3f633..bd4ed1c6 100644 --- a/tests/recognizers/test_openai.py +++ b/tests/recognizers/test_openai.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock, patch import httpx +import pytest import respx from speech_recognition import AudioData, Recognizer @@ -29,6 +30,7 @@ def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): assert actual == "Transcription by OpenAI Whisper" +@pytest.mark.skip @patch("speech_recognition.recognizers.whisper.os.environ") @patch("speech_recognition.recognizers.whisper.BytesIO") @patch("openai.OpenAI") From 26e5165f267463f0619c06d571597887c24a53da Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 12:53:04 +0900 Subject: [PATCH 04/10] [test] Refine RESPX mock --- tests/recognizers/test_openai.py | 48 ++++---------------------------- 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py index bd4ed1c6..55135a4d 100644 --- a/tests/recognizers/test_openai.py +++ b/tests/recognizers/test_openai.py @@ -1,8 +1,6 @@ -from unittest import TestCase -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import httpx -import pytest import respx from speech_recognition import AudioData, Recognizer @@ -13,7 +11,11 @@ def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "sk_openai_api_key") - respx_mock.post("https://api.openai.com/v1/audio/transcriptions").mock( + respx_mock.post( + "https://api.openai.com/v1/audio/transcriptions", + headers__contains={"Authorization": "Bearer sk_openai_api_key"}, + data__contains={"model": "whisper-1"}, + ).mock( return_value=httpx.Response( 200, json={"text": "Transcription by OpenAI Whisper"}, @@ -28,41 +30,3 @@ def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): ) assert actual == "Transcription by OpenAI Whisper" - - -@pytest.mark.skip -@patch("speech_recognition.recognizers.whisper.os.environ") -@patch("speech_recognition.recognizers.whisper.BytesIO") -@patch("openai.OpenAI") -class RecognizeWhisperApiTestCase(TestCase): - def test_recognize_default_arguments(self, OpenAI, BytesIO, environ): - client = OpenAI.return_value - transcript = client.audio.transcriptions.create.return_value - - recognizer = MagicMock(spec=Recognizer) - audio_data = MagicMock(spec=AudioData) - - actual = whisper.recognize_whisper_api(recognizer, audio_data) - - self.assertEqual(actual, transcript.text) - audio_data.get_wav_data.assert_called_once_with() - BytesIO.assert_called_once_with(audio_data.get_wav_data.return_value) - OpenAI.assert_called_once_with(api_key=None) - client.audio.transcriptions.create.assert_called_once_with( - file=BytesIO.return_value, model="whisper-1" - ) - - def test_recognize_pass_arguments(self, OpenAI, BytesIO, environ): - client = OpenAI.return_value - - recognizer = MagicMock(spec=Recognizer) - audio_data = MagicMock(spec=AudioData) - - _ = whisper.recognize_whisper_api( - recognizer, audio_data, model="x-whisper", api_key="OPENAI_API_KEY" - ) - - OpenAI.assert_called_once_with(api_key="OPENAI_API_KEY") - client.audio.transcriptions.create.assert_called_once_with( - file=BytesIO.return_value, model="x-whisper" - ) From ee849562d42b14f8d210a8052b482fb4c57861e5 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 12:55:16 +0900 Subject: [PATCH 05/10] [test] OpenAI provides whisper-1 only --- tests/recognizers/test_openai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py index 55135a4d..6b43f5d3 100644 --- a/tests/recognizers/test_openai.py +++ b/tests/recognizers/test_openai.py @@ -26,7 +26,8 @@ def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): audio_data.get_wav_data.return_value = b"audio_data" actual = whisper.recognize_whisper_api( - MagicMock(spec=Recognizer), audio_data, model="whisper-1" + MagicMock(spec=Recognizer), audio_data ) assert actual == "Transcription by OpenAI Whisper" + audio_data.get_wav_data.assert_called_once() From bbf6753a01da27049789fa2f000b2e9885f54992 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 13:10:45 +0900 Subject: [PATCH 06/10] [feat] Partially support OpenAI's optional parameters --- speech_recognition/recognizers/whisper.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/speech_recognition/recognizers/whisper.py b/speech_recognition/recognizers/whisper.py index d4d63923..0ab22dc6 100644 --- a/speech_recognition/recognizers/whisper.py +++ b/speech_recognition/recognizers/whisper.py @@ -3,6 +3,8 @@ import os from typing import Literal +from typing_extensions import Unpack + from speech_recognition.audio import AudioData from speech_recognition.exceptions import SetupError from speech_recognition.recognizers.whisper_api import ( @@ -13,12 +15,27 @@ WhisperModel = Literal["whisper-1"] +class OpenAIOptionalParameters: + """OpenAI speech transcription's optional parameters. + + https://platform.openai.com/docs/api-reference/audio/createTranscription + """ + + language: str + prompt: str + # TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]` + response_format: Literal["json"] + temperature: float + # timestamp_granularities # TODO support + + def recognize_whisper_api( recognizer, audio_data: "AudioData", *, model: WhisperModel = "whisper-1", api_key: str | None = None, + **kwargs: Unpack[OpenAIOptionalParameters], ) -> str: """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. @@ -40,4 +57,4 @@ def recognize_whisper_api( ) recognizer = OpenAICompatibleRecognizer(openai.OpenAI(api_key=api_key)) - return recognizer.recognize(audio_data, model) + return recognizer.recognize(audio_data, model, **kwargs) From 822a7c63e2c0e6290734d1d8b0481db076e10a48 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 13:11:11 +0900 Subject: [PATCH 07/10] [chore] Ignore .python-version --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5fc9aabb..93e8d09f 100644 --- a/.gitignore +++ b/.gitignore @@ -88,7 +88,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. From 26f58d166f69528eb95e337575016f496f91d217 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 13:16:41 +0900 Subject: [PATCH 08/10] [refactor] Rename OpenAI Whisper module --- speech_recognition/__init__.py | 4 ++-- speech_recognition/recognizers/{whisper.py => openai.py} | 2 +- tests/recognizers/test_openai.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) rename speech_recognition/recognizers/{whisper.py => openai.py} (98%) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 238d5e50..d4d45555 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1506,12 +1506,12 @@ def flush(self, *args, **kwargs): # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. # This is a workaround to resolve this issue try: - from .recognizers import google, groq, whisper + from .recognizers import google, openai, groq except (ModuleNotFoundError, ImportError): pass else: Recognizer.recognize_google = google.recognize_legacy - Recognizer.recognize_whisper_api = whisper.recognize_whisper_api + Recognizer.recognize_whisper_api = openai.recognize_openai Recognizer.recognize_groq = groq.recognize_groq diff --git a/speech_recognition/recognizers/whisper.py b/speech_recognition/recognizers/openai.py similarity index 98% rename from speech_recognition/recognizers/whisper.py rename to speech_recognition/recognizers/openai.py index 0ab22dc6..277d2900 100644 --- a/speech_recognition/recognizers/whisper.py +++ b/speech_recognition/recognizers/openai.py @@ -29,7 +29,7 @@ class OpenAIOptionalParameters: # timestamp_granularities # TODO support -def recognize_whisper_api( +def recognize_openai( recognizer, audio_data: "AudioData", *, diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py index 6b43f5d3..c5ddaeca 100644 --- a/tests/recognizers/test_openai.py +++ b/tests/recognizers/test_openai.py @@ -4,7 +4,7 @@ import respx from speech_recognition import AudioData, Recognizer -from speech_recognition.recognizers import whisper +from speech_recognition.recognizers import openai @respx.mock(assert_all_called=True, assert_all_mocked=True) @@ -25,7 +25,7 @@ def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): audio_data = MagicMock(spec=AudioData) audio_data.get_wav_data.return_value = b"audio_data" - actual = whisper.recognize_whisper_api( + actual = openai.recognize_openai( MagicMock(spec=Recognizer), audio_data ) From 8ba44ef778b571e645fa8d4a012a9789e79b7dc7 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 13:18:18 +0900 Subject: [PATCH 09/10] [refactor] Clear name --- speech_recognition/__init__.py | 2 +- speech_recognition/recognizers/openai.py | 2 +- tests/recognizers/test_openai.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index d4d45555..7e2736b9 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1511,7 +1511,7 @@ def flush(self, *args, **kwargs): pass else: Recognizer.recognize_google = google.recognize_legacy - Recognizer.recognize_whisper_api = openai.recognize_openai + Recognizer.recognize_whisper_api = openai.recognize Recognizer.recognize_groq = groq.recognize_groq diff --git a/speech_recognition/recognizers/openai.py b/speech_recognition/recognizers/openai.py index 277d2900..79843d69 100644 --- a/speech_recognition/recognizers/openai.py +++ b/speech_recognition/recognizers/openai.py @@ -29,7 +29,7 @@ class OpenAIOptionalParameters: # timestamp_granularities # TODO support -def recognize_openai( +def recognize( recognizer, audio_data: "AudioData", *, diff --git a/tests/recognizers/test_openai.py b/tests/recognizers/test_openai.py index c5ddaeca..21c2b04e 100644 --- a/tests/recognizers/test_openai.py +++ b/tests/recognizers/test_openai.py @@ -25,9 +25,7 @@ def test_transcribe_with_openai_whisper(respx_mock, monkeypatch): audio_data = MagicMock(spec=AudioData) audio_data.get_wav_data.return_value = b"audio_data" - actual = openai.recognize_openai( - MagicMock(spec=Recognizer), audio_data - ) + actual = openai.recognize(MagicMock(spec=Recognizer), audio_data) assert actual == "Transcription by OpenAI Whisper" audio_data.get_wav_data.assert_called_once() From 81f48a23254b4e82c06095aadfe07567b4500d6b Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 7 Dec 2024 13:24:18 +0900 Subject: [PATCH 10/10] [feat] Rename to recognize_openai (deprecate recognize_whisper_api) This library currently supports OpenAI and Groq. recognize_whisper_api becomes confusing --- README.rst | 6 +++--- examples/microphone_recognition.py | 2 +- reference/library-reference.rst | 4 ++-- speech_recognition/__init__.py | 3 ++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 337f6873..69beebcf 100644 --- a/README.rst +++ b/README.rst @@ -97,7 +97,7 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) -* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``) +* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``) * **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -176,9 +176,9 @@ You can install it with ``python3 -m pip install SpeechRecognition[whisper-local OpenAI Whisper API (for OpenAI Whisper API users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_whisper_api``). +The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_openai``). -If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_whisper_api`` will raise an ``RequestError``. +If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_openai`` will raise an ``RequestError``. You can install it with ``python3 -m pip install SpeechRecognition[whisper-api]``. diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index a5d8a688..38ef95bd 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -96,6 +96,6 @@ # recognize speech using Whisper API OPENAI_API_KEY = "INSERT OPENAI API KEY HERE" try: - print(f"Whisper API thinks you said {r.recognize_whisper_api(audio, api_key=OPENAI_API_KEY)}") + print(f"Whisper API thinks you said {r.recognize_openai(audio, api_key=OPENAI_API_KEY)}") except sr.RequestError as e: print(f"Could not request results from Whisper API; {e}") diff --git a/reference/library-reference.rst b/reference/library-reference.rst index e8b6c7e0..e245e819 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -314,8 +314,8 @@ You can translate the result to english with Whisper by passing translate=True Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options -``recognizer_instance.recognize_whisper_api(audio_data: AudioData, model: str = "whisper-1", api_key: str | None = None)`` --------------------------------------------------------------------------------------------------------------------------- +``recognizer_instance.recognize_openai(audio_data: AudioData, model: str = "whisper-1", api_key: str | None = None)`` +--------------------------------------------------------------------------------------------------------------------- Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 7e2736b9..94345ccb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1511,7 +1511,8 @@ def flush(self, *args, **kwargs): pass else: Recognizer.recognize_google = google.recognize_legacy - Recognizer.recognize_whisper_api = openai.recognize + Recognizer.recognize_openai = openai.recognize + Recognizer.recognize_whisper_api = openai.recognize # Deprecated Recognizer.recognize_groq = groq.recognize_groq