Merge pull request #797 from ftnext/feature/groq-support

ftnext · web-flow · commit c4cb90f6f8a1 · 2024-12-06T08:33:08.000+09:00
Support Groq whisper
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -44,16 +44,16 @@ jobs:
       - name: Install Python dependencies (Ubuntu, <=3.12)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13'
         run: |
-          python -m pip install .[dev,audio,pocketsphinx,whisper-local,whisper-api]
+          python -m pip install .[dev,audio,pocketsphinx,whisper-local,whisper-api,groq]
       - name: Install Python dependencies (Ubuntu, 3.13)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13'
         run: |
           python -m pip install standard-aifc setuptools
-          python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,whisper-api]
+          python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,whisper-api,groq]
       - name: Install Python dependencies (Windows)
         if: matrix.os == 'windows-latest'
         run: |
-          python -m pip install .[dev,whisper-local,whisper-api]
+          python -m pip install .[dev,whisper-local,whisper-api,groq]
       - name: Test with unittest
         run: |
           pytest --doctest-modules -v speech_recognition/recognizers/ tests/
diff --git a/README.rst b/README.rst
@@ -39,7 +39,8 @@ Speech recognition engine/API support:
 * `Tensorflow <https://www.tensorflow.org/>`__
 * `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
 * `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)
-* `Whisper API <https://platform.openai.com/docs/guides/speech-to-text>`__
+* `OpenAI Whisper API <https://platform.openai.com/docs/guides/speech-to-text>`__
+* `Groq Whisper API <https://console.groq.com/docs/speech-text>`__
 
 **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
 
@@ -96,7 +97,8 @@ To use all of the functionality of the library, you should have:
 * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
 * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
 * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)
-* **openai** (required only if you need to use Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``)
+* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_whisper_api``)
+* **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``)
 
 The following requirements are optional, but can improve or extend functionality in some situations:
 
@@ -171,15 +173,24 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins
 
 You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``.
 
-Whisper API (for Whisper API users) 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAI Whisper API (for OpenAI Whisper API users) 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The library `openai <https://pypi.org/project/openai/>`__ is **required if and only if you want to use Whisper API** (``recognizer_instance.recognize_whisper_api``).
+The library `openai <https://pypi.org/project/openai/>`__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_whisper_api``).
 
 If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_whisper_api`` will raise an ``RequestError``.
 
 You can install it with ``python3 -m pip install SpeechRecognition[whisper-api]``.
 
+Groq Whisper API (for Groq Whisper API users)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The library `groq <https://pypi.org/project/groq/>`__ is **required if and only if you want to use Groq Whisper API** (``recognizer_instance.recognize_groq``).
+
+If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_groq`` will raise an ``RequestError``.
+
+You can install it with ``python3 -m pip install SpeechRecognition[groq]``.
+
 Troubleshooting
 ---------------
 
diff --git a/setup.cfg b/setup.cfg
@@ -4,6 +4,7 @@ dev =
     rstcheck
     pytest
     pytest-randomly
+    respx
 audio =
     PyAudio >= 0.2.11
 pocketsphinx =
@@ -13,5 +14,9 @@ whisper-local =
     soundfile
 whisper-api =
     openai
+    httpx < 0.28
+groq =
+    groq
+    httpx < 0.28
 assemblyai =
     requests
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1506,12 +1506,13 @@ def flush(self, *args, **kwargs):
 # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError.
 # This is a workaround to resolve this issue
 try:
-    from .recognizers import google, whisper
+    from .recognizers import google, groq, whisper
 except (ModuleNotFoundError, ImportError):
     pass
 else:
     Recognizer.recognize_google = google.recognize_legacy
     Recognizer.recognize_whisper_api = whisper.recognize_whisper_api
+    Recognizer.recognize_groq = groq.recognize_groq
 
 
 # ===============================
diff --git a/speech_recognition/recognizers/groq.py b/speech_recognition/recognizers/groq.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import os
+from typing import Literal, TypedDict
+from typing_extensions import Unpack
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import SetupError
+from speech_recognition.recognizers.whisper_api import (
+    OpenAICompatibleRecognizer,
+)
+
+# https://console.groq.com/docs/speech-text#supported-models
+GroqModel = Literal[
+    "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
+]
+
+
+class GroqOptionalParameters(TypedDict):
+    """Groq speech transcription's optional parameters.
+
+    https://console.groq.com/docs/speech-text#transcription-endpoint-usage
+    """
+
+    prompt: str
+    response_format: str
+    temperature: float
+    language: str
+
+
+def recognize_groq(
+    recognizer,
+    audio_data: "AudioData",
+    *,
+    model: GroqModel = "whisper-large-v3-turbo",
+    **kwargs: Unpack[GroqOptionalParameters],
+) -> str:
+    """
+    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
+
+    This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
+
+    Detail: https://console.groq.com/docs/speech-text
+
+    Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the groq installation, or the environment variable is missing.
+    """
+    if os.environ.get("GROQ_API_KEY") is None:
+        raise SetupError("Set environment variable ``GROQ_API_KEY``")
+
+    try:
+        import groq
+    except ImportError:
+        raise SetupError(
+            "missing groq module: ensure that groq is set up correctly."
+        )
+
+    recognizer = OpenAICompatibleRecognizer(groq.Groq())
+    return recognizer.recognize(audio_data, model)
diff --git a/speech_recognition/recognizers/whisper_api.py b/speech_recognition/recognizers/whisper_api.py
@@ -0,0 +1,22 @@
+from io import BytesIO
+
+from speech_recognition.audio import AudioData
+
+
+class OpenAICompatibleRecognizer:
+    def __init__(self, client) -> None:
+        self.client = client
+
+    def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
+        if not isinstance(audio_data, AudioData):
+            raise ValueError(
+                "``audio_data`` must be an ``AudioData`` instance"
+            )
+
+        wav_data = BytesIO(audio_data.get_wav_data())
+        wav_data.name = "SpeechRecognition_audio.wav"
+
+        transcript = self.client.audio.transcriptions.create(
+            file=wav_data, model=model, **kwargs
+        )
+        return transcript.text
diff --git a/tests/recognizers/test_groq.py b/tests/recognizers/test_groq.py
@@ -0,0 +1,33 @@
+from unittest.mock import MagicMock
+
+import httpx
+import respx
+
+from speech_recognition import AudioData, Recognizer
+from speech_recognition.recognizers import groq
+
+
+@respx.mock(assert_all_called=True, assert_all_mocked=True)
+def test_transcribe_with_groq_whisper(respx_mock, monkeypatch):
+    monkeypatch.setenv("GROQ_API_KEY", "gsk_grok_api_key")
+
+    respx_mock.post(
+        "https://api.groq.com/openai/v1/audio/transcriptions"
+    ).mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "text": "Transcription by Groq Whisper",
+                "x_groq": {"id": "req_unique_id"},
+            },
+        )
+    )
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.get_wav_data.return_value = b"audio_data"
+
+    actual = groq.recognize_groq(
+        MagicMock(spec=Recognizer), audio_data, model="whisper-large-v3"
+    )
+
+    assert actual == "Transcription by Groq Whisper"