Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions examples/audio_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav")
# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "french.aiff")
# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "chinese.flac")
audio = sr.AudioData.from_file(AUDIO_FILE)

# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(AUDIO_FILE) as source:
audio = r.record(source) # read the entire audio file

# recognize speech using Sphinx
try:
Expand Down
4 changes: 1 addition & 3 deletions examples/extended_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@
AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav")
# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "french.aiff")
# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "chinese.flac")
audio = sr.AudioData.from_file(AUDIO_FILE)

# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(AUDIO_FILE) as source:
audio = r.record(source) # read the entire audio file

# recognize speech using Sphinx
try:
Expand Down
7 changes: 2 additions & 5 deletions examples/special_recognizer_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@
from os import path
AUDIO_FILE_EN = path.join(path.dirname(path.realpath(__file__)), "english.wav")
AUDIO_FILE_FR = path.join(path.dirname(path.realpath(__file__)), "french.aiff")
audio_en = sr.AudioData.from_file(AUDIO_FILE_EN)
audio_fr = sr.AudioData.from_file(AUDIO_FILE_FR)

# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(AUDIO_FILE_EN) as source:
audio_en = r.record(source) # read the entire audio file
with sr.AudioFile(AUDIO_FILE_FR) as source:
audio_fr = r.record(source) # read the entire audio file

# recognize keywords using Sphinx
try:
Expand Down
11 changes: 11 additions & 0 deletions speech_recognition/audio.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import aifc
import audioop
import io
Expand Down Expand Up @@ -31,6 +33,15 @@ def __init__(self, frame_data, sample_rate, sample_width):
self.sample_rate = sample_rate
self.sample_width = int(sample_width)

@classmethod
def from_file(cls, file_path: str) -> AudioData:
"""Creates a new ``AudioData`` instance from an audio file."""
import speech_recognition as sr

r = sr.Recognizer()
with sr.AudioFile(file_path) as source:
return r.record(source)

def get_segment(self, start_ms=None, end_ms=None):
"""
Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
Expand Down
5 changes: 1 addition & 4 deletions speech_recognition/recognizers/whisper_api/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,7 @@ def recognize(
parser.add_argument("-l", "--language")
args = parser.parse_args()

r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)

audio_data = sr.AudioData.from_file(args.audio_file)
if args.language:
transcription = recognize(
None, audio_data, model=args.model, language=args.language
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,6 @@ def recognize(
parser.add_argument("audio_file")
args = parser.parse_args()

r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)

audio_data = sr.AudioData.from_file(args.audio_file)
transcription = recognize(None, audio_data)
print(transcription)
46 changes: 15 additions & 31 deletions tests/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,28 @@ def assertSimilar(self, bytes_1, bytes_2):
raise AssertionError("{} is really different from {} at index {}".format(bytes_1, bytes_2, i))

def test_get_segment(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav"))
self.assertEqual(audio.get_raw_data(), audio.get_segment().get_raw_data())
self.assertEqual(audio.get_raw_data()[8:], audio.get_segment(0.022675738 * 2).get_raw_data())
self.assertEqual(audio.get_raw_data()[:16], audio.get_segment(None, 0.022675738 * 4).get_raw_data())
self.assertEqual(audio.get_raw_data()[8:16], audio.get_segment(0.022675738 * 2, 0.022675738 * 4).get_raw_data())

def test_wav_mono_8_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-8-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-8-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 1)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\xff\x00\xff\x00\xff\xff\x00\xff\x00\xff\x00\xff\x00\x00\xff\x00\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\xff")

def test_wav_mono_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xff\xff\x01\x00\xff\xff\x00\x00\x01\x00\xfe\xff\x01\x00\xfe\xff\x04\x00\xfc\xff\x04\x00\xfe\xff\xff\xff\x03\x00\xfe\xff")

def test_wav_mono_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
Expand All @@ -47,33 +43,28 @@ def test_wav_mono_24_bit(self):
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\xfe\xff\x00\x00\x01\x00")

def test_wav_mono_32_bit(self):
r = sr.Recognizer()
audio_file_path = path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav")
with sr.AudioFile(audio_file_path) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 4)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\xfe\xff\x00\x00\x01\x00")

def test_wav_stereo_8_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-8-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-8-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 1)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\xff\x00\xff\x00\x00\xff\x7f\x7f\x00\xff\x00\xff\x00\x00\xff\x00\x7f\x7f\x7f\x00\x00\xff\x00\xff\x00\xff\x00\x7f\x7f\x7f\x7f")

def test_wav_stereo_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x02\x00\xfb\xff\x04\x00\xfe\xff\xfe\xff\x07\x00\xf6\xff\x07\x00\xf9\xff\t\x00\xf5\xff\x0c\x00\xf8\xff\x02\x00\x04\x00\xfa\xff")

def test_wav_stereo_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
Expand All @@ -82,40 +73,35 @@ def test_wav_stereo_24_bit(self):
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\xfc\xff\x00\x00\x02\x00")

def test_wav_stereo_32_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-32-bit-44100Hz.wav")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-32-bit-44100Hz.wav"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 4)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\xfc\xff\x00\x00\x02\x00")

def test_aiff_mono_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.aiff")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.aiff"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xff\xff\x01\x00\xff\xff\x01\x00\xfe\xff\x02\x00\xfd\xff\x04\x00\xfc\xff\x03\x00\x00\x00\xfe\xff\x03\x00\xfd\xff")

def test_aiff_stereo_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.aiff")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.aiff"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xfe\xff\x02\x00\xfe\xff\xff\xff\x04\x00\xfa\xff\x04\x00\xfa\xff\t\x00\xf6\xff\n\x00\xfa\xff\xff\xff\x08\x00\xf5\xff")

def test_flac_mono_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.flac")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.flac"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xff\xff\x01\x00\xff\xff\x00\x00\x01\x00\xfe\xff\x02\x00\xfc\xff\x06\x00\xf9\xff\x06\x00\xfe\xff\xfe\xff\x05\x00\xfa\xff")

def test_flac_mono_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.flac")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.flac"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
Expand All @@ -124,16 +110,14 @@ def test_flac_mono_24_bit(self):
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\xff\xfe\xff\x00\x02\x01\x00\x00\xfd\xfe\xff\x00\x04\x00\x00\x00\xfc\x00\x00\x00\x04\xfe\xff\x00\xfb\x00\x00")

def test_flac_stereo_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.flac")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.flac"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\xff\xff\xff\xff\x02\x00\xfe\xff\x00\x00\x01\x00\xfd\xff\x01\x00\xff\xff\x04\x00\xfa\xff\x05\x00\xff\xff\xfd\xff\x08\x00\xf6\xff")

def test_flac_stereo_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.flac")) as source: audio = r.record(source)
audio = sr.AudioData.from_file(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.flac"))
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
Expand Down
18 changes: 9 additions & 9 deletions tests/test_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,56 +31,56 @@ def test_recognizer_attributes(self):

@unittest.skipIf(sys.platform.startswith("win"), "skip on Windows")
def test_sphinx_english(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_EN)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_sphinx(audio), "one two three")

@unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable")
def test_wit_english(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_EN)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_wit(audio, key=os.environ["WIT_AI_KEY"]), "one two three")

@unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable")
def test_bing_english(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_EN)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"]), "123.")

@unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable")
def test_bing_french(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_FR)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"], language="fr-FR"), u"Essaye la dictée numéro un.")

@unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable")
def test_bing_chinese(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_ZH)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"], language="zh-CN"), u"砸自己的脚。")

@unittest.skipUnless("HOUNDIFY_CLIENT_ID" in os.environ and "HOUNDIFY_CLIENT_KEY" in os.environ, "requires Houndify client ID and client key to be specified in HOUNDIFY_CLIENT_ID and HOUNDIFY_CLIENT_KEY environment variables")
def test_houndify_english(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_EN)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_houndify(audio, client_id=os.environ["HOUNDIFY_CLIENT_ID"], client_key=os.environ["HOUNDIFY_CLIENT_KEY"]), "one two three")

@unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables")
def test_ibm_english(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_EN)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"]), "one two three ")

@unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables")
def test_ibm_french(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_FR)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="fr-FR"), u"si la dictée numéro un ")

@unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables")
def test_ibm_chinese(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_ZH)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="zh-CN"), u"砸 自己 的 脚 ")


Expand Down
2 changes: 1 addition & 1 deletion tests/test_special_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def setUp(self):

@unittest.skipIf(sys.platform.startswith("win"), "skip on Windows")
def test_sphinx_keywords(self):
audio = sr.AudioData.from_file(self.AUDIO_FILE_EN)
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_sphinx(audio, keyword_entries=[("one", 1.0), ("two", 1.0), ("three", 1.0)]), "three two one")
# pocketsphinx < 5 recognizes tree but pocketsphinx >= 5 ignores it (TODO need to research why)
self.assertEqual(r.recognize_sphinx(audio, keyword_entries=[("wan", 0.95), ("too", 1.0), ("tree", 1.0)]), "too wan")
Expand Down