Skip to content
Merged
19 changes: 8 additions & 11 deletions speech/cloud-client/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,25 +221,22 @@ To run this sample:

$ python beta_snippets.py

usage: beta_snippets.py [-h] command path [first] [second]
usage: beta_snippets.py [-h] command

Google Cloud Speech API sample that demonstrates enhanced models
and recognition metadata.

Example usage:
python beta_snippets.py enhanced-model resources/commercial_mono.wav
python beta_snippets.py metadata resources/commercial_mono.wav
python beta_snippets.py punctuation resources/commercial_mono.wav
python beta_snippets.py diarization resources/commercial_mono.wav
python beta_snippets.py multi-channel resources/commercial_mono.wav
python beta_snippets.py multi-language resources/multi.wav en-US es
python beta_snippets.py word-level-conf resources/commercial_mono.wav
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf

positional arguments:
command
path File for audio file to be recognized
first First language in audio file to be recognized
second Second language in audio file to be recognized

optional arguments:
-h, --help show this help message and exit
Expand Down
98 changes: 50 additions & 48 deletions speech/cloud-client/beta_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,34 @@
and recognition metadata.

Example usage:
python beta_snippets.py enhanced-model resources/commercial_mono.wav
python beta_snippets.py metadata resources/commercial_mono.wav
python beta_snippets.py punctuation resources/commercial_mono.wav
python beta_snippets.py diarization resources/commercial_mono.wav
python beta_snippets.py multi-channel resources/commercial_mono.wav
python beta_snippets.py multi-language resources/multi.wav en-US es
python beta_snippets.py word-level-conf resources/commercial_mono.wav
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf
"""

import argparse
import io


def transcribe_file_with_enhanced_model(speech_file):
def transcribe_file_with_enhanced_model():
"""Transcribe the given audio file using an enhanced model."""
# [START speech_transcribe_file_with_enhanced_model]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()

audio = speech.types.RecognitionAudio(content=content)
config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='en-US',
# Enhanced models are only available to projects that
# opt in for audio data collection.
Expand All @@ -63,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file):
# [END speech_transcribe_file_with_enhanced_model]


def transcribe_file_with_metadata(speech_file):
def transcribe_file_with_metadata():
"""Send a request that includes recognition metadata."""
# [START speech_transcribe_file_with_metadata]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -94,6 +93,7 @@ def transcribe_file_with_metadata(speech_file):
audio = speech.types.RecognitionAudio(content=content)
config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='en-US',
# Add this in the request to send metadata.
metadata=metadata)
Expand All @@ -108,21 +108,21 @@ def transcribe_file_with_metadata(speech_file):
# [END speech_transcribe_file_with_metadata]


def transcribe_file_with_auto_punctuation(speech_file):
def transcribe_file_with_auto_punctuation():
"""Transcribe the given audio file with auto punctuation enabled."""
# [START speech_transcribe_file_with_auto_punctuation]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()

audio = speech.types.RecognitionAudio(content=content)
config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='en-US',
# Enable automatic punctuation
enable_automatic_punctuation=True)
Expand All @@ -137,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file):
# [END speech_transcribe_file_with_auto_punctuation]


def transcribe_file_with_diarization(speech_file):
def transcribe_file_with_diarization():
"""Transcribe the given audio file synchronously with diarization."""
# [START speech_transcribe_diarization]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/commercial_mono.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -153,30 +152,42 @@ def transcribe_file_with_diarization(speech_file):

config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='en-US',
enable_speaker_diarization=True,
diarization_speaker_count=2)

print('Waiting for operation to complete...')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using python logging facility. Understandably, for this sample it might be overkill so take it or leave it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW nearly all our other Python samples do print(). It's true that it's not always the recommended practice in production, but it's easy to understand. With logging there's always the risk that the developer has some weird config where the logs end up where maybe they don't expect.

response = client.recognize(config, audio)

# response.results contains partial results with the last item
# containing the entire result:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mm... not quite. The transcript within each result is separate and sequential per result. However, the words list within an alternative (for whatever reason) includes all the words from all the results thus far. Thus, to get all the words with speaker tags, you only have to take the words list from the last result.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Thanks for the clarification. Let me update the comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not really understanding your comment, @jerjou but this sounds like something that needs to be documented on the cloud.google.com docs with a briefer explanation in the sample itself.

result = response.results[-1]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment here explaining why you're only taking the last result (instead of all of them) would probably be helpful.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. Adding it.


words_info = result.alternatives[0].words
pieces = ['%s (%s)' % (word_info.word, word_info.speaker_tag)
for word_info in words_info]
print(' '.join(pieces))

# Separating the words by who said what:
speakers_words = []
for word_info in words_info:
if speakers_words and speakers_words[-1][0] == word_info.speaker_tag:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly what this piece of for loop is doing (creating list of words for tag) isn't is better to use hashmap. Does this loop work what it's supposed to do?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jerjou is already reviewing this code so I should stop reviewing. You are already is great hands :)

speakers_words[-1][1].append(word_info.word)
else:
speakers_words.append((word_info.speaker_tag, [word_info.word, ]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit hard to read. An intermediate variable and a namedtuple would go a long way to making this clearer:

Speaker = collections.namedtuple('Speaker', ['tag', 'words'])
speaker_words = [Speaker(tag=0, words=[])]
for word_info in words_info:
  current_speaker = speaker_words[-1]
  if current_speaker.tag == word_info.speaker_tag:
    current_speaker.words.append(word_info.word)
  else:
    speaker_words.append(Speaker(tag=word_info.speaker_tag, words=[word_info.word]))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, maybe speaker_sequence or something, to make it clear it's not just a speaker->words_they_spoke mapping, and is actually the conversation / words spoken in sequence.

Copy link
Contributor Author

@happyhuman happyhuman Jul 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting idea. However, I think the code readability is mainly the result of the intermediate variable Speaker(tag=0, words=[]) as the first element in the list, allowing us to define current_speaker every time.
The downside of it is that we are introducing a new object that is not returned by the API and we will have to handle it in the next step separately (either by removing it, or by skipping it), which reduces the code cleanness in another way.
So, while I am not against the suggested solution, I am also not sure if it is really helping with the code readability that much.
What do you think @jerjou ?

Copy link
Contributor Author

@happyhuman happyhuman Jul 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about this:

    speakers = []
    words = []
    for word_info in words_info:
        if (not speakers) or speakers[-1] != word_info.speaker_tag:
            speakers.append(word_info.speaker_tag)
            words.append([])
        words[-1].append(word_info.word)

I think this is more readable that the current code, without introducing the intermediate variable.


# Printing the output based on who said what:
for speaker_tag, words in speakers_words:
print('Speaker #{}: {}'.format(speaker_tag, ' '.join(words)))
# [END speech_transcribe_diarization]


def transcribe_file_with_multichannel(speech_file):
def transcribe_file_with_multichannel():
"""Transcribe the given audio file synchronously with
multi channel."""
# [START speech_transcribe_multichannel]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/Google_Gnome.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -201,17 +212,16 @@ def transcribe_file_with_multichannel(speech_file):
# [END speech_transcribe_multichannel]


def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
def transcribe_file_with_multilanguage():
"""Transcribe the given audio file synchronously with
multi language."""
# [START speech_transcribe_multilanguage]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
# first_lang = first language code, e,g, 'en-US'
# second_lang = first language code, e,g, 'es'
speech_file = 'resources/multi.wav'
first_lang = 'en-US'
second_lang = 'es'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand All @@ -220,6 +230,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):

config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=44100,
audio_channel_count=2,
language_code=first_lang,
alternative_language_codes=[second_lang])
Expand All @@ -235,15 +246,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
# [END speech_transcribe_multilanguage]


def transcribe_file_with_word_level_confidence(speech_file):
def transcribe_file_with_word_level_confidence():
"""Transcribe the given audio file synchronously with
word level confidence."""
# [START speech_transcribe_word_level_confidence]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()

# TODO(developer): Uncomment and set to a path to your audio file.
# speech_file = 'path/to/file.wav'
speech_file = 'resources/Google_Gnome.wav'

with open(speech_file, 'rb') as audio_file:
content = audio_file.read()
Expand Down Expand Up @@ -273,28 +283,20 @@ def transcribe_file_with_word_level_confidence(speech_file):
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('command')
parser.add_argument(
'path', help='File for audio file to be recognized')
parser.add_argument(
'first', help='First language in audio file to be recognized',
nargs='?')
parser.add_argument(
'second', help='Second language in audio file to be recognized',
nargs='?')

args = parser.parse_args()

if args.command == 'enhanced-model':
transcribe_file_with_enhanced_model(args.path)
transcribe_file_with_enhanced_model()
elif args.command == 'metadata':
transcribe_file_with_metadata(args.path)
transcribe_file_with_metadata()
elif args.command == 'punctuation':
transcribe_file_with_auto_punctuation(args.path)
transcribe_file_with_auto_punctuation()
elif args.command == 'diarization':
transcribe_file_with_diarization(args.path)
transcribe_file_with_diarization()
elif args.command == 'multi-channel':
transcribe_file_with_multichannel(args.path)
transcribe_file_with_multichannel()
elif args.command == 'multi-language':
transcribe_file_with_multilanguage(args.path, args.first, args.second)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For future reference, argparse's sub-commands feature would be helpful to avoid having args that only matter for one command or another.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

transcribe_file_with_multilanguage()
elif args.command == 'word-level-conf':
transcribe_file_with_word_level_confidence(args.path)
transcribe_file_with_word_level_confidence()
23 changes: 8 additions & 15 deletions speech/cloud-client/beta_snippets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,56 +26,49 @@


def test_transcribe_file_with_enhanced_model(capsys):
transcribe_file_with_enhanced_model(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_enhanced_model()
out, _ = capsys.readouterr()

assert 'Chrome' in out


def test_transcribe_file_with_metadata(capsys):
transcribe_file_with_metadata(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_metadata()
out, _ = capsys.readouterr()

assert 'Chrome' in out


def test_transcribe_file_with_auto_punctuation(capsys):
transcribe_file_with_auto_punctuation(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_auto_punctuation()
out, _ = capsys.readouterr()

assert 'Okay. Sure.' in out


def test_transcribe_diarization(capsys):
transcribe_file_with_diarization(
os.path.join(RESOURCES, 'commercial_mono.wav'))
transcribe_file_with_diarization()
out, err = capsys.readouterr()

assert "I'm (1) here (1) hi (2)" in out
assert "Speaker #1: I'm here" in out


def test_transcribe_multichannel_file(capsys):
transcribe_file_with_multichannel(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_multichannel()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out


def test_transcribe_multilanguage_file(capsys):
transcribe_file_with_multilanguage(
os.path.join(RESOURCES, 'multi.wav'), 'en-US', 'es')
transcribe_file_with_multilanguage()
out, err = capsys.readouterr()

assert 'how are you doing estoy bien e tu' in out


def test_transcribe_word_level_confidence(capsys):
transcribe_file_with_word_level_confidence(
os.path.join(RESOURCES, 'Google_Gnome.wav'))
transcribe_file_with_word_level_confidence()
out, err = capsys.readouterr()

assert 'OK Google stream stranger things from Netflix to my TV' in out