-
Notifications
You must be signed in to change notification settings - Fork 6.7k
Diarization Output Modified #1586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
b5d4ceb
a4c2ca4
f7e4131
5949a81
f1662fe
4fbefa3
b105e2a
b53296a
46c1f43
99ed289
3ef4a0d
146a180
597dc0a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,27 +18,26 @@ | |
| and recognition metadata. | ||
|
|
||
| Example usage: | ||
| python beta_snippets.py enhanced-model resources/commercial_mono.wav | ||
| python beta_snippets.py metadata resources/commercial_mono.wav | ||
| python beta_snippets.py punctuation resources/commercial_mono.wav | ||
| python beta_snippets.py diarization resources/commercial_mono.wav | ||
| python beta_snippets.py multi-channel resources/commercial_mono.wav | ||
| python beta_snippets.py multi-language resources/multi.wav en-US es | ||
| python beta_snippets.py word-level-conf resources/commercial_mono.wav | ||
| python beta_snippets.py enhanced-model | ||
| python beta_snippets.py metadata | ||
| python beta_snippets.py punctuation | ||
| python beta_snippets.py diarization | ||
| python beta_snippets.py multi-channel | ||
| python beta_snippets.py multi-language | ||
| python beta_snippets.py word-level-conf | ||
| """ | ||
|
|
||
| import argparse | ||
| import io | ||
|
|
||
|
|
||
| def transcribe_file_with_enhanced_model(speech_file): | ||
| def transcribe_file_with_enhanced_model(): | ||
| """Transcribe the given audio file using an enhanced model.""" | ||
| # [START speech_transcribe_file_with_enhanced_model] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| speech_file = 'resources/commercial_mono.wav' | ||
|
|
||
| with io.open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file): | |
| # [END speech_transcribe_file_with_enhanced_model] | ||
|
|
||
|
|
||
| def transcribe_file_with_metadata(speech_file): | ||
| def transcribe_file_with_metadata(): | ||
| """Send a request that includes recognition metadata.""" | ||
| # [START speech_transcribe_file_with_metadata] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| speech_file = 'resources/commercial_mono.wav' | ||
|
|
||
| with io.open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file): | |
| # [END speech_transcribe_file_with_metadata] | ||
|
|
||
|
|
||
| def transcribe_file_with_auto_punctuation(speech_file): | ||
| def transcribe_file_with_auto_punctuation(): | ||
| """Transcribe the given audio file with auto punctuation enabled.""" | ||
| # [START speech_transcribe_file_with_auto_punctuation] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| speech_file = 'resources/commercial_mono.wav' | ||
|
|
||
| with io.open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file): | |
| # [END speech_transcribe_file_with_auto_punctuation] | ||
|
|
||
|
|
||
| def transcribe_file_with_diarization(speech_file): | ||
| def transcribe_file_with_diarization(): | ||
| """Transcribe the given audio file synchronously with diarization.""" | ||
| # [START speech_transcribe_diarization] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| speech_file = 'resources/commercial_mono.wav' | ||
|
|
||
| with open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -156,33 +152,36 @@ def transcribe_file_with_diarization(speech_file): | |
|
|
||
| config = speech.types.RecognitionConfig( | ||
| encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, | ||
| sample_rate_hertz=16000, | ||
| sample_rate_hertz=8000, | ||
| language_code='en-US', | ||
| enable_speaker_diarization=True, | ||
| diarization_speaker_count=2) | ||
|
|
||
| print('Waiting for operation to complete...') | ||
| response = client.recognize(config, audio) | ||
|
|
||
| for i, result in enumerate(response.results): | ||
| alternative = result.alternatives[0] | ||
| print('-' * 20) | ||
| print('First alternative of result {}: {}' | ||
| .format(i, alternative.transcript)) | ||
| print('Speaker Tag for the first word: {}' | ||
| .format(alternative.words[0].speaker_tag)) | ||
| # The transcript within each result is separate and sequential per result. | ||
| # However, the words list within an alternative includes all the words | ||
| # from all the results thus far. Thus, to get all the words with speaker | ||
| # tags, you only have to take the words list from the last result: | ||
| result = response.results[-1] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A comment here explaining why you're only taking the last result (instead of all of them) would probably be helpful.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea. Adding it. |
||
|
|
||
| words_info = result.alternatives[0].words | ||
|
|
||
| # Printing out the output: | ||
| for word_info in words_info: | ||
| print("word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)) | ||
| # [END speech_transcribe_diarization] | ||
|
|
||
|
|
||
| def transcribe_file_with_multichannel(speech_file): | ||
| def transcribe_file_with_multichannel(): | ||
| """Transcribe the given audio file synchronously with | ||
| multi channel.""" | ||
| # [START speech_transcribe_multichannel] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| speech_file = 'resources/Google_Gnome.wav' | ||
|
|
||
| with open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -207,17 +206,16 @@ def transcribe_file_with_multichannel(speech_file): | |
| # [END speech_transcribe_multichannel] | ||
|
|
||
|
|
||
| def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang): | ||
| def transcribe_file_with_multilanguage(): | ||
| """Transcribe the given audio file synchronously with | ||
| multi language.""" | ||
| # [START speech_transcribe_multilanguage] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| # first_lang = first language code, e,g, 'en-US' | ||
| # second_lang = first language code, e,g, 'es' | ||
| speech_file = 'resources/multi.wav' | ||
| first_lang = 'en-US' | ||
| second_lang = 'es' | ||
|
|
||
| with open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -226,6 +224,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang): | |
|
|
||
| config = speech.types.RecognitionConfig( | ||
| encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, | ||
| sample_rate_hertz=44100, | ||
| audio_channel_count=2, | ||
| language_code=first_lang, | ||
| alternative_language_codes=[second_lang]) | ||
|
|
@@ -241,15 +240,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang): | |
| # [END speech_transcribe_multilanguage] | ||
|
|
||
|
|
||
| def transcribe_file_with_word_level_confidence(speech_file): | ||
| def transcribe_file_with_word_level_confidence(): | ||
| """Transcribe the given audio file synchronously with | ||
| word level confidence.""" | ||
| # [START speech_transcribe_word_level_confidence] | ||
| from google.cloud import speech_v1p1beta1 as speech | ||
| client = speech.SpeechClient() | ||
|
|
||
| # TODO(developer): Uncomment and set to a path to your audio file. | ||
| # speech_file = 'path/to/file.wav' | ||
| speech_file = 'resources/Google_Gnome.wav' | ||
|
|
||
| with open(speech_file, 'rb') as audio_file: | ||
| content = audio_file.read() | ||
|
|
@@ -279,28 +277,20 @@ def transcribe_file_with_word_level_confidence(speech_file): | |
| description=__doc__, | ||
| formatter_class=argparse.RawDescriptionHelpFormatter) | ||
| parser.add_argument('command') | ||
| parser.add_argument( | ||
| 'path', help='File for audio file to be recognized') | ||
| parser.add_argument( | ||
| 'first', help='First language in audio file to be recognized', | ||
| nargs='?') | ||
| parser.add_argument( | ||
| 'second', help='Second language in audio file to be recognized', | ||
| nargs='?') | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| if args.command == 'enhanced-model': | ||
| transcribe_file_with_enhanced_model(args.path) | ||
| transcribe_file_with_enhanced_model() | ||
| elif args.command == 'metadata': | ||
| transcribe_file_with_metadata(args.path) | ||
| transcribe_file_with_metadata() | ||
| elif args.command == 'punctuation': | ||
| transcribe_file_with_auto_punctuation(args.path) | ||
| transcribe_file_with_auto_punctuation() | ||
| elif args.command == 'diarization': | ||
| transcribe_file_with_diarization(args.path) | ||
| transcribe_file_with_diarization() | ||
| elif args.command == 'multi-channel': | ||
| transcribe_file_with_multichannel(args.path) | ||
| transcribe_file_with_multichannel() | ||
| elif args.command == 'multi-language': | ||
| transcribe_file_with_multilanguage(args.path, args.first, args.second) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For future reference, argparse's sub-commands feature would be helpful to avoid having args that only matter for one command or another.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. |
||
| transcribe_file_with_multilanguage() | ||
| elif args.command == 'word-level-conf': | ||
| transcribe_file_with_word_level_confidence(args.path) | ||
| transcribe_file_with_word_level_confidence() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider using python logging facility. Understandably, for this sample it might be overkill so take it or leave it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW nearly all our other Python samples do
print(). It's true that it's not always the recommended practice in production, but it's easy to understand. With logging there's always the risk that the developer has some weird config where the logs end up where maybe they don't expect.