1- // Copyright 2018 Google Inc .
1+ // Copyright 2018 Google LLC .
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
1111// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212// See the License for the specific language governing permissions and
1313// limitations under the License.
14+ //
1415
1516syntax = "proto3" ;
1617
@@ -20,6 +21,7 @@ import "google/api/annotations.proto";
2021import "google/longrunning/operations.proto" ;
2122import "google/protobuf/any.proto" ;
2223import "google/protobuf/duration.proto" ;
24+ import "google/protobuf/empty.proto" ;
2325import "google/protobuf/timestamp.proto" ;
2426import "google/rpc/status.proto" ;
2527
@@ -54,7 +56,8 @@ service Speech {
5456
5557 // Performs bidirectional streaming speech recognition: receive results while
5658 // sending audio. This method is only available via the gRPC API (not REST).
57- rpc StreamingRecognize (stream StreamingRecognizeRequest ) returns (stream StreamingRecognizeResponse );
59+ rpc StreamingRecognize (stream StreamingRecognizeRequest ) returns (stream StreamingRecognizeResponse ) {
60+ }
5861}
5962
6063// The top-level message sent by the client for the `Recognize` method.
@@ -98,7 +101,7 @@ message StreamingRecognizeRequest {
98101 // `audio_content` data. The audio bytes must be encoded as specified in
99102 // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
100103 // pure binary representation (not base64). See
101- // [audio limits](https://cloud.google.com/ speech/limits #content).
104+ // [content limits](/ speech-to-text/quotas #content).
102105 bytes audio_content = 2 ;
103106 }
104107}
@@ -218,36 +221,36 @@ message RecognitionConfig {
218221 // Valid values for OGG_OPUS are '1'-'254'.
219222 // Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
220223 // If `0` or omitted, defaults to one channel (mono).
221- // NOTE : We only recognize the first channel by default.
224+ // Note : We only recognize the first channel by default.
222225 // To perform independent recognition on each channel set
223- // enable_separate_recognition_per_channel to 'true'.
226+ // ` enable_separate_recognition_per_channel` to 'true'.
224227 int32 audio_channel_count = 7 ;
225228
226- // This needs to be set to ‘true’ explicitly and audio_channel_count > 1
229+ // This needs to be set to ‘true’ explicitly and ` audio_channel_count` > 1
227230 // to get each channel recognized separately. The recognition result will
228- // contain a channel_tag field to state which channel that result belongs to.
229- // If this is not ‘ true’ , we will only recognize the first channel.
230- // NOTE: The request is also billed cumulatively for all channels recognized:
231- // (audio_channel_count times the audio length)
231+ // contain a ` channel_tag` field to state which channel that result belongs
232+ // to. If this is not true, we will only recognize the first channel. The
233+ // request is billed cumulatively for all channels recognized:
234+ // `audio_channel_count` multiplied by the length of the audio.
232235 bool enable_separate_recognition_per_channel = 12 ;
233236
234237 // *Required* The language of the supplied audio as a
235238 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
236239 // Example: "en-US".
237- // See [Language Support](https://cloud.google.com/ speech/docs/languages)
240+ // See [Language Support](/ speech-to-text /docs/languages)
238241 // for a list of the currently supported language codes.
239242 string language_code = 3 ;
240243
241244 // *Optional* A list of up to 3 additional
242245 // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
243246 // listing possible alternative languages of the supplied audio.
244- // See [Language Support](https://cloud.google.com/ speech/docs/languages)
247+ // See [Language Support](/ speech-to-text /docs/languages)
245248 // for a list of the currently supported language codes.
246249 // If alternative languages are listed, recognition result will contain
247250 // recognition in the most likely language detected including the main
248251 // language_code. The recognition result will include the language tag
249252 // of the language detected in the audio.
250- // NOTE : This feature is only supported for Voice Command and Voice Search
253+ // Note : This feature is only supported for Voice Command and Voice Search
251254 // use cases and performance may vary for other use cases (e.g., phone call
252255 // transcription).
253256 repeated string alternative_language_codes = 18 ;
@@ -266,7 +269,9 @@ message RecognitionConfig {
266269 // won't be filtered out.
267270 bool profanity_filter = 5 ;
268271
269- // *Optional* A means to provide context to assist the speech recognition.
272+ // *Optional* array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
273+ // A means to provide context to assist the speech recognition. For more
274+ // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
270275 repeated SpeechContext speech_contexts = 6 ;
271276
272277 // *Optional* If `true`, the top result includes a list of words and
@@ -284,18 +289,20 @@ message RecognitionConfig {
284289 // This feature is only available in select languages. Setting this for
285290 // requests in other languages has no effect at all.
286291 // The default 'false' value does not add punctuation to result hypotheses.
287- // NOTE: " This is currently offered as an experimental service, complimentary
292+ // Note: This is currently offered as an experimental service, complimentary
288293 // to all users. In the future this may be exclusively available as a
289- // premium feature."
294+ // premium feature.
290295 bool enable_automatic_punctuation = 11 ;
291296
292297 // *Optional* If 'true', enables speaker detection for each recognized word in
293298 // the top alternative of the recognition result using a speaker_tag provided
294299 // in the WordInfo.
295300 // Note: When this is true, we send all the words from the beginning of the
296- // audio for the top alternative in every consecutive responses.
301+ // audio for the top alternative in every consecutive STREAMING responses.
297302 // This is done in order to improve our speaker tags as our models learn to
298303 // identify the speakers in the conversation over time.
304+ // For non-streaming requests, the diarization results will be provided only
305+ // in the top alternative of the FINAL SpeechRecognitionResult.
299306 bool enable_speaker_diarization = 16 ;
300307
301308 // *Optional*
@@ -342,14 +349,18 @@ message RecognitionConfig {
342349 string model = 13 ;
343350
344351 // *Optional* Set to true to use an enhanced model for speech recognition.
345- // You must also set the `model` field to a valid, enhanced model. If
346- // `use_enhanced` is set to true and the `model` field is not set, then
347- // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
348- // version of the specified model does not exist, then the speech is
349- // recognized using the standard version of the specified model.
352+ // If `use_enhanced` is set to true and the `model` field is not set, then
353+ // an appropriate enhanced model is chosen if:
354+ // 1. project is eligible for requesting enhanced models
355+ // 2. an enhanced model exists for the audio
356+ //
357+ // If `use_enhanced` is true and an enhanced version of the specified model
358+ // does not exist, then the speech is recognized using the standard version
359+ // of the specified model.
350360 //
351- // Enhanced speech models require that you opt-in to the audio logging using
352- // instructions in the [alpha documentation](/speech/data-sharing). If you set
361+ // Enhanced speech models require that you opt-in to data logging using
362+ // instructions in the
363+ // [documentation](/speech-to-text/docs/enable-data-logging). If you set
353364 // `use_enhanced` to true and you have not enabled audio logging, then you
354365 // will receive an error.
355366 bool use_enhanced = 14 ;
@@ -494,14 +505,14 @@ message SpeechContext {
494505 // to improve the accuracy for specific words and phrases, for example, if
495506 // specific commands are typically spoken by the user. This can also be used
496507 // to add additional words to the vocabulary of the recognizer. See
497- // [usage limits](https://cloud.google.com/ speech/limits #content).
508+ // [usage limits](/ speech-to-text/quotas #content).
498509 repeated string phrases = 1 ;
499510}
500511
501512// Contains audio data in the encoding specified in the `RecognitionConfig`.
502513// Either `content` or `uri` must be supplied. Supplying both or neither
503514// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
504- // [audio limits](https://cloud.google.com/ speech/limits #content).
515+ // [content limits](/ speech-to-text/quotas #content).
505516message RecognitionAudio {
506517 // The audio source, which is either inline content or a Google Cloud
507518 // Storage uri.
@@ -512,7 +523,8 @@ message RecognitionAudio {
512523 bytes content = 1 ;
513524
514525 // URI that points to a file that contains audio data bytes as specified in
515- // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
526+ // `RecognitionConfig`. The file must not be compressed (for example, gzip).
527+ // Currently, only Google Cloud Storage URIs are
516528 // supported, which must be specified in the following format:
517529 // `gs://bucket_name/object_name` (other URI formats return
518530 // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
@@ -658,6 +670,10 @@ message StreamingRecognitionResult {
658670 // The default of 0.0 is a sentinel value indicating `stability` was not set.
659671 float stability = 3 ;
660672
673+ // Output only. Time offset of the end of this result relative to the
674+ // beginning of the audio.
675+ google.protobuf.Duration result_end_time = 4 ;
676+
661677 // For multi-channel audio, this is the channel number corresponding to the
662678 // recognized result for the audio from that channel.
663679 // For audio_channel_count = N, its output values can range from '1' to 'N'.
@@ -705,7 +721,7 @@ message SpeechRecognitionAlternative {
705721 float confidence = 2 ;
706722
707723 // Output only. A list of word-specific information for each recognized word.
708- // Note: When enable_speaker_diarization is true, you will see all the words
724+ // Note: When ` enable_speaker_diarization` is true, you will see all the words
709725 // from the beginning of the audio.
710726 repeated WordInfo words = 3 ;
711727}
@@ -746,5 +762,4 @@ message WordInfo {
746762 // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
747763 // top alternative.
748764 int32 speaker_tag = 5 ;
749-
750765}
0 commit comments