diff --git a/docs/index.rst b/docs/index.rst index 7a888a1c9cb9..902c0ad80eee 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -177,6 +177,7 @@ speech-encoding speech-metadata speech-operation + speech-streaming speech-sample speech-transcript diff --git a/docs/speech-streaming.rst b/docs/speech-streaming.rst new file mode 100644 index 000000000000..4a04b8ba79ae --- /dev/null +++ b/docs/speech-streaming.rst @@ -0,0 +1,15 @@ +Streaming Speech Response +========================= + +.. automodule:: google.cloud.speech.streaming_response + :members: + :undoc-members: + :show-inheritance: + +Streaming Speech Result +======================= + +.. automodule:: google.cloud.speech.streaming_result + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/speech-usage.rst b/docs/speech-usage.rst index e3341051d128..e1909057dfa2 100644 --- a/docs/speech-usage.rst +++ b/docs/speech-usage.rst @@ -51,10 +51,9 @@ See: `Speech Asynchronous Recognize`_ >>> import time >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.LINEAR16, + ... encoding=speech.Encoding.LINEAR16, ... sample_rate=44100) >>> operation = client.async_recognize(sample, max_alternatives=2) >>> retry_count = 100 @@ -82,10 +81,9 @@ Great Britian. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> operation = client.async_recognize(sample, max_alternatives=2) >>> alternatives = client.sync_recognize( @@ -107,10 +105,9 @@ Example of using the profanity filter. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> alternatives = client.sync_recognize(sample, max_alternatives=1, ... profanity_filter=True) @@ -129,10 +126,9 @@ words to the vocabulary of the recognizer. .. code-block:: python >>> from google.cloud import speech - >>> from google.cloud.speech.encoding import Encoding >>> client = speech.Client() >>> sample = client.sample(source_uri='gs://my-bucket/recording.flac', - ... encoding=Encoding.FLAC, + ... encoding=speech.Encoding.FLAC, ... sample_rate=44100) >>> hints = ['hi', 'good afternoon'] >>> alternatives = client.sync_recognize(sample, max_alternatives=2, @@ -145,5 +141,81 @@ words to the vocabulary of the recognizer. transcript: Hello, this is a test confidence: 0.81 + +Streaming Recognition +--------------------- + +The :meth:`~google.cloud.speech.Client.stream_recognize` method converts speech +data to possible text alternatives on the fly. + +.. note:: + Streaming recognition requests are limited to 1 minute of audio. + + See: https://cloud.google.com/speech/limits#content + +.. code-block:: python + + >>> from google.cloud import speech + >>> client = speech.Client() + >>> with open('./hello.wav', 'rb') as stream: + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + ... sample_rate=16000) + ... for response in client.stream_recognize(sample): + ... print(response.transcript) + ... print(response.is_final) + hello + True + + +By setting ``interim_results`` to :data:`True`, interim results (tentative hypotheses) +may be returned as they become available (these interim results are indicated +with the ``is_final=false`` flag). If :data:`False` or omitted, only ``is_final=true`` +result(s) are returned. + +.. code-block:: python + + >>> from google.cloud import speech + >>> client = speech.Client() + >>> with open('./hello.wav', 'rb') as stream: + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + ... sample_rate=16000) + ... for response in client.stream_recognize(sample, + ... interim_results=True): + ... print('====Response====') + ... print(response.transcript) + ... print(response.is_final) + ====Response==== + he + False + ====Response==== + hell + False + ====Repsonse==== + hello + True + + +By default the recognizer will perform continuous recognition +(continuing to process audio even if the user pauses speaking) until the client +closes the output stream or when the maximum time limit has been reached. + +If you only want to recognize a single utterance you can set + ``single_utterance`` to ``True`` and only one result will be returned. + +See: `Single Utterance`_ + +.. code-block:: python + + >>> with open('./hello_pause_goodbye.wav', 'rb') as stream: + ... sample = client.sample(stream=stream, encoding=speech.Encoding.LINEAR16, + ... sample_rate=16000) + ... for response in client.stream_recognize(sample, + ... single_utterance=True): + ... print(response.transcript) + ... print(response.is_final) + hello + True + +.. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig .. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize .. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize diff --git a/scripts/verify_included_modules.py b/scripts/verify_included_modules.py index d0791f0807b0..9e7f2963b99d 100644 --- a/scripts/verify_included_modules.py +++ b/scripts/verify_included_modules.py @@ -44,6 +44,7 @@ 'google.cloud.pubsub.__init__', 'google.cloud.resource_manager.__init__', 'google.cloud.speech.__init__', + 'google.cloud.speech.streaming.__init__', 'google.cloud.storage.__init__', 'google.cloud.streaming.__init__', 'google.cloud.streaming.buffered_stream', diff --git a/speech/google/cloud/speech/__init__.py b/speech/google/cloud/speech/__init__.py index ef55810893a7..4a9e4e4f6fc6 100644 --- a/speech/google/cloud/speech/__init__.py +++ b/speech/google/cloud/speech/__init__.py @@ -16,3 +16,4 @@ from google.cloud.speech.client import Client from google.cloud.speech.connection import Connection +from google.cloud.speech.encoding import Encoding diff --git a/speech/google/cloud/speech/client.py b/speech/google/cloud/speech/client.py index 553927d237cd..b5622575b3c9 100644 --- a/speech/google/cloud/speech/client.py +++ b/speech/google/cloud/speech/client.py @@ -14,15 +14,35 @@ """Basic client for Google Cloud Speech API.""" +import os from base64 import b64encode from google.cloud._helpers import _to_bytes from google.cloud._helpers import _bytes_to_unicode from google.cloud import client as client_module +from google.cloud.environment_vars import DISABLE_GRPC from google.cloud.speech.connection import Connection from google.cloud.speech.encoding import Encoding from google.cloud.speech.operation import Operation from google.cloud.speech.sample import Sample +from google.cloud.speech.streaming_response import StreamingSpeechResponse + +try: + from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + RecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognitionConfig) + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) +except ImportError: # pragma: NO COVER + _HAVE_GAX = False +else: + _HAVE_GAX = True + + +_DISABLE_GAX = os.getenv(DISABLE_GRPC, False) +_USE_GAX = _HAVE_GAX and not _DISABLE_GAX class Client(client_module.Client): @@ -47,6 +67,7 @@ class Client(client_module.Client): """ _connection_class = Connection + _speech_api = None def async_recognize(self, sample, language_code=None, max_alternatives=None, profanity_filter=None, @@ -104,7 +125,7 @@ def async_recognize(self, sample, language_code=None, return Operation.from_api_repr(self, api_response) @staticmethod - def sample(content=None, source_uri=None, encoding=None, + def sample(content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): """Factory: construct Sample to use when making recognize requests. @@ -118,6 +139,9 @@ def sample(content=None, source_uri=None, encoding=None, supported, which must be specified in the following format: ``gs://bucket_name/object_name``. + :type stream: :class:`io.BufferedReader` + :param stream: File like object to read audio data from. + :type encoding: str :param encoding: encoding of audio data sent in all RecognitionAudio messages, can be one of: :attr:`~.Encoding.LINEAR16`, @@ -135,7 +159,7 @@ def sample(content=None, source_uri=None, encoding=None, :rtype: :class:`~google.cloud.speech.sample.Sample` :returns: Instance of ``Sample``. """ - return Sample(content=content, source_uri=source_uri, + return Sample(content=content, source_uri=source_uri, stream=stream, encoding=encoding, sample_rate=sample_rate) def sync_recognize(self, sample, language_code=None, @@ -199,6 +223,103 @@ def sync_recognize(self, sample, language_code=None, else: raise ValueError('result in api should have length 1') + def stream_recognize(self, sample, language_code=None, + max_alternatives=None, profanity_filter=None, + speech_context=None, single_utterance=False, + interim_results=False): + """Streaming speech recognition. + + .. note:: + Streaming recognition requests are limited to 1 minute of audio. + + See: https://cloud.google.com/speech/limits#content + + Yields :class:`~streaming_response.StreamingSpeechResponse` containing + results and metadata from the streaming request. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :raises: :class:`EnvironmentError` if gRPC is not enabled. + """ + if not _USE_GAX: + raise EnvironmentError('gRPC is required to use this API.') + + requests = _make_request_stream(sample, language_code=language_code, + max_alternatives=max_alternatives, + profanity_filter=profanity_filter, + speech_context=speech_context, + single_utterance=single_utterance, + interim_results=interim_results) + + for response in self.speech_api.streaming_recognize(requests): + if getattr(response, 'results', None) or interim_results: + yield StreamingSpeechResponse.from_pb(response) + + @property + def speech_api(self): + """Instance of Speech API. + + :rtype: :class:`google.cloud.gapic.speech.v1beta1.speech_api.SpeechApi` + :returns: Instance of ``SpeechApi``. + """ + if not self._speech_api: + self._speech_api = SpeechApi() + return self._speech_api + def _build_request_data(sample, language_code=None, max_alternatives=None, profanity_filter=None, speech_context=None): @@ -261,3 +382,159 @@ def _build_request_data(sample, language_code=None, max_alternatives=None, } return data + + +def _make_request_stream(sample, language_code=None, max_alternatives=None, + profanity_filter=None, speech_context=None, + single_utterance=None, interim_results=None): + """Generate stream of requests from sample. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: (Optional) The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: (Optional) Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: (Optional) If True, the server will attempt to + filter out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: (Optional) A list of strings (max 50) containing + words and phrases "hints" so that the speech + recognition is more likely to recognize them. + This can be used to improve the accuracy for + specific words and phrases. This can also be used to + add new words to the vocabulary of the recognizer. + + :type single_utterance: bool + :param single_utterance: (Optional) If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: bool + :param interim_results: (Optional) If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + """ + config_request = _make_streaming_config( + sample, language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context, + single_utterance=single_utterance, interim_results=interim_results) + + # The config request MUST go first and not contain any audio data. + yield config_request + + while True: + data = sample.stream.read(sample.chunk_size) + if not data: + break + yield StreamingRecognizeRequest(audio_content=data) + + +def _make_streaming_config(sample, language_code, + max_alternatives, profanity_filter, + speech_context, single_utterance, + interim_results): + """Build streaming configuration. + + :type sample: :class:`~google.cloud.speech.sample.Sample` + :param sample: Instance of ``Sample`` containing audio information. + + :type language_code: str + :param language_code: The language of the supplied audio as + BCP-47 language tag. Example: ``'en-GB'``. + If omitted, defaults to ``'en-US'``. + + :type max_alternatives: int + :param max_alternatives: Maximum number of recognition + hypotheses to be returned. The server may + return fewer than maxAlternatives. + Valid values are 0-30. A value of 0 or 1 + will return a maximum of 1. Defaults to 1 + + :type profanity_filter: bool + :param profanity_filter: If True, the server will attempt to filter + out profanities, replacing all but the + initial character in each filtered word with + asterisks, e.g. ``'f***'``. If False or + omitted, profanities won't be filtered out. + + :type speech_context: list + :param speech_context: A list of strings (max 50) containing words and + phrases "hints" so that the speech recognition + is more likely to recognize them. This can be + used to improve the accuracy for specific words + and phrases. This can also be used to add new + words to the vocabulary of the recognizer. + + :type single_utterance: bool + :param single_utterance: If false or omitted, the recognizer + will perform continuous recognition + (continuing to process audio even if the user + pauses speaking) until the client closes the + output stream (gRPC API) or when the maximum + time limit has been reached. Multiple + SpeechRecognitionResults with the is_final + flag set to true may be returned. + + If true, the recognizer will detect a single + spoken utterance. When it detects that the + user has paused or stopped speaking, it will + return an END_OF_UTTERANCE event and cease + recognition. It will return no more than one + SpeechRecognitionResult with the is_final flag + set to true. + + :type interim_results: bool + :param interim_results: If true, interim results (tentative + hypotheses) may be returned as they become + available (these interim results are indicated + with the is_final=false flag). If false or + omitted, only is_final=true result(s) are + returned. + + :rtype: :class:`~StreamingRecognitionConfig` + :returns: Instance of ``StreamingRecognitionConfig``. + """ + config = RecognitionConfig( + encoding=sample.encoding, sample_rate=sample.sample_rate, + language_code=language_code, max_alternatives=max_alternatives, + profanity_filter=profanity_filter, speech_context=speech_context) + + streaming_config = StreamingRecognitionConfig( + config=config, single_utterance=single_utterance, + interim_results=interim_results) + + config_request = StreamingRecognizeRequest( + streaming_config=streaming_config) + + return config_request diff --git a/speech/google/cloud/speech/operation.py b/speech/google/cloud/speech/operation.py index 69614b16cb7f..e7abbf88636d 100644 --- a/speech/google/cloud/speech/operation.py +++ b/speech/google/cloud/speech/operation.py @@ -124,7 +124,8 @@ def _update(self, response): results = [] if raw_results: for result in raw_results[0]['alternatives']: - results.append(Transcript(result)) + results.append(Transcript(result.get('transcript'), + result.get('confidence'))) if metadata: self._metadata = Metadata.from_api_repr(metadata) diff --git a/speech/google/cloud/speech/sample.py b/speech/google/cloud/speech/sample.py index a197f20372f6..ef82f3a8f485 100644 --- a/speech/google/cloud/speech/sample.py +++ b/speech/google/cloud/speech/sample.py @@ -30,6 +30,9 @@ class Sample(object): supported, which must be specified in the following format: ``gs://bucket_name/object_name``. + :type stream: :class:`io.BufferedReader` + :param stream: File like object to read audio data from. + :type encoding: str :param encoding: encoding of audio data sent in all RecognitionAudio messages, can be one of: :attr:`~.Encoding.LINEAR16`, @@ -47,16 +50,15 @@ class Sample(object): default_encoding = Encoding.FLAC default_sample_rate = 16000 - def __init__(self, content=None, source_uri=None, + def __init__(self, content=None, source_uri=None, stream=None, encoding=None, sample_rate=None): - - no_source = content is None and source_uri is None - both_source = content is not None and source_uri is not None - if no_source or both_source: - raise ValueError('Supply one of \'content\' or \'source_uri\'') + if (content, source_uri, stream).count(None) != 2: + raise ValueError('Supply only one of \'content\', \'source_uri\'' + ' or stream.') self._content = content self._source_uri = source_uri + self._stream = stream if sample_rate is not None and not 8000 <= sample_rate <= 48000: raise ValueError('The value of sample_rate must be between 8000' @@ -68,6 +70,15 @@ def __init__(self, content=None, source_uri=None, else: raise ValueError('Invalid encoding: %s' % (encoding,)) + @property + def chunk_size(self): + """Chunk size to send over GRPC. ~100ms + + :rtype: int + :returns: Optimized chunk size. + """ + return int(self.sample_rate / 10.0) + @property def source_uri(self): """Google Cloud Storage URI of audio source. @@ -77,6 +88,15 @@ def source_uri(self): """ return self._source_uri + @property + def stream(self): + """Stream of audio data. + + :rtype: :class:`io.BufferedReader` + :returns: File like object to read audio data from. + """ + return self._stream + @property def content(self): """Bytes of audio content. diff --git a/speech/google/cloud/speech/streaming_response.py b/speech/google/cloud/speech/streaming_response.py new file mode 100644 index 000000000000..a720870c5e89 --- /dev/null +++ b/speech/google/cloud/speech/streaming_response.py @@ -0,0 +1,107 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of a GAPIC Speech API response.""" + +from google.cloud.speech.streaming_result import StreamingSpeechResult +from google.cloud.gapic.speech.v1beta1.enums import StreamingRecognizeResponse + +_REVERSE_MAP = { + value: key for key, value + in StreamingRecognizeResponse.EndpointerType.__dict__.items() + if not key.startswith('__')} + + +class StreamingSpeechResponse(object): + """Representation of a Speech API protobuf streaming response. + + :type error: :class:`google.grpc.Status` + :param error: Instance of ``Status`` + + :type endpointer_type: int + :param endpointer_type: Integer value of endpointer event. + + :type results: list of + :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` + :param results: List of protobuf ``StreamingRecognitionResult``. + + :type result_index: int + :param result_index: Index for specific result set. Used for updating with + ``interim_results``. + """ + def __init__(self, error=None, endpointer_type=None, results=None, + result_index=None): + results = results or [] + self._error = error + self._endpointer_type = _REVERSE_MAP.get(endpointer_type) + self._result_index = result_index + self._results = [StreamingSpeechResult.from_pb(result) + for result in results] + + @classmethod + def from_pb(cls, pb_response): + """Factory: construct a ``StreamingSpeechResponse`` from protobuf. + + :type pb_response: + :class:`google.cloud.speech.v1beta1.StreamingRecognizeResponse` + :param pb_response: Instance of protobuf + ``StreamingRecognizeResponse``. + :rtype: :class:`~StreamingSpeechResponse` + :returns: Instance of ``StreamingSpeechResponse``. + """ + error = pb_response.error + endpointer_type = pb_response.endpointer_type + results = pb_response.results + result_index = pb_response.result_index + return cls(error=error, endpointer_type=endpointer_type, + results=results, result_index=result_index) + + @property + def endpointer_type(self): + """Endpointer indicating the state of the speech detection. + + :rtype: str + :returns: String derived from :class:`~endpointer_type.EndpointerType`. + """ + return self._endpointer_type + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + if self.results: + return self.results[0].is_final + else: + return False + + @property + def result_index(self): + """Result index associated with this response. + + :rtype: int + :returns: Result index of this response. + """ + return self._result_index + + @property + def results(self): + """List of results for this response. + + :rtype: list of :class:`~result.StreamingSpeechResult` + :returns: List of ``StreamingSpeechResult`` in this response. + """ + return self._results diff --git a/speech/google/cloud/speech/streaming_result.py b/speech/google/cloud/speech/streaming_result.py new file mode 100644 index 000000000000..104916eda9e0 --- /dev/null +++ b/speech/google/cloud/speech/streaming_result.py @@ -0,0 +1,73 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representation of Speech GAPIC API result.""" + +from google.cloud.speech.transcript import Transcript + + +class StreamingSpeechResult(object): + """Factory: contruct streaming speech result. + + :type alternatives: + :class:`google.cloud.speech.v1beta1.SpeechRecognitionAlternative` + :param alternatives: List of ``SpeechRecognitionAlternative``. + + :type is_final: bool + :param is_final: Indicates if the transcription is complete. + + :type stability: float + :param stability: An estimate of the probability that the recognizer will + not change its guess about this interim result. + """ + + def __init__(self, alternatives, is_final, stability): + self._alternatives = [Transcript.from_pb(alternative) + for alternative in alternatives] + self._is_final = is_final + self._stability = stability + + @classmethod + def from_pb(cls, pb_response): + """Factory: construct StreamingSpeechResult from protobuf response. + + :type pb_response: + :class:`google.cloud.speech.v1beta1.StreamingRecognitionResult` + :param pb_response: Instance of ``StreamingRecognitionResult``. + + :rtype: :class:`~result.StreamingSpeechResult` + :returns: Instance of ``StreamingSpeechResult``. + """ + alternatives = pb_response.alternatives + is_final = pb_response.is_final + stability = pb_response.stability + return cls(alternatives, is_final, stability) + + @property + def alternatives(self): + """List of alternative transcripts. + + :rtype: list of :class:`~google.cloud.speech.transcript.Transcript` + :returns: List of ``Transcript`` objects. + """ + return self._alternatives + + @property + def is_final(self): + """Represents an interim result that may change. + + :rtype: bool + :returns: True if the result has completed it's processing. + """ + return self._is_final diff --git a/speech/google/cloud/speech/transcript.py b/speech/google/cloud/speech/transcript.py index bbe915396c5c..800f4e26d45c 100644 --- a/speech/google/cloud/speech/transcript.py +++ b/speech/google/cloud/speech/transcript.py @@ -16,14 +16,30 @@ class Transcript(object): - """Representation of Speech Transcripts + """Representation of Speech Transcripts. - :type result: dict - :param result: Dictionary of transcript and confidence of recognition. + :type transcript: str + :param transcript: String of transcribed data. + + :type confidence: float + :param confidence: The confidence estimate between 0.0 and 1.0. """ - def __init__(self, result): - self._transcript = result.get('transcript') - self._confidence = result.get('confidence') + def __init__(self, transcript, confidence): + self._transcript = transcript + self._confidence = confidence + + @classmethod + def from_pb(cls, transcript): + """Factory: construct ``Transcript`` from protobuf response + + :type transcript: :class:`~SpeechRecognitionAlternative` + :param transcript: Instance of ``SpeechRecognitionAlternative`` + from protobuf. + + :rtype: :class:`~Transcript` + :returns: Instance of ``Transcript``. + """ + return cls(transcript.transcript, transcript.confidence) @property def transcript(self): diff --git a/speech/setup.py b/speech/setup.py index c02aeaad3e9d..c7504e1beac4 100644 --- a/speech/setup.py +++ b/speech/setup.py @@ -51,6 +51,7 @@ REQUIREMENTS = [ 'google-cloud-core >= 0.20.0', + 'gapic-google-cloud-speech-v1beta1 >= 0.11.1' ] setup( diff --git a/speech/unit_tests/test_client.py b/speech/unit_tests/test_client.py index 5972a0014eb3..5fe8c3107fb7 100644 --- a/speech/unit_tests/test_client.py +++ b/speech/unit_tests/test_client.py @@ -195,6 +195,97 @@ def test_async_recognize(self): self.assertFalse(operation.complete) self.assertIsNone(operation.metadata) + def test_streaming_depends_on_gax(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + with _Monkey(MUT, _USE_GAX=False): + with self.assertRaises(EnvironmentError): + next(client.stream_recognize({})) + + def test_set_speech_api(self): + from google.cloud.speech import client as MUT + from google.cloud._testing import _Monkey + creds = _Credentials() + client = self._makeOne(credentials=creds) + client.connection = _Connection() + + with _Monkey(MUT, SpeechApi=_MockGAPICSpeechAPI): + client._speech_api = None + speech_api = client.speech_api + self.assertIsInstance(speech_api, _MockGAPICSpeechAPI) + + def test_streaming_with_empty_response(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + client._speech_api._responses = [] + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + results = client.stream_recognize(sample) + with self.assertRaises(StopIteration): + next(results) + + def test_stream_recognize(self): + from io import BytesIO + from google.cloud.speech.encoding import Encoding + from google.cloud.speech.streaming_response import ( + StreamingSpeechResponse) + + stream = BytesIO(b'Some audio data...') + credentials = _Credentials() + client = self._makeOne(credentials=credentials) + client.connection = _Connection() + client._speech_api = _MockGAPICSpeechAPI() + + sample = client.sample(stream=stream, + encoding=Encoding.LINEAR16, + sample_rate=self.SAMPLE_RATE) + responses = client.stream_recognize(sample) + + self.assertIsInstance(next(responses), StreamingSpeechResponse) + requests = [] + for req in client.speech_api._requests: + requests.append(req) + self.assertEqual(len(requests), 2) + + +class _MockSpeechGAPICAlternative(object): + transcript = 'hello there!' + confidence = 0.9704365 + + +class _MockSpeechGAPICResult(object): + alternatives = [_MockSpeechGAPICAlternative()] + is_final = False + stability = 0.0 + + +class _MockGAPICSpeechResponse(object): + error = None + endpointer_type = None + results = [_MockSpeechGAPICResult()] + result_index = 0 + + +class _MockGAPICSpeechAPI(object): + _requests = None + _responses = [None, _MockGAPICSpeechResponse()] + + def streaming_recognize(self, requests): + self._requests = requests + return self._responses + class _Credentials(object): diff --git a/speech/unit_tests/test_request.py b/speech/unit_tests/test_request.py new file mode 100644 index 000000000000..b536c661aecb --- /dev/null +++ b/speech/unit_tests/test_request.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingSpeechRequestHelpers(unittest.TestCase): + def test_make_request_stream(self): + from io import BytesIO + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + from google.cloud.speech.client import _make_request_stream + from google.cloud.speech.sample import Sample + + stream = BytesIO(b'g' * 1702) # Something bigger than a chunk. + sample = Sample(stream=stream, encoding='LINEAR16') + + request_count = 0 + for req in _make_request_stream(sample): + request_count += 1 + self.assertIsInstance(req, StreamingRecognizeRequest) + self.assertEqual(request_count, 3) + + def test_make_request_stream_short(self): + from io import BytesIO + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( + StreamingRecognizeRequest) + from google.cloud.speech.client import _make_request_stream + from google.cloud.speech.sample import Sample + + stream = BytesIO(b'g' * (1599 * 4)) # Something bigger than a chunk. + sample = Sample(stream=stream, encoding='LINEAR16') + + request_count = 0 + for req in _make_request_stream(sample): + request_count += 1 + self.assertIsInstance(req, StreamingRecognizeRequest) + + self.assertEqual(request_count, 5) diff --git a/speech/unit_tests/test_response.py b/speech/unit_tests/test_response.py new file mode 100644 index 000000000000..5a156b74d4b3 --- /dev/null +++ b/speech/unit_tests/test_response.py @@ -0,0 +1,59 @@ +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestStreamingSpeechResponse(unittest.TestCase): + def _getTargetClass(self): + from google.cloud.speech.streaming_response import ( + StreamingSpeechResponse) + return StreamingSpeechResponse + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_ctor(self): + response = self._makeOne({}, 'END_OF_UTTERANCE', [], 0) + self.assertEqual(response.result_index, 0) + self.assertEqual(response.endpointer_type, None) + self.assertEqual(response.results, []) + self.assertFalse(response.is_final) + + def test_from_pb(self): + response = self._makeOne() + res = response.from_pb(_MockSpeechPBResponse) + self.assertFalse(res.is_final) + self.assertEqual(res.endpointer_type, 'END_OF_AUDIO') + self.assertEqual(res.results[0].alternatives[0].transcript, + 'hello there!') + self.assertEqual(res.results[0].alternatives[0].confidence, 0.9704365) + + +class _MockSpeechPBAlternative(object): + transcript = 'hello there!' + confidence = 0.9704365 + + +class _MockSpeechPBResult(object): + alternatives = [_MockSpeechPBAlternative()] + is_final = False + stability = 0.0 + + +class _MockSpeechPBResponse(object): + error = {} + endpointer_type = 3 + result_index = 0 + results = [_MockSpeechPBResult, _MockSpeechPBResult] diff --git a/speech/unit_tests/test_transcript.py b/speech/unit_tests/test_transcript.py index b585d6e7429c..6cbf038546b4 100644 --- a/speech/unit_tests/test_transcript.py +++ b/speech/unit_tests/test_transcript.py @@ -26,7 +26,8 @@ def _makeOne(self, *args, **kwargs): def test_ctor(self): from unit_tests._fixtures import OPERATION_COMPLETE_RESPONSE as DATA TRANSCRIPT_DATA = DATA['response']['results'][0]['alternatives'][0] - transcript = self._makeOne(TRANSCRIPT_DATA) + transcript = self._makeOne(TRANSCRIPT_DATA['transcript'], + TRANSCRIPT_DATA['confidence']) self.assertEqual('how old is the Brooklyn Bridge', transcript.transcript) self.assertEqual(0.98267895, transcript.confidence)