From 8e0c33b59e6bc0c9d003ebf41cab5ec8c105a165 Mon Sep 17 00:00:00 2001 From: nirupa-kumar Date: Thu, 16 Aug 2018 15:35:59 -0700 Subject: [PATCH 1/4] Microphone streaming with a 1 minute duration. --- speech/cloud-client/README.md | 5 + .../java/com/example/speech/Recognize.java | 119 +++++++++++++++++- 2 files changed, 119 insertions(+), 5 deletions(-) diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md index 9a7055f32ff..98d6c69cfc4 100644 --- a/speech/cloud-client/README.md +++ b/speech/cloud-client/README.md @@ -92,6 +92,11 @@ Performing streaming speech transcription and punctuation on an audio file mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw" ``` +Perform microphone streaming speech recognition +``` +mvn exec:java -DRecognize -Dexec.args="micstreamrecognize" +``` + ## Enhanced Model Transcribe an audio file using an enhanced model ``` diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 9771ad2a8e9..29cd1f45712 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -19,6 +19,9 @@ import com.google.api.gax.longrunning.OperationFuture; import com.google.api.gax.rpc.ApiStreamObserver; import com.google.api.gax.rpc.BidiStreamingCallable; +import com.google.api.gax.rpc.ClientStream; +import com.google.api.gax.rpc.ResponseObserver; +import com.google.api.gax.rpc.StreamController; import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata; import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse; import com.google.cloud.speech.v1p1beta1.RecognitionAudio; @@ -47,6 +50,13 @@ import java.util.ArrayList; import java.util.List; +import javax.sound.sampled.AudioFormat; +import javax.sound.sampled.AudioInputStream; +import javax.sound.sampled.AudioSystem; +import javax.sound.sampled.DataLine; +import javax.sound.sampled.DataLine.Info; +import javax.sound.sampled.TargetDataLine; + public class Recognize { /** Run speech recognition tasks. */ @@ -56,7 +66,7 @@ public static void main(String... args) throws Exception { System.out.printf( "\tjava %s \"\" \"\"\n" + "Commands:\n" - + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n" + + "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize | wordoffsets\n" + "\t| model-selection | auto-punctuation | stream-punctuation | enhanced-model\n" + "\t| metadata | diarization | multi-channel | multi-language | word-level-conf" + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " @@ -88,6 +98,8 @@ public static void main(String... args) throws Exception { } } else if (command.equals("streamrecognize")) { streamingRecognizeFile(path); + } else if (command.equals("micstreamrecognize")) { + streamingMicRecognize(); } else if (command.equals("model-selection")) { if (path.startsWith("gs://")) { transcribeModelSelectionGcs(path); @@ -704,6 +716,101 @@ public SettableFuture> future() { } // [END speech_stream_recognize_punctuation] + // [START speech_streaming_mic_recognize] + + /** + * Performs microphone streaming speech recognition with a duration of 1 minute. + * + * @throws Exception + */ + public static void streamingMicRecognize() throws Exception { + AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false); + DataLine.Info targetInfo = new Info(TargetDataLine.class, audioFormat); + TargetDataLine targetDataLine; + int BYTES_PER_BUFFER = 6400; // buffer size in bytes + int durationMillSec = 60 * 1000; // 60 seconds + if (!AudioSystem.isLineSupported(targetInfo)) { + System.out.println("Microphone not supported"); + System.exit(0); + } + + ResponseObserver responseObserver = null; + try (SpeechClient client = SpeechClient.create()) { + + responseObserver = + new ResponseObserver() { + ArrayList responses = new ArrayList<>(); + + public void onStart(StreamController controller) {} + + public void onResponse(StreamingRecognizeResponse response) { + responses.add(response); + } + + public void onComplete() { + for (StreamingRecognizeResponse response : responses) { + StreamingRecognitionResult result = response.getResultsList().get(0); + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + } + } + + public void onError(Throwable t) { + System.out.println(t); + } + }; + + ClientStream clientStream = + client.streamingRecognizeCallable().splitCall(responseObserver); + + RecognitionConfig recConfig = + RecognitionConfig.newBuilder() + .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + StreamingRecognitionConfig config = + StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build(); + + StreamingRecognizeRequest request = + StreamingRecognizeRequest.newBuilder() + .setStreamingConfig(config) + .build(); // The first request in a streaming call has to be a config + + clientStream.send(request); + + // Get the target data line + targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); + targetDataLine.open(audioFormat); + targetDataLine.start(); + System.out.println("Start speaking"); + long startTime = System.currentTimeMillis(); + // Audio Input Stream + AudioInputStream audio = new AudioInputStream(targetDataLine); + while (true) { + long estimatedTime = System.currentTimeMillis() - startTime; + byte[] data = new byte[BYTES_PER_BUFFER]; + audio.read(data); + if (estimatedTime > durationMillSec) { + System.out.println("Stop speaking."); + targetDataLine.stop(); + targetDataLine.close(); + break; + } + request = + StreamingRecognizeRequest.newBuilder() + .setAudioContent(ByteString.copyFrom(data)) + .build(); + clientStream.send(request); + } + } catch (Exception e) { + System.out.println(e); + } + responseObserver.onComplete(); + } + + // [END speech_streaming_mic_recognize] + // [START speech_transcribe_file_with_enhanced_model] /** * Transcribe the given audio file using an enhanced model. @@ -833,8 +940,9 @@ public static void transcribeDiarization(String fileName) throws Exception { SpeechRecognitionAlternative alternative = result.getAlternatives(0); System.out.format("Transcript : %s\n", alternative.getTranscript()); // The words array contains the entire transcript up until that point. - //Referencing the last spoken word to get the associated Speaker tag - System.out.format("Speaker Tag %s: %s\n", + // Referencing the last spoken word to get the associated Speaker tag + System.out.format( + "Speaker Tag %s: %s\n", alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), alternative.getTranscript()); } @@ -877,8 +985,9 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception { // use the first (most likely) one here. SpeechRecognitionAlternative alternative = result.getAlternatives(0); // The words array contains the entire transcript up until that point. - //Referencing the last spoken word to get the associated Speaker tag - System.out.format("Speaker Tag %s:%s\n", + // Referencing the last spoken word to get the associated Speaker tag + System.out.format( + "Speaker Tag %s:%s\n", alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), alternative.getTranscript()); } From 7b6b10806aacba05453e79519910668696517dcc Mon Sep 17 00:00:00 2001 From: nirupa-kumar Date: Thu, 16 Aug 2018 15:58:43 -0700 Subject: [PATCH 2/4] Fixed audit issues. --- .../src/main/java/com/example/speech/Recognize.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 29cd1f45712..61dbd1b95aa 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -66,9 +66,10 @@ public static void main(String... args) throws Exception { System.out.printf( "\tjava %s \"\" \"\"\n" + "Commands:\n" - + "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize | wordoffsets\n" - + "\t| model-selection | auto-punctuation | stream-punctuation | enhanced-model\n" - + "\t| metadata | diarization | multi-channel | multi-language | word-level-conf" + + "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n" + + "\t| wordoffsets | model-selection | auto-punctuation | stream-punctuation \n" + + "\t| enhanced-model| metadata | diarization | multi-channel | multi-language \n" + + "\t | word-level-conf" + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + "for a Cloud Storage resource (gs://...)\n", Recognize.class.getCanonicalName()); @@ -721,13 +722,11 @@ public SettableFuture> future() { /** * Performs microphone streaming speech recognition with a duration of 1 minute. * - * @throws Exception */ public static void streamingMicRecognize() throws Exception { AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false); DataLine.Info targetInfo = new Info(TargetDataLine.class, audioFormat); TargetDataLine targetDataLine; - int BYTES_PER_BUFFER = 6400; // buffer size in bytes int durationMillSec = 60 * 1000; // 60 seconds if (!AudioSystem.isLineSupported(targetInfo)) { System.out.println("Microphone not supported"); @@ -789,7 +788,7 @@ public void onError(Throwable t) { AudioInputStream audio = new AudioInputStream(targetDataLine); while (true) { long estimatedTime = System.currentTimeMillis() - startTime; - byte[] data = new byte[BYTES_PER_BUFFER]; + byte[] data = new byte[6400]; audio.read(data); if (estimatedTime > durationMillSec) { System.out.println("Stop speaking."); From 838f7a3133f2dcdb5434e91efa00b2b2f5c3f7c6 Mon Sep 17 00:00:00 2001 From: nirupa-kumar Date: Fri, 17 Aug 2018 11:10:09 -0700 Subject: [PATCH 3/4] Fixing issues after review. --- .../java/com/example/speech/Recognize.java | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 61dbd1b95aa..fa29233313c 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -718,20 +718,8 @@ public SettableFuture> future() { // [END speech_stream_recognize_punctuation] // [START speech_streaming_mic_recognize] - - /** - * Performs microphone streaming speech recognition with a duration of 1 minute. - * - */ + /** Performs microphone streaming speech recognition with a duration of 1 minute. */ public static void streamingMicRecognize() throws Exception { - AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false); - DataLine.Info targetInfo = new Info(TargetDataLine.class, audioFormat); - TargetDataLine targetDataLine; - int durationMillSec = 60 * 1000; // 60 seconds - if (!AudioSystem.isLineSupported(targetInfo)) { - System.out.println("Microphone not supported"); - System.exit(0); - } ResponseObserver responseObserver = null; try (SpeechClient client = SpeechClient.create()) { @@ -762,23 +750,35 @@ public void onError(Throwable t) { ClientStream clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); - RecognitionConfig recConfig = + RecognitionConfig recognitionConfig = RecognitionConfig.newBuilder() .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) .setLanguageCode("en-US") .setSampleRateHertz(16000) .build(); - StreamingRecognitionConfig config = - StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build(); + StreamingRecognitionConfig streamingRecognitionConfig = + StreamingRecognitionConfig.newBuilder().setConfig(recognitionConfig).build(); StreamingRecognizeRequest request = StreamingRecognizeRequest.newBuilder() - .setStreamingConfig(config) + .setStreamingConfig(streamingRecognitionConfig) .build(); // The first request in a streaming call has to be a config clientStream.send(request); - - // Get the target data line + // SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true, + // bigEndian: false + AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false); + DataLine.Info targetInfo = + new Info( + TargetDataLine.class, + audioFormat); // Set the system information to read from the microphone audio stream + TargetDataLine targetDataLine; + int durationMillSec = 60 * 1000; // 60 seconds + if (!AudioSystem.isLineSupported(targetInfo)) { + System.out.println("Microphone not supported"); + System.exit(0); + } + //Target data line captures the audio stream the microphone produces. targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); targetDataLine.open(audioFormat); targetDataLine.start(); @@ -807,7 +807,6 @@ public void onError(Throwable t) { } responseObserver.onComplete(); } - // [END speech_streaming_mic_recognize] // [START speech_transcribe_file_with_enhanced_model] From 5cf46b90e8c395d646ff32c7d92e6b68fd1c21d2 Mon Sep 17 00:00:00 2001 From: nirupa-kumar Date: Fri, 17 Aug 2018 11:46:40 -0700 Subject: [PATCH 4/4] Fixing review issues. --- .../src/main/java/com/example/speech/Recognize.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index fa29233313c..15beaba3e1a 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -772,14 +772,13 @@ public void onError(Throwable t) { new Info( TargetDataLine.class, audioFormat); // Set the system information to read from the microphone audio stream - TargetDataLine targetDataLine; - int durationMillSec = 60 * 1000; // 60 seconds + if (!AudioSystem.isLineSupported(targetInfo)) { System.out.println("Microphone not supported"); System.exit(0); } - //Target data line captures the audio stream the microphone produces. - targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); + // Target data line captures the audio stream the microphone produces. + TargetDataLine targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); targetDataLine.open(audioFormat); targetDataLine.start(); System.out.println("Start speaking"); @@ -790,7 +789,7 @@ public void onError(Throwable t) { long estimatedTime = System.currentTimeMillis() - startTime; byte[] data = new byte[6400]; audio.read(data); - if (estimatedTime > durationMillSec) { + if (estimatedTime > 60000) { // 60 seconds System.out.println("Stop speaking."); targetDataLine.stop(); targetDataLine.close();