-
Notifications
You must be signed in to change notification settings - Fork 2.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Create test for a speech sample (InfiniteStreamRecognize) #9336
base: main
Are you sure you want to change the base?
Changes from 9 commits
42ec95c
eb7f110
13d1dda
a019754
f7be9d6
c6e8b71
380e357
c548fda
de30b87
b3e2da4
502d2b0
66b61c8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse; | ||
import com.google.protobuf.ByteString; | ||
import com.google.protobuf.Duration; | ||
import java.nio.charset.StandardCharsets; | ||
import java.text.DecimalFormat; | ||
import java.util.ArrayList; | ||
import java.util.concurrent.BlockingQueue; | ||
|
@@ -39,19 +40,15 @@ | |
import javax.sound.sampled.AudioSystem; | ||
import javax.sound.sampled.DataLine; | ||
import javax.sound.sampled.DataLine.Info; | ||
import javax.sound.sampled.LineUnavailableException; | ||
import javax.sound.sampled.TargetDataLine; | ||
|
||
public class InfiniteStreamRecognize { | ||
|
||
private static final int STREAMING_LIMIT = 290000; // ~5 minutes | ||
|
||
public static final String RED = "\033[0;31m"; | ||
public static final String GREEN = "\033[0;32m"; | ||
public static final String YELLOW = "\033[0;33m"; | ||
|
||
// Creating shared object | ||
private static volatile BlockingQueue<byte[]> sharedQueue = new LinkedBlockingQueue<byte[]>(); | ||
private static TargetDataLine targetDataLine; | ||
private static int BYTES_PER_BUFFER = 6400; // buffer size in bytes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Consider selecting multiple of 2 for the buffer size as a general guideline. Is it possible to explain how to select a certain value? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't explain how to select a certain value. |
||
|
||
private static int restartCounter = 0; | ||
|
@@ -63,19 +60,35 @@ public class InfiniteStreamRecognize { | |
private static boolean newStream = true; | ||
private static double bridgingOffset = 0; | ||
private static boolean lastTranscriptWasFinal = false; | ||
private static boolean stopRecognition = false; | ||
private static StreamController referenceToStreamController; | ||
private static ByteString tempByteString; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All constants in Java should use UPPER_SNAKE_CASE There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have google acc :( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @minherz By constant, do you mean the variable marked as final or static is constant too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I looked into the case closer. It looks like you made changes to the code sample itself and not only added the test case. I will ask a Googler who opened the issue to review the changes you made to the code sample itself.
I mean "constants" in Java. It should be literals declared with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeap, I refactored this sample to have a possibility to replace the voice recognition thread with a file or another source of an audio byte stream |
||
|
||
public static void main(String... args) { | ||
public static void main(String... args) throws LineUnavailableException { | ||
InfiniteStreamRecognizeOptions options = InfiniteStreamRecognizeOptions.fromFlags(args); | ||
if (options == null) { | ||
// Could not parse. | ||
System.out.println("Failed to parse options."); | ||
System.exit(1); | ||
} | ||
|
||
// TODO(developer): Replace the variables before running the sample or use default | ||
// the number of samples per second | ||
int sampleRate = 16000; | ||
// the number of bits in each sample | ||
int sampleSizeInBits = 16; | ||
// the number of channels (1 for mono, 2 for stereo, and so on) | ||
int channels = 1; | ||
// indicates whether the data is signed or unsigned | ||
boolean signed = true; | ||
// indicates whether the data for a single sample is stored in big-endian byte | ||
// order (false means little-endian) | ||
boolean bigEndian = false; | ||
|
||
MicBuffer micBuffer = new MicBuffer(sampleRate, sampleSizeInBits, channels, signed, bigEndian); | ||
try { | ||
infiniteStreamingRecognize(options.langCode); | ||
infiniteStreamingRecognize(options.langCode, micBuffer, sampleRate, | ||
RecognitionConfig.AudioEncoding.LINEAR16); | ||
Comment on lines
+90
to
+91
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand that this method exits when a stream has a word "exit". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a comment on how to stop, and this will also be printed when the application runs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have guidelines how code sample should look like. This repository is intended to host code used mainly for documentation code snippets. See go/code-snippets-101#what-is-a-code-snippet for the definition. It is not intended for demo applications or other complex solutions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please send a direct link, or if this link needs google account, could you please copy it here a phrase There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The documentation is available for Googlers. This is why we experience challenges when external contributors introduce complex code samples or changes to them in this repo. I would also like to better understand reasoning behind changing the code sample when the PR description explains changes as a test for the already existing code sample. |
||
} catch (Exception e) { | ||
System.out.println("Exception caught: " + e); | ||
} | ||
|
@@ -94,87 +107,22 @@ public static String convertMillisToDate(double milliSeconds) { | |
} | ||
|
||
/** Performs infinite streaming speech recognition */ | ||
public static void infiniteStreamingRecognize(String languageCode) throws Exception { | ||
|
||
// Microphone Input buffering | ||
class MicBuffer implements Runnable { | ||
|
||
@Override | ||
public void run() { | ||
System.out.println(YELLOW); | ||
System.out.println("Start speaking...Press Ctrl-C to stop"); | ||
targetDataLine.start(); | ||
byte[] data = new byte[BYTES_PER_BUFFER]; | ||
while (targetDataLine.isOpen()) { | ||
try { | ||
int numBytesRead = targetDataLine.read(data, 0, data.length); | ||
if ((numBytesRead <= 0) && (targetDataLine.isOpen())) { | ||
continue; | ||
} | ||
sharedQueue.put(data.clone()); | ||
} catch (InterruptedException e) { | ||
System.out.println("Microphone input buffering interrupted : " + e.getMessage()); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public static void infiniteStreamingRecognize(String languageCode, Runnable micBuffer, | ||
int sampleRateHertz, | ||
RecognitionConfig.AudioEncoding encoding) | ||
throws Exception { | ||
// Creating microphone input buffer thread | ||
MicBuffer micrunnable = new MicBuffer(); | ||
Thread micThread = new Thread(micrunnable); | ||
ResponseObserver<StreamingRecognizeResponse> responseObserver = null; | ||
Thread micThread = new Thread(micBuffer); | ||
try (SpeechClient client = SpeechClient.create()) { | ||
ClientStream<StreamingRecognizeRequest> clientStream; | ||
responseObserver = | ||
new ResponseObserver<StreamingRecognizeResponse>() { | ||
|
||
ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>(); | ||
|
||
public void onStart(StreamController controller) { | ||
referenceToStreamController = controller; | ||
} | ||
|
||
public void onResponse(StreamingRecognizeResponse response) { | ||
responses.add(response); | ||
StreamingRecognitionResult result = response.getResultsList().get(0); | ||
Duration resultEndTime = result.getResultEndTime(); | ||
resultEndTimeInMS = | ||
(int) | ||
((resultEndTime.getSeconds() * 1000) + (resultEndTime.getNanos() / 1000000)); | ||
double correctedTime = | ||
resultEndTimeInMS - bridgingOffset + (STREAMING_LIMIT * restartCounter); | ||
|
||
SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); | ||
if (result.getIsFinal()) { | ||
System.out.print(GREEN); | ||
System.out.print("\033[2K\r"); | ||
System.out.printf( | ||
"%s: %s [confidence: %.2f]\n", | ||
convertMillisToDate(correctedTime), | ||
alternative.getTranscript(), | ||
alternative.getConfidence()); | ||
isFinalEndTime = resultEndTimeInMS; | ||
lastTranscriptWasFinal = true; | ||
} else { | ||
System.out.print(RED); | ||
System.out.print("\033[2K\r"); | ||
System.out.printf( | ||
"%s: %s", convertMillisToDate(correctedTime), alternative.getTranscript()); | ||
lastTranscriptWasFinal = false; | ||
} | ||
} | ||
|
||
public void onComplete() {} | ||
|
||
public void onError(Throwable t) {} | ||
}; | ||
clientStream = client.streamingRecognizeCallable().splitCall(responseObserver); | ||
ResponseObserver<StreamingRecognizeResponse> responseObserver = getResponseObserver(); | ||
ClientStream<StreamingRecognizeRequest> clientStream = client.streamingRecognizeCallable() | ||
.splitCall(responseObserver); | ||
|
||
RecognitionConfig recognitionConfig = | ||
RecognitionConfig.newBuilder() | ||
.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16) | ||
.setEncoding(encoding) | ||
.setLanguageCode(languageCode) | ||
.setSampleRateHertz(16000) | ||
.setSampleRateHertz(sampleRateHertz) | ||
.build(); | ||
|
||
StreamingRecognitionConfig streamingRecognitionConfig = | ||
|
@@ -191,28 +139,11 @@ public void onError(Throwable t) {} | |
clientStream.send(request); | ||
|
||
try { | ||
// SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true, | ||
// bigEndian: false | ||
AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false); | ||
DataLine.Info targetInfo = | ||
new Info( | ||
TargetDataLine.class, | ||
audioFormat); // Set the system information to read from the microphone audio | ||
// stream | ||
|
||
if (!AudioSystem.isLineSupported(targetInfo)) { | ||
System.out.println("Microphone not supported"); | ||
System.exit(0); | ||
} | ||
// Target data line captures the audio stream the microphone produces. | ||
targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); | ||
targetDataLine.open(audioFormat); | ||
micThread.setDaemon(true); | ||
micThread.start(); | ||
|
||
long startTime = System.currentTimeMillis(); | ||
|
||
while (true) { | ||
|
||
while (!stopRecognition) { | ||
long estimatedTime = System.currentTimeMillis() - startTime; | ||
|
||
if (estimatedTime >= STREAMING_LIMIT) { | ||
|
@@ -244,7 +175,6 @@ public void onError(Throwable t) {} | |
.setStreamingConfig(streamingRecognitionConfig) | ||
.build(); | ||
|
||
System.out.println(YELLOW); | ||
System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_LIMIT); | ||
|
||
startTime = System.currentTimeMillis(); | ||
|
@@ -285,18 +215,121 @@ public void onError(Throwable t) {} | |
|
||
tempByteString = ByteString.copyFrom(sharedQueue.take()); | ||
|
||
checkStopRecognitionFlag(tempByteString.toByteArray()); | ||
|
||
request = | ||
StreamingRecognizeRequest.newBuilder().setAudioContent(tempByteString).build(); | ||
|
||
audioInput.add(tempByteString); | ||
} | ||
|
||
Thread.sleep(100); | ||
ssvir marked this conversation as resolved.
Show resolved
Hide resolved
|
||
clientStream.send(request); | ||
} | ||
clientStream.closeSend(); | ||
} catch (Exception e) { | ||
System.out.println(e); | ||
} | ||
} | ||
} | ||
|
||
public static ResponseObserver<StreamingRecognizeResponse> getResponseObserver() { | ||
return new ResponseObserver<StreamingRecognizeResponse>() { | ||
|
||
final ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>(); | ||
|
||
public void onStart(StreamController controller) { | ||
referenceToStreamController = controller; | ||
} | ||
|
||
public void onResponse(StreamingRecognizeResponse response) { | ||
responses.add(response); | ||
StreamingRecognitionResult result = response.getResultsList().get(0); | ||
Duration resultEndTime = result.getResultEndTime(); | ||
resultEndTimeInMS = | ||
(int) ((resultEndTime.getSeconds() * 1000) + (resultEndTime.getNanos() / 1000000)); | ||
double correctedTime = | ||
resultEndTimeInMS - bridgingOffset + (STREAMING_LIMIT * restartCounter); | ||
|
||
SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); | ||
minherz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (result.getIsFinal()) { | ||
System.out.print("\r"); | ||
System.out.printf( | ||
"%s: %s [confidence: %.2f]\n", | ||
convertMillisToDate(correctedTime), | ||
alternative.getTranscript(), | ||
alternative.getConfidence()); | ||
isFinalEndTime = resultEndTimeInMS; | ||
lastTranscriptWasFinal = true; | ||
} else { | ||
System.out.print("\r"); | ||
System.out.printf( | ||
"%s: %s", convertMillisToDate(correctedTime), alternative.getTranscript()); | ||
lastTranscriptWasFinal = false; | ||
} | ||
checkStopRecognitionFlag(alternative.getTranscript().getBytes(StandardCharsets.UTF_8)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this method will fail if the "exit" word is not in the first element. can such thing happen? can the code reflect that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this code reflects only if the word "exit" will be present in a sentence , that's why we check element length before comparing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My question was related to line 256. Is it possible that other, less confident, alternatives interpret "exit" while the first one does not? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we have some possibility, so in this case, we should repeat "exit". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO it does not look like a good code sample then. Code samples should be deterministic and easy to comprehend. |
||
} | ||
|
||
public void onComplete() { | ||
System.out.println("Recognition was stopped"); | ||
} | ||
|
||
public void onError(Throwable t) {} | ||
}; | ||
} | ||
|
||
private static void checkStopRecognitionFlag(byte[] flag) { | ||
if (flag.length < 10) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: it would be nice to make more clear why |
||
stopRecognition = new String(flag).trim().equalsIgnoreCase("exit"); | ||
putDataToSharedQueue("exit".getBytes(StandardCharsets.UTF_8)); | ||
} | ||
} | ||
|
||
// Microphone Input buffering | ||
static class MicBuffer implements Runnable { | ||
TargetDataLine targetDataLine; | ||
|
||
public MicBuffer(int sampleRate, int sampleSizeInBits, | ||
int channels, boolean signed, boolean bigEndian) | ||
throws LineUnavailableException { | ||
AudioFormat audioFormat | ||
= new AudioFormat(sampleRate, sampleSizeInBits, channels, signed, bigEndian); | ||
DataLine.Info targetInfo = | ||
new Info( | ||
TargetDataLine.class, | ||
audioFormat); // Set the system information to read from the microphone audio | ||
// stream | ||
|
||
if (!AudioSystem.isLineSupported(targetInfo)) { | ||
System.out.println("Microphone not supported"); | ||
System.exit(0); | ||
} | ||
// Target data line captures the audio stream the microphone produces. | ||
targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo); | ||
targetDataLine.open(audioFormat); | ||
} | ||
|
||
@Override | ||
public void run() { | ||
System.out.println("Start speaking...Say `exit` to stop"); | ||
targetDataLine.start(); | ||
byte[] data = new byte[BYTES_PER_BUFFER]; | ||
while (targetDataLine.isOpen()) { | ||
int numBytesRead = targetDataLine.read(data, 0, data.length); | ||
if ((numBytesRead <= 0) && (targetDataLine.isOpen())) { | ||
continue; | ||
} | ||
putDataToSharedQueue(data.clone()); | ||
} | ||
} | ||
} | ||
|
||
public static void putDataToSharedQueue(byte[] data) { | ||
try { | ||
sharedQueue.put(data.clone()); | ||
} catch (InterruptedException e) { | ||
System.out.printf("Can't insert data to shared queue. Caused by : %s", e.getMessage()); | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} | ||
// [END speech_transcribe_infinite_streaming] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: This code sample is called infinite streaming. The name "streaming limit" can be potentially confusing. Consider providing more meaningful name.