Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add samples for enhanced models and metadata #1093

Merged
merged 2 commits into from
Apr 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions speech/cloud-client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,15 @@ Performing streaming speech transcription and punctuation on an audio file
```
mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw"
```

## Enhanced Model
Transcribe an audio file using an enhanced model
```
mvn exec:java -DRecognize -Dexec.args="enhanced-model ./resources/commercial_mono.wav"
```

## Recognition Metadata
Transcribe an audio file with recognition metadata
```
mvn exec:java -DRecognize -Dexec.args="metadata ./resources/commercial_mono.wav"
```
2 changes: 1 addition & 1 deletion speech/cloud-client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-speech</artifactId>
<version>0.42.0-alpha</version>
<version>0.46.0-alpha</version>
</dependency>
<!-- [END dependencies] -->

Expand Down
Binary file not shown.
103 changes: 102 additions & 1 deletion speech/cloud-client/src/main/java/com/example/speech/Recognize.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
Expand Down Expand Up @@ -53,7 +57,7 @@ public static void main(String... args) throws Exception {
"\tjava %s \"<command>\" \"<path-to-image>\"\n"
+ "Commands:\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n"
+ "\t| auto-punctuation | stream-punctuation\n"
+ "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n"
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ "for a Cloud Storage resource (gs://...)\n",
Recognize.class.getCanonicalName());
Expand Down Expand Up @@ -97,6 +101,10 @@ public static void main(String... args) throws Exception {
}
} else if (command.equals("stream-punctuation")) {
streamingTranscribeWithAutomaticPunctuation(path);
} else if (command.equals("enhanced-model")) {
transcribeFileWithEnhancedModel(path);
} else if (command.equals("metadata")) {
transcribeFileWithMetadata(path);
}
}

Expand Down Expand Up @@ -678,4 +686,97 @@ public SettableFuture<List<T>> future() {
}
}
// [END speech_stream_recognize_punctuation]

// [START speech_transcribe_file_with_enhanced_model]
/**
* Transcribe the given audio file using an enhanced model.
*
* @param fileName the path to an audio file.
*/
public static void transcribeFileWithEnhancedModel(String fileName) throws Exception {
Path path = Paths.get(fileName);
byte[] content = Files.readAllBytes(path);

try (SpeechClient speechClient = SpeechClient.create()) {
// Get the contents of the local audio file
RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
.setContent(ByteString.copyFrom(content))
.build();

// Configure request to enable enhanced models
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
// Enhanced models are only available to projects that
// opt in for audio data collection.
.setUseEnhanced(true)
// A model must be specified to use enhanced model.
.setModel("phone_call")
.build();

// Perform the transcription request
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);

// Print out the results
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
}
}
}
// [END speech_transcribe_file_with_enhanced_model]

// [START speech_transcribe_file_with_metadata]
/**
* Transcribe the given audio file and include recognition metadata in the request.
*
* @param fileName the path to an audio file.
*/
public static void transcribeFileWithMetadata(String fileName) throws Exception {
Path path = Paths.get(fileName);
byte[] content = Files.readAllBytes(path);

try (SpeechClient speechClient = SpeechClient.create()) {
// Get the contents of the local audio file
RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
.setContent(ByteString.copyFrom(content))
.build();

// Construct a recognition metadata object.
// Most metadata fields are specified as enums that can be found
// in speech.enums.RecognitionMetadata
RecognitionMetadata metadata = RecognitionMetadata.newBuilder()
.setInteractionType(InteractionType.DISCUSSION)
.setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
.setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
.setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
// And some are integers, for instance the 6 digit NAICS code
// https://www.naics.com/search/
.setIndustryNaicsCodeOfAudio(519190)
.build();

// Configure request to enable enhanced models
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
.setMetadata(metadata) // Add the metadata to the config
.build();

// Perform the transcription request
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);

// Print out the results
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
}
}
}
// [END speech_transcribe_file_with_metadata]
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public class RecognizeIT {
private String videoFileName = "./resources/Google_Gnome.wav";
private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";

private String recognitionAudioFile = "./resources/commercial_mono.wav";

@Before
public void setUp() {
bout = new ByteArrayOutputStream();
Expand Down Expand Up @@ -145,4 +147,18 @@ public void testStreamAutoPunctuation() throws Exception {
String got = bout.toString();
assertThat(got).contains("How old is the Brooklyn Bridge?");
}

@Test
public void testEnhancedModel() throws Exception {
Recognize.transcribeFileWithEnhancedModel(recognitionAudioFile);
String got = bout.toString();
assertThat(got).contains("Chrome");
}

@Test
public void testMetadata() throws Exception {
Recognize.transcribeFileWithMetadata(recognitionAudioFile);
String got = bout.toString();
assertThat(got).contains("Chrome");
}
}