Added voice control
Former-commit-id: 6f69079bf44f0d8f9ae40de6b0f1638d103464c2
This commit is contained in:
parent
35c92407a3
commit
53da641909
863 changed files with 192681 additions and 0 deletions
58
lib/sphinx4-5prealpha-src/sphinx4-samples/pom.xml
Normal file
58
lib/sphinx4-5prealpha-src/sphinx4-samples/pom.xml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
<project
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>edu.cmu.sphinx</groupId>
|
||||
<artifactId>sphinx4-parent</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>sphinx4-samples</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<name>Sphinx4 demo applications</name>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>edu.cmu.sphinx</groupId>
|
||||
<artifactId>sphinx4-core</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>edu.cmu.sphinx</groupId>
|
||||
<artifactId>sphinx4-data</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<addClasspath>true</addClasspath>
|
||||
<mainClass>edu.cmu.sphinx.demo.DemoRunner</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar-with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
package edu.cmu.sphinx.demo;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import edu.cmu.sphinx.demo.aligner.AlignerDemo;
|
||||
import edu.cmu.sphinx.demo.dialog.DialogDemo;
|
||||
import edu.cmu.sphinx.demo.speakerid.SpeakerIdentificationDemo;
|
||||
import edu.cmu.sphinx.demo.transcriber.TranscriberDemo;
|
||||
|
||||
import static java.util.Arrays.copyOfRange;
|
||||
|
||||
public class DemoRunner {
|
||||
|
||||
static final Class<?>[] paramTypes = new Class<?>[] {String[].class};
|
||||
private static final Map<String, Class<?>> classes =
|
||||
new TreeMap<String, Class<?>>();
|
||||
|
||||
static {
|
||||
classes.put("aligner", AlignerDemo.class);
|
||||
classes.put("dialog", DialogDemo.class);
|
||||
classes.put("speakerid", SpeakerIdentificationDemo.class);
|
||||
classes.put("transcriber", TranscriberDemo.class);
|
||||
}
|
||||
|
||||
public static void printUsage() {
|
||||
System.err.println("Usage: DemoRunner <DEMO> [<ARG> ...]\n");
|
||||
System.err.println("Demo names:");
|
||||
|
||||
for (String name : classes.keySet())
|
||||
System.err.println(" " + name);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Throwable {
|
||||
if (0 == args.length || !classes.containsKey(args[0])) {
|
||||
printUsage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
Method main = classes.get(args[0]).getMethod("main", paramTypes);
|
||||
main.invoke(null, new Object[]{copyOfRange(args, 1, args.length)});
|
||||
} catch (InvocationTargetException e) {
|
||||
throw e.getCause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright 1999-2013 Carnegie Mellon University.
|
||||
* Portions Copyright 2004 Sun Microsystems, Inc.
|
||||
* Portions Copyright 2004 Mitsubishi Electric Research Laboratories.
|
||||
* All Rights Reserved. Use is subject to license terms.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and
|
||||
* redistribution of this file, and for a DISCLAIMER OF ALL
|
||||
* WARRANTIES.
|
||||
*
|
||||
*/
|
||||
package edu.cmu.sphinx.demo.aligner;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
|
||||
import edu.cmu.sphinx.alignment.LongTextAligner;
|
||||
import edu.cmu.sphinx.api.SpeechAligner;
|
||||
import edu.cmu.sphinx.result.WordResult;
|
||||
|
||||
/**
|
||||
* This class demonstrates how to align audio to existing transcription and
|
||||
* receive word timestamps.
|
||||
* <br>
|
||||
* In order to initialize the aligner you need to specify several data files
|
||||
* which might be available on the CMUSphinx website. There should be an
|
||||
* acoustic model for your language, a dictionary, an optional G2P model to
|
||||
* convert word strings to pronunciation.
|
||||
* <br>
|
||||
* Currently the audio must have specific format (16khz, 16bit, mono), but in
|
||||
* the future other formats will be supported.
|
||||
* <br>
|
||||
* Text should be a clean text in lower case. It should be cleaned from
|
||||
* punctuation marks, numbers and other non-speakable things. In the future
|
||||
* automatic cleanup will be supported.
|
||||
*/
|
||||
public class AlignerDemo {
|
||||
private static final String ACOUSTIC_MODEL_PATH =
|
||||
"resource:/edu/cmu/sphinx/models/en-us/en-us";
|
||||
private static final String DICTIONARY_PATH =
|
||||
"resource:/edu/cmu/sphinx/models/en-us/cmudict-en-us.dict";
|
||||
private static final String TEXT = "one zero zero zero one nine oh two "
|
||||
+ "one oh zero one eight zero three";
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
URL audioUrl;
|
||||
String transcript;
|
||||
if (args.length > 1) {
|
||||
audioUrl = new File(args[0]).toURI().toURL();
|
||||
Scanner scanner = new Scanner(new File(args[1]));
|
||||
scanner.useDelimiter("\\Z");
|
||||
transcript = scanner.next();
|
||||
scanner.close();
|
||||
} else {
|
||||
audioUrl = AlignerDemo.class.getResource("10001-90210-01803.wav");
|
||||
transcript = TEXT;
|
||||
}
|
||||
String acousticModelPath =
|
||||
(args.length > 2) ? args[2] : ACOUSTIC_MODEL_PATH;
|
||||
String dictionaryPath = (args.length > 3) ? args[3] : DICTIONARY_PATH;
|
||||
String g2pPath = (args.length > 4) ? args[4] : null;
|
||||
SpeechAligner aligner =
|
||||
new SpeechAligner(acousticModelPath, dictionaryPath, g2pPath);
|
||||
|
||||
List<WordResult> results = aligner.align(audioUrl, transcript);
|
||||
List<String> stringResults = new ArrayList<String>();
|
||||
for (WordResult wr : results) {
|
||||
stringResults.add(wr.getWord().getSpelling());
|
||||
}
|
||||
|
||||
LongTextAligner textAligner =
|
||||
new LongTextAligner(stringResults, 2);
|
||||
List<String> sentences = aligner.getTokenizer().expand(transcript);
|
||||
List<String> words = aligner.sentenceToWords(sentences);
|
||||
|
||||
int[] aid = textAligner.align(words);
|
||||
|
||||
int lastId = -1;
|
||||
for (int i = 0; i < aid.length; ++i) {
|
||||
if (aid[i] == -1) {
|
||||
System.out.format("- %s\n", words.get(i));
|
||||
} else {
|
||||
if (aid[i] - lastId > 1) {
|
||||
for (WordResult result : results.subList(lastId + 1,
|
||||
aid[i])) {
|
||||
System.out.format("+ %-25s [%s]\n", result.getWord()
|
||||
.getSpelling(), result.getTimeFrame());
|
||||
}
|
||||
}
|
||||
System.out.format(" %-25s [%s]\n", results.get(aid[i])
|
||||
.getWord().getSpelling(), results.get(aid[i])
|
||||
.getTimeFrame());
|
||||
lastId = aid[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (lastId >= 0 && results.size() - lastId > 1) {
|
||||
for (WordResult result : results.subList(lastId + 1,
|
||||
results.size())) {
|
||||
System.out.format("+ %-25s [%s]\n", result.getWord()
|
||||
.getSpelling(), result.getTimeFrame());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright 2014 Carnegie Mellon University.
|
||||
* All Rights Reserved. Use is subject to license terms.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and
|
||||
* redistribution of this file, and for a DISCLAIMER OF ALL
|
||||
* WARRANTIES.
|
||||
*/
|
||||
|
||||
package edu.cmu.sphinx.demo.allphone;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import edu.cmu.sphinx.api.Configuration;
|
||||
import edu.cmu.sphinx.api.Context;
|
||||
import edu.cmu.sphinx.api.SpeechResult;
|
||||
import edu.cmu.sphinx.recognizer.Recognizer;
|
||||
import edu.cmu.sphinx.result.Result;
|
||||
import edu.cmu.sphinx.result.WordResult;
|
||||
import edu.cmu.sphinx.util.TimeFrame;
|
||||
|
||||
/**
|
||||
* A simple example that shows how to transcribe a continuous audio file that
|
||||
* has multiple utterances in it.
|
||||
*/
|
||||
public class AllphoneDemo {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.out.println("Loading models...");
|
||||
|
||||
Configuration configuration = new Configuration();
|
||||
|
||||
// Load model from the jar
|
||||
configuration
|
||||
.setAcousticModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us");
|
||||
|
||||
// You can also load model from folder
|
||||
// configuration.setAcousticModelPath("file:en-us");
|
||||
|
||||
configuration
|
||||
.setDictionaryPath("resource:/edu/cmu/sphinx/models/en-us/cmudict-en-us.dict");
|
||||
Context context = new Context(configuration);
|
||||
context.setLocalProperty("decoder->searchManager", "allphoneSearchManager");
|
||||
Recognizer recognizer = context.getInstance(Recognizer.class);
|
||||
InputStream stream = AllphoneDemo.class
|
||||
.getResourceAsStream("/edu/cmu/sphinx/demo/aligner/10001-90210-01803.wav");
|
||||
stream.skip(44);
|
||||
|
||||
// Simple recognition with generic model
|
||||
recognizer.allocate();
|
||||
context.setSpeechSource(stream, TimeFrame.INFINITE);
|
||||
Result result;
|
||||
while ((result = recognizer.recognize()) != null) {
|
||||
SpeechResult speechResult = new SpeechResult(result);
|
||||
System.out.format("Hypothesis: %s\n", speechResult.getHypothesis());
|
||||
|
||||
System.out.println("List of recognized words and their times:");
|
||||
for (WordResult r : speechResult.getWords()) {
|
||||
System.out.println(r);
|
||||
}
|
||||
|
||||
System.out.println("Lattice contains "
|
||||
+ speechResult.getLattice().getNodes().size() + " nodes");
|
||||
}
|
||||
recognizer.deallocate();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
/*
|
||||
* Copyright 2013 Carnegie Mellon University.
|
||||
* Portions Copyright 2004 Sun Microsystems, Inc.
|
||||
* Portions Copyright 2004 Mitsubishi Electric Research Laboratories.
|
||||
* All Rights Reserved. Use is subject to license terms.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and
|
||||
* redistribution of this file, and for a DISCLAIMER OF ALL
|
||||
* WARRANTIES.
|
||||
*/
|
||||
|
||||
package edu.cmu.sphinx.demo.dialog;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import edu.cmu.sphinx.api.Configuration;
|
||||
import edu.cmu.sphinx.api.LiveSpeechRecognizer;
|
||||
|
||||
|
||||
public class DialogDemo {
|
||||
|
||||
private static final String ACOUSTIC_MODEL =
|
||||
"resource:/edu/cmu/sphinx/models/en-us/en-us";
|
||||
private static final String DICTIONARY_PATH =
|
||||
"resource:/edu/cmu/sphinx/models/en-us/cmudict-en-us.dict";
|
||||
private static final String GRAMMAR_PATH =
|
||||
"resource:/edu/cmu/sphinx/demo/dialog/";
|
||||
private static final String LANGUAGE_MODEL =
|
||||
"resource:/edu/cmu/sphinx/demo/dialog/weather.lm";
|
||||
|
||||
private static final Map<String, Integer> DIGITS =
|
||||
new HashMap<String, Integer>();
|
||||
|
||||
static {
|
||||
DIGITS.put("oh", 0);
|
||||
DIGITS.put("zero", 0);
|
||||
DIGITS.put("one", 1);
|
||||
DIGITS.put("two", 2);
|
||||
DIGITS.put("three", 3);
|
||||
DIGITS.put("four", 4);
|
||||
DIGITS.put("five", 5);
|
||||
DIGITS.put("six", 6);
|
||||
DIGITS.put("seven", 7);
|
||||
DIGITS.put("eight", 8);
|
||||
DIGITS.put("nine", 9);
|
||||
}
|
||||
|
||||
private static double parseNumber(String[] tokens) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (int i = 1; i < tokens.length; ++i) {
|
||||
if (tokens[i].equals("point"))
|
||||
sb.append(".");
|
||||
else
|
||||
sb.append(DIGITS.get(tokens[i]));
|
||||
}
|
||||
|
||||
return Double.parseDouble(sb.toString());
|
||||
}
|
||||
private static void recognizeDigits(LiveSpeechRecognizer recognizer) {
|
||||
System.out.println("Digits recognition (using GrXML)");
|
||||
System.out.println("--------------------------------");
|
||||
System.out.println("Example: one two three");
|
||||
System.out.println("Say \"101\" to exit");
|
||||
System.out.println("--------------------------------");
|
||||
|
||||
recognizer.startRecognition(true);
|
||||
while (true) {
|
||||
String utterance = recognizer.getResult().getHypothesis();
|
||||
if (utterance.equals("one zero one")
|
||||
|| utterance.equals("one oh one"))
|
||||
break;
|
||||
else
|
||||
System.out.println(utterance);
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
}
|
||||
|
||||
private static void recognizerBankAccount(LiveSpeechRecognizer recognizer) {
|
||||
System.out.println("This is bank account voice menu");
|
||||
System.out.println("-------------------------------");
|
||||
System.out.println("Example: balance");
|
||||
System.out.println("Example: withdraw zero point five");
|
||||
System.out.println("Example: deposit one two three");
|
||||
System.out.println("Example: back");
|
||||
System.out.println("-------------------------------");
|
||||
|
||||
double savings = .0;
|
||||
recognizer.startRecognition(true);
|
||||
|
||||
while (true) {
|
||||
String utterance = recognizer.getResult().getHypothesis();
|
||||
if (utterance.endsWith("back")) {
|
||||
break;
|
||||
} else if (utterance.startsWith("deposit")) {
|
||||
double deposit = parseNumber(utterance.split("\\s"));
|
||||
savings += deposit;
|
||||
System.out.format("Deposited: $%.2f\n", deposit);
|
||||
} else if (utterance.startsWith("withdraw")) {
|
||||
double withdraw = parseNumber(utterance.split("\\s"));
|
||||
savings -= withdraw;
|
||||
System.out.format("Withdrawn: $%.2f\n", withdraw);
|
||||
} else if (!utterance.endsWith("balance")) {
|
||||
System.out.println("Unrecognized command: " + utterance);
|
||||
}
|
||||
|
||||
System.out.format("Your savings: $%.2f\n", savings);
|
||||
}
|
||||
|
||||
recognizer.stopRecognition();
|
||||
}
|
||||
|
||||
private static void recognizeWeather(LiveSpeechRecognizer recognizer) {
|
||||
System.out.println("Try some forecast. End with \"the end\"");
|
||||
System.out.println("-------------------------------------");
|
||||
System.out.println("Example: mostly dry some fog patches tonight");
|
||||
System.out.println("Example: sunny spells on wednesday");
|
||||
System.out.println("-------------------------------------");
|
||||
|
||||
recognizer.startRecognition(true);
|
||||
while (true) {
|
||||
String utterance = recognizer.getResult().getHypothesis();
|
||||
if (utterance.equals("the end"))
|
||||
break;
|
||||
else
|
||||
System.out.println(utterance);
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
Configuration configuration = new Configuration();
|
||||
configuration.setAcousticModelPath(ACOUSTIC_MODEL);
|
||||
configuration.setDictionaryPath(DICTIONARY_PATH);
|
||||
configuration.setGrammarPath(GRAMMAR_PATH);
|
||||
configuration.setUseGrammar(true);
|
||||
|
||||
configuration.setGrammarName("dialog");
|
||||
LiveSpeechRecognizer jsgfRecognizer =
|
||||
new LiveSpeechRecognizer(configuration);
|
||||
|
||||
configuration.setGrammarName("digits.grxml");
|
||||
LiveSpeechRecognizer grxmlRecognizer =
|
||||
new LiveSpeechRecognizer(configuration);
|
||||
|
||||
configuration.setUseGrammar(false);
|
||||
configuration.setLanguageModelPath(LANGUAGE_MODEL);
|
||||
LiveSpeechRecognizer lmRecognizer =
|
||||
new LiveSpeechRecognizer(configuration);
|
||||
|
||||
jsgfRecognizer.startRecognition(true);
|
||||
while (true) {
|
||||
System.out.println("Choose menu item:");
|
||||
System.out.println("Example: go to the bank account");
|
||||
System.out.println("Example: exit the program");
|
||||
System.out.println("Example: weather forecast");
|
||||
System.out.println("Example: digits\n");
|
||||
|
||||
String utterance = jsgfRecognizer.getResult().getHypothesis();
|
||||
|
||||
if (utterance.startsWith("exit"))
|
||||
break;
|
||||
|
||||
if (utterance.equals("digits")) {
|
||||
jsgfRecognizer.stopRecognition();
|
||||
recognizeDigits(grxmlRecognizer);
|
||||
jsgfRecognizer.startRecognition(true);
|
||||
}
|
||||
|
||||
if (utterance.equals("bank account")) {
|
||||
jsgfRecognizer.stopRecognition();
|
||||
recognizerBankAccount(jsgfRecognizer);
|
||||
jsgfRecognizer.startRecognition(true);
|
||||
}
|
||||
|
||||
if (utterance.endsWith("weather forecast")) {
|
||||
jsgfRecognizer.stopRecognition();
|
||||
recognizeWeather(lmRecognizer);
|
||||
jsgfRecognizer.startRecognition(true);
|
||||
}
|
||||
}
|
||||
|
||||
jsgfRecognizer.stopRecognition();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
package edu.cmu.sphinx.demo.speakerid;
|
||||
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import edu.cmu.sphinx.api.Configuration;
|
||||
import edu.cmu.sphinx.api.SpeechResult;
|
||||
import edu.cmu.sphinx.api.StreamSpeechRecognizer;
|
||||
import edu.cmu.sphinx.decoder.adaptation.Stats;
|
||||
import edu.cmu.sphinx.decoder.adaptation.Transform;
|
||||
import edu.cmu.sphinx.speakerid.Segment;
|
||||
import edu.cmu.sphinx.speakerid.SpeakerCluster;
|
||||
import edu.cmu.sphinx.speakerid.SpeakerIdentification;
|
||||
import edu.cmu.sphinx.util.TimeFrame;
|
||||
|
||||
public class SpeakerIdentificationDemo {
|
||||
|
||||
/**
|
||||
* Returns string version of the given time in milliseconds
|
||||
*
|
||||
* @param milliseconds time in milliseconds
|
||||
* @return time in format mm:ss
|
||||
*/
|
||||
public static String time(int milliseconds) {
|
||||
return (milliseconds / 60000) + ":"
|
||||
+ (Math.round((double) (milliseconds % 60000) / 1000));
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param speakers
|
||||
* An array of clusters for which it is needed to be printed the
|
||||
* speakers intervals
|
||||
* @param fileName
|
||||
* THe name of file we are processing
|
||||
*/
|
||||
public static void printSpeakerIntervals(
|
||||
ArrayList<SpeakerCluster> speakers, String fileName) {
|
||||
int idx = 0;
|
||||
for (SpeakerCluster spk : speakers) {
|
||||
idx++;
|
||||
ArrayList<Segment> segments = spk.getSpeakerIntervals();
|
||||
for (Segment seg : segments)
|
||||
System.out.println(fileName + " " + " "
|
||||
+ time(seg.getStartTime()) + " "
|
||||
+ time(seg.getLength()) + " Speaker" + idx);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param speakers
|
||||
* An array of clusters for which it is needed to get the
|
||||
* speakers intervals for decoding with per-speaker adaptation
|
||||
* with diarization.
|
||||
* @param url
|
||||
* Url for the audio
|
||||
* @throws Exception if something went wrong
|
||||
*/
|
||||
public static void speakerAdaptiveDecoding(ArrayList<SpeakerCluster> speakers,
|
||||
URL url) throws Exception {
|
||||
|
||||
Configuration configuration = new Configuration();
|
||||
|
||||
// Load model from the jar
|
||||
configuration
|
||||
.setAcousticModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us");
|
||||
configuration
|
||||
.setDictionaryPath("resource:/edu/cmu/sphinx/models/en-us/cmudict-en-us.dict");
|
||||
configuration
|
||||
.setLanguageModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us.lm.dmp");
|
||||
|
||||
StreamSpeechRecognizer recognizer = new StreamSpeechRecognizer(
|
||||
configuration);
|
||||
|
||||
TimeFrame t;
|
||||
SpeechResult result;
|
||||
|
||||
for (SpeakerCluster spk : speakers) {
|
||||
Stats stats = recognizer.createStats(1);
|
||||
ArrayList<Segment> segments = spk.getSpeakerIntervals();
|
||||
|
||||
for (Segment s : segments) {
|
||||
long startTime = s.getStartTime();
|
||||
long endTime = s.getStartTime() + s.getLength();
|
||||
t = new TimeFrame(startTime, endTime);
|
||||
|
||||
recognizer.startRecognition(url.openStream(), t);
|
||||
while ((result = recognizer.getResult()) != null) {
|
||||
stats.collect(result);
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
}
|
||||
|
||||
Transform profile;
|
||||
// Create the Transformation
|
||||
profile = stats.createTransform();
|
||||
recognizer.setTransform(profile);
|
||||
|
||||
for (Segment seg : segments) {
|
||||
long startTime = seg.getStartTime();
|
||||
long endTime = seg.getStartTime() + seg.getLength();
|
||||
t = new TimeFrame(startTime, endTime);
|
||||
|
||||
// Decode again with updated SpeakerProfile
|
||||
recognizer.startRecognition(url.openStream(), t);
|
||||
while ((result = recognizer.getResult()) != null) {
|
||||
System.out.format("Hypothesis: %s\n",
|
||||
result.getHypothesis());
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
SpeakerIdentification sd = new SpeakerIdentification();
|
||||
URL url = SpeakerIdentificationDemo.class.getResource("test.wav");
|
||||
ArrayList<SpeakerCluster> clusters = sd.cluster(url.openStream());
|
||||
|
||||
printSpeakerIntervals(clusters, url.getPath());
|
||||
speakerAdaptiveDecoding(clusters, url);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* Copyright 1999-2013 Carnegie Mellon University.
|
||||
* Portions Copyright 2004 Sun Microsystems, Inc.
|
||||
* Portions Copyright 2004 Mitsubishi Electric Research Laboratories.
|
||||
* All Rights Reserved. Use is subject to license terms.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and
|
||||
* redistribution of this file, and for a DISCLAIMER OF ALL
|
||||
* WARRANTIES.
|
||||
*/
|
||||
|
||||
package edu.cmu.sphinx.demo.transcriber;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import edu.cmu.sphinx.api.Configuration;
|
||||
import edu.cmu.sphinx.api.SpeechResult;
|
||||
import edu.cmu.sphinx.api.StreamSpeechRecognizer;
|
||||
import edu.cmu.sphinx.decoder.adaptation.Stats;
|
||||
import edu.cmu.sphinx.decoder.adaptation.Transform;
|
||||
import edu.cmu.sphinx.result.WordResult;
|
||||
|
||||
/**
|
||||
* A simple example that shows how to transcribe a continuous audio file that
|
||||
* has multiple utterances in it.
|
||||
*/
|
||||
public class TranscriberDemo {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.out.println("Loading models...");
|
||||
|
||||
Configuration configuration = new Configuration();
|
||||
|
||||
// Load model from the jar
|
||||
configuration
|
||||
.setAcousticModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us");
|
||||
|
||||
// You can also load model from folder
|
||||
// configuration.setAcousticModelPath("file:en-us");
|
||||
|
||||
configuration
|
||||
.setDictionaryPath("resource:/edu/cmu/sphinx/models/en-us/cmudict-en-us.dict");
|
||||
configuration
|
||||
.setLanguageModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us.lm.dmp");
|
||||
|
||||
StreamSpeechRecognizer recognizer = new StreamSpeechRecognizer(
|
||||
configuration);
|
||||
InputStream stream = TranscriberDemo.class
|
||||
.getResourceAsStream("/edu/cmu/sphinx/demo/aligner/10001-90210-01803.wav");
|
||||
stream.skip(44);
|
||||
|
||||
// Simple recognition with generic model
|
||||
recognizer.startRecognition(stream);
|
||||
SpeechResult result;
|
||||
while ((result = recognizer.getResult()) != null) {
|
||||
|
||||
System.out.format("Hypothesis: %s\n", result.getHypothesis());
|
||||
|
||||
System.out.println("List of recognized words and their times:");
|
||||
for (WordResult r : result.getWords()) {
|
||||
System.out.println(r);
|
||||
}
|
||||
|
||||
System.out.println("Best 3 hypothesis:");
|
||||
for (String s : result.getNbest(3))
|
||||
System.out.println(s);
|
||||
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
|
||||
// Live adaptation to speaker with speaker profiles
|
||||
|
||||
stream = TranscriberDemo.class
|
||||
.getResourceAsStream("/edu/cmu/sphinx/demo/aligner/10001-90210-01803.wav");
|
||||
stream.skip(44);
|
||||
|
||||
// Stats class is used to collect speaker-specific data
|
||||
Stats stats = recognizer.createStats(1);
|
||||
recognizer.startRecognition(stream);
|
||||
while ((result = recognizer.getResult()) != null) {
|
||||
stats.collect(result);
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
|
||||
// Transform represents the speech profile
|
||||
Transform transform = stats.createTransform();
|
||||
recognizer.setTransform(transform);
|
||||
|
||||
// Decode again with updated transform
|
||||
stream = TranscriberDemo.class
|
||||
.getResourceAsStream("/edu/cmu/sphinx/demo/aligner/10001-90210-01803.wav");
|
||||
stream.skip(44);
|
||||
recognizer.startRecognition(stream);
|
||||
while ((result = recognizer.getResult()) != null) {
|
||||
System.out.format("Hypothesis: %s\n", result.getHypothesis());
|
||||
}
|
||||
recognizer.stopRecognition();
|
||||
|
||||
}
|
||||
}
|
||||
Binary file not shown.
|
|
@ -0,0 +1,42 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
|
||||
|
||||
<!--
|
||||
|
||||
/**
|
||||
* Copyright 1999-2004 Carnegie Mellon University.
|
||||
* Portions Copyright 2004 Sun Microsystems, Inc.
|
||||
* Portions Copyright 2004 Mitsubishi Electric Research Laboratories.
|
||||
* All Rights Reserved. Use is subject to license terms.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and
|
||||
* redistribution of this file, and for a DISCLAIMER OF ALL
|
||||
* WARRANTIES.
|
||||
*
|
||||
*/
|
||||
|
||||
-->
|
||||
|
||||
<html>
|
||||
<head><title>Sphinx-4 Aligner Demo</title></head>
|
||||
<style TYPE="text/css">
|
||||
pre { font-size: medium; background: #f0f8ff; padding: 2mm;
|
||||
border-style: ridge ; color: teal }
|
||||
code { font-size: medium; color: teal }
|
||||
</style></head>
|
||||
<body>
|
||||
<span style="font-family: Times New Roman; ">
|
||||
<div style="text-align: center;">
|
||||
<table bgcolor="#99CCFF" width="100%">
|
||||
<tr>
|
||||
<td align=center width="100%">
|
||||
<h1><i>Sphinx-4</i> Aligner Demo</h1>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</span>
|
||||
|
||||
Aligns audio file to transcription and get times of
|
||||
words. Can be useful for closed captioning.
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#JSGF V1.0;
|
||||
|
||||
grammar dialog;
|
||||
|
||||
<digit> = oh |
|
||||
zero |
|
||||
one |
|
||||
two |
|
||||
three |
|
||||
four |
|
||||
five |
|
||||
six |
|
||||
seven |
|
||||
eight |
|
||||
nine ;
|
||||
|
||||
<number> = <digit>+ [point <digit>+];
|
||||
|
||||
<menu_command> = digits |
|
||||
[go to [the]] bank account |
|
||||
weather forecast |
|
||||
exit [[the] program] ;
|
||||
|
||||
<bank_command> = [show | check] balance |
|
||||
deposit <number> |
|
||||
withdraw <number> |
|
||||
back ;
|
||||
|
||||
public <command> = <menu_command> | <bank_command>;
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN" "http://www.w3.org/TR/speech-grammar/grammar.dtd">
|
||||
<grammar xmlns="http://www.w3.org/2001/06/grammar" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="en" xsi:schemaLocation="http://www.w3.org/2001/06/grammar http://www.w3.org/TR/speech-grammar/grammar.xsd" version="1.0" mode="voice" root="digits">
|
||||
<rule id="digits" scope="public">
|
||||
<item repeat="1-">
|
||||
<one-of>
|
||||
<item> one </item>
|
||||
<item> two </item>
|
||||
<item> three </item>
|
||||
<item> four </item>
|
||||
<item> five </item>
|
||||
<item> six </item>
|
||||
<item> seven </item>
|
||||
<item> eight </item>
|
||||
<item> nine </item>
|
||||
<item> zero </item>
|
||||
<item> oh </item>
|
||||
</one-of>
|
||||
</item>
|
||||
</rule>
|
||||
</grammar>
|
||||
|
|
@ -0,0 +1 @@
|
|||
ec4cda3a0b3a0fcaa4d8685188f1f79f6d7f5bcf
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
Will remain dry apart from perhaps a little drizzle near the northwest
|
||||
coast.
|
||||
|
||||
Extensive mist and fog patches expected also.
|
||||
|
||||
Some fog also.
|
||||
|
||||
Frost likely in many places.
|
||||
|
||||
Any rain or drizzle will die out tomorrow and sunny spells will develop.
|
||||
|
||||
Cloudy tonight in the north and northeast with some light rain or drizzle
|
||||
in places.
|
||||
|
||||
Fresh or strong gusty southwest to west winds gradually veering
|
||||
northwesterly.
|
||||
|
||||
Mostly dry calm and clear overnight with little or no wind.
|
||||
|
||||
Severe ground frost developing.
|
||||
|
||||
Rain or drizzle in the north and northeast will clear tomorrow leaving
|
||||
a dry day with sunny spells in most areas.
|
||||
|
||||
Scattered blustery showers largely dying out later.
|
||||
|
||||
Fair weather in all areas, rain in the south and west later.
|
||||
|
||||
Showers will slowly become more isolated tonight.
|
||||
|
||||
A very cold night.
|
||||
|
||||
Continuing mild with a moderate to fresh southerly breeze.
|
||||
|
||||
Scattered showers with a risk of thunder later.
|
||||
|
||||
Becoming windy overnight but very mild.
|
||||
|
||||
Widespread haze and mist with scattered outbreaks of rain.
|
||||
|
||||
A few sunny breaks will develop by afternoon principally in the western
|
||||
half of the country.
|
||||
|
||||
A second low pressure centre will move in across northern areas tomorrow.
|
||||
|
||||
Cold and windy with occasional showers.
|
||||
|
||||
Continuing rather cloudy over most parts of the country.
|
||||
|
||||
Tomorrow any fog, low cloud and drizzle will clear with most places dry
|
||||
and sunny.
|
||||
|
||||
It will be cold overnight in most parts of the country with some clear
|
||||
spells.
|
||||
|
||||
A weak ridge of high pressure will cross the country tonight.
|
||||
|
||||
Rain spreading to all areas from the west followed later by heavy showers.
|
||||
|
||||
Weather mainly fair if rather hazy.
|
||||
|
||||
The rest of the night will be dry in most areas though a few showers
|
||||
are still possibly across northern and northeastern counties.
|
||||
|
||||
Windy with rain extending to all parts during the morning.
|
||||
|
||||
The rain becoming heavy in many areas.
|
||||
|
||||
Very windy with strong southerly winds gusting up to fifty or sixty mph.
|
||||
|
||||
Rain in the south and west will spread to remaining north east areas
|
||||
overnight, heavy and persistent in places.
|
||||
|
||||
Clouds will increase from the west as the day goes on.
|
||||
|
||||
Misty in the south and southeast with patchy drizzle and a risk of fog.
|
||||
|
||||
The rest of the country will be overcast with outbreaks of rain heaviest
|
||||
and most persistent in the south and southwest.
|
||||
|
||||
Binary file not shown.
|
|
@ -0,0 +1,104 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
|
||||
|
||||
<!--
|
||||
|
||||
/**
|
||||
* Copyright 1999-2004 Carnegie Mellon University.
|
||||
* Portions Copyright 2004 Sun Microsystems, Inc.
|
||||
* Portions Copyright 2004 Mitsubishi Electric Research Laboratories.
|
||||
* All Rights Reserved. Use is subject to license terms.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and
|
||||
* redistribution of this file, and for a DISCLAIMER OF ALL
|
||||
* WARRANTIES.
|
||||
*
|
||||
*/
|
||||
|
||||
-->
|
||||
|
||||
<html>
|
||||
<head><title>Sphinx-4 Transcriber Demo</title></head>
|
||||
<style TYPE="text/css">
|
||||
pre { font-size: medium; background: #f0f8ff; padding: 2mm;
|
||||
border-style: ridge ; color: teal }
|
||||
code { font-size: medium; color: teal }
|
||||
</style></head>
|
||||
<body>
|
||||
<span style="font-family: Times New Roman; ">
|
||||
<div style="text-align: center;">
|
||||
<table bgcolor="#99CCFF" width="100%">
|
||||
<tr>
|
||||
<td align=center width="100%">
|
||||
<h1><i>Sphinx-4</i> Transcriber Demo</h1>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</span>
|
||||
|
||||
<span style="font-family: Arial; font-size: x-small; ">
|
||||
<p>
|
||||
A simple Sphinx-4 application that transcribes a continuous audio file
|
||||
that has multiple utterances. The audio file should contain connected
|
||||
digits data. The default file, called "10001-90210-01803.wav", contains
|
||||
three utterances, separated by silences.
|
||||
People who want to transcribe non-digits data should
|
||||
modify the <code>config.xml</code> file to use the correct grammar,
|
||||
language model, and linguist to do so. Please refer to the
|
||||
<a href="../../../../../../../doc/ProgrammersGuide.html">Programmer's Guide</a>
|
||||
on how to modify the configuration file for your purposes.
|
||||
</p>
|
||||
|
||||
<h3>Building</h3>
|
||||
<p>
|
||||
Check if the <code>bin</code> directory already has the
|
||||
<code>Transcriber.jar</code> file. If not, type the following in the top
|
||||
level directory:
|
||||
</p>
|
||||
<code>ant -find demo.xml</code>
|
||||
<h3>Running</h3>
|
||||
<p>
|
||||
To run the demo, type:
|
||||
</p>
|
||||
<code>sphinx4 > java -jar bin/Transcriber.jar</code>
|
||||
<p>
|
||||
You will see the following result, with each utterance on its own line:
|
||||
<pre>
|
||||
one zero zero zero one
|
||||
nine oh two one oh
|
||||
zero one eight zero three
|
||||
</pre>
|
||||
<p>
|
||||
<span style="color: FF0000; "><b>NOTE:</b></span>
|
||||
<ol>
|
||||
<li>
|
||||
Make sure that you are using Java<sup>TM</sup> 2 SDK, Standard Edition,
|
||||
v1.4 or higher.
|
||||
</li>
|
||||
<li>
|
||||
If you have the source distribution, make sure that the JAR file
|
||||
<code>lib/sphinx4.jar</code> is built. If not, go to the top level
|
||||
directory and type: <code>ant</code>
|
||||
</li>
|
||||
<li>
|
||||
You can supply your own test files, but they must be digits data.
|
||||
Just make sure that the audio format is the same as in the
|
||||
config.xml file, which is 16-bit signed PCM-linear, 16kHz, little-endian.
|
||||
The audio file format can be any format readable by Java Sound,
|
||||
e.g., .wav, .au. To test your own file, supply it as an argument.
|
||||
Suppose your test file is called <code>test.wav</code>, then:
|
||||
<p><code>java -jar bin/Transcriber.jar test.wav</code>
|
||||
</li>
|
||||
</ol>
|
||||
</p>
|
||||
</span>
|
||||
<hr>
|
||||
Copyright 1999-2004 Carnegie Mellon University.
|
||||
<br>
|
||||
Portions Copyright 2002-2004 Sun Microsystems, Inc.
|
||||
<br>
|
||||
Portions Copyright 2002-2004 Mitsubishi Electric Research Laboratories.
|
||||
<br>
|
||||
All Rights Reserved. Usage is subject to <a href="../../../../../../../license.terms">license terms</a>.
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Add table
Add a link
Reference in a new issue