Android 阿里云语音识别Paraformer(已可跑通,复制/粘贴可运行)
Paraformer的实时转写API能够持续识别长时间的语音数据流,并流式返回结果,适用于会议演讲、视频直播等场景。API提供丰富的输出选项,包括中间文字结果、句子级文字、词和时间戳等。
官方提供的时 本地 Java demo 无法在 Android 上运行,下面这段代码已经迁移到安卓系统。
package cn.netkiller.conference.ai.aliyun;
import android.media.AudioFormat;
import android.media.AudioRecord;
import android.media.MediaRecorder;
import android.util.Log;
import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import com.alibaba.dashscope.exception.NoApiKeyException;
import java.nio.ByteBuffer;
import cn.aigcsst.conference.ai.aigc.AigcSpeechRecognizer;
import cn.aigcsst.conference.config.Config;
import io.reactivex.BackpressureStrategy;
import io.reactivex.Flowable;
public class AliyunParaformerSpeechRecognizer implements AigcSpeechRecognizer {
private static AudioRecord audioRecord = null;
private final int bufferSizeInBytes = AudioRecord.getMinBufferSize(16000, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
// set exit flags
boolean[] shouldExit = {false};
Object exitFlag = new Object();
private static void startRecordingAndRecognition(Object exitFlag, boolean[] shouldExit) throws NoApiKeyException {
// Create a Flowable<ByteBuffer> for streaming audio data
Flowable<ByteBuffer> audioSource = createAudioSourceWithControl(exitFlag, shouldExit);
// Create speech Recognizer
Recognition recognizer = new Recognition();
// Create RecognitionParam, pass the Flowable<ByteBuffer> to audioFrames parameter
RecognitionParam param = RecognitionParam.builder()
.model("paraformer-realtime-v2")
.format("pcm")
.sampleRate(16000)
.apiKey(Config.Aliyun.dashScopeApiKey) // set your apikey in config.Environments.yourApikey
.parameter("semantic_punctuation_enabled", false)
.build();
// Stream call interface for streaming audio to recognizer
recognizer.streamCall(param, audioSource).blockingForEach(
result -> {
// Subscribe to the output result
if (result.isSentenceEnd()) {
System.out.println("Final Result: " + result.getSentence().getText());
} else {
System.out.println("Intermediate Result: " + result.getSentence().getText());
}
});
System.out.println("Recognition onComplete! , exit program...");
System.out.println(
"[Metric] requestId: "
+ recognizer.getLastRequestId()
+ ", first package delay ms: "
+ recognizer.getFirstPackageDelay()
+ ", last package delay ms: "
+ recognizer.getLastPackageDelay());
System.exit(0);
}
private static Flowable<ByteBuffer> createAudioSourceWithControl(
Object exitFlag, boolean[] shouldExit) {
// Create a Flowable<ByteBuffer> for streaming audio data
return Flowable.create(
emitter -> {
try {
audioRecord.startRecording();
ByteBuffer buffer = ByteBuffer.allocate(1024);
while (!shouldExit[0]) {
int read = audioRecord.read(buffer.array(), 0, buffer.capacity());
if (read > 0) {
buffer.limit(read);
emitter.onNext(buffer);
buffer = ByteBuffer.allocate(1024);
Thread.sleep(20); // Small delay to control CPU usage
}
synchronized (exitFlag) {
if (shouldExit[0]) {
emitter.onComplete();
break;
}
}
}
} catch (Exception e) {
emitter.onError(e);
}
},
BackpressureStrategy.BUFFER);
}
public void startSpeechRecognizer() {
audioRecord = new AudioRecord(MediaRecorder.AudioSource.MIC, 16000, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSizeInBytes);
Log.i("audioRecordTest", "size->" + bufferSizeInBytes);
// Start a new thread to record and recognize
new Thread(
() -> {
try {
startRecordingAndRecognition(exitFlag, shouldExit);
} catch (Exception e) {
e.printStackTrace();
}
})
.start();
// Exit the program
// waitForExitSignal(exitFlag, shouldExit);
}
@Override
public void stopSpeechRecognizer() {
synchronized (exitFlag) {
shouldExit[0] = true;
exitFlag.notifyAll(); // 通知录音线程退出
}
if (audioRecord != null) {
audioRecord.stop();
Log.i("audioRecordTest", "停止录音");
audioRecord.release();
// recordingThread = null;
}
System.out.println("Exit signal received. Exiting...");
}
}