Compress recorded audio with speex before transmitting to server for recognition.
BUG=none
TEST=none
Review URL: http://codereview.chromium.org/3189007
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@56525 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/chrome/browser/speech/speech_recognition_request.cc b/chrome/browser/speech/speech_recognition_request.cc
index 9e6cc41..1bb2efe 100644
--- a/chrome/browser/speech/speech_recognition_request.cc
+++ b/chrome/browser/speech/speech_recognition_request.cc
@@ -14,9 +14,6 @@
namespace {
-// TODO(satish): Change this once speex compression is enabled for audio.
-const char kMimeRawAudio[] = "audio/l16; rate=8000";
-
const char* const kHypothesesString = "hypotheses";
const char* const kUtteranceString = "utterance";
@@ -101,12 +98,13 @@
DCHECK(delegate);
}
-bool SpeechRecognitionRequest::Send(const std::string& audio_data) {
+bool SpeechRecognitionRequest::Send(const std::string& content_type,
+ const std::string& audio_data) {
DCHECK(!url_fetcher_.get());
url_fetcher_.reset(URLFetcher::Create(
url_fetcher_id_for_tests, url_, URLFetcher::POST, this));
- url_fetcher_->set_upload_data(kMimeRawAudio, audio_data);
+ url_fetcher_->set_upload_data(content_type, audio_data);
url_fetcher_->set_request_context(url_context_);
// The speech recognition API does not require user identification as part
diff --git a/chrome/browser/speech/speech_recognition_request.h b/chrome/browser/speech/speech_recognition_request.h
index db12d34..a8ffe2f 100644
--- a/chrome/browser/speech/speech_recognition_request.h
+++ b/chrome/browser/speech/speech_recognition_request.h
@@ -41,7 +41,7 @@
// Sends a new request with the given audio data, returns true if successful.
// The same object can be used to send multiple requests but only after the
// previous request has completed.
- bool Send(const std::string& audio_data);
+ bool Send(const std::string& content_type, const std::string& audio_data);
bool HasPendingRequest() { return url_fetcher_ != NULL; }
diff --git a/chrome/browser/speech/speech_recognition_request_unittest.cc b/chrome/browser/speech/speech_recognition_request_unittest.cc
index 0e3559e6..303735f 100644
--- a/chrome/browser/speech/speech_recognition_request_unittest.cc
+++ b/chrome/browser/speech/speech_recognition_request_unittest.cc
@@ -44,7 +44,7 @@
void SpeechRecognitionRequestTest::CreateAndTestRequest(
bool success, const std::string& http_response) {
SpeechRecognitionRequest request(NULL, GURL(""), this);
- request.Send(std::string());
+ request.Send(std::string(), std::string());
TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
ASSERT_TRUE(fetcher);
URLRequestStatus status;
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index 02c67a3..2e8f353 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -9,14 +9,25 @@
#include "chrome/browser/chrome_thread.h"
#include "chrome/browser/profile.h"
#include "chrome/common/net/url_request_context_getter.h"
+#include "third_party/speex/include/speex/speex.h"
using media::AudioInputController;
using std::list;
using std::string;
namespace {
-const char* kDefaultSpeechRecognitionUrl =
+const char* const kDefaultSpeechRecognitionUrl =
"http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";
+const char* const kContentTypeSpeex =
+ "audio/x-speex-with-header-byte; rate=16000";
+const int kAudioSampleRate = 16000;
+const int kSpeexEncodingQuality = 8;
+const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
+
+// Since the frame length gets written out as a byte in the encoded packet,
+// make sure it is within the byte range.
+COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
+
const int kAudioPacketIntervalMs = 100; // Record 100ms long audio packets.
const int kNumAudioChannels = 1; // Speech is recorded as mono.
const int kNumBitsPerAudioSample = 16;
@@ -24,10 +35,72 @@
namespace speech_input {
+// Provides a simple interface to encode raw audio using the Speex codec.
+class SpeexEncoder {
+ public:
+ SpeexEncoder();
+ ~SpeexEncoder();
+
+ int samples_per_frame() const { return samples_per_frame_; }
+
+ // Encodes each frame of raw audio in |raw_samples| and adds the
+ // encoded frames as a set of strings to the |encoded_frames| list.
+ // Ownership of the newly added strings is transferred to the caller.
+ void Encode(const string& raw_samples,
+ std::list<std::string*>* encoded_frames);
+
+ private:
+ SpeexBits bits_;
+ void* encoder_state_;
+ int samples_per_frame_;
+ char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
+};
+
+SpeexEncoder::SpeexEncoder() {
+ speex_bits_init(&bits_);
+ encoder_state_ = speex_encoder_init(&speex_wb_mode);
+ DCHECK(encoder_state_);
+ speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
+ DCHECK(samples_per_frame_ > 0);
+ int quality = kSpeexEncodingQuality;
+ speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
+ int vbr = 1;
+ speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
+}
+
+SpeexEncoder::~SpeexEncoder() {
+ speex_bits_destroy(&bits_);
+ speex_encoder_destroy(encoder_state_);
+}
+
+void SpeexEncoder::Encode(const string& raw_samples,
+ std::list<std::string*>* encoded_frames) {
+ const short* samples = reinterpret_cast<const short*>(raw_samples.data());
+ DCHECK((raw_samples.length() % sizeof(short)) == 0);
+ int num_samples = raw_samples.length() / sizeof(short);
+
+ // Drop incomplete frames, typically those which come in when recording stops.
+ num_samples -= (num_samples % samples_per_frame_);
+ for (int i = 0; i < num_samples; i += samples_per_frame_) {
+ speex_bits_reset(&bits_);
+ speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
+ &bits_);
+
+ // Encode the frame and place the size of the frame as the first byte. This
+ // is the packet format for MIME type x-speex-with-header-byte.
+ int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
+ kMaxSpeexFrameLength);
+ encoded_frame_data_[0] = static_cast<char>(frame_length);
+ encoded_frames->push_back(new string(encoded_frame_data_,
+ frame_length + 1));
+ }
+}
+
SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
const SpeechInputCallerId& caller_id)
: delegate_(delegate),
- caller_id_(caller_id) {
+ caller_id_(caller_id),
+ encoder_(new SpeexEncoder()) {
}
SpeechRecognizer::~SpeechRecognizer() {
@@ -43,10 +116,12 @@
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
+ int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
+ DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
audio_controller_ = AudioInputController::Create(this,
AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels,
- AudioManager::kTelephoneSampleRate, kNumBitsPerAudioSample,
- (AudioManager::kTelephoneSampleRate * kAudioPacketIntervalMs) / 1000);
+ kAudioSampleRate, kNumBitsPerAudioSample,
+ samples_per_packet);
DCHECK(audio_controller_.get());
LOG(INFO) << "SpeechRecognizer starting record.";
audio_controller_->Record();
@@ -110,7 +185,7 @@
Profile::GetDefaultRequestContext(),
GURL(kDefaultSpeechRecognitionUrl),
this));
- request_->Send(data);
+ request_->Send(kContentTypeSpeex, data);
ReleaseAudioBuffers(); // No need to keep the audio anymore.
}
@@ -165,9 +240,11 @@
return;
}
+ encoder_->Encode(*data, &audio_buffers_);
+ delete data;
+
// TODO(satish): Once we have streaming POST, start sending the data received
// here as POST chunks.
- audio_buffers_.push_back(data);
}
void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {
diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h
index 2a13c60..5d51b02 100644
--- a/chrome/browser/speech/speech_recognizer.h
+++ b/chrome/browser/speech/speech_recognizer.h
@@ -20,6 +20,8 @@
// SpeechInputCallerId::second holds the request ID given by the element.
typedef std::pair<int, int> SpeechInputCallerId;
+class SpeexEncoder;
+
// Records audio, sends recorded audio to server and translates server response
// to recognition result.
class SpeechRecognizer
@@ -92,6 +94,7 @@
scoped_ptr<SpeechRecognitionRequest> request_;
scoped_refptr<media::AudioInputController> audio_controller_;
+ scoped_ptr<SpeexEncoder> encoder_;
DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer);
};
diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc
index 5d2c845..29f211a 100644
--- a/chrome/browser/speech/speech_recognizer_unittest.cc
+++ b/chrome/browser/speech/speech_recognizer_unittest.cc
@@ -13,6 +13,10 @@
using media::TestAudioInputController;
using media::TestAudioInputControllerFactory;
+namespace {
+const int kAudioPacketLengthBytes = 1000;
+}
+
namespace speech_input {
class SpeechRecognizerTest : public SpeechRecognizerDelegate,
@@ -87,7 +91,7 @@
}
TEST_F(SpeechRecognizerTest, StopWithData) {
- uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+ uint8 data[kAudioPacketLengthBytes] = { 0 };
// Start recording, give some data and then stop. This should wait for the
// network callback to arrive before completion.
@@ -116,7 +120,7 @@
}
TEST_F(SpeechRecognizerTest, CancelWithData) {
- uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+ uint8 data[kAudioPacketLengthBytes] = { 0 };
// Start recording, give some data and then cancel. This should not create
// a network request and finish immediately.
@@ -147,7 +151,7 @@
}
TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) {
- uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+ uint8 data[kAudioPacketLengthBytes] = { 0 };
// Check if things tear down properly if AudioInputController threw an error
// after giving some audio data.
diff --git a/chrome/chrome_browser.gypi b/chrome/chrome_browser.gypi
index 772005a..2db7ee5 100644
--- a/chrome/chrome_browser.gypi
+++ b/chrome/chrome_browser.gypi
@@ -36,6 +36,7 @@
'../third_party/libxml/libxml.gyp:libxml',
'../third_party/npapi/npapi.gyp:npapi',
'../third_party/hunspell/hunspell.gyp:hunspell',
+ '../third_party/speex/speex.gyp:libspeex',
'../webkit/support/webkit_support.gyp:appcache',
'../webkit/support/webkit_support.gyp:database',
'../webkit/support/webkit_support.gyp:glue',