Compress recorded audio with speex before transmitting to server for recognition. BUG=none TEST=none Review URL: http://codereview.chromium.org/3189007 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@56525 0039d316-1c4b-4281-b951-d872f2087c98

commit: 2fed40dca785c74127c812cb45f0a2227414813e [log] [tgz]
author: [email protected] <[email protected]@0039d316-1c4b-4281-b951-d872f2087c98> Wed Aug 18 14:01:21 2010
committer: [email protected] <[email protected]@0039d316-1c4b-4281-b951-d872f2087c98> Wed Aug 18 14:01:21 2010
tree: 73f2e4889ab70bd39804fc81ab09efdc1d75c59a
parent: 67a7851692243892bb2d7253d2f1974833e57432 [diff]
diff --git a/chrome/browser/speech/speech_recognition_request.cc b/chrome/browser/speech/speech_recognition_request.cc
index 9e6cc41..1bb2efe 100644
--- a/chrome/browser/speech/speech_recognition_request.cc
+++ b/chrome/browser/speech/speech_recognition_request.cc

@@ -14,9 +14,6 @@
 
 namespace {
 
-// TODO(satish): Change this once speex compression is enabled for audio.
-const char kMimeRawAudio[] = "audio/l16; rate=8000";
-
 const char* const kHypothesesString = "hypotheses";
 const char* const kUtteranceString = "utterance";
 
@@ -101,12 +98,13 @@
   DCHECK(delegate);
 }
 
-bool SpeechRecognitionRequest::Send(const std::string& audio_data) {
+bool SpeechRecognitionRequest::Send(const std::string& content_type,
+                                    const std::string& audio_data) {
   DCHECK(!url_fetcher_.get());
 
   url_fetcher_.reset(URLFetcher::Create(
       url_fetcher_id_for_tests, url_, URLFetcher::POST, this));
-  url_fetcher_->set_upload_data(kMimeRawAudio, audio_data);
+  url_fetcher_->set_upload_data(content_type, audio_data);
   url_fetcher_->set_request_context(url_context_);
 
   // The speech recognition API does not require user identification as part

diff --git a/chrome/browser/speech/speech_recognition_request.h b/chrome/browser/speech/speech_recognition_request.h
index db12d34..a8ffe2f 100644
--- a/chrome/browser/speech/speech_recognition_request.h
+++ b/chrome/browser/speech/speech_recognition_request.h

@@ -41,7 +41,7 @@
   // Sends a new request with the given audio data, returns true if successful.
   // The same object can be used to send multiple requests but only after the
   // previous request has completed.
-  bool Send(const std::string& audio_data);
+  bool Send(const std::string& content_type, const std::string& audio_data);
 
   bool HasPendingRequest() { return url_fetcher_ != NULL; }
 

diff --git a/chrome/browser/speech/speech_recognition_request_unittest.cc b/chrome/browser/speech/speech_recognition_request_unittest.cc
index 0e3559e6..303735f 100644
--- a/chrome/browser/speech/speech_recognition_request_unittest.cc
+++ b/chrome/browser/speech/speech_recognition_request_unittest.cc

@@ -44,7 +44,7 @@
 void SpeechRecognitionRequestTest::CreateAndTestRequest(
     bool success, const std::string& http_response) {
   SpeechRecognitionRequest request(NULL, GURL(""), this);
-  request.Send(std::string());
+  request.Send(std::string(), std::string());
   TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
   ASSERT_TRUE(fetcher);
   URLRequestStatus status;

diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index 02c67a3..2e8f353 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc

@@ -9,14 +9,25 @@
 #include "chrome/browser/chrome_thread.h"
 #include "chrome/browser/profile.h"
 #include "chrome/common/net/url_request_context_getter.h"
+#include "third_party/speex/include/speex/speex.h"
 
 using media::AudioInputController;
 using std::list;
 using std::string;
 
 namespace {
-const char* kDefaultSpeechRecognitionUrl =
+const char* const kDefaultSpeechRecognitionUrl =
     "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";
+const char* const kContentTypeSpeex =
+    "audio/x-speex-with-header-byte; rate=16000";
+const int kAudioSampleRate = 16000;
+const int kSpeexEncodingQuality = 8;
+const int kMaxSpeexFrameLength = 110;  // (44kbps rate sampled at 32kHz).
+
+// Since the frame length gets written out as a byte in the encoded packet,
+// make sure it is within the byte range.
+COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
+
 const int kAudioPacketIntervalMs = 100;  // Record 100ms long audio packets.
 const int kNumAudioChannels = 1;  // Speech is recorded as mono.
 const int kNumBitsPerAudioSample = 16;
@@ -24,10 +35,72 @@
 
 namespace speech_input {
 
+// Provides a simple interface to encode raw audio using the Speex codec.
+class SpeexEncoder {
+ public:
+  SpeexEncoder();
+  ~SpeexEncoder();
+
+  int samples_per_frame() const { return samples_per_frame_; }
+
+  // Encodes each frame of raw audio in |raw_samples| and adds the
+  // encoded frames as a set of strings to the |encoded_frames| list.
+  // Ownership of the newly added strings is transferred to the caller.
+  void Encode(const string& raw_samples,
+              std::list<std::string*>* encoded_frames);
+
+ private:
+  SpeexBits bits_;
+  void* encoder_state_;
+  int samples_per_frame_;
+  char encoded_frame_data_[kMaxSpeexFrameLength + 1];  // +1 for the frame size.
+};
+
+SpeexEncoder::SpeexEncoder() {
+  speex_bits_init(&bits_);
+  encoder_state_ = speex_encoder_init(&speex_wb_mode);
+  DCHECK(encoder_state_);
+  speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
+  DCHECK(samples_per_frame_ > 0);
+  int quality = kSpeexEncodingQuality;
+  speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
+  int vbr = 1;
+  speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
+}
+
+SpeexEncoder::~SpeexEncoder() {
+  speex_bits_destroy(&bits_);
+  speex_encoder_destroy(encoder_state_);
+}
+
+void SpeexEncoder::Encode(const string& raw_samples,
+                          std::list<std::string*>* encoded_frames) {
+  const short* samples = reinterpret_cast<const short*>(raw_samples.data());
+  DCHECK((raw_samples.length() % sizeof(short)) == 0);
+  int num_samples = raw_samples.length() / sizeof(short);
+
+  // Drop incomplete frames, typically those which come in when recording stops.
+  num_samples -= (num_samples % samples_per_frame_);
+  for (int i = 0; i < num_samples; i += samples_per_frame_) {
+    speex_bits_reset(&bits_);
+    speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
+                     &bits_);
+
+    // Encode the frame and place the size of the frame as the first byte. This
+    // is the packet format for MIME type x-speex-with-header-byte.
+    int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
+                                        kMaxSpeexFrameLength);
+    encoded_frame_data_[0] = static_cast<char>(frame_length);
+    encoded_frames->push_back(new string(encoded_frame_data_,
+                                         frame_length + 1));
+  }
+}
+
 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
                                    const SpeechInputCallerId& caller_id)
     : delegate_(delegate),
-      caller_id_(caller_id) {
+      caller_id_(caller_id),
+      encoder_(new SpeexEncoder()) {
 }
 
 SpeechRecognizer::~SpeechRecognizer() {
@@ -43,10 +116,12 @@
   DCHECK(!audio_controller_.get());
   DCHECK(!request_.get() || !request_->HasPendingRequest());
 
+  int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
+  DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
   audio_controller_ = AudioInputController::Create(this,
       AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels,
-      AudioManager::kTelephoneSampleRate, kNumBitsPerAudioSample,
-      (AudioManager::kTelephoneSampleRate * kAudioPacketIntervalMs) / 1000);
+      kAudioSampleRate, kNumBitsPerAudioSample,
+      samples_per_packet);
   DCHECK(audio_controller_.get());
   LOG(INFO) << "SpeechRecognizer starting record.";
   audio_controller_->Record();
@@ -110,7 +185,7 @@
       Profile::GetDefaultRequestContext(),
       GURL(kDefaultSpeechRecognitionUrl),
       this));
-  request_->Send(data);
+  request_->Send(kContentTypeSpeex, data);
   ReleaseAudioBuffers();  // No need to keep the audio anymore.
 }
 
@@ -165,9 +240,11 @@
     return;
   }
 
+  encoder_->Encode(*data, &audio_buffers_);
+  delete data;
+
   // TODO(satish): Once we have streaming POST, start sending the data received
   // here as POST chunks.
-  audio_buffers_.push_back(data);
 }
 
 void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {

diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h
index 2a13c60..5d51b02 100644
--- a/chrome/browser/speech/speech_recognizer.h
+++ b/chrome/browser/speech/speech_recognizer.h

@@ -20,6 +20,8 @@
 // SpeechInputCallerId::second holds the request ID given by the element.
 typedef std::pair<int, int> SpeechInputCallerId;
 
+class SpeexEncoder;
+
 // Records audio, sends recorded audio to server and translates server response
 // to recognition result.
 class SpeechRecognizer
@@ -92,6 +94,7 @@
 
   scoped_ptr<SpeechRecognitionRequest> request_;
   scoped_refptr<media::AudioInputController> audio_controller_;
+  scoped_ptr<SpeexEncoder> encoder_;
 
   DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer);
 };

diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc
index 5d2c845..29f211a 100644
--- a/chrome/browser/speech/speech_recognizer_unittest.cc
+++ b/chrome/browser/speech/speech_recognizer_unittest.cc

@@ -13,6 +13,10 @@
 using media::TestAudioInputController;
 using media::TestAudioInputControllerFactory;
 
+namespace {
+const int kAudioPacketLengthBytes = 1000;
+}
+
 namespace speech_input {
 
 class SpeechRecognizerTest : public SpeechRecognizerDelegate,
@@ -87,7 +91,7 @@
 }
 
 TEST_F(SpeechRecognizerTest, StopWithData) {
-  uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+  uint8 data[kAudioPacketLengthBytes] = { 0 };
 
   // Start recording, give some data and then stop. This should wait for the
   // network callback to arrive before completion.
@@ -116,7 +120,7 @@
 }
 
 TEST_F(SpeechRecognizerTest, CancelWithData) {
-  uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+  uint8 data[kAudioPacketLengthBytes] = { 0 };
 
   // Start recording, give some data and then cancel. This should not create
   // a network request and finish immediately.
@@ -147,7 +151,7 @@
 }
 
 TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) {
-  uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+  uint8 data[kAudioPacketLengthBytes] = { 0 };
 
   // Check if things tear down properly if AudioInputController threw an error
   // after giving some audio data.

diff --git a/chrome/chrome_browser.gypi b/chrome/chrome_browser.gypi
index 772005a..2db7ee5 100644
--- a/chrome/chrome_browser.gypi
+++ b/chrome/chrome_browser.gypi

@@ -36,6 +36,7 @@
         '../third_party/libxml/libxml.gyp:libxml',
         '../third_party/npapi/npapi.gyp:npapi',
         '../third_party/hunspell/hunspell.gyp:hunspell',
+        '../third_party/speex/speex.gyp:libspeex',
         '../webkit/support/webkit_support.gyp:appcache',
         '../webkit/support/webkit_support.gyp:database',
         '../webkit/support/webkit_support.gyp:glue',
commit	2fed40dca785c74127c812cb45f0a2227414813e	[log] [tgz]
author	[email protected] <[email protected]@0039d316-1c4b-4281-b951-d872f2087c98>	Wed Aug 18 14:01:21 2010
committer	[email protected] <[email protected]@0039d316-1c4b-4281-b951-d872f2087c98>	Wed Aug 18 14:01:21 2010
tree	73f2e4889ab70bd39804fc81ab09efdc1d75c59a
parent	67a7851692243892bb2d7253d2f1974833e57432 [diff]