iPhoneアプリから送った音声データをJuliusで音声認識

June 22, 2016

今回のロボットシステムの音声認識ではフリーソフトのJuliusを使用することにしました。最初はディテーションキットを使用していたのですが、あまりにも認識率が低いため、記述文法音声認識キットを使用して、特定の文章だけを認識させるようにしたところ、かなり認識率は良くなりました。このキットは自分で文法や語彙のデータを作成する必要があり、面倒なのですが、非常に認識率は良くなりますし、認識速度も速いです。iPhone からの音声データの受け取りは、WAVファイルとして受け取るようにしています。下記がWAVを認識してテキストとして返却するWEBAPIのソースコードです。

<?php
header("Content-Type: application/json; charset=utf-8");
 
$confpath   = "/usr/local/share/julius/grammar-kit";
//$confpath   = "/usr/local/share/julius/dictkit";
$juliuspath = "/usr/local/bin/julius";
$cmd = "${juliuspath} -C ${confpath}/hmm_mono.jconf -gram ${confpath}/MbedGrammar/mbed";
//$cmd = "${juliuspath} -C ${confpath}/am-gmm.jconf -C ${confpath}/wav.jconf";
 
$wname = 'wavfile';
$mname = 'aacfile';
$tmppath = "/var/www/html/voice/tmp";
 
/**
 * 出力結果を所定の形式に変換
 */
function processOutput($lines) {
    $ret = array();
    foreach($lines as $line){
        $data = explode(":", $line);
        if(count($data) < 2) continue;
        $key = trim($data[0]);
        if($key == "STAT") continue;
        array_shift($data);
        $value = trim(implode(":", $data));
        $ret[$key] = $value;
    }
    return $ret;
}
 
if(isset($_FILES[$wname]) || isset($_FILES[$mname])){
    if(isset($_FILES[$wname])){
      $tmp_wav = $_FILES[$wname]['tmp_name'];
      $file_size = $_FILES[$wname]['size'];
    }
    else {
      $dl_aac = "${tmppath}/rec.aac";
      $tmp_wav = "${tmppath}/rec.wav";
      @unlink($dl_aac);
      @unlink($tmp_wav);
      $file_size = $_FILES[$mname]['size'];
      move_uploaded_file($_FILES[$mname]['tmp_name'],$dl_aac);
      exec("ffmpeg -loglevel quiet -y -i ${dl_aac} -ac 1 -ar 16000 -acodec pcm_s16le ${tmp_wav}");
    }
    $result = exec("cat ${tmp_wav} | ".$cmd, $output);
    $po = processOutput($output);
    $res_ary = array(
        "status"=>"success",
        "voicesize"=>$file_size,
        "result"=>$po['sentence1']);
    echo json_encode($res_ary);
    //move_uploaded_file($_FILES[$wname]['tmp_name'],"/tmp/out.wav");
}
else {
    echo '{"status":"error"}';
}

iPhoneの方のソースは下記のような感じです。ボタンをおすとWAVファイルとして録音し、もう一度押すとWAVファイルを送信し、認識結果（テキスト）を受け取ったらそれをsendVoiceというhubotにSocket.ioでメッセージを送るメソッドにそのまま渡しています。

import UIKit
import AVFoundation
 
class ViewController: UIViewController, AVAudioRecorderDelegate  {
 
    var audioRecorder: AVAudioRecorder?
 
    @IBOutlet var speakButton: UIButton!
    override func viewDidLoad() {
        super.viewDidLoad()
        // Do any additional setup after loading the view, typically from a nib.
        prepareRecording()
    }
 
    override func didReceiveMemoryWarning() {
        super.didReceiveMemoryWarning()
        // Dispose of any resources that can be recreated.
    }
 
    @IBAction func touchSpeak(sender: AnyObject) {
        if self.audioRecorder == nil {
            self.startRecording()
            speakButton.setTitle("ストップ", forState: .Normal)
        }
        else {
            self.stopRecording()
            speakButton.setTitle("話しかける", forState: .Normal)
        }
    }
 
    func prepareRecording() {
        /// 録音可能カテゴリに設定する
        let session = AVAudioSession.sharedInstance()
        do {
            try session.setCategory(AVAudioSessionCategoryRecord)
        } catch  {
            // エラー処理
            fatalError("カテゴリ設定失敗")
        }
 
        // sessionのアクティブ化
        do {
            try session.setActive(true)
        } catch {
            // audio session有効化失敗時の処理
            // (ここではエラーとして停止している）
            fatalError("session有効化失敗")
        }
 
        session.requestRecordPermission() { [unowned self] (allowed: Bool) -> Void in
            dispatch_async(dispatch_get_main_queue()) {
                if allowed {
                    print("record permitted")
                } else {
                    // failed to record!
                    fatalError("カテゴリ設定失敗")
                }
            }
        }
    }
 
    func startRecording() {
        // 録音用URLを設定
        let recordingsURL = AppDelegate.localFileURL("recording.m4a")
 
        // 録音設定
        let recordSettings: [String: AnyObject] =
            [AVEncoderAudioQualityKey: AVAudioQuality.Min.rawValue,
             AVNumberOfChannelsKey: 1,
             AVSampleRateKey: 16000.0,
             AVFormatIDKey: Int(kAudioFormatMPEG4AAC)]
 
        do {
            audioRecorder = try AVAudioRecorder(URL: recordingsURL, settings: recordSettings)
            audioRecorder!.delegate = self
            audioRecorder!.prepareToRecord()
            audioRecorder!.record()
        } catch {
            fatalError("レコーダー取得失敗")
        }
    }
 
    func stopRecording() {
        self.audioRecorder?.stop()
    }
 
    func audioRecorderDidFinishRecording(recorder: AVAudioRecorder, successfully flag: Bool) {
        if flag {
            let dg : AppDelegate = UIApplication.sharedApplication().delegate as! AppDelegate
            dg.sendVoice()
        }
        self.audioRecorder = nil
    }
 
 
}