语音识别

最新推荐文章于 2020-10-27 18:20:31 发布

bbzz2

最新推荐文章于 2020-10-27 18:20:31 发布

阅读量2.6k

点赞数

分类专栏：查看语音库

查看语音库专栏收录该内容

4 篇文章 0 订阅

订阅专栏

语音识别的应用领域非常广泛，洋文名Speech Recognition。它所要解决的问题是让计算机能够“听懂”人类的语音，将语音中包含的文字信息“提取”出来。

语音识别是前文《聊天机器人》必不可少的一个组件，本帖就使用TensorFlow做一个中文语音识别。

使用的数据集

THCHS30是Dong Wang, Xuewei Zhang, Zhiyong Zhang这几位大神发布的开放语音数据集，可用于开发中文语音识别系统。

为了感谢这几位大神，我是跪在电脑前写的本帖代码。

下载中文语音数据集（5G+）：

[python]view plaincopy 
   
 $ wget http://data.cslt.org/thchs30/zip/wav.tgz  
 $ wget http://data.cslt.org/thchs30/zip/doc.tgz  
 $ wget http://data.cslt.org/thchs30/zip/lm.tgz  
 # 解压  
 $ tar xvf wav.tgz  
 $ tar xvf doc.tgz  
 $ tar xvf lm.tgz  

在开始之前，先好好检视一下数据集。

训练

[python]view plaincopy 
   
 import tensorflow as tf  # 0.12  
 import numpy as np  
 import os  
 from collections import Counter  
 import librosa  # https://github.com/librosa/librosa  
    
 # 训练样本路径  
 wav_path = 'data/wav/train'  
 label_file = 'data/doc/trans/train.word.txt'  
    
 # 获得训练用的wav文件路径列表  
 def get_wav_files(wav_path=wav_path):  
     wav_files = []  
     for (dirpath, dirnames, filenames) in os.walk(wav_path):  
         for filename in filenames:  
             if filename.endswith('.wav') or filename.endswith('.WAV'):  
                 filename_path = os.sep.join([dirpath, filename])  
                 if os.stat(filename_path).st_size < 240000:  # 剔除掉一些小文件  
                     continue  
                 wav_files.append(filename_path)  
     return wav_files  
    
 wav_files = get_wav_files()  
    
 # 读取wav文件对应的label  
 def get_wav_lable(wav_files=wav_files, label_file=label_file):  
     labels_dict = {}  
     with open(label_file, 'r') as f:  
         for label in f:  
             label = label.strip('\n')  
             label_id = label.split(' ', 1)[0]  
             label_text = label.split(' ', 1)[1]  
             labels_dict[label_id] = label_text  
    
     labels = []  
     new_wav_files = []  
     for wav_file in wav_files:  
         wav_id = os.path.basename(wav_file).split('.')[0]  
         if wav_id in labels_dict:  
             labels.append(labels_dict[wav_id])  
             new_wav_files.append(wav_file)  
    
     return new_wav_files, labels  
    
 wav_files, labels = get_wav_lable()  
 print("样本数:", len(wav_files))  # 8911  
 #print(wav_files[0], labels[0])  
 # wav/train/A11/A11_0.WAV -> 绿 是 阳春 烟 景 大块 文章 的 底色 四月 的 林 峦 更是 绿 得 鲜活 秀媚 诗意 盎然  
    
 # 词汇表(参看练习1和7)  
 all_words = []  
 for label in labels:  
     all_words += [word for word in label]  
 counter = Counter(all_words)  
 count_pairs = sorted(counter.items(), key=lambda x: -x[1])  
    
 words, _ = zip(*count_pairs)  
 words_size = len(words)  
 print('词汇表大小:', words_size)  
    
 word_num_map = dict(zip(words, range(len(words))))  
 to_num = lambda word: word_num_map.get(word, len(words))  
 labels_vector = [ list(map(to_num, label)) for label in labels]  
 #print(wavs_file[0], labels_vector[0])  
 #wav/train/A11/A11_0.WAV -> [479, 0, 7, 0, 138, 268, 0, 222, 0, 714, 0, 23, 261, 0, 28, 1191, 0, 1, 0, 442, 199, 0, 72, 38, 0, 1, 0, 463, 0, 1184, 0, 269, 7, 0, 479, 0, 70, 0, 816, 254, 0, 675, 1707, 0, 1255, 136, 0, 2020, 91]  
 #print(words[479]) #绿  
 label_max_len = np.max([len(label) for label in labels_vector])  
 print('最长句子的字数:', label_max_len)  
    
 wav_max_len = 0  # 673  
 for wav in wav_files:  
     wav, sr = librosa.load(wav, mono=True)  
     mfcc = np.transpose(librosa.feature.mfcc(wav, sr), [1,0])  
     if len(mfcc) > wav_max_len:  
         wav_max_len = len(mfcc)  
 print("最长的语音:", wav_max_len)  
    
 batch_size = 16  
 n_batch = len(wav_files) // batch_size  
    
 # 获得一个batch  
 pointer = 0  
 def get_next_batches(batch_size):  
     global pointer  
     batches_wavs = []  
     batches_labels = []  
     for i in range(batch_size):  
         wav, sr = librosa.load(wav_files[pointer], mono=True)  
         mfcc = np.transpose(librosa.feature.mfcc(wav, sr), [1,0])  
         batches_wavs.append(mfcc.tolist())  
         batches_labels.append(labels_vector[pointer])  
         pointer += 1  
    
     # 补零对齐  
     for mfcc in batches_wavs:  
         while len(mfcc) < wav_max_len:  
             mfcc.append([0]*20)  
     for label in batches_labels:  
         while len(label) < label_max_len:  
             label.append(0)  
     return batches_wavs, batches_labels  
    
 X = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, 20])  
 sequence_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(X, reduction_indices=2), 0.), tf.int32), reduction_indices=1)  
 Y = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])  
    
 # conv1d_layer  
 conv1d_index = 0  
 def conv1d_layer(input_tensor, size, dim, activation, scale, bias):  
     global conv1d_index  
     with tf.variable_scope('conv1d_' + str(conv1d_index)):  
         W = tf.get_variable('W', (size, input_tensor.get_shape().as_list()[-1], dim), dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=-scale, maxval=scale))  
         if bias:  
             b = tf.get_variable('b', [dim], dtype=tf.float32, initializer=tf.constant_initializer(0))  
         out = tf.nn.conv1d(input_tensor, W, stride=1, padding='SAME') + (b if bias else 0)  
         if not bias:  
             beta = tf.get_variable('beta', dim, dtype=tf.float32, initializer=tf.constant_initializer(0))  
             gamma = tf.get_variable('gamma', dim, dtype=tf.float32, initializer=tf.constant_initializer(1))  
             mean_running = tf.get_variable('mean', dim, dtype=tf.float32, initializer=tf.constant_initializer(0))  
             variance_running = tf.get_variable('variance', dim, dtype=tf.float32, initializer=tf.constant_initializer(1))  
             mean, variance = tf.nn.moments(out, axes=range(len(out.get_shape()) - 1))  
             def update_running_stat():  
                 decay = 0.99  
                 update_op = [mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay))]  
                 with tf.control_dependencies(update_op):  
                     return tf.identity(mean), tf.identity(variance)  
                 m, v = tf.cond(tf.Variable(False, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]), update_running_stat, lambda: (mean_running, variance_running))  
                 out = tf.nn.batch_normalization(out, m, v, beta, gamma, 1e-8)  
         if activation == 'tanh':  
             out = tf.nn.tanh(out)  
         if activation == 'sigmoid':  
             out = tf.nn.sigmoid(out)  
    
         conv1d_index += 1  
         return out  
 # aconv1d_layer  
 aconv1d_index = 0  
 def aconv1d_layer(input_tensor, size, rate, activation, scale, bias):  
     global aconv1d_index  
     with tf.variable_scope('aconv1d_' + str(aconv1d_index)):  
         shape = input_tensor.get_shape().as_list()  
         W = tf.get_variable('W', (1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=-scale, maxval=scale))  
         if bias:  
             b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0))  
         out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), W, rate=rate, padding='SAME')  
         out = tf.squeeze(out, [1])  
         if not bias:  
             beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))  
             gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))  
             mean_running = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))  
             variance_running = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))  
             mean, variance = tf.nn.moments(out, axes=range(len(out.get_shape()) - 1))  
             def update_running_stat():  
                 decay = 0.99  
                 update_op = [mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay))]  
                 with tf.control_dependencies(update_op):  
                     return tf.identity(mean), tf.identity(variance)  
                 m, v = tf.cond(tf.Variable(False, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]), update_running_stat, lambda: (mean_running, variance_running))  
                 out = tf.nn.batch_normalization(out, m, v, beta, gamma, 1e-8)  
         if activation == 'tanh':  
             out = tf.nn.tanh(out)  
         if activation == 'sigmoid':  
             out = tf.nn.sigmoid(out)  
    
         aconv1d_index += 1  
         return out  
 # 定义神经网络  
 def speech_to_text_network(n_dim=128, n_blocks=3):  
     out = conv1d_layer(input_tensor=X, size=1, dim=n_dim, activation='tanh', scale=0.14, bias=False)  
     # skip connections  
     def residual_block(input_sensor, size, rate):  
             conv_filter = aconv1d_layer(input_sensor, size=size, rate=rate, activation='tanh', scale=0.03, bias=False)  
             conv_gate = aconv1d_layer(input_sensor, size=size, rate=rate,  activation='sigmoid', scale=0.03, bias=False)  
             out = conv_filter * conv_gate  
             out = conv1d_layer(out, size=1, dim=n_dim, activation='tanh', scale=0.08, bias=False)  
             return out + input_sensor, out  
     skip = 0  
     for _ in range(n_blocks):  
         for r in [1, 2, 4, 8, 16]:  
             out, s = residual_block(out, size=7, rate=r)  
             skip += s  
    
     logit = conv1d_layer(skip, size=1, dim=skip.get_shape().as_list()[-1], activation='tanh', scale=0.08, bias=False)  
     logit = conv1d_layer(logit, size=1, dim=words_size, activation=None, scale=0.04, bias=True)  
    
     return logit  
    
 class MaxPropOptimizer(tf.train.Optimizer):  
     def __init__(self, learning_rate=0.001, beta2=0.999, use_locking=False, name="MaxProp"):  
         super(MaxPropOptimizer, self).__init__(use_locking, name)  
         self._lr = learning_rate  
         self._beta2 = beta2  
         self._lr_t = None  
         self._beta2_t = None  
     def _prepare(self):  
         self._lr_t = tf.convert_to_tensor(self._lr, name="learning_rate")  
         self._beta2_t = tf.convert_to_tensor(self._beta2, name="beta2")  
     def _create_slots(self, var_list):  
         for v in var_list:  
             self._zeros_slot(v, "m", self._name)  
     def _apply_dense(self, grad, var):  
         lr_t = tf.cast(self._lr_t, var.dtype.base_dtype)  
         beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype)  
         if var.dtype.base_dtype == tf.float16:  
             eps = 1e-7  
         else:  
             eps = 1e-8  
         m = self.get_slot(var, "m")  
         m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))  
         g_t = grad / m_t  
         var_update = tf.assign_sub(var, lr_t * g_t)  
         return tf.group(*[var_update, m_t])  
     def _apply_sparse(self, grad, var):  
         return self._apply_dense(grad, var)  
    
 def train_speech_to_text_network():  
     logit = speech_to_text_network()  
    
     # CTC loss  
     indices = tf.where(tf.not_equal(tf.cast(Y, tf.float32), 0.))  
     target = tf.SparseTensor(indices=indices, values=tf.gather_nd(Y, indices) - 1, shape=tf.cast(tf.shape(Y), tf.int64))  
     loss = tf.nn.ctc_loss(logit, target, sequence_len, time_major=False)  
     # optimizer  
     lr = tf.Variable(0.001, dtype=tf.float32, trainable=False)  
     optimizer = MaxPropOptimizer(learning_rate=lr, beta2=0.99)  
     var_list = [t for t in tf.trainable_variables()]  
     gradient = optimizer.compute_gradients(loss, var_list=var_list)  
     optimizer_op = optimizer.apply_gradients(gradient)  
    
     with tf.Session() as sess:  
         sess.run(tf.global_variables_initializer())  
    
         saver = tf.train.Saver(tf.global_variables())  
    
         for epoch in range(16):  
             sess.run(tf.assign(lr, 0.001 * (0.97 ** epoch)))  
    
             global pointer  
             pointer = 0  
             for batch in range(n_batch):  
                 batches_wavs, batches_labels = get_next_batches(batch_size)  
                 train_loss, _ = sess.run([loss, optimizer_op], feed_dict={X: batches_wavs, Y: batches_labels})  
                 print(epoch, batch, train_loss)  
             if epoch % 5 == 0:  
                 saver.save(sess, 'speech.module', global_step=epoch)  
    
 # 训练  
 train_speech_to_text_network()  
    
 # 语音识别  
 # 把batch_size改为1  
 def speech_to_text(wav_file):  
     wav, sr = librosa.load(wav_file, mono=True)  
     mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr), axis=0), [0,2,1])  
    
     logit = speech_to_text_network()  
    
     saver = tf.train.Saver()  
     with tf.Session() as sess:  
         saver.restore(sess, tf.train.latest_checkpoint('.'))  
    
         decoded = tf.transpose(logit, perm=[1, 0, 2])  
         decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False)  
         predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].shape, decoded[0].values) + 1  
         output = sess.run(decoded, feed_dict={X: mfcc})          
         #print(output)  

后续：从麦克风获得语音输入，使用上面的模型进行识别。

Android 轻松实现语音识别的完整代码 (转)

博客分类：
Android

苹果的iphone 有语音识别用的是Google 的技术，做为Google 力推的Android 自然会将其核心技术往Android 系统里面植入，并结合google 的云端技术将其发扬光大。

所以Google Voice Recognition在Android 的实现就变得极其轻松。

语音识别，借助于云端技术可以识别用户的语音输入，包括语音控制等技术，下面我们将利用Google 提供的Api 实现这一功能。

功能点为：通过用户语音将用户输入的语音识别出来，并打印在列表上。

功能界面如下：

用户通过点击speak按钮显示界面：

用户说完话后，将提交到云端搜索：

在云端搜索完成后，返回打印数据：

* Copyright (C) 2008 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.example.android.apis.app;

import com.example.android.apis.R;

import android.app.Activity;
import android.content.Intent;
import android.content.pm.PackageManager;
import android.content.pm.ResolveInfo;
import android.os.Bundle;
import android.speech.RecognizerIntent;
import android.view.View;
import android.view.View.OnClickListener;
import android.widget.ArrayAdapter;
import android.widget.Button;
import android.widget.ListView;

import java.util.ArrayList;
import java.util.List;

/**
* Sample code that invokes the speech recognition intent API.
*/
public class VoiceRecognition extends Activity implements OnClickListener {

private static final int VOICE_RECOGNITION_REQUEST_CODE = 1234;

private ListView mList;

/**
* Called with the activity is first created.
*/
@Override
public void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);

// Inflate our UI from its XML layout description.
setContentView(R.layout.voice_recognition);

// Get display items for later interaction
Button speakButton = (Button) findViewById(R.id.btn_speak);

mList = (ListView) findViewById(R.id.list);

// Check to see if a recognition activity is present
PackageManager pm = getPackageManager();
List activities = pm.queryIntentActivities(
new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH), 0);
if (activities.size() != 0) {
speakButton.setOnClickListener(this);
} else {
speakButton.setEnabled(false);
speakButton.setText("Recognizer not present");
}
}

/**
* Handle the click on the start recognition button.
*/
public void onClick(View v) {
if (v.getId() == R.id.btn_speak) {
startVoiceRecognitionActivity();
}
}

/**
* Fire an intent to start the speech recognition activity.
*/
private void startVoiceRecognitionActivity() {
Intent intent = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);
intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL,
RecognizerIntent.LANGUAGE_MODEL_FREE_FORM);
intent.putExtra(RecognizerIntent.EXTRA_PROMPT, "Speech recognition demo");
startActivityForResult(intent, VOICE_RECOGNITION_REQUEST_CODE);
}

/**
* Handle the results from the recognition activity.
*/
@Override
protected void onActivityResult(int requestCode, int resultCode, Intent data) {
if (requestCode == VOICE_RECOGNITION_REQUEST_CODE && resultCode == RESULT_OK) {
// Fill the list view with the strings the recognizer thought it could have heard
ArrayList matches = data.getStringArrayListExtra(
RecognizerIntent.EXTRA_RESULTS);
mList.setAdapter(new ArrayAdapter(this, android.R.layout.simple_list_item_1,
matches));
}

super.onActivityResult(requestCode, resultCode, data);
}
}