wxpy识别语音消息
1.wxpy
wxpy是一个模拟微信网页版登录,从而实现的一系列将微信网页版功能自动化的一个模块。github:https://github.com/youfou/wxpy
2.前期准备
目前的wxpy还没有内置语音处理的功能,接入wxpy之后,别人发送语音消息,是以一个语音文件的形式发送来的
不同于手机微信语音的是,手机上语音格式默认是amr格式,而网页版微信的语音发过来是mp3格式
经过筛选,我选择了使用百度语音识别的API来进行语音识别文字,在使用之前,需要先注册百度语音识别的应用,获得 APP_ID, API_KEY, SECRET_KEY,然后安装
pip install baidu-aip
语音识别模块但是问题来了,百度语音识别支持的格式有限,而且是限定采样率和单声道,mp3格式是不符合上面任何一项,所以需要进行格式转换
3.语音格式转换
我翻了好几页google, 找到一些免费的网站进行格式转换的,本来想提取API的,但都是上传文件,然后再进行格式转换的。流程是这样的:
字节码–>保存文件–>上传到API–>下载文件–>读取字节码–>上传到百度API
这种形势太浪费IO,经过查找,我找到一个轮子 github: pydub 这样就简单多了:
字节码–>pydub格式转换–>上传到百度API
安装 pydub
还需要安装 ffmpeg:
apt-get install ffmpeg libavcodec-extra-53
然后再安装python库
pip install pydub
开始语音识别
from pydub import AudioSegment
from io import BytesIO
from aip import AipSpeech
...
if msg.type == 'Recording':
audio = AudioSegment.from_mp3(BytesIO(msg.get_file()))
export = audio.export( format="amr", bitrate="12.20k")
transform = aipSpeech.asr(export.read(), 'amr', 8000, {'lan': 'zh',} )
...
不出意外,这个json变量 transform 就是百度识别的结果。
问题探讨
在写这个代码的过程中,我发现了一个性能问题。
audio = AudioSegment.from_mp3(BytesIO(msg.get_file()))
这个是把音频文档转换成AudioSegment对象,看一下这个模块的源代码:@classmethod def from_file(cls, file, format=None, codec=None, parameters=None, **kwargs): orig_file = file file = _fd_or_path_or_tempfile(file, 'rb', tempfile=False) if format: format = format.lower() format = AUDIO_FILE_EXT_ALIASES.get(format, format) def is_format(f): f = f.lower() if format == f: return True if isinstance(orig_file, basestring): return orig_file.lower().endswith(".{0}".format(f)) return False if is_format("wav"): try: return cls._from_safe_wav(file) except: file.seek(0) elif is_format("raw") or is_format("pcm"): sample_width = kwargs['sample_width'] frame_rate = kwargs['frame_rate'] channels = kwargs['channels'] metadata = { 'sample_width': sample_width, 'frame_rate': frame_rate, 'channels': channels, 'frame_width': channels * sample_width } return cls(data=file.read(), metadata=metadata) input_file = NamedTemporaryFile(mode='wb', delete=False) try: input_file.write(file.read()) except(OSError): input_file.flush() input_file.close() input_file = NamedTemporaryFile(mode='wb', delete=False, buffering=2**31-1) file = open(orig_file, buffering=2**13-1, mode='rb') reader = file.read(2**31-1) while reader: input_file.write(reader) reader = file.read(2**31-1) input_file.flush() output = NamedTemporaryFile(mode="rb", delete=False) conversion_command = [cls.converter, '-y', # always overwrite existing files ] # If format is not defined # ffmpeg/avconv will detect it automatically if format: conversion_command += ["-f", format] if codec: # force audio decoder conversion_command += ["-acodec", codec] conversion_command += [ "-i", input_file.name, # input_file options (filename last) "-vn", # Drop any video streams if there are any "-f", "wav", # output options (filename last) output.name ] if parameters is not None: # extend arguments with arbitrary set conversion_command.extend(parameters) log_conversion(conversion_command) p = subprocess.Popen(conversion_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p_out, p_err = p.communicate() if p.returncode != 0: raise CouldntDecodeError("Decoding failed. ffmpeg returned error code: {0}\n\nOutput from ffmpeg/avlib:\n\n{1}".format(p.returncode, p_err)) obj = cls._from_safe_wav(output) input_file.close() output.close() os.unlink(input_file.name) os.unlink(output.name) return obj
- 这个源代码里,是把我转化为 文档的对象,再转化为字节对象。就是说我先把字节转成文档,再把文档转成字节。。。心累。。。
- 再来看一下输出的源代码:
def export(self, out_f=None, format='mp3', codec=None, bitrate=None, parameters=None, tags=None, id3v2_version='4', cover=None):
"""
Export an AudioSegment to a file with given options
out_f (string):
Path to destination audio file
format (string)
Format for destination audio file.
('mp3', 'wav', 'raw', 'ogg' or other ffmpeg/avconv supported files)
codec (string)
Codec used to encoding for the destination.
bitrate (string)
Bitrate used when encoding destination file. (64, 92, 128, 256, 312k...)
Each codec accepts different bitrate arguments so take a look at the
ffmpeg documentation for details (bitrate usually shown as -b, -ba or
-a:b).
parameters (string)
Aditional ffmpeg/avconv parameters
tags (dict)
Set metadata information to destination files
usually used as tags. ({title='Song Title', artist='Song Artist'})
id3v2_version (string)
Set ID3v2 version for tags. (default: '4')
cover (file)
Set cover for audio file from image file. (png or jpg)
"""
id3v2_allowed_versions = ['3', '4']
out_f = _fd_or_path_or_tempfile(out_f, 'wb+')
out_f.seek(0)
if format == "raw":
out_f.write(self._data)
out_f.seek(0)
return out_f
# for wav output we can just write the data directly to out_f
if format == "wav":
data = out_f
else:
data = NamedTemporaryFile(mode="wb", delete=False)
wave_data = wave.open(data, 'wb')
wave_data.setnchannels(self.channels)
wave_data.setsampwidth(self.sample_width)
wave_data.setframerate(self.frame_rate)
# For some reason packing the wave header struct with
# a float in python 2 doesn't throw an exception
wave_data.setnframes(int(self.frame_count()))
wave_data.writeframesraw(self._data)
wave_data.close()
# for wav files, we're done (wav data is written directly to out_f)
if format == 'wav':
return out_f
output = NamedTemporaryFile(mode="w+b", delete=False)
# build converter command to export
conversion_command = [
self.converter,
'-y', # always overwrite existing files
"-f", "wav", "-i", data.name, # input options (filename last)
]
if codec is None:
codec = self.DEFAULT_CODECS.get(format, None)
if cover is not None:
if cover.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')) and format == "mp3":
conversion_command.extend(["-i" , cover, "-map", "0", "-map", "1", "-c:v", "mjpeg"])
else:
raise AttributeError("Currently cover images are only supported by MP3 files. The allowed image formats are: .tif, .jpg, .bmp, .jpeg and .png.")
if codec is not None:
# force audio encoder
conversion_command.extend(["-acodec", codec])
if bitrate is not None:
conversion_command.extend(["-b:a", bitrate])
if parameters is not None:
# extend arguments with arbitrary set
conversion_command.extend(parameters)
if tags is not None:
if not isinstance(tags, dict):
raise InvalidTag("Tags must be a dictionary.")
else:
# Extend converter command with tags
# print(tags)
for key, value in tags.items():
conversion_command.extend(
['-metadata', '{0}={1}'.format(key, value)])
if format == 'mp3':
# set id3v2 tag version
if id3v2_version not in id3v2_allowed_versions:
raise InvalidID3TagVersion(
"id3v2_version not allowed, allowed versions: %s" % id3v2_allowed_versions)
conversion_command.extend([
"-id3v2_version", id3v2_version
])
if sys.platform == 'darwin':
conversion_command.extend(["-write_xing", "0"])
conversion_command.extend([
"-f", format, output.name, # output options (filename last)
])
log_conversion(conversion_command)
# read stdin / write stdout
p = subprocess.Popen(conversion_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
p_out, p_err = p.communicate()
if p.returncode != 0:
raise CouldntEncodeError("Encoding failed. ffmpeg/avlib returned error code: {0}\n\nCommand:{1}\n\nOutput from ffmpeg/avlib:\n\n{2}".format(p.returncode, conversion_command, p_err))
output.seek(0)
out_f.write(output.read())
data.close()
output.close()
os.unlink(data.name)
os.unlink(output.name)
out_f.seek(0)
return out_f
先把转化后的字节码写入文档,再从文档中读取字节码。。。心累。。。
可以看到,读取原音频和读取新音频,都多进行了一步IO,所以说可以改善这个代码。因为原作者写这个模块的需求就是可以批量或者单个修改音频文件,汇集了改变长度,音轨,声调,音量等功能,可谓是十分强大。
所以没有给直接读入字节码,和直接输出字节码提供接口。如果对性能要求特别高,或者对于同时处理多条语音消息,建议修改源文件的函数,增加一个字节码处理函数,和一个字节码输出函数。这样性能会提升非常多。