以下代码包括了ocr , asr短音频,asr长音频(多个语句)。
正常调用asr的话,效果不好,所以用asr长音频(多个语句切分调用),效果可观很多。
import aip
from aip import ocr
from aip import speech
import os
import wave #音频文件处理
import urllib.request, pycurl
#import base64
import json
# get access token by api key & secret key
import time
import re
ocr
# https://console.bce.baidu.com/ai 百度智能云 创建ocr的api
ocr_APP_ID = 'XXXXXXX'
ocr_API_KEY = 'XXXXXXXXXXXXXXXXXXXX'
ocr_SECRET_KEY = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXX'
ocr_client = ocr.AipOcr(APP_ID, API_KEY, SECRET_KEY)
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def image2text(fileName):
image = get_file_content(fileName)
dic_result = ocr_client.basicGeneral(image)
try:
res = dic_result['words_result']
except:
res = []
result = ''
for m in res:
result = result + str(m['words'])
return result
asr(短视频)
# https://console.bce.baidu.com/ai 百度智能云 创建asr的api
asr_APP_ID = 'XXXXXXXXXXX'
asr_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
asr_SECRET_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXX"
asr_client = speech.AipSpeech(asr_APP_ID, asr_API_KEY, asr_SECRET_KEY)
# def wav2text(fileName):
# fp = wave.open(fileName, 'rb')
# nf = fp.getnframes()
# # f_len = nf * 2
# wav = fp.readframes(nf) #audio_data
# dic_result = asr_client.asr(wav)
# # try:
# # res = dic_result['words_result']
# # except:
# # res = []
# # result = ''
# # for m in res:
# # result = result + str(m['words'])
# return dic_result
# 5.短时间举例子
# 语音参数 必须符合16k或8K采样率、16bit采样位数、单声道
# 语音格式 PCM、WAV、AMR
def baidu_Speech_To_Text(filePath): # 百度语音识别
asr_APP_ID = 'XXXXXXXX'
asr_API_KEY = "XXXXXXXXXXXXXXXXXXXXXX" #这两行是登录用的密码
asr_SECRET_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
asr_client = speech.AipSpeech(asr_APP_ID, asr_API_KEY, asr_SECRET_KEY)
# 读取文件
with open(filePath, 'rb') as fp:
audioPcm = fp.read()
json = asr_client.asr(audioPcm, 'wav', 16000, {'lan': 'zh', })
print(json)
if 'success' in json['err_msg']:
context = json['result'][0]
print('成功,返回结果为:', context)
else:
context = '=====识别失败====='
print('识别失败!')
return context
asr(长视频(其实是多语句))
from pydub import AudioSegment
from pydub.utils import mediainfo
from aip import speech
def baidu_Speech_To_Text(filePath): # 百度语音识别
asr_APP_ID = 'XXXXXXXX'
asr_API_KEY = "XXXXXXXXXXXXXXXXXXXXXX" #这两行是登录用的密码
asr_SECRET_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
asr_client = speech.AipSpeech(asr_APP_ID, asr_API_KEY, asr_SECRET_KEY)
# 读取文件
with open(filePath, 'rb') as fp:
audioPcm = fp.read()
json = asr_client.asr(audioPcm, 'wav', 16000, {'lan': 'zh', })
print(json)
if 'success' in json['err_msg']:
context = json['result'][0]
print('成功,返回结果为:', context)
else:
context = '=====识别失败====='
print('识别失败!')
return context
def sound_cut(file_name):
if os.path.exists('识别结果.txt'):
os.remove(r'识别结果.txt')
song = mediainfo(file_name)
song_length = str(int(float(song['duration']))) # 读取文件时长
song_size = str(round(float(int(song['size']) / 1024 / 1024), 2)) + 'M' # 读取文件大小保留两位小数round(变量,2)
song_filename = song['filename'] # 读取文件地址
song_format_name = song['format_name'] # 读取文件格式
print('\t长度', song_length, '\t文件大小', song_size, '\t文件路径', song_filename, '\t文件格式', song_format_name)
cut_song_num = int(int(song_length) / 59) + 1 # 每段59s,计算切割段数
print('切割次数', cut_song_num)
sound = AudioSegment.from_mp3(file_name)
# 单位:ms
stat_time = 0
end_time = 59
for i in range(cut_song_num):
if i == cut_song_num - 1: # 判断如果是最后一次截断
cut_song = sound[stat_time * 1000:] # 截取到最后的时间
end_time = int(song_length)
else:
cut_song = sound[stat_time * 1000:end_time * 1000]
save_name = r"temp-" + str(i + 1) + '.mp3' # 设置文件保存名称
cut_song.export(save_name, format="mp3") # 进行切割
save_name_pcm = r"temp-" + str(i + 1) + '.wav' # 设置文件保存名称
mp3_version = AudioSegment.from_mp3(save_name) # 可以根据文件不太类型导入不同from方法
mono = mp3_version.set_frame_rate(16000).set_channels(1) # 设置声道和采样率
mono.export(save_name_pcm, format='wav', codec='pcm_s16le') # codec此参数本意是设定16bits pcm编码器, 但发现此参数可以省略
context = baidu_Speech_To_Text(save_name_pcm)
with open(r'识别结果.txt', 'a', encoding='utf-8') as f:
f.write(context)
os.remove(save_name) # 删除mp3文件
os.remove(save_name_pcm) # 删除mp3文件
print(save_name, 'end_time=', stat_time, 'end_time=', end_time)
# 切割完加入下一段的参数
stat_time += 59
end_time += 59
参考: https://blog.csdn.net/qq_40584593/article/details/110311540