由于工作需求,一些文件不能在网上操作,无法使用微信文字识别和讯飞语音识别等工具,于是心血来潮,打算开发一个离线的OCR、ASR工具箱,UI采用PyQT5编写,OCR部分采用cnocr模块实现,ASR部分使用Whisper实现,另外加入了一个TTS文字转语音功能,用于边听边校对文本内容。
一、OCR部分
OCR部分的界面如下:
其中,用于OCR识别的相关模型文件放入了mylibs/cnocr文件夹中,该部分代码如下:
# 实现从图像中识别文字的功能
import os
from PIL import ImageGrab, Image
from PyQt5.QtCore import QObject, Qt
from PyQt5.QtWidgets import QFileDialog, QMessageBox
from cnocr import CnOcr
from pyglet.window import mouse
from pynput import mouse
# 鼠标监听事件
global m_global_leftclick_num # 采用全局变量记录鼠标左键点击次数
global start_x
global start_y
global end_x
global end_y
m_global_leftclick_num = 0
LOG_FILE = "processed_files.log"
class OCR(QObject):
def __init__(self, ui):
super(OCR, self).__init__()
self.ui = ui
self.ui.label_Tab2_infoshow.setTextInteractionFlags(Qt.TextSelectableByMouse)
ui.pushButton_Tab2_selectimage.clicked.connect(self.m_select_img) # 点击发送按钮后获取输入框内容
ui.pushButton_Tab2_PrcScreen.clicked.connect(self.m_prtsc) # 点击发送按钮后获取输入框内容
# 选择图像
def m_select_img(self):
file_path, _ = QFileDialog.getOpenFileName(None, "选择文件", "./", "files(*.png *.jpg)")
self.ui.lineEdit_Tab2_imagepath.setText(file_path) # 在UI上显示选中的img路径
try:
self.m_img_recognize(file_path)
except:
QMessageBox.warning(None, "错误", "未能正确识别图像,请重试!", QMessageBox.Ok)
# 识别图像
# 参数:img_fpd待识别的图像
def m_img_recognize(self, img_fp):
if len(img_fp) == 0:
QMessageBox.warning(None, "警告", "没有可识别的图像文件!", QMessageBox.Ok)
pass
else:
save_dir = 'tmp'
if not os.path.exists('./tmp/'):
# 如果文件夹不存在,则创建该文件夹
os.makedirs('./tmp/')
# print("文件夹已检查并创建")
# filename, ext = os.path.splitext(os.path.basename(img_fp))
save_fp = os.path.join(save_dir, "ocrtemp.jpg")
# 读取图片
with Image.open(img_fp) as img:
# 转换图片格式并保存(如果需要的话,Pillow可以自动处理格式转换)
img.save(save_fp, 'JPEG')
# ocr = CnOcr() # 所有参数都使用默认值
ocr = CnOcr(
rec_model_name='densenet_lite_136-fc',
rec_model_backend='onnx',
# rec_model_fp='mylibs/cnocr/2.2/densenet_lite_136-fc/cnocr-v2.2-densenet_lite_136-fc-epoch=039-complete_match_epoch=0.8597-model.onnx',
rec_root='mylibs/cnocr',
det_model_name='ch_PP-OCRv3_det',
det_model_backend='onnx',
# det_model_fp='mylibs/cnstd/1.2/ppocr/ch_PP-OCRv3_det_infer.onnx',
det_root='mylibs/cnstd'
)
out = ocr.ocr(save_fp)
out_len = len(out)
txt_out = ''
extra_char = '' if self.ui.DelEnter_checkBox.isChecked() else '\n'
for i in range(0, out_len):
txt_out = txt_out + out[i]['text'] + extra_char
self.ui.label_Tab2_infoshow.setText(txt_out)
try:
os.remove('tmp/ocrtemp.jpg')
except:
pass
# 检测到鼠标,点击后的处理事件。 x、y是鼠标,点击时的坐标。
def on_click(self, x, y, button, pressed):
global m_global_leftclick_num
if pressed:
if m_global_leftclick_num == 0: # 第一次点击鼠标左键
m_global_leftclick_num = m_global_leftclick_num + 1
# print('鼠标点击事件 - 按下:', x, y, button)
# print('第' + str(m_global_leftclick_num) + '次点击')
global start_x
global start_y
start_x = x
start_y = y
elif m_global_leftclick_num == 1: # 第二次点击鼠标左键
m_global_leftclick_num = m_global_leftclick_num + 1
global m_global_end_x
global m_global_end_y
m_global_end_x = x
m_global_end_y = y
if (m_global_end_x > start_x) & (m_global_end_y > start_y):
# print('鼠标点击事件 - 按下:', x, y, button)
# print('第' + str(m_global_leftclick_num) + '次点击')
if not os.path.exists('./tmp/'):
# 如果文件夹不存在,则创建该文件夹
os.makedirs('./tmp/')
# print("文件夹已检查并创建")
screenshot = ImageGrab.grab(bbox=(start_x, start_y, m_global_end_x, m_global_end_y))
screenshot.save('./tmp/screenshot.png')
m_global_leftclick_num = 0
return False
else:
m_global_leftclick_num = 1
# 从屏幕截图,监听鼠标左键,将截图保存在为./tmp/screenshot.png,并读取该图片进行识别
# 参数show_txt_obj:Text组件,用于显示识别结果
def m_prtsc(self):
# 创建鼠标监听器并注册点击事件处理程序
listener = mouse.Listener(on_click=self.on_click)
listener.start() # 非阻断式,开始监听后可以操作该程序
listener.join() # 阻断式,开始监听后不能操作该程序
self.m_img_recognize('./tmp/screenshot.png')
os.remove('./tmp/screenshot.png')
二、ASR部分
ASR部分的界面如下:
此部分在识别语音时,首先以2min为单位对将长语音进行切分,然后对每一段语音分别进行识别,最后再拼接各个文本片段,得到最终的识别结果。模型有3个,分别是Whisper的medium、small、small.en。模型文件存放于mylibs/whisper文件夹下。此部分代码如下:
import os
from pydub import AudioSegment
from threading import Thread
import whisper
from PyQt5.QtCore import QObject
from PyQt5.QtWidgets import QFileDialog, QMessageBox
LOG_FILE = "processed_files.log"
status = {"current_file": "", "details": [], "done": False}
stop_requested = False
class MAudioText(QObject):
def __init__(self, ui):
super(MAudioText, self).__init__()
self.ui = ui
ui.pushButton_Tab1_selectaudio.clicked.connect(self.select_audio) # 朗读按钮事件
ui.pushButton_Tab1_startrecognize.clicked.connect(self.recognize_audio_thread) # 暂停按钮事件
# ui.language_comboBox.addItems(['中文', '英文'])
ui.model_comboBox.addItems(['中文精准', '中文快速', '英文快速'])
# 选择要切分原始音频文件
# 参数:inp1:显示源音频文件地址的input组件
def select_audio(self):
file_path, _ = QFileDialog.getOpenFileName(None, "选择文件", "./", "files(*.mp3 *.wav *.mp4)")
self.ui.lineEdit_Tab1_audiopath.setText(file_path) # 在UI上显示选中的img路径
def get_processed_files(self):
if os.path.exists(LOG_FILE):
with open(LOG_FILE, 'r') as f:
return set(f.read().splitlines())
return set()
def log_processed_file(self, file_path):
with open(LOG_FILE, 'a') as f:
f.write(f"{file_path}\n")
def process_wav_files(self, directory, info_txt):
global stop_requested
status["details"] = []
status["done"] = False
# txt_update_data(txt_object=info_txt, data='开始文字识别', position=END, clearornot='no')
info_txt.label_Tab1_infoshow.setText(info_txt.label_Tab1_infoshow.text() + '\n开始文字识别')
tmp_soundfile_path = directory + 'tmp/'
processed_files = self.get_processed_files()
filelist = os.listdir(tmp_soundfile_path)
recognized_file_num = 0
total_file_num = len(filelist)
for file in filelist:
if file.lower().endswith('.wav'):
file_path = os.path.join(tmp_soundfile_path, file)
# output, errors = execute_command(file_path)
# model = whisper.load_model("small")
if self.ui.model_comboBox.currentText() == "中文精准":
modeltype = "medium"
promt = "以下是普通话句子。"
elif self.ui.model_comboBox.currentText() == "中文快速":
modeltype = "small"
promt = "以下是普通话句子。"
elif self.ui.model_comboBox.currentText() == "英文快速":
modeltype = "small.en"
promt = "Here are the English sentences."
else:
modeltype = "small"
promt = ""
model = whisper.load_model(
name=modeltype,
download_root='mylibs/whisper',
# in_memory=False
)
output = model.transcribe(
audio=file_path,
initial_prompt=promt
)['text']
# print(output)
with open(f'{tmp_soundfile_path}segment_{recognized_file_num}.txt', "w", encoding="utf-8") as file:
file.write(output) # 将output变量的内容写入文件
# log_processed_file(file_path)
recognized_file_num = recognized_file_num + 1
progress_num = round(float(recognized_file_num) / total_file_num * 100)
# txt_update_data(txt_object=info_txt, data=f"已完成{progress_num}%", position=END, clearornot='no')
info_txt.label_Tab1_infoshow.setText(info_txt.label_Tab1_infoshow.text() + f"\n已完成{progress_num}%")
# m_remove_file(file_path)
if stop_requested:
stop_requested = False
break
status["done"] = True
# 在输入的文件地址同级创建以该音频文件名命名的文件夹,
# 并将分段的音频文件保存在该文件夹中
# 参数:src_filepath:需要识别的音频文件的地址
def get_save_filefolder(self, src_filepath):
output_folder = src_filepath.split('.')[0] + "/"
# 检查文件夹是否存在
if not os.path.exists(output_folder):
# 如果文件夹不存在,则创建该文件夹
os.makedirs(output_folder)
return output_folder
def get_tmp_filefolder(self, tmp_filepath):
if not os.path.exists(tmp_filepath):
# 如果文件夹不存在,则创建该文件夹
os.makedirs(tmp_filepath)
return tmp_filepath
# 对音频文件进行切片保存,并返回文件夹路径
# 参数:file_path:需要切片的音频文件的地址
# 返回值:output_folder:切片后的文件存储路径
def split_audio(self, file_path):
file_path = file_path.strip('\n') # 去掉file_path末尾的'\n'
output_folder = self.get_save_filefolder(file_path) # 根据输入的音频文件地址,得到输出文件保存路径
output_folder_tmp = self.get_tmp_filefolder(output_folder + 'tmp/')
segment_length = 120000 # 设置分段长度,单位为毫秒,120000ms=2min
try:
audio = AudioSegment.from_file(file_path)
segments = []
for i in range(0, len(audio), segment_length):
segment = audio[i:i + segment_length]
segments.append(segment)
for i, segment in enumerate(segments):
segment.export(output_folder_tmp + f"segment_{i}.wav", format="wav")
return output_folder
except Exception as e:
# print(f"Error: {e}")
pass
# 文语音识别
# 参数:file_path:需要识别的音频文件地址,info_txt:用于显示处理信息的Text组件,inp2:用于显示最终保存路径的input组件
def recognize_audio(self, file_path):
output_folder = self.split_audio(file_path) # 对音频文件进行切片保存,并返回文件夹路径
self.ui.label_Tab1_infoshow.setText('音频切片完毕')
self.process_wav_files(output_folder, self.ui)
self.hebing_txt(output_folder + 'tmp/', output_folder)
self.ui.lineEdit_Tab1_textpath.setText(output_folder)
# 语音识别进程
def recognize_audio_thread(self):
file_path = self.ui.lineEdit_Tab1_audiopath.text()
if (file_path == '\n') or (file_path == '\t') or (len(file_path) == 0):
QMessageBox.warning(None, "警告", "尚未选择音频文件!", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes)
else:
t1 = Thread(target=self.recognize_audio, args=(file_path,)) # 创建线程对象
t1.start() # 启动
# 将每个识别生成的txt文本合并保存成最终文本
# 参数:folder_path:分段识别文本的保存路径,output_file:最终识别文本的保存路径,info_txt:用于显示进度信息的Text组件
def hebing_txt(self, folder_path, output_file):
# 设置文件夹路径和输出文件路径
txt_name = output_file.split("/")
txt_name = txt_name[len(txt_name) - 2]
output_file = output_file + txt_name + '.txt'
# 获取文件夹内所有txt文件的路径
txt_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.txt')]
try:
# 打开输出文件
self.ui.label_Tab1_infoshow.setText(self.ui.label_Tab1_infoshow.text() + '\n开始合并文本')
with open(output_file, 'w') as outfile:
# 遍历所有txt文件
for txt_file in txt_files:
with open(txt_file, 'r', encoding='utf-8-sig') as infile:
# 读取每个txt文件的内容
content = infile.read()
# 将内容写入输出文件
outfile.write(content)
# 添加换行符,以便每个文件的内容在不同的行上显示
outfile.write('\n')
self.ui.label_Tab1_infoshow.setText(self.ui.label_Tab1_infoshow.text() + '\n文本合并完毕')
self.remove_folder(folder_path)
except Exception as e:
# print(f"Error: {e}")
pass
# 删除掉指定的文件夹
def remove_folder(self, path):
if os.path.exists(path):
if os.path.isfile(path) or os.path.islink(path):
os.remove(path)
else:
for filename in os.listdir(path):
self.remove_folder(os.path.join(path, filename))
os.rmdir(path)
三、TTS部分
TTS采用python自带的pyttsx3实现,支持将合成的语音导出为音频文件。该部分的界面和代码如下:
import datetime
import os
import shutil
import pyttsx3
from PyQt5.QtCore import QObject
from PyQt5.QtWidgets import QFileDialog, QMessageBox
from pygame import mixer
global tts_path
global playorpause
global cansaveornot
cansaveornot = 'no'
playorpause = 'unplayed' # unplayed表示从未播放过,playing表示正在播放,pause表示正处于暂停状态
class MTTS(QObject):
def __init__(self, ui):
super(MTTS, self).__init__()
self.ui = ui
ui.pushButton_read.clicked.connect(self.m_tts_play) # 朗读按钮事件
ui.pushButton_pause.clicked.connect(self.m_stopaudio) # 暂停按钮事件
ui.pushButton_saveaudio.clicked.connect(self.m_saveaudio) # 导出按钮事件
def m_generatetts(self, input_text):
# 获取输入的文本内容
# 初始化 TTS 引擎
engine = pyttsx3.init()
global cansaveornot
try:
# 设置语速、音量等参数(根据需求自行调整)
engine.setProperty('rate', 150) # 默认值为 200
engine.setProperty('volume', 0.8) # 默认值为 1.0
if not os.path.exists('./tmp/'):
# 如果文件夹不存在,则创建该文件夹
os.makedirs('./tmp/')
# print("文件夹已检查并创建")
output_path = './tmp/outputtts.wav'
# 将文本转换为语音并保存到指定路径
engine.save_to_file(input_text, output_path)
engine.runAndWait()
cansaveornot = 'yes' # 生成的文件保存成功才允许导出操作
return output_path
except Exception as e:
# print("发生错误:", str(e))
pass
def m_tts_play(self):
input_text = self.ui.plainTextEdit_Tab3_textshow.toPlainText()
if (input_text == '\n') or (input_text == '\t') or (len(input_text) == 0):
# messagebox.showwarning("警告", "请先输出文本")
QMessageBox.warning(None, "警告", "请先输出文本!", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes)
else:
filepath = './tmp/outputtts.wav'
mixer.quit() # 退出语音进程
if os.path.exists(filepath): # 如果检测到已经存在该文件,则先退出进程,然后删除掉该文件
mixer.quit() # 退出语音进程
os.remove(filepath) # 删除掉语音文件
global playorpause
global tts_path
tts_path = self.m_generatetts(input_text)
self.m_playaudio(tts_path) # 播放生成的语音文件
playorpause = 'playing'
self.ui.pushButton_pause.setText('暂停')
global audio_path
global playsound_thread
def m_playaudio(self, filepath):
input_text = self.ui.plainTextEdit_Tab3_textshow.toPlainText()
if (input_text == '\n') or (input_text == '\t') or (len(input_text) == 0):
QMessageBox.warning(None, "警告", "请先输出文本!", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes)
else:
mixer.init()
size = 0.7
# name, song_name, singer = name_now()
mixer.music.set_volume(size)
# print(f"播放{filepath}")
mixer.music.load(filepath)
mixer.music.play()
def m_stopaudio(self):
input_text = self.ui.plainTextEdit_Tab3_textshow.toPlainText()
if (input_text == '\n') or (input_text == '\t') or (len(input_text) == 0):
QMessageBox.warning(None, "警告", "请先输出文本!", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes)
else:
global playorpause
if playorpause == 'unplayed':
playorpause = 'playing'
self.ui.pushButton_pause.setText('暂停')
elif playorpause == 'playing':
playorpause = 'pause'
mixer.music.pause()
self.ui.pushButton_pause.setText('继续播放')
elif playorpause == 'pause':
mixer.music.unpause()
playorpause = 'playing'
self.ui.pushButton_pause.setText('暂停')
def m_cancelaudio(self):
mixer.music.stop()
# 保存音频文件
def m_saveaudio(self):
if cansaveornot == 'yes':
old_file_path = tts_path
new_file_path = QFileDialog.getExistingDirectory(None, "选择保存文件夹", "./")
shutil.copy(old_file_path, new_file_path) # 复制文件
now = datetime.datetime.now() # 输出当前日期和时间
now = str(now).split('.')[0].replace(':', '').replace(' ', '_')
new_name = 'outputtts' + '_' + now + '.wav'
os.rename(new_file_path + '/' + 'outputtts.wav', new_file_path + '/' + new_name) # 文件重命名
QMessageBox.information(None, "提示", "音频文件导出成功!", QMessageBox.Ok)
elif cansaveornot == 'no':
# 后两项分别为按钮(以|隔开,共有7种按钮类型,见示例后)、默认按钮(省略则默认为第一个按钮)
QMessageBox.warning(None, "警告", "没有可导出的音频文件!", QMessageBox.Ok)
四、声明及链接
编写软件纯属业余爱好,也是为了更好地学习编程,上面的代码中有很多不规范的地方,各路大神莫见怪。以下是软件下载链接,仅用于学习交流,欢迎大家指点与反馈。下载软件文件,然后下载模型文件,解压至应用程序所在的文件夹的根目录下即可。
https://download.csdn.net/download/weixin_40646871/89614148
模型文件地址:
链接:https://pan.baidu.com/s/1V2Ga6IONjB5xswXc2LJdWw?pwd=3b5s
提取码:3b5s