思路: # 获取文件内连续的中文字符(筛选规则为冒号内的字符串如果出现\u4e00到\u9fa5字符串 提取冒号内容作为目标文本 (.*?)非贪婪模式)
用法:运行main.py,生成result excel文件。
library requirements
pip3 install xlwt
main.py
import os
import datetime
import re
import xlwt
class FileRead:
def __init__(self):
self.root_dir = r"C:\Users\Administrator\Desktop\flutter\your_project\lib"
# 递归获取所有文件
def find_file(self, dir_path):
file_list = []
if len(dir_path) == 0:
path = os.path.join(self.root_dir, dir_path)
else:
path = dir_path
temp_list = os.listdir(path)
for file in temp_list:
if os.path.isfile(os.path.join(path, file)):
file_list.append(os.path.join(path, file))
else:
file_list.extend(self.find_file(os.path.join(path, file)))
return file_list
# 获取文件内连续的中文字符(不包括注释,筛选规则为冒号内的字符串如果出现\u4e00到\u9fa5 则提取冒号内容作为目标文本 (.*?)非贪婪模式)
def find_chinese_in_file(self, file):
file = open(file, mode='r', encoding='UTF-8')
file_content = file.read()
result = set( )
# 正则匹配规则1 # 中文的编码范围是:\u4e00到\u9fa5
patten = re.compile(r'\'([\u4e00-\u9fa5].*?)\'')
result.update(patten.findall(file_content))
# 正则匹配规则2
patten = re.compile(r'"([\u4e00-\u9fa5].*?)"')
result.update(patten.findall(file_content))
# 清除空格
for text in result.copy():
if len(str.strip(text)) == 0:
result.remove(text)
file.close()
return result
class ExeclHelper:
def set_style(self, name, height, bold=False):
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = name # 'Times New Roman'
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
def write_execl(self, result):
# 格式化输出
f = xlwt.Workbook()
# 创建sheet
sheet1 = f.add_sheet(u'translate', cell_overwrite_ok=True)
row0 = [u'chinese', u'english']
# 生成第一行
for i in range(0, len(row0)):
sheet1.write(0, i, row0[i], self.set_style('Times New Roman', 220, True))
# 填充数据
for i in range(0, len(result)):
sheet1.write(i+1, 0, result[i])
# 设置宽度
first_col = sheet1.col(0)
first_col.width = 256*40
second_col = sheet1.col(1)
second_col.width = 256*40
f.save("result.xls")
if __name__ == '__main__':
print(datetime.datetime.now())
file_read = FileRead()
file_path_list = file_read.find_file("")
# 找出所有中文字符
result = []
for file_path in file_path_list:
result.extend(file_read.find_chinese_in_file(file_path))
execl_helper = ExeclHelper()
execl_helper.write_execl(result)
print(datetime.datetime.now())