批量文件编码转换 从utf-8到gb2312
import os
import codecs
# 遍历目录下所有的文件
def batch_encoding_conversion(path):
# 遍历目录下的文件
for root, dirs, files in os.walk(path):
print (f"dir:{dirs} files:{files}")
for file in files:
file_path = os.path.join(root, file)
if file.endswith('.txt'): # 只处理txt文件,可以根据需要修改
# 对于utf-8编码格式的文件进行转换 and 'utf-8' in open(file_path).read()
with codecs.open(file_path, 'r', 'utf-8') as f:
content = f.read()
# 把编码从utf-8转为gb2312
content_gb = content.encode('gb2312', 'ignore').decode('gb2312')
with codecs.open(file_path, 'w', 'gb2312') as f2:
f2.write(content_gb)
batch_encoding_conversion('.')
批量文本翻译 英译汉
import os
from googletrans import Translator # 导入Google Translate翻译库
import requests
path = '.' # 需要遍历的目录路径
proxies = {
'http': 'http://IP地址:端口号',
'https': 'https://IP地址:端口号'
} # 代理设置,需要根据实际情况修改
def my_post(url, data=None, json=None, **kwargs):
kwargs['proxies'] = proxies
# if 1:
# # 此处仅验证代理配置是否生效, 正式中无需这段
# res = requests.get('https://mybrowserinfo.com/', **kwargs)
# print(res.text)
return requests.post(url, data, json, **kwargs)
def translate(s):
"""
翻译函数,将英文翻译成中文
"""
translator = Translator(proxies=proxies)
translator.client.post = my_post
return translator.translate(s, dest='zh-CN').text
# 遍历目录下的文件
for root, dirs, files in os.walk(path):
print (f"dir:{dirs} files:{files}")
for file in files:
if file.endswith('.txt'): # 只处理txt文件,可以根据需要修改
file_path = os.path.join(root, file)
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 使用正则表达式查找所有左括号双引号和右括号双引号中间的字符
import re
en_pattren = re.compile(r"\(\".*?\"\)",re.S)
result = re.findall(en_pattren, content)
# 对找到的英文字符进行翻译,并替换到原文本中
for en in result:
try:
print(f"file:<{file}>\nfind:<{en}>")
cn = translate(en[2:][:-2])
for pair in [("“","\""),("”","\""),("#","#")]:
cn = cn.replace(pair[0],pair[1])
content = content.replace(en, f"(\"{cn}\")")
except:
pass
print(content)
# 将结果写入文件
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
两个项目迭代过程中比较常用的工具,省时省力