文本一些冗余标点符号清洗
#encoding=utf-8
import sys
import re
outfile = 'result.txt'
file = sys.argv[1]
if len(sys.argv) > 2:
outfile = sys.argv[2]
print("Deading" + file + " now...\n")
lines = []
n = 0
with open(file, 'r', encoding='UTF-8') as f: #打开文件
for line in f:
line.strip() #去掉换行符
line,nu = re.subn(r'`','\'',line)
if nu > 0 :
print("eedddddd"+str(nu))
line,nu = re.subn(r'"\s{0,}"|\'\s{0,}"|\'\s{0,}\'|\'\s{0,}"','"',line)
ch_en = re.split(r"\|\|\|", line)
ch = ch_en[0]
en = ch_en[1]
#if():
lines.append(ch + '|||' + en +'\n')
with open(outfile, 'w', encoding='utf-8') as g: #写文件
for line in lines:
g.write(line)
# for line in lines:
# try:
# print(line)
# except UnicodeEncodeError as e:
# print('UnicodeEncodeError')
# print("\n Please open the " + outfile + "(current path)!!")