工作中遇到需要把中文标点转化成英文标点的需求, 注意转化前, notepad++ 中的 encoding 设置成UTF-8, 否者会报 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd2 in position 2: invalid continuation byte
#!/usr/bin/env python3
#coding=utf-8
import unicodedata
import os
import re
import sys
def punctuation_mend(string):
#输入字符串或者txt文件路径
table = {ord(f):ord(t) for f,t in zip(
u',、。!?【】()%#@&1234567890“”‘’',
u',,.!?[]()%#@&1234567890""\'\'')} #其他自定义需要修改的符号可以加到这里
if os.path.isfile(string):
with open(string, 'r', encoding='utf-8') as f:
res = unicodedata.normalize('NFKC', f.read())
res = res.translate(table)
with open(string, 'w', encoding='utf-8') as f:
f.write(res)
else:
res = unicodedata.normalize('NFKC', string)
res = res.translate(table)
re.sub(r'(?<=[.,])(?=[^\s])', r' ', res)
return res
#def add_space(string, t_file):
# if os.path.isfile(string):
# with open(string, 'r', encoding='utf-8') as f:
# line = f.readline()
# tmp_f = open(t_file, 'w+', encoding='utf-8')
# while line:
# '''
# add space after `,.:`
# (?<=[.,:]) positive lookbehind that looks for dots or commas
# (?=[^\s\d]) positive lookahead that matches anything that isn't a space or a num
# '''
# line_new = re.sub(r'(?<=[:.,)])(?=[^\s\d,.:])', r' ', line)
# # remove space and tabs at the beginning of and at the end of line
# line_new = re.sub(r'(^[ \t]+|[ \t]+$)', r'', line_new)
# # write line except empty lines with or without space
# if not (re.match(r'^\s*$', line_new)):
# tmp_f.write(line_new)
# line = f.readline()
# tmp_f.close()
# os.rename(t_file, string);
# else:
# re.sub(r'(?<=[.,])(?=[^\s])', r' ', string)
# return string
def add_format(string, t_file, mergeFlag):
if os.path.isfile(string):
tmp_f = open(t_file, 'w+', encoding='utf-8')
with open(string, 'r', encoding='utf-8') as f_:
for line in f_:
# remve '*' in the front of line
line_new = line
if 'yes' == mergeFlag:
line_new = re.sub(r'^ +[*/]+', r'', line_new)
'''
add space after `,:`
(?<=[.,:]) positive lookbehind that looks for dots or commas
(?=[^\s\d]) positive lookahead that matches anything that isn't a space or a num
'''
line_new = re.sub(r'(?<=[:;,.)])(?=[^\s\d,;:.])', r' ', line_new)
line_new = re.sub(r'\. (?=(cpp|h|c|hpp|X[,.!]))', u'.', line_new)
'''
match_str= re.search(r'\. (?=\w[,.!:])', line_new)
if match_str:
print(match_str.group())
else:
print("not match")
'''
# remove space and tabs at the beginning of and at the end of line
line_new = re.sub(r'(^[ \t]+|[ \t]+$)', r'', line_new)
# covert line break to space
if 'yes' == mergeFlag:
line_new = re.sub(r'(\n|\n\r)', r' ', line_new)
# write line except empty lines with or without space
if re.match(r'^\s*$', line_new):
if 'yes' == mergeFlag:
tmp_f.write('\n')
else:
tmp_f.write(line_new)
tmp_f.close()
os.rename(t_file, string);
else:
print("wrong " + string)
def rm_mutiple_space(string, t_file):
tmp_f = open(t_file, 'w+', encoding='utf-8')
with open(string, 'r', encoding='utf-8') as f_:
for line in f_:
line_new = re.sub(r' +', r' ', line)
tmp_f.write(line_new)
tmp_f.close()
os.rename(t_file, string);
convert_file=''
tmp_file=''
if __name__ == '__main__':
mergeLine = 'no'
if len(sys.argv) == 1:
print("Nothing need to be done!")
sys.exit()
elif len(sys.argv) == 2:
para_list = sys.argv
convert_file = para_list[1]
tmp_file = os.path.dirname(convert_file) + '/tmp.txt'
elif len(sys.argv) == 3:
para_list = sys.argv
convert_file = para_list[1]
mergeLine = para_list[2]
tmp_file = os.path.dirname(convert_file) + '/tmp.txt'
else:
print("wrong parameter")
punctuation_mend(convert_file)
add_format(convert_file, tmp_file, mergeLine)
以上就能转化常用的中文标点, 并且在 逗号, 点号和冒号
后加空格, 再去掉文件中带或者不带空格的空行.