本程序添加了一个处理,就是把英文中的中文符号转换成英文符号。
import re
def qipy100:
txtdata = [‘@#%1、That‘s the first row。!第一行。’,
‘-=+20.That‘s second row.!第二行。’,
‘&……203.That‘s third row.?第三行。’]
p0 = ‘^.*\d{1,3}[、.]’
p1 = ‘[。!.?]{2}’
p2 = ‘(.?)([\u4e00-\u9fa5]+.?)’
pattern = re.compile(p2,re.S)
cmark =
emark =
mdict = dict(zip(cmark, emark))
p3 = “[” + “”.join(cmark) + “]”
for s in txtdata:
s1 = re.sub(p0, “”, s)
print(‘s1:’, s1)
m1 = re.search(p1, s1)
s2 = s1.replace(m1.group, m1.group[0])
s3 = list(pattern.findall(s2)[0])
s4 = s3[0]
m3 = re.findall(p3, s4)
for punctuation in m3:
print(‘punctuation:’, punctuation)
s4 = s4.replace(punctuation, mdict[punctuation])
s3[0] = s4
print(‘s3:’, s3)
return
if name == “main”:
执行结果
s1: That‘s the first row。!第一行。
s3: [“That’s the first row.”, ‘第一行’]
s1: That‘s second row.!第二行。
s3: [“That’s second row.”, ‘第二行’]
s1: That‘s third row.?第三行。
s3: [“That’s third row.”, ‘第三行’]