doc1.txt
原始语料:
doc2.txt
预处理后:
doc3.txt
结果为:
代码为:
import re
def yuchuli():
filename = './中间结果/doc1.txt'
f = open(filename, 'r', encoding='utf-8')
context = f.read()
pattern = ",|\\.|\\?|!|:|;|~|,|:|。|!|;|?| "
sentence = [i.replace('\n', '##').strip() for i in re.split(pattern, context)]
g = open('./中间结果/doc2.txt', 'w', encoding='utf-8')
for word in sentence:
a = str(word)
a = a.replace('##', '\r\n')
print(a, file=g)
f.close()
g.close()
k = open('./中间结果/doc2.txt', 'r', encoding='utf-8')
out = open('./中间结果/doc3.txt', 'w', encoding='utf-8')
for eachline in k.readlines():
if len(eachline) > 5:
out.writelines(eachline)
k.close()
out.close()
yuchuli()