对于英文文本分句比较简单,只要根据终结符"."划分就好,中文文本分句看似很简单,但是实现时会遇到很多麻烦,尤其是处理社交媒体数据时,会遇到文本格式不规范等问题。
下面代码针对一段一段的短文本组成了文档分句
import re
def cut_sent(infile, outfile):
cutLineFlag = ["?", "!", "。","…"] #本文使用的终结符,可以修改
sentenceList = []
with open(infile, "r", encoding="UTF-8") as file:
oneSentence = ""
for line in file:
if len(oneSentence)!=0:
sentenceList.append(oneSentence.strip() + "\r")
oneSentence=""
# oneSentence = ""
for word in words:
if word not in cutLineFlag:
oneSentence = oneSentence + word
else:
oneSentence = oneSentence + word
if oneSentence.__len__() > 4:
sentenceList.append(oneSentence.strip() + &#