数据预处理:
1,符号处理:
def go_split(s,min_len):
# 拼接正则表达式
symbol = ',;。!、?!'
symbol = "[" + symbol + "]+"
# 一次性分割字符串
result = re.split(symbol, s)
return [x for x in result if len(x)>min_len]
def is_dup(s,min_len):
result = go_split(s,min_len)
return len(result) !=len(set(result))
def is_neg_symbol(uchar):
neg_symbol=['!', '0', ';', '?', '、', '。', ',']
return uchar in neg_symbol
2,分词
#encoding=utf-8
#author linxinzhu
import jieba
import sys
reload(sys)
sys.setdefaultencoding('utf8')
f = open('3.txt','r')
outf = open('4.txt','w')
for line in f:
line = line.decode("utf-8").encode("utf-8