导入包
import pandas as pd
import jieba
from gensim.models import word2vec
分词
# 分词
def tokenizer(text):
# zh_pattern = re.compile(u'[^\u4e00-\u9fa5]+')
# text = re.sub(zh_pattern,"", text)
return [word for word in jieba.lcut(text) if word not in stop_words]
注释部分是用正则表达式筛选汉字的,但我想想又不对,数字也是有语义的,所以没用了。
去停用词
# 去停用词
def get_stop_words():
file_object = open('data/stopwords.txt',encoding='utf-8')