TFIDF补充
上一篇文章中已经使用了sklearn的TFIDF
这次补充,数据集用了高校大数据预选赛的数据集
from gensim import corpora
import jieba
import jieba.analyse
import pandas as pd
import numpy as np
text = pd.read_csv("../train.csv",lineterminator='\n')
def word_cut(mytext):
return " ".join(jieba.cut(mytext)).strip().replace("...","").replace('?',"")
def word_split(content):
return content.strip().lower().split(' ')
text['split_review'] = text.review.apply(word_split)
dic = text['split_review'].values.tolist()
dictionary = corpora.Dictionary(dic)
corpus = [dictionary.doc2bow(sentence ) for sentence in dic]