https://github.com/RaRe-Technologies/gensim
pip install -U gensim
代码示例
import nltk
from nltk import collections
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans
import gensim
from gensim import corpora, models, similarities
from itertools import chain
from operator import itemgetter
import re
# 文本清洗预处理
def preprocessing(text):
# text = text.encode("gbk").decode("utf8")
# tokons to word 句子标记解析 单词标记解析
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# print("单词标记解析:", tokens)
# 停用词删除
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# print("停用词删除", tokens)
# 单词字数小于3删除 并转换成小写
tokens = [word.lower() for