import re
import numpy
import jieba
import jieba.analyse # 提取关键内容
import jieba.posseg as pseg # 词性标注
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from snownlp import SnowNLP
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def load_file():
'''
加载外部词典,正则去除所有的标点符号,返回纯文本
'''
jieba.load_userdict("G:/anaconda/dict_lzf.txt") # 加载外部自定义词典
client = MongoClient('localhost', 27017) # 链接数据库
db = client['Taoguba'] # 匹配Taoguba表
news = db.Taoguba.find()
# db = client['Eastmoney'] # 匹配eastmoney表
# news = db.Eastmoney.find()
all_new = []
N_new = []
emo = []
sum = []
for i in news:
new = (i["Content"])
# new = (i["massage"])
r = '[’!@#~¥%……&*() ——+|}{“:”?》《,。、‘;’、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+'
news1 = re.sub(r, '', new)
news
结巴分词,文本聚类,情感分析,词云图可视化
最新推荐文章于 2024-05-14 14:56:05 发布