import re
import numpy
import jieba
import jieba.analyse # 提取关键内容
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
def load_file():
'''
加载外部词典,正则去除所有的标点符号,返回纯文本
'''
jieba.load_userdict("G:/anaconda/dict_lzf.txt") # 加载外部自定义词典
client = MongoClient('localhost', 27017) # 链接数据库
db = client['Taoguba'] # 匹配Taoguba表
news = db.Taoguba.find()
N_content = []
All_content = []
for i in news:
new = (i["Content"])
r = '[’!@#~¥%……&*() ——+|}{“:”?》《,。、‘;’、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+'
news1 = re.sub(r, '', new)
news1 = re.sub('[a-zA-Z0-9]', '', news1)
stop_new = stop_dict(news1)
cut = cut_package(stop_new)
N_content.append(cut)
All_content.append(new)
world_Arry = world_arry(N_content)
cosine_Similarities = cosine_similarities(world_Arry)
k_data = K_means(cosine_Similarities)
print("正在打印分类数据:")
for i in range(5):
data_arry = numpy.array(All_content)
print("---------------正在打印第%d类---------------" % (i+1))
data = data_arry[k_data == i]
id_ = 0
List = "List"
collection_name = List + str(i+1)
for i in data:
All_data = find_DB(i)
id_ += 1
All_data.append(id_)
print(All_data)
write_to_DB(collection_name, All_data[5], All_data[0],
All_data[1], All_data[2], All_data[3],
All_data[4])
def find_DB(content):
client = MongoClient('localhost', 27017) # 链接数据库
db = client['Taoguba'] # 匹配eastmoney表
data = db.Taoguba.find({"Content": content}) # 添加值
massage = []
for i in data:
title = i['Title']
author = i['Author']
skim = i['Skim']
talk = i['Talk']
content = i['Content']
massage.append(title)
massage.append(author)
massage.append(skim)
massage.append(talk)
massage.append(content)
return massage
def write_to_DB(name, id, title, author, skim, talk, content):
'''
保存数据库
'''
client = MongoClient('localhost', 27017) # 链接数据库
db = client['Taoguba']
collection_name = db[name]
collection_name.save({"_id": id, "Title": title, "Author": author,
"Skim": skim, "Talk": talk, "Content": content}) # 添加值
def stop_dict(news):
'''
去除所有的停用词
'''
stopwords = open("G:/anaconda/stopwords.txt", 'r', encoding='utf-8').read()
outstr = ''
for word in news:
if word not in stopwords:
outstr += word
return outstr
def cut_package(news):
'''
按照不同的模式切分
'''
seg_list = jieba.cut(news, cut_all=False) # 精确切割模式(默认为精确模式)
seg = (' '.join(seg_list))
return seg
# seg_list = jieba.cut(news, cut_all=True) # 全模式
# print("Full Mode:", ' '.join(seg_list))
# seg_list = jieba.cut_for_search(news) # 搜索引擎模式
# print("Full Mode:", ' '.join(seg_list))
def world_arry(corpus):
'''
词频矩阵
'''
vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵
x = vectorizer.fit_transform(corpus)
return x
def cosine_similarities(x):
'''
余弦相似度统计
'''
cosine_similarities = cosine_similarity(x, x)
return cosine_similarities
def K_means(weight):
'''
文档聚类
'''
clf = KMeans(n_clusters=5, init='k-means++', random_state=123)
k_data = clf.fit_predict(weight)
return k_data
def main():
load_file()
if __name__ == '__main__':
main()