文本聚类

import re
import numpy
import jieba
import jieba.analyse                        # 提取关键内容
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

def load_file():
    '''
        加载外部词典,正则去除所有的标点符号,返回纯文本
    '''
    jieba.load_userdict("G:/anaconda/dict_lzf.txt")   # 加载外部自定义词典
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['Taoguba']  # 匹配Taoguba表
    news = db.Taoguba.find()
    N_content = []
    All_content = []
    for i in news:
        new = (i["Content"])
        r = '[’!@#~¥%……&*() ——+|}{“:”?》《,。、‘;’、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+'
        news1 = re.sub(r, '', new)
        news1 = re.sub('[a-zA-Z0-9]', '', news1)
        stop_new = stop_dict(news1)
        cut = cut_package(stop_new)
        N_content.append(cut)
        All_content.append(new)
    world_Arry = world_arry(N_content)
    cosine_Similarities = cosine_similarities(world_Arry)
    k_data = K_means(cosine_Similarities)
    print("正在打印分类数据:")
    for i in range(5):
        data_arry = numpy.array(All_content)
        print("---------------正在打印第%d类---------------" % (i+1))
        data = data_arry[k_data == i]
        id_ = 0
        List = "List"
        collection_name = List + str(i+1)
        for i in data:
            All_data = find_DB(i)
            id_ += 1
            All_data.append(id_)
            print(All_data)
            write_to_DB(collection_name, All_data[5], All_data[0],
                        All_data[1], All_data[2], All_data[3],
                        All_data[4])

def find_DB(content):
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['Taoguba']                  # 匹配eastmoney表
    data = db.Taoguba.find({"Content": content})    # 添加值
    massage = []
    for i in data:
        title = i['Title']
        author = i['Author']
        skim = i['Skim']
        talk = i['Talk']
        content = i['Content']
        massage.append(title)
        massage.append(author)
        massage.append(skim)
        massage.append(talk)
        massage.append(content)
    return massage

def write_to_DB(name, id, title, author, skim, talk, content):
    '''
        保存数据库
    '''
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['Taoguba']
    collection_name = db[name]
    collection_name.save({"_id": id, "Title": title, "Author": author,
                   "Skim": skim, "Talk": talk, "Content": content})        # 添加值

def stop_dict(news):
    '''
        去除所有的停用词
    '''
    stopwords = open("G:/anaconda/stopwords.txt", 'r', encoding='utf-8').read()
    outstr = ''
    for word in news:
        if word not in stopwords:
            outstr += word
    return outstr

def cut_package(news):
    '''
       按照不同的模式切分
    '''
    seg_list = jieba.cut(news, cut_all=False)         # 精确切割模式(默认为精确模式)
    seg = (' '.join(seg_list))
    return seg

    # seg_list = jieba.cut(news, cut_all=True)         # 全模式
    # print("Full Mode:", ' '.join(seg_list))

    # seg_list = jieba.cut_for_search(news)            # 搜索引擎模式
    # print("Full Mode:", ' '.join(seg_list))

def world_arry(corpus):
    '''
        词频矩阵
    '''
    vectorizer = CountVectorizer()  # 将文本中的词语转换为词频矩阵
    x = vectorizer.fit_transform(corpus)
    return x

def cosine_similarities(x):
    '''
        余弦相似度统计
    '''
    cosine_similarities = cosine_similarity(x, x)
    return cosine_similarities

def K_means(weight):
    '''
        文档聚类
    '''
    clf = KMeans(n_clusters=5, init='k-means++', random_state=123)
    k_data = clf.fit_predict(weight)
    return k_data

def main():
    load_file()

if __name__ == '__main__':
    main()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值