Python实现Kmeans文本聚类

目录

一、数据

二、代码

        2.1、加载停用词

        2.2、加载数据       

        2.3、计算tf-idf向量值

        2.4、训练

三、完整代码


一、数据

        通过爬虫爬取贴吧数据,这里怎么爬取的就不记录了。然后以一句一行的格式存入到txt中。接着我们要通过对每句话进行分词转向量,最后使用kmeans进行聚类并输出结果。

二、代码

        2.1、加载停用词

                在stop_words目录下有多个停用词表,需要循环加总所有停用词。

def defined_stop_words():
    all_stop_words = []
    for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
        # 读取图片
        filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
        with open(filepath, 'r', encoding='utf-8') as fp:
            all_line = fp.readlines()
            for line in all_line:
                all_stop_words.append(line.replace('\n',''))
    return all_stop_words

        2.2、加载数据       

                这边主要是对原始数据的一个筛选+jieba分词+去停用词。这是相对标准的一个流程。


def loadDataset(filepath):
    '''导入文本数据集'''
    dataset = []
    key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
    with open(filepath,'r',encoding='utf-8') as fp:
        all_line = fp.readlines()
        for line in all_line:
            dataset.append(line.replace('\n','' ))
    fp.close()

    # print(len(dataset))
    # # 随机抽样10W条
    # dataset = random.sample(dataset,10000)
    # print(len(dataset))
    
    # 加载停用词
    stop_words = defined_stop_words()
    all_sen = []
    original_sen = []
    for sen in list(set(dataset)):
        # 判断句子是否包含关键字
        for key in key_list:
            if operator.contains(sen,key):
                sentence = ""
                # jieba分词
                word = jieba_postag(sen)
                for w in word:
                    # 去停用词
                    if w.word not in stop_words:
                        sentence += w.word + ' '
                all_sen.append(sentence)
                original_sen.append(sen)
                break
    #         原句       原句分词结果
    return original_sen,all_sen

        2.3、计算tf-idf向量值

                X返回输入dataset的向量值,参数看数据选择合适的。

def transform(dataset, n_features=1000):
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
    X = vectorizer.fit_transform(dataset)
    return X, vectorizer

        2.4、训练

                这里选择Kmeans的方式,自定义k值,欠考虑的一个方案。

def train(X, vectorizer, true_k=10, minibatch=False):
    # 使用采样数据还是原始数据训练k-means,
    if minibatch:
        km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000, verbose=False)
    else:
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
                    verbose=False)
    km.fit(X)
    # 报存模型
    save_model_file(km,'Kmeans.pkl')

    result = list(km.predict(X))
    print('Cluster distribution:')
    print(dict([(i, result.count(i)) for i in result]))
    return km.score(X),result

三、完整代码

        根据实际的数据情况有部分是需要调整的,这里是做的文本聚类。这样盲目的定义k的取值为100是不太合理的。感兴趣的可以看下Canopy算法,它能根据你的数据集来输出最佳k的取值。使用Canopy + Kmeans 或许效果会好一些。

from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd
import sys
import os
import jieba.posseg as pseg
import operator
import random
from sklearn.externals import joblib



def save_model_file(model,save_model_name):
    joblib.dump(model, save_model_name)

def jieba_postag(text):
    words = pseg.cut(text)
    return words


def defined_stop_words():
    all_stop_words = []
    for i, file in enumerate(os.listdir(r'D:\Gitlab\extract_key\stop_words')):
        # 读取图片
        filepath = fr'D:\Gitlab\extract_key\stop_words\{file}'
        with open(filepath, 'r', encoding='utf-8') as fp:
            all_line = fp.readlines()
            for line in all_line:
                all_stop_words.append(line.replace('\n',''))
    return all_stop_words


def loadDataset(filepath):
    '''导入文本数据集'''
    dataset = []
    key_list = ['公司','项目','专业投资团队','元宇宙投资项目','养老项目','养老服务','老年产品','高回报','理财','募集','贷款','抵押','利息','保险','包赔','高利贷']
    with open(filepath,'r',encoding='utf-8') as fp:
        all_line = fp.readlines()
        for line in all_line:
            dataset.append(line.replace('\n','' ))
    fp.close()

    # print(len(dataset))
    # # 随机抽样10W条
    # dataset = random.sample(dataset,10000)
    # print(len(dataset))

    stop_words = defined_stop_words()
    all_sen = []
    original_sen = []
    for sen in list(set(dataset)):
        # 判断句子是否包含关键字
        for key in key_list:
            if operator.contains(sen,key):
                sentence = ""
                # jieba分词
                word = jieba_postag(sen)
                for w in word:
                    # 去停用词
                    if w.word not in stop_words:
                        sentence += w.word + ' '
                all_sen.append(sentence)
                original_sen.append(sen)
                break

    return original_sen,all_sen


def transform(dataset, n_features=1000):
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
    X = vectorizer.fit_transform(dataset)
    return X, vectorizer


def train(X, vectorizer, true_k=10, minibatch=False):
    # 使用采样数据还是原始数据训练k-means,
    if minibatch:
        km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000, verbose=False)
    else:
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
                    verbose=False)
    km.fit(X)
    # 报存模型
    save_model_file(km,'Kmeans.pkl')


    result = list(km.predict(X))
    print('Cluster distribution:')
    print(dict([(i, result.count(i)) for i in result]))
    return -km.score(X),result


def test():
    '''测试选择最优参数'''
    # 读数据
    filepath = r'D:\Gitlab\extract_key\all.txt'
    original_data,dataset = loadDataset(filepath)

    X, vectorizer = transform(dataset, n_features=500)
    train_score,class_result = train(X, vectorizer, true_k=100)
    socre = train_score / len(dataset)
    print(socre)


    abc_dict = {
        'original_sentence':original_data,
        'class':class_result,
        'cut_words':dataset
    }
    result = pd.DataFrame(abc_dict)
    # print(result)

    result.to_csv('result.csv',index=False)






if __name__ == '__main__':

    test()


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值