计算文本词频tf-idf然后进行聚类

最新推荐文章于 2024-05-30 18:37:52 发布

walk walk

最新推荐文章于 2024-05-30 18:37:52 发布

阅读量2.2k

点赞数 1

分类专栏：数据挖掘 python

本文链接：https://blog.csdn.net/dongtest/article/details/95595270

版权

python 同时被 2 个专栏收录

31 篇文章 0 订阅

订阅专栏

数据挖掘

30 篇文章 0 订阅

订阅专栏

import jieba  
import jieba.analyse  
import math  
import operator  
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN  
from sklearn.cluster import MeanShift, estimate_bandwidth  
from collections import Counter  
from sklearn.manifold import TSNE  
from sklearn.decomposition import TruncatedSVD  
from sklearn.decomposition import PCA  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import HashingVectorizer  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.pipeline import make_pipeline  
from sklearn.preprocessing import Normalizer  
import numpy as np  
import matplotlib.pyplot as plt  
from collections import defaultdict  
import os  
#np.set_printoptions(threshold=np.inf)  
  
#加载手工设置的某些词的idf值  
def load_idf_file(path):  
    idf_dict = {}  
    handle = open(path, 'r',encoding= 'utf8')  
    line = handle.readline()               # 调用文件的 readline()方法   
    datas = []  
    while line:   
        line = line.strip()   
        if len(line)>0:  
            line_arr = line.split(' ')  
            idf_dict[line_arr[0]] = float(line_arr[1])  
        line = handle.readline()   
    handle.close()  
    return idf_dict  
  
def cal_idf(data_set,idf_dict):  
    doc_num = len(data_set)  
    word_doc_count=defaultdict(int)  
    for word_str in data_set:  
        word_list = word_str.split(' ')  
        word_list = list(set(word_list))  
        for item in word_list:  
            if item and item.strip()!='':  
                word_doc_count[item]+=1  
  
    word_idf = {}  
    default_idf_keys = idf_dict.keys()  
    for k,v in word_doc_count.items():    
        idf = math.log(doc_num*1.0 / v)    
        if k in default_idf_keys: word_idf[k] = idf_dict[k]  
        else:word_idf[k] = idf  
    #path = "idf.txt"    
    #save(word_idf, path)    
    return word_idf  
def cal_tfidf(data_set,idf_ret):  
    doc_word_tfidf = []      
    i = 0  
    for word_str in data_set:  
        word_list = word_str.split(' ')  
        doc_word_total = len(word_list)  
        doc_word_dict = defaultdict(int)  
        doc_word_tfidf_dict = defaultdict(int)  
        for item in word_list:  
            if item and item.strip()!='':  
                doc_word_dict[item]+=1  
        for k,v in doc_word_dict.items():  
            doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k]  
        doc_word_tfidf.append(doc_word_tfidf_dict)  
        i=i+1  
    return doc_word_tfidf      
  
  
def save(idf_dict, path):    
    f = open(path, 'a+',encoding= 'utf8')  
    f.truncate()    
    for key in idf_dict.keys():    
        f.write(str(key) + " " + str(idf_dict[key]) + "\n")    
    f.close()    
  
# 切词  
def jieba_tokenize():  
    jieba_need =[]  
    for item in datas:  
        temp_list1 = jieba.analyse.extract_tags(item,topK=10)   
        w_len = len(temp_list1)  
        if w_len>2:  
            w_num = math.ceil(float(w_len)*0.6)  
            temp_list1 = temp_list1[0:w_num]  
        temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1]  
        jieba_need.append(" ".join(temp_list))  
    return jieba_need  
  
  
# 降维  
def reduction(matrix):  
    svd = TruncatedSVD(50)  
    normalizer = Normalizer(copy=False)  
    lsa = make_pipeline(svd, normalizer)  
    X = lsa.fit_transform(matrix)  
    return X  
  
jieba.analyse.set_idf_path("./jieba/idf.txt")  
data_num = 10000  
k_num = 200  
  
i = 0  
output = open('../data.txt', 'r',encoding= 'utf8')  
line = output.readline()               # 调用文件的 readline()方法   
datas = []  
while line:   
    line = line.strip()   
    if len(line)>0:  
        i=i+1  
        datas.append(line)  
        if i>=data_num:break      
    line = output.readline()   
output.close()  
  
  
npyfile = "data.npy"  
if os.path.exists(npyfile):  
    X = np.load("data.npy")  
else:  
    # 读取停用词  
    with open('./stop_words.txt',encoding='utf-8') as f:  
        entities = list(f)  
        lines = []  
        for line in entities:  
            line1 = line.strip()  
            lines.append(line1)  
    #加载手工设置的idf  
    default_idf_dict = load_idf_file("./jieba/idf.txt")  
    my_train = jieba_tokenize()  
  
    idf_ret = cal_idf(my_train,default_idf_dict)  
    tfidf_ret = cal_tfidf(my_train,idf_ret)  
    doc_rows = len(datas)  
    word_rows = len(idf_ret)  
    #把每个文档每个分词的词频 转换成 矩阵[文档数,分词数]  
    X = np.zeros([doc_rows,word_rows])  
    for i in range(doc_rows):  
        j = 0  
        for k,v in idf_ret.items():  
            X[i][j] = tfidf_ret[i][k]  
            j=j+1  
    X = np.array(X)  
    np.save("data.npy",X)  
#降维  
X = reduction(X)  
#cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False)  
#cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False)  
cl = DBSCAN(eps=0.2, min_samples=30)  
result = cl.fit_predict(X)  
num_clusters = len(set(result))  
  
# 结果输出  
ret = [[] for y in range(len(result))]   
for i in range(len(datas)):   
    classid = result[i]  
    ret[classid].append(datas[i])  
  
for m in range(num_clusters):  
    file = "./result/result_"+str(m)+".txt"  
    handle = open(file, 'w+',encoding= 'utf8')  
    for n in range(len(ret[m])):  
        handle.write(ret[m][n]+"\n")  
    handle.close()