文本分析--基于gensim的文本主题模型分析

#!/usr/bin/python
# -*- coding:utf8 -*-

import os
import time
import re
import jieba.analyse
import time

# 关键词获取
def post_cut():
    fr = open("post_data.txt")  # 源文件
    fo = open("post_key.txt", "a+")  # 保存关键词
    for line in fr.readlines():
        term = line.strip().split("\t")
        if len(term) == 3 and term[2] != "":
            key_list = jieba.analyse.extract_tags(term[2], 30)  # 直接使用jieba获取关键词,按照权重来排序
            ustr = term[0] + "\t"
            for i in key_list:
                ustr += i.encode("utf-8") + " "
            fo.write(ustr + "\n")
    fr.close()
    fo.close()


# tfidf权值
def post_tfidf():
    from sklearn.feature_extraction.text import HashingVectorizer
    fr = open("post_key.txt")
    id_list = []
    data_list = []
    for line in fr.readlines():
        term = line.strip().split("\t")
        if len(term) == 2:
            id_list.append(term[0])
            data_list.append(term[1])

    hv = HashingVectorizer(n_features=10000, non_negative=True)  # 该类实现hash技巧
    post_tfidf = hv.fit_transform(data_list)  # return feature vector 'fea_train' [n_samples,n_features]
    print 'Size of fea_train:' + repr(post_tfidf.shape)
    print post_tfidf.nnz
    post_cluster(id_list, post_tfidf)

# 聚类
def post_cluster(id, tfidf_vec):
    from sklearn.cluster import KMeans
    kmean = KMeans(n_clusters=300)  # 聚成300个类别
    print "kmeans"
    kmean.fit(tfidf_vec)

    count1 = 0
    count2 = 0

    pred = kmean.predict(tfidf_vec)
    fo = open("cluster.txt", "a+")  # 写入聚类结果
    for i in range(len(pred)):
        count2 += 1
        fo.write(id[i] + "\t" + str(pred[i]) + "\n")
    fo.close()
    print "%d+%d" % (count1, count2)

# lda聚类
def post_lda(cluster):
    from gensim import corpora, models, matutils
    count = 0
    fr = open("post_key.txt")
    fo2 = open("post_vec_lda.txt", "a+")  # 写入
    id_list = []
    data_list = []

    for line in fr.readlines():
        term = line.strip().split("\t")
        if len(term) == 2:
            count += 1
            id_list.append(term[0])
            word = term[1].strip().split()
            data_list.append(word)
    print "lda"
    dic = corpora.Dictionary(data_list)  # 构造词典
    corpus = [dic.doc2bow(text) for text in data_list]  # 每个text 对应的稀疏向量
    tfidf = models.TfidfModel(corpus)  # 统计tfidf
    print "lda"
    corpus_tfidf = tfidf[corpus]  # 得到每个文本的tfidf向量,稀疏矩阵
    lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=200)
    corpus_lda = lda[corpus_tfidf]  # 每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
    print "lda"

    num = 0
    for doc in corpus_lda:
        wstr = ""
        for i in range(len(doc)):
            item = doc[i]
            wstr += str(item[0]) + "," + str(item[1])[0:7] + "/"
        fo2.write(id_list[num] + "\t" + wstr[0:-1] + "\n")
        num += 1
    fr.close()
    fo2.close()
    print num

    if cluster:
        lda_csc_matrix = matutils.corpus2csc(corpus_lda).transpose()  # gensim sparse matrix to scipy sparse matrix
        post_cluster(id_list, lda_csc_matrix)


if __name__ == "__main__":
    # url = "path"
    time = time.time()
    post_cut()
    post_tfidf()
    lda_cluster = False
    post_lda(lda_cluster)

    print time.time() - time
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值