beautifulsoup爬取不同类百度新闻 LDA尝试做新闻类型分类

最新推荐文章于 2023-03-13 09:11:16 发布

weixin_41044499

最新推荐文章于 2023-03-13 09:11:16 发布

阅读量601

点赞数

文章标签： LDA 机器学习文本分类

本文链接：https://blog.csdn.net/weixin_41044499/article/details/94578559

版权

1 beautifulsoup爬取不同类百度新闻的方法，参见如下链接

https://blog.csdn.net/weixin_41044499/article/details/94382539

beautifulsoup爬取百度新闻，方法参见之前的方案

https://blog.csdn.net/weixin_41044499/article/details/94382539

整理如下：

2 爬取军事、汽车、娱乐三类的新闻，借助glensim的lda方法做主题分类

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from gensim import corpora, models, similarities
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import jieba
import jieba.posseg


def load_stopword():
    f_stop = open('stopword.txt')
    sw = [line.strip() for line in f_stop]
    f_stop.close()
    return sw


def segment():
    uinfo = []
    stopwords = load_stopword()
    for titleIndex,m in enumerate(['learningNotebook-auto.xlsx','learningNotebook-ent.xlsx','learningNotebook-mli.xlsx']):
        data = pd.read_excel(m, encoding='utf-8')
        for i, info in enumerate(data['新闻内容']):
            info_words = []
            for word, pos in jieba.posseg.cut(info):
                if word not in stopwords:
                    info_words.append(word)
            if info_words:
                uinfo.append([titleIndex, ' '.join(info_words)])

    path = "learning.xlsx"
    uinfo = pd.DataFrame(uinfo, columns=['新闻类型', '新闻内容'])
    uinfo.to_excel(path, index=False)

def lda():
    np.set_printoptions(linewidth=300)
    data = pd.read_excel('learning.xlsx', encoding='utf-8')
    texts = []
    print(data['新闻类型'].value_counts())
    for info in data['新闻内容']:
        texts.append(info.split(' '))
    M = len(texts)
    print('文档数目：%d个' % M)
    print('正在建立词典 --')
    dictionary = corpora.Dictionary(texts)
    V = len(dictionary)
    print('正在计算文本向量 --')
    corpus = [dictionary.doc2bow(text) for text in texts]
    print('正在计算文档TF-IDF --')
    t_start = time.time()
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    print('建立文档TF-IDF完成，用时%.3f秒' % (time.time() - t_start))
    print('LDA模型拟合推断 --')
    num_topics = 3
    t_start = time.time()
    lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                          update_every=1, chunksize=20, iterations=1000)
    print('LDA模型完成，训练时间为\t%.3f秒' % (time.time() - t_start))

    num_show_term = 20  # 每个主题显示几个词
    print('每个主题的词分布：')
    for topic_id in range(num_topics):
        print('主题#%d：\t' % topic_id, end=' ')
        term_distribute_all = lda.get_topic_terms(topicid=topic_id)
        term_distribute = term_distribute_all[:num_show_term]
        term_distribute = np.array(term_distribute)
        term_id = term_distribute[:, 0].astype(np.int)
        for t in term_id:
            print(dictionary.id2token[t], end=' ')
        print('\n概率：\t', term_distribute[:, 1])

    # 随机打印某10个新闻文档的主题
    np.set_printoptions(linewidth=200, suppress=True)
    num_show_topic = 10  # 每个文档显示前几个主题
    print('10个新闻的主题分布：')
    doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
    idx = np.arange(M)
    np.random.shuffle(idx)
    idx = idx[:10]
    for i in idx:
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        # print topic_distribute
        topic_idx = topic_distribute.argsort()[:-num_show_topic - 1:-1]
        print(('第%d个用户的前%d个主题：' % (i, num_show_topic)), topic_idx)
        print(topic_distribute[topic_idx])
    # 显示这10个文档的主题
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(8, 7), facecolor='w')
    for i, k in enumerate(idx):
        ax = plt.subplot(5, 2, i + 1)
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        ax.stem(topic_distribute, linefmt='g-', markerfmt='ro')
        ax.set_xlim(-1, num_topics)
        ax.set_ylim(0, 1)
        ax.set_ylabel("概率")
        ax.set_title("文章 {0}主题{1}".format(k,data['新闻类型'][k]))
        plt.grid(b=True, axis='both', ls=':', color='#606060')
    plt.xlabel("主题", fontsize=13)
    plt.suptitle('文章的主题分布', fontsize=15)
    plt.tight_layout(1, rect=(0, 0, 1, 0.95))
    plt.show()

    # 计算各个主题的强度
    print('\n各个主题的强度:\n')
    topic_all = np.zeros(num_topics)
    doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
    for i in np.arange(M):  # 遍历所有文档
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        topic_all += topic_distribute
    topic_all /= M  # 平均
    idx = topic_all.argsort()
    topic_sort = topic_all[idx]
    print(topic_sort)
    plt.figure(facecolor='w')
    plt.stem(topic_sort, linefmt='g-', markerfmt='ro')
    plt.xticks(np.arange(idx.size), idx)
    plt.xlabel("主题", fontsize=13)
    plt.ylabel("主题出现概率", fontsize=13)
    plt.title('主题强度', fontsize=15)
    plt.grid(b=True, axis='both', ls=':', color='#606060')
    plt.show()


if __name__ == '__main__':
    print('segment')
    segment()
    print('lda')
    lda()

三种新闻类型的数量分别为58，19，17。将LDA程序反复执行多次，发现该算法并不稳定，每次的结果都不相同，而且，三个主题的强度也不均匀，每个主题的词分布也总在变化，有些的结果比较准确，有些不太靠谱。

#各个主题的强度:[0.05099506 0.30505012 0.64395482]
#各个主题的强度:[0.07306102 0.22705589 0.6998831 ]
#各个主题的强度:[0.06512089 0.15694532 0.77793379]
#各个主题的强度:[0.12991019 0.26014361 0.6099462 ]
每个主题的词分布：
主题#0：       伊朗 美国 步枪 无人机 将军 美军 缅甸 体能 机油 97 
主题#1：       俄罗斯 量子 米格 F16 无人机 直升机 29 美国 通讯 拦截
主题#2：       黄轩 杨采钰 芸 Pro 超能 秦  轩逸 比亚迪 燃油
每个主题的词分布：
主题#0：       迈凯伦 对面 一辆 事故 轿车 奥迪车 小轿车 跑车 驾驶员 
主题#1：       俄罗斯 美国 无人机 伊朗 中国 我国 机油 拦截 步枪 量子
主题#2：       娱君 煮 文 险 保险 涉水 盗抢险 王某利 交通事故 划痕
每个主题的词分布：
主题#0：       娱君 煮 文 保险 险 王某利 公交车 交通事故 专班 驾驶
主题#1：       俄罗斯 美国 无人机 伊朗 我国 中国 步枪 量子 米格 通讯
主题#2：       特朗普 伊朗 EXO 弹道导弹 险 黄轩 划痕 车辆  杨采钰
每个主题的词分布：
主题#0：       险 划痕 轿车 对面 刮痕 事故 一辆 责任 机动车 保险
主题#1：       俄罗斯 美国 无人机 伊朗 我国 中国 拦截 机油 步枪 量子
主题#2：       煮 娱君 黄轩 杨采钰 文 芸  电影 剧照 新西兰

weixin_41044499

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
beautifulsoup爬取不同类百度新闻 LDA尝试做新闻类型分类

1beautifulsoup爬取不同类百度新闻的方法，参见如下链接https://blog.csdn.net/weixin_41044499/article/details/94382539beautifulsoup爬取百度新闻，方法参见之前的方案https://blog.csdn.net/weixin_41044499/article/details/94382539整理如下：...
复制链接

扫一扫