新闻类型分类

最新推荐文章于 2022-07-04 11:47:35 发布

NightCharm

最新推荐文章于 2022-07-04 11:47:35 发布

阅读量6.7k

点赞数 4

分类专栏： python学习文章标签： python 分类问题

本文链接：https://blog.csdn.net/NightCharm/article/details/100879477

版权

python学习专栏收录该内容

13 篇文章 0 订阅

订阅专栏

分类问题Demo

这应该是我能想到最简单的方法惹，应该是我能想明白的 emm
这中不仅仅用于新闻分类，可以扩展为标签分类。这里仅用新闻做实验
~~鬼知道我参考了多少博客，一个能直接跑起来的都没有~~ ，非常抱歉博客参考太多找不到了

思路

通过不同种类新闻提出关键词
使用gensim 训练建立语意向量
通过向量对比做语意匹配

数据源

由于爬虫爬取的新闻还需要自己分类 =。= ，于是偷懒使用 sougou 2008年的数据（http://www.sogou.com/labs/resource/cs.php）

使用三方库

jieba
gensim
爬虫相关的就先不提了

代码逻辑

对搜狗实验室获取到的数据进行处理

将不同类型的文章通过 jieba 提取关键词, 将关键词进行存储

(ps: 下载下来的数据，字符集编码是 gb18030的，手动转了一次 utf-8)

代码

import os
import jieba
from jieba import analyse
from bs4 import BeautifulSoup


def jieba_content(contnet):
    a = analyse.extract_tags(contnet)
    return list(a)


def file_read(file_dir):  # 得到文本.txt的路径
    for root, dirs, files in os.walk(file_dir):
        for f in files:
            file_path2 = file_dir + "/a/" + f
            print(file_path2)
            with open(file_path2) as f2:
                content = f2.read()
                soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8")
                doc_list = soup.find_all('doc')
                for index, i in enumerate(doc_list):
                    content = i.find('content').text.strip()
                    if content == '':
                        continue
                    url = i.find('url').text
                    for item in url.split('/'):
                        if item in dicurl:
                            keys = jieba_content(content)
                            data = {'type': str(dicurl[item].strip()), 'key_list': keys}
                            data_write_csv(str(dicurl[item].strip()), data)


def data_write_csv(filename, datas):  
    if filename not in file_list:
        file_list.append(filename)
    filename = 'test/' + filename + '.txt'
    with open(filename, 'a+', encoding='utf-8') as f: 
        f.write(','.join(datas['key_list']))


def write_file_list(file_list):
    with open('filename.txt', 'w') as f:
        f.write(file_list)


if __name__ == "__main__":
    file_list = []
    path = "SogouCS"
    # 建立url和类别的映射词典
    dicurl = {'auto.sohu.com': 'qiche', 'it.sohu.com': 'hulianwang', 'health.sohu.com': 'jiankang', \
              'sports.sohu.com': 'tiyu', 'travel.sohu.com': 'lvyou', 'learning.sohu.com': 'jiaoyu', \
              'career.sohu.com': 'zhaopin', 'cul.sohu.com': 'wenhua', 'mil.news.sohu.com': 'junshi', \
              'house.sohu.com': 'fangchan', 'yule.sohu.com': 'yule', 'women.sohu.com': 'shishang', \
              'media.sohu.com': 'chuanmei', 'gongyi.sohu.com': 'gongyi', '2008.sohu.com': 'aoyun', \
              'business.sohu.com': 'shangye', 'news.sohu.com': 'other'}

    jieba.load_userdict("user.txt")
    analyse.set_stop_words("stopword.txt")
    file_read(path)

    write_file_list(file_list)

通过 gensim 训练语料库

通过上一步处理过的数据进行语料库训练

代码

import os
import jieba
from jieba import analyse
from collections import defaultdict
from gensim import corpora, models, similarities


def file_read(file_dir):  # 得到文本.txt的路径
    for root, dirs, files in os.walk(file_dir):
        for f in files:
            key_list.append(f)
            file_path2 = file_dir + "/" + f
            with open(file_path2) as f2:
                content = f2.read()
                dic[f] = content.split(',')


def jieba_content(contnet):
    a = analyse.extract_tags(contnet)
    return list(a)


if __name__ == '__main__':
    dic = {}
    key_list = []

    file_read('test')
    texts = dic.values()

    frequency = defaultdict(int)  # 使用默认字典
    for text in texts:  # 下面2行代码是计算每个词的频数。方便下面的代码去除频数少的单词
        for token in text:
            frequency[token] = +1
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary.txt')

    texts = [[word for word in text]
             for text in texts]  

    corpus = [dictionary.doc2bow(text) for text in texts]  # 建立新的语料库
    corpora.MmCorpus.serialize("XinYU.mm", corpus)  # 存新的语料库

通过语料库比对获取新闻的类型

加载语料库
将新内容通过 jieba 获取对应关键词
获取该组关键词和语料库中的匹配率最高的类型

代码

import os
from jieba import analyse
from gensim import corpora, models, similarities


def jieba_content(contnet):
    a = analyse.extract_tags(contnet)
    return list(a)


def load():
    dictionary = corpora.Dictionary.load('dictionary.txt')
    corpus = corpora.MmCorpus('XinYU.mm')
    file_list = [files for root, dirs, files in os.walk('test')][0]
    return dictionary, corpus, file_list



def get2(dictionary, corpus, content, file_list):
    new_vec = dictionary.doc2bow(jieba_content(content))  # 建立向量
    tfidf = models.TfidfModel(corpus)  # 建立tfidf模型
    featureNum = len(dictionary.token2id.keys())  # 通过token2id得到特征数
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=featureNum)  # 稀疏矩阵相似度，从而建立索引
    sim = index[tfidf[new_vec]]  # 计算最终相似度结果
    key_index = list(sim).index(max(sim))
    return dic.get(file_list[key_index])


if __name__ == '__main__':

    dic = {'qiche.txt': '汽车', 'hulianwang.txt': '互联网', 'jiankang.txt': '健康', \
              'tiyu.txt': '体育', 'lvyou.txt': '旅游', 'jiaoyu.txt': '教育', \
              'zhaopin.txt': '照片', 'wenhua.txt': '文化', 'junshi.txt': '军事', \
              'fangchan.txt': '房产', 'yule.txt': '娱乐', 'shishang.txt': '市场', \
              'chuanmei.txt': '传媒', 'gongyi.txt': '公益', 'aoyun.txt': '奥运', \
              'shangye.txt': '商业', 'other.txt': 'other'}

    dictionary, corpus, file_list = load()
   
    content = """
        凡孕卵在子宫腔以外的任何部位着床者，统称为异位妊娠，习称为宫外孕。根据着床部位不同，有输卵管妊娠、卵巢妊娠、腹腔妊娠、宫颈妊娠及子宫残角妊娠等。    """
    key = get2(dictionary, corpus, content, file_list)
    print('内容: {}\n 预测属于： {} 类新闻'.format(content, key))