python词频统计（re和jieba模块的使用），获取词频TOP50的词组

最新推荐文章于 2022-12-21 12:20:10 发布

w.ang.jie

最新推荐文章于 2022-12-21 12:20:10 发布

阅读量1.3k

点赞数 1

分类专栏： python 文章标签：正则表达式 python 自然语言处理词频统计 TOP

本文链接：https://blog.csdn.net/qq_32392597/article/details/114841427

版权

python 专栏收录该内容

28 篇文章 5 订阅

订阅专栏

参考

https://www.jianshu.com/p/28718ba04bc9?from=groupmessage
https://blog.csdn.net/qq_32392597/article/details/96147620

爬虫内容

在这里插入图片描述
对应于

代码

# -*- coding: utf-8 -*-
# import requests
import re
from bs4 import BeautifulSoup
from  urllib.request import urlopen
import collections # 词频统计库
import jieba # 解霸分词

# 获取网页内容
soup = BeautifulSoup(urlopen('http://www.gov.cn/zhengce/2020-02/05/content_5474884.htm'),'lxml')

for data in soup.select('.pages_content'):
    # 不转换成str会报错：TypeError: expected string or bytes-like object
    data = str(data)

    # 文本获取
    pattern = re.compile(r'<p style=\"text-indent.*?12pt;\">(.*?)</p>',re.S)
    content=re.findall(pattern,data)
    str = ''.join(str(i) for i in content)

    # 文本预处理
    pattern2 = re.compile(u'\t|\n|“|>|<|”|）|（|\.|-|:|;|\)|\(|\?|"|[span]') # 定义正则表达式匹配模式
    string_data = re.sub(pattern2, '', str) # 将符合模式的字符去除

    # 文本分词
    seg_list_exact = jieba.cut(string_data, cut_all = False) # 精确模式分词
    object_list = []
    remove_words = [u'的', u'，',u'和', u'是', u'随着', u'对于', u'对',u'等',u'能',u'都',u'。',u' ',u'、',u'中',u'在',u'了',
                    u'通常',u'如果',u'我们',u'需要'] # 自定义去除词库

    for word in seg_list_exact: # 循环读出每个分词
        if word not in remove_words: # 如果不在去除词库中
            object_list.append(word) # 分词追加到列表

    # 词频统计
    word_counts = collections.Counter(object_list) # 对分词做词频统计
    word_counts_top50 = word_counts.most_common(50) # 获取前50最高频的词
    print (word_counts_top50) # 输出检查