统计2016政府工作报告中的高频词

最新推荐文章于 2023-12-08 06:56:37 发布

Hunter_kai

最新推荐文章于 2023-12-08 06:56:37 发布

阅读量1k

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/Hunter_kai/article/details/50827636

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-、

'''
  统计2016政府工作报告中的高频词
'''

import jieba
import requests
from bs4 import BeautifulSoup

'''
    从网上抓取政府工作报告的全文 extract_text(url)
'''
def extract_text(url):
    #page_source = requests.get(url).text   出现乱码  为了不乱可以设置编码  r.encoding='utf-8'
    page_source = requests.get(url).content   #以字节的方式访问请求响应体   不会出现乱码
    bs_source = BeautifulSoup(page_source)
    report_text = bs_source.find_all('p')  #查找文档中所有的<p>标签  返回一个列表
    text=''
    for p in report_text:
        text += p.get_text()    #得到<p>标签之间的内容
        text += '\n'
    return text

'''
    利用jieba分词，并计算词频 word_frequency(text)
'''
def word_frequency(text):
    from collections import Counter

    #len(word)>=2 为了去除标点符号和当个停用词   words为一个list
    words = [word for word in jieba.cut(text, cut_all=True) if len(word)>=2]
    c = Counter(words)    #返回的c为一个字典  key是各个word   value是出现的次数

    for word_freq in c.most_common(10):   # most_common(n),列出最常出现的前n个
        word, freq = word_freq
        print word, freq


url_2016 = 'http://www.gov.cn/guowuyuan/2016-03/05/content_5049372.htm'
text_2016 = extract_text(url_2016)
word_frequency(text_2016)