【无标题】报告实录文本挖掘与分析(简单版)

中国共产党第十八、十九、 二十次全国人民代表大会

报告实录文本挖掘与分析(简单版)

爬取18大报告实录文本

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 

driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10) 
driver.get('http://cpc.people.com.cn/n/2012/1118/c64094-19612151.html') # 打开网页
# 读入第一页数据
file = open('18da_report_raw.txt', 'w', encoding='utf-8')
file.write(data)
file.close()
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="p_content"]/center/table/tbody/tr/td/a')
)# 可扩展标记语言(英語:Extensible Markup Language,简称:XML)是一种标记语言
).click()
#依次读入后十页数据
for i in range(10): 
    data  =  driver.find_element_by_xpath('//*[@id="p_content"]').text
    file = open('18da_report_raw.txt', 'a', encoding='utf-8')
    file.write(data)
    file.close()
    time.sleep(2)
    wait.until(EC.presence_of_element_located(
    (By.XPATH, '//*[@id="p_content"]/center/table/tbody/tr/td[2]/a')
)
).click()
# 读入最后一页    
data  =  driver.find_element(By.XPATH,'//*[@id="p_content"]').text
file = open('18da_report_raw.txt', 'a', encoding='utf-8')
file.write(data)
file.close()
driver.quit()

爬取19大报告文本

from selenium import webdriver

driver = webdriver.Chrome()
driver.get('http://finance.sina.com.cn/china/gncj/2017-10-18/doc-ifymvuyt4098830.shtml')
data  =  driver.find_element_by_xpath('//*[@id="articleContent"]/div[1]').text
file = open('19da_report_raw.txt', 'w', encoding='utf-8')
file.write(data)
file.close()

爬取20大报告文本

from selenium import webdriver

driver = webdriver.Chrome()
driver.get('https://news.ifeng.com/c/8K9l4qcZtaw')
data  =  driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[2]/div/div[1]/div/div').text

file = open('20da_report_raw.txt', 'w', encoding='utf-8')
file.write(data)
file.close()

收集数据后进行数据处理

高频词分析以及画词云图替换txt文件即可

import jieba

from wordcloud import WordCloud
import matplotlib.pyplot as plt
txt = open("18.txt",encoding="utf-8").read()
stopwords = [line.strip() for line in open("baidu_stopwords.txt",encoding="utf-8").readlines()] #  strip()方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
words = jieba.lcut(txt,cut_all=False) # 精确模式
counts = {}

for word in words:
    # 不在停词表中
    if word not in stopwords:
        # 不统计字数为1的词 
        if len(word) == 1:
            continue
        else:
            counts[word] = counts.get(word,0)+1
            print(counts[word])
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(30):
    word, count = items[i]
    print("{:<10}{:>7}".format(word, count))
    
text=' '.join(words)
wc=WordCloud(background_color='white',# 设置背景颜色
             font_path='msyh.ttc',# 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字  
             scale=2, #按照比例进行放大画布,如设置为2,则长和宽都是原来画布的1.5倍
             max_words=100,# 设置最大现实的字数  
             max_font_size=80,# 设置字体最大值  
            stopwords =stopwords
           )
wc.generate(text)
# 显示词云图  
plt.imshow(wc)
plt.axis('off')  
plt.show() 
wc.to_file('词频统计.jpg')    

关键词分析(简易版)

import jieba
import jieba.analyse as analyse
with open('20.txt') as f: # 默认模式为‘r’,只读模式
    contents = f.read() # 读取文件全部内容
    contents = contents.replace(u'\u3000',u'').replace('\n', '').replace('\r', '').replace(" ","")
    contents.rstrip()
# jieba.del_word("坚持")
# jieba.del_word("推进")

# jieba.del_word("中国")
# jieba.del_word("全面")
jieba.del_word("我们")
jieba.del_word("实现")
jieba.del_word("伟大")
jieba.del_word("坚持")
jieba.del_word("加强")
jieba.del_word("全面")
jieba.del_word("中国")
jieba.del_word("完善")
jieba.del_word("加快")
jieba.del_word("健全")
jieba.del_word("社会")
jieba.del_word("推进")
jieba.del_word("推动")
jieba.del_word("提高")
words = jieba.lcut(contents,cut_all=False)# 使用精确模式对文本进行分词
text=' '.join(words)

keywords = jieba.analyse.extract_tags(text,topK=10,withWeight=True,allowPOS=())
for item in keywords:
    print(item[0],item[1])
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值