基于PDF的词云图

最近在疯狂找与研究方向相关的paper,全部下载下来后发现一篇篇看太费时间了,但又想知道这篇文章paper主要内容是什么,用到了哪些关键的方法,对我是否有帮助,于是想到了词云。

“词云”就是对文本中出现频率较高的“关键词”予以视觉上的突出,形成“关键词云层”或“关键词渲染”,从而过滤掉大量的文本信息,使用户只要一眼扫过文本就可以领略文本的主旨。

网上搜了一下,很少有基于PDF生成词云的代码,于是参考了一些博客,自己动手实现基于PDF的词云图,顺带网站爬虫生成词云图,源代码如下(如果对你有用记得偷偷点赞或评论 ):

import requests
from bs4 import BeautifulSoup
import time
import random

from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import jieba

from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf


def wangye(url):
    urls = [url.format(str(i))
            for i in range(0, 200, 20)]  # 通过观察的url翻页的规律,使用for循环得到10个链接,保存到urls列表中
    print(urls)
    dic_h = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
    comments_list = []  # 初始化用于保存短评的列表

    for url in urls:  # 使用for循环分别获取每个页面的数据,保存到comments_list列表
        r = requests.get(url=url, headers=dic_h).text

        soup = BeautifulSoup(r, 'lxml')
        ul = soup.find('div', id="comments")
        lis = ul.find_all('p')

        list2 = []
        for li in lis:
            list2.append(li.find('span').string)
        # print(list2)
        comments_list.extend(list2)
        time.sleep(random.randint(0, 3))  # 暂停0~3秒

    with open('ciyun_word.txt', 'w', encoding='utf-8') as f:  # 使用with open()新建对象f
        # 将列表中的数据循环写入到文本文件中
        for i in comments_list:
            f.write(i + "\n")  # 写入数据

def word():
    # 打开文本
    with open("ciyun_word.txt",encoding="utf-8") as f:
        s = f.read()

    # 中文分词
    text = ' '.join(jieba.cut(s))

    # 生成对象
    img = Image.open("changjinglu.jpg") # 打开遮罩图片
    mask = np.array(img) #将图片转换为数组

    stopwords = ["我","你","她","的","是","了","在","也","和","就","都","这","a","we","that","and","for","this","the","of","to","are","in",
                 "as","our","is","but","with","by","on","it","which","all","an","each","not","from","data","here","model","method","based",
                 "these","their","where","be"]
    wc = WordCloud(font_path="msyh.ttc",
                   mask=mask,
                   width=1000,
                   height=700,
                   background_color='white',
                   max_words=200,
                   stopwords=stopwords).generate(text)

    # 显示词云
    plt.imshow(wc, interpolation='bilinear')# 用plt显示图片
    plt.axis("off")  # 不显示坐标轴
    plt.show() # 显示图片

    # 保存到文件
    wc.to_file("ciyun.png")

def read_pdf(pdf):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdf)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    # 获取所有行
    lines = str(content).split("\n")
    with open('ciyun_word.txt', 'w', encoding='utf-8') as f:  # 使用with open()新建对象f
        # 将列表中的数据循环写入到文本文件中
        for i in lines:
            f.write(i + "\n")  # 写入数据





if __name__ == '__main__':
    #url = '你要爬取的网站地址'
    #wangye(url)
    pdf_url = 'C:/Users/ASUS/Desktop/21-two stage GAN.pdf'
    with open(pdf_url, "rb") as my_pdf:
        read_pdf(my_pdf)
    word()

生成的词云图
在这里插入图片描述
我这里用的是长颈鹿的图片,大家可以根据下载自己喜欢的图片进行替换就可以了,考虑到可能有一些人和我一样懒,不想找图片,我就直接把可可爱爱的长颈鹿放上来
在这里插入图片描述

  • 4
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值