python爬虫实例：批量爬取论文

一只楚楚猫

已于 2023-10-21 19:11:20 修改

阅读量858

点赞数

分类专栏： python 文章标签： python 爬虫

于 2022-10-23 22:32:15 首次发布

本文链接：https://blog.csdn.net/julac/article/details/127481182

版权

python 专栏收录该内容

32 篇文章 0 订阅

订阅专栏

python爬虫基础（一）
python爬虫基础（二）

一、无头浏览器

代码模板：

# 无可视化界面的操作
firefox_options = Options()
firefox_options.add_argument("--headless")
firefox_options.add_argument("--disable-gpu")

无头浏览器也就是使用selenium进行爬虫时不会出现浏览器界面

二、规避检测

代码模板：

# 实现规避检测
    options = FirefoxOptions()
    profile = FirefoxProfile()
    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'
    # profile.set_preference("network.proxy.type", 4)  # 自动检测代理设置
    profile.set_preference("dom.webdriver.enabled", False)  # 设置非driver驱动
    profile.set_preference('useAutomationExtension', False)  # 关闭自动化提示
    profile.update_preferences()  # 更新设置

规避检测就是规避浏览器检测，使爬虫看起来不像是机器操作的程序

三、寻找爬取内容的xpath

这里的爬取内容以artificial intelligence为例：
图1 artificial intelligence
在上面这张图片中，我们可以点击“PDF”进行论文的下载，当我们随便点击一个“PDF”之后，我们会进入如下页面：
在这里插入图片描述
该论文的URL：https://iopscience.iop.org/article/10.1088/1742-6596/1533/3/032093/pdf
通过观察此URL，我们会发现该URL的组成为三部分：https://iopscience.iop.org/article+/10.1088/1742-6596/1533/3/032093（论文的doi）+/pdf，其实这就是一种规律，我们可以利用此规律进行论文的批量爬取。这种规律是：当我们得知IOPScience网站上每一篇论文的doi之后，我们再将论文的doi与https://iopscience.iop.org/article和**/pdf**进行拼装得到相应论文PDF所对应的URL，再使用requests模块访问该URL，即可实现论文的批量爬取。
我们再回到刚刚的主页中：
在这里插入图片描述

此时注意红框框柱的title对应的右边的a标签，我们会惊奇地发现，该标签的href属性值竟然是：“/article/”+论文的doi，这不就是我们要找的东西嘛，获取该href属性对应的值，再将其与“https://iopscience.iop.org”和"/pdf"进行拼接，就可以完美地得到我们想要的URL。
在这里插入图片描述
上面这张图片红框框柱的内容对应着一页IOPScience上所有与artificial intelligence相关的论文所对应的全部div标签，我们只需要定位到class=“art-list”的div标签，即可获得该标签所有的子标签，我们可以用过xpath来定位到class=“art-list”的div标签
class="art-list"的div标签对应的xpath：/html/body/div[3]/div/div/main/div[2]/form/div[2]
class=“art-list"的div标签的直接后代div标签（即class=“art-list-item"的div标签）对应的xpath：/html/body/div[3]/div/div/main/div[2]/form/div[2]
在这里插入图片描述
红框框柱的title对应的a标签的xpath：/html/body/div[3]/div/div/main/div[2]/form/div[2]/div[1]/div/h2/a
a标签对应的href属性值为：paper.xpath(”/html/body/div[3]/div/div/main/div[2]/form/div[2]/div[1]/div/h2/a/@href”)[0]

对应的代码：

driver = webdriver.Firefox(executable_path='../driver/geckodriver.exe', firefox_profile=profile, options=options)
retrieve=input("请输入查询内容") # 输入内容最好是英文内容
retrieve = re.sub(r'\s', '+', retrieve)
iop_url = "https://iopscience.iop.org/nsearch?terms=" + retrieve
driver.get(url=iop_url)

time.sleep(2)

tree = etree.HTML(driver.page_source)

information = []

papers = tree.xpath("/html/body/div[2]/div/div/main/div[2]/form/div[2]/div")
for paper in papers:
    title = paper.xpath("./div/h2/a/text()")[0]
    href = 'https://iopscience.iop.org' + paper.xpath("./div/h2/a/@href")[0] + '/pdf'

    title = re.sub(r"[.!+-=—,$%^，。？、~@#￥…&*《》<>「」{}【】()']", ' ', title)
    title = re.sub(r'\n', '', title)
    title = re.sub(r'\s\s+', " ", title)
    title = title.strip()

    print((title, href))

    info = {'title': title, 'href': href}

    information.append(info)

四、使用requests模块批量爬取论文

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',

    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

    'Accept-Encoding': 'gzip, deflate, br',

    'Connection': 'keep-alive',

    'Upgrade-Insecure-Requests': '1'}

for info in information:
    pdf_href = info['href']

    paper = requests.get(url=pdf_href, headers=headers)

    paper_title = './paper/' + info['title'] + '.pdf'

    with open(paper_title, 'wb') as file:
        file.write(paper.content)
    print(f"{info['title']}爬取成功")

五、实现在IOPScience上根据论文标题爬取相应的论文完整代码

import tkinter

import time

from selenium import webdriver
# 实现无可视化界面的
from selenium.webdriver.firefox.options import Options
# 规避检测
from selenium.webdriver import FirefoxOptions
from selenium.webdriver import FirefoxProfile

import requests
import re
from bs4 import BeautifulSoup
import asyncio

from lxml import etree
import os


def start_crawl_papers():
    if not os.path.exists('./paper'):
        os.mkdir('./paper')

    # 无可视化界面的操作
    firefox_options = Options()
    firefox_options.add_argument("--headless")
    firefox_options.add_argument("--disable-gpu")

    # 实现规避检测
    options = FirefoxOptions()
    profile = FirefoxProfile()
    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'
    # profile.set_preference("network.proxy.type", 4)  # 自动检测代理设置
    profile.set_preference("dom.webdriver.enabled", False)  # 设置非driver驱动
    profile.set_preference('useAutomationExtension', False)  # 关闭自动化提示
    profile.update_preferences()  # 更新设置

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',

        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

        'Accept-Encoding': 'gzip, deflate, br',

        'Connection': 'keep-alive',

        'Upgrade-Insecure-Requests': '1'}

    driver = webdriver.Firefox(executable_path='../driver/geckodriver.exe', firefox_profile=profile, options=options)
    retrieve = re.sub(r'\s', '+', e1.get())
    iop_url = "https://iopscience.iop.org/nsearch?terms=" + retrieve
    driver.get(url=iop_url)

    time.sleep(2)

    tree = etree.HTML(driver.page_source)

    information = []
    limit = int(e2.get())
    count = 0

    while True:
        papers = tree.xpath("/html/body/div[2]/div/div/main/div[2]/form/div[2]/div")

        index = 0  # 计数

        for paper in papers:

            try:
                title = paper.xpath("./div/h2/a/text()")[0]
                href = 'https://iopscience.iop.org' + paper.xpath("./div/h2/a/@href")[0] + '/pdf'

                title = re.sub(r"[.!+-=—,$%^，。？、~@#￥…&*《》<>「」{}【】()']", ' ', title)
                title = re.sub(r'\n', '', title)
                title = re.sub(r'\s\s+', " ", title)
                title = title.strip()

                print((title, href))

                info = {'title': title, 'href': href}

                information.append(info)
            except Exception as e:
                t.insert('insert', e.__str__())  # 将结果添加到文本框显示
                driver.quit()

            index += 1

            count += 1
            if count == limit:
                break

        if index == 10 and count != limit:
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            driver.find_element_by_id('navsubmitnext').click()

        else:
            break

    for info in information:
        pdf_href = info['href']

        paper = requests.get(url=pdf_href, headers=headers)

        paper_title = './paper/' + info['title'] + '.pdf'

        with open(paper_title, 'wb') as file:
            file.write(paper.content)
        print(f"{info['title']}爬取成功")
    t.insert('insert', '爬取结束\n')  # 将结果添加到文本框显示
    driver.quit()


window = tkinter.Tk()
window.title('my window')
window.geometry('400x400')

l1 = tkinter.Label(window, text='输入待爬取的文本')
l1.pack()
# 定义输入框1
e1 = tkinter.Entry(window, width=200)
e1.pack()

l2 = tkinter.Label(window, text='输入爬取的论文数目')
l2.pack()
# 定义输入框2
e2 = tkinter.Entry(window, width=200)
e2.pack()

b1 = tkinter.Button(window, text="开始爬取", command=start_crawl_papers)
b1.pack()
# 定义文本框
t = tkinter.Text(window,
                 state='normal',  # 有disabled、normal 两个状态值，默认为normal
                 width=200, height=10
                 )
t.pack()

b2 = tkinter.Button(window, text='退出', command=window.quit)
b2.pack()

window.mainloop()