一、无头浏览器
代码模板:
# 无可视化界面的操作
firefox_options = Options()
firefox_options.add_argument("--headless")
firefox_options.add_argument("--disable-gpu")
无头浏览器也就是使用selenium进行爬虫时不会出现浏览器界面
二、规避检测
代码模板:
# 实现规避检测
options = FirefoxOptions()
profile = FirefoxProfile()
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'
# profile.set_preference("network.proxy.type", 4) # 自动检测代理设置
profile.set_preference("dom.webdriver.enabled", False) # 设置非driver驱动
profile.set_preference('useAutomationExtension', False) # 关闭自动化提示
profile.update_preferences() # 更新设置
规避检测就是规避浏览器检测,使爬虫看起来不像是机器操作的程序
三、寻找爬取内容的xpath
这里的爬取内容以artificial intelligence为例:
在上面这张图片中,我们可以点击“PDF”进行论文的下载,当我们随便点击一个“PDF”之后,我们会进入如下页面:
该论文的URL:https://iopscience.iop.org/article/10.1088/1742-6596/1533/3/032093/pdf
通过观察此URL,我们会发现该URL的组成为三部分:https://iopscience.iop.org/article+/10.1088/1742-6596/1533/3/032093(论文的doi)+/pdf,其实这就是一种规律,我们可以利用此规律进行论文的批量爬取。这种规律是:当我们得知IOPScience网站上每一篇论文的doi之后,我们再将论文的doi与https://iopscience.iop.org/article和**/pdf**进行拼装得到相应论文PDF所对应的URL,再使用requests模块访问该URL,即可实现论文的批量爬取。
我们再回到刚刚的主页中:
此时注意红框框柱的title对应的右边的a标签,我们会惊奇地发现,该标签的href属性值竟然是:“/article/”+论文的doi,这不就是我们要找的东西嘛,获取该href属性对应的值,再将其与“https://iopscience.iop.org”和"/pdf"进行拼接,就可以完美地得到我们想要的URL。
上面这张图片红框框柱的内容对应着一页IOPScience上所有与artificial intelligence相关的论文所对应的全部div标签,我们只需要定位到class=“art-list”的div标签,即可获得该标签所有的子标签,我们可以用过xpath来定位到class=“art-list”的div标签
class="art-list"的div标签对应的xpath:/html/body/div[3]/div/div/main/div[2]/form/div[2]
class=“art-list"的div标签的直接后代div标签(即class=“art-list-item"的div标签)对应的xpath:/html/body/div[3]/div/div/main/div[2]/form/div[2]
红框框柱的title对应的a标签的xpath:/html/body/div[3]/div/div/main/div[2]/form/div[2]/div[1]/div/h2/a
a标签对应的href属性值为:paper.xpath(”/html/body/div[3]/div/div/main/div[2]/form/div[2]/div[1]/div/h2/a/@href”)[0]
对应的代码:
driver = webdriver.Firefox(executable_path='../driver/geckodriver.exe', firefox_profile=profile, options=options)
retrieve=input("请输入查询内容") # 输入内容最好是英文内容
retrieve = re.sub(r'\s', '+', retrieve)
iop_url = "https://iopscience.iop.org/nsearch?terms=" + retrieve
driver.get(url=iop_url)
time.sleep(2)
tree = etree.HTML(driver.page_source)
information = []
papers = tree.xpath("/html/body/div[2]/div/div/main/div[2]/form/div[2]/div")
for paper in papers:
title = paper.xpath("./div/h2/a/text()")[0]
href = 'https://iopscience.iop.org' + paper.xpath("./div/h2/a/@href")[0] + '/pdf'
title = re.sub(r"[.!+-=—,$%^,。?、~@#¥…&*《》<>「」{}【】()']", ' ', title)
title = re.sub(r'\n', '', title)
title = re.sub(r'\s\s+', " ", title)
title = title.strip()
print((title, href))
info = {'title': title, 'href': href}
information.append(info)
四、使用requests模块批量爬取论文
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
for info in information:
pdf_href = info['href']
paper = requests.get(url=pdf_href, headers=headers)
paper_title = './paper/' + info['title'] + '.pdf'
with open(paper_title, 'wb') as file:
file.write(paper.content)
print(f"{info['title']}爬取成功")
五、实现在IOPScience上根据论文标题爬取相应的论文完整代码
import tkinter
import time
from selenium import webdriver
# 实现无可视化界面的
from selenium.webdriver.firefox.options import Options
# 规避检测
from selenium.webdriver import FirefoxOptions
from selenium.webdriver import FirefoxProfile
import requests
import re
from bs4 import BeautifulSoup
import asyncio
from lxml import etree
import os
def start_crawl_papers():
if not os.path.exists('./paper'):
os.mkdir('./paper')
# 无可视化界面的操作
firefox_options = Options()
firefox_options.add_argument("--headless")
firefox_options.add_argument("--disable-gpu")
# 实现规避检测
options = FirefoxOptions()
profile = FirefoxProfile()
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'
# profile.set_preference("network.proxy.type", 4) # 自动检测代理设置
profile.set_preference("dom.webdriver.enabled", False) # 设置非driver驱动
profile.set_preference('useAutomationExtension', False) # 关闭自动化提示
profile.update_preferences() # 更新设置
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
driver = webdriver.Firefox(executable_path='../driver/geckodriver.exe', firefox_profile=profile, options=options)
retrieve = re.sub(r'\s', '+', e1.get())
iop_url = "https://iopscience.iop.org/nsearch?terms=" + retrieve
driver.get(url=iop_url)
time.sleep(2)
tree = etree.HTML(driver.page_source)
information = []
limit = int(e2.get())
count = 0
while True:
papers = tree.xpath("/html/body/div[2]/div/div/main/div[2]/form/div[2]/div")
index = 0 # 计数
for paper in papers:
try:
title = paper.xpath("./div/h2/a/text()")[0]
href = 'https://iopscience.iop.org' + paper.xpath("./div/h2/a/@href")[0] + '/pdf'
title = re.sub(r"[.!+-=—,$%^,。?、~@#¥…&*《》<>「」{}【】()']", ' ', title)
title = re.sub(r'\n', '', title)
title = re.sub(r'\s\s+', " ", title)
title = title.strip()
print((title, href))
info = {'title': title, 'href': href}
information.append(info)
except Exception as e:
t.insert('insert', e.__str__()) # 将结果添加到文本框显示
driver.quit()
index += 1
count += 1
if count == limit:
break
if index == 10 and count != limit:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
driver.find_element_by_id('navsubmitnext').click()
else:
break
for info in information:
pdf_href = info['href']
paper = requests.get(url=pdf_href, headers=headers)
paper_title = './paper/' + info['title'] + '.pdf'
with open(paper_title, 'wb') as file:
file.write(paper.content)
print(f"{info['title']}爬取成功")
t.insert('insert', '爬取结束\n') # 将结果添加到文本框显示
driver.quit()
window = tkinter.Tk()
window.title('my window')
window.geometry('400x400')
l1 = tkinter.Label(window, text='输入待爬取的文本')
l1.pack()
# 定义输入框1
e1 = tkinter.Entry(window, width=200)
e1.pack()
l2 = tkinter.Label(window, text='输入爬取的论文数目')
l2.pack()
# 定义输入框2
e2 = tkinter.Entry(window, width=200)
e2.pack()
b1 = tkinter.Button(window, text="开始爬取", command=start_crawl_papers)
b1.pack()
# 定义文本框
t = tkinter.Text(window,
state='normal', # 有disabled、normal 两个状态值,默认为normal
width=200, height=10
)
t.pack()
b2 = tkinter.Button(window, text='退出', command=window.quit)
b2.pack()
window.mainloop()