论文爬取并整理到excel表中代码:
import requests
from lxml import etree
import openpyxl as op
if __name__ == '__main__':
for k in range(4):
#论文的网址:https://openaccess.thecvf.com/CVPR2022?
url2 = f"https://openaccess.thecvf.com/CVPR2022?day=2022-06-2{k+1}"
headers = {
'User-Agent': 'Mozilla/5.0 ( Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84'
}
wb = op.Workbook()
sheet = wb.active
m = 2
sheet['A1'] = "title"
sheet['B1'] = 'name'
sheet["C1"] = "pdf_url"
page_text = requests.get(url=url2, headers=headers).text
parser = etree.HTMLParser(encoding="utf-8")
tree = etree.HTML(page_text, parser=parser)
all = tree.xpath('//*[@id="content"]/dl/dt')
j = 1
for i in range(3,len(all),2):
title = tree.xpath(f'//*[@id="content"]/dl/dt[{j}]/a/text()')[0]
name = tree.xpath(f'/ html / body / div[3] / dl / dd[{i-1}] / form[1] / a/text()')[0]
#论文pdf网址
pdf = 'https://openaccess.thecvf.com/'+tree.xpath(f'//*[@id="content"]/dl/dd[{i}]/a[1]/@href')[0]
j+=1
# print(title)
# print(name)
# print(pdf)
# print("====================")
sheet[f'A{m}'] = title
sheet[f'B{m}'] = name
sheet[f'C{m}'] = pdf
m+=1
#我这将论文分成了四份,需要弄成一份的话把下面一行缩进一下即可
wb.save(f'CVPR论文6月2{k+1}号.xlsx')
print(f"6月2{k+1}号")
爬取效果:
网址都是可以打开的。