需要相关学科、主题的论文元数据做一些分析,arXiv开源,容易获取。
显示选择了考虑用arXiv的api去实现,相关手册见👉arXiv-api
但貌似每次我都被卡在3000条数据就停止了,所以我选择用传统的lxml,bs4
直接爬取,但仍然被限制爬取10000条。
直接上完整代码,后面解释
from lxml import html
import requests
import re
import math
import csv
from bs4 import BeautifulSoup
import time
def get_total_results(url):
"""获取总结果数"""
response = requests.get(url)
tree = html.fromstring(response.content)
result_string = ''.join(tree.xpath('//*[@id="main-container"]/div[1]/div[1]/h1/text()')).strip()
match = re.search(r'of ([\d,]+) results', result_string)
if match:
total_results = int(match.group(1).replace(',', ''))
return total_results
else:
print("没有找到匹配的数字。")
return 0
def get_paper_info(url):
"""根据URL爬取一页的论文信息"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
papers = []
for article in soup.find_all('li', class_='arxiv-result'):
title = article.find('p', class_='title').text.strip()
authors_text = article.find('p', class_='authors').text.replace('Authors:', '').strip()
authors = [author.strip() for author in authors_text.split(',')]
abstract = article.find('span', class_='abstract-full').text.strip()
submitted = article.find('p', class_='is-size-7').text.strip()
submission_date = submitted.split(';')[0].replace('Submitted', '').strip()
pdf_link_element = article.find('a', text='pdf')
if pdf_link_element:
pdf_link = pdf_link_element['href']
else:
pdf_link = 'No PDF link found'
papers.append({'title': title, 'authors': authors, 'abstract': abstract, 'submission_date': submission_date, 'pdf_link': pdf_link})
return papers
def save_to_csv(papers, filename):
"""将所有爬取的论文信息保存到CSV文件中"""
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['title', 'authors', 'abstract', 'submission_date', 'pdf_link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for paper in papers:
writer.writerow(paper)
# 主程序
base_url = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=GAN&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first"
total_results = get_total_results(base_url + "&start=0")
pages = math.ceil(total_results / 200)
all_papers = []
for page in range(pages):
start = page * 200
print(f"Crawling page {page+1}/{pages}, start={start}")
page_url = base_url + f"&start={start}"
all_papers.extend(get_paper_info(page_url))
time.sleep(3) # 等待三秒以避免对服务器造成过大压力
# 保存到CSV
save_to_csv(all_papers, '1645.csv')
print(f"完成!总共爬取到 {len(all_papers)} 条数据,已保存到 1645.csv 文件中。")
get_total_results
函数用来获取检索结果的总条数,用来计算页数更新start(设置的是200 items per page)- 我这里爬取的数据为:论文的标题、作者、摘要、发布时间、PDF链接(后续要进行全文分析)
- 用的URL是arXiv高级检索的URL,因为我需要仅CS学科的。现在arXiv官网进行高级检索,然后复制URL即可。
- 翻页调整start即可,我这里设置的是200 per page
- 最后保存为CSV文件
- 想用更多的参数可以直接在官网高级检索中设置,搜索,然后复制URL更新start即可。或者参考arXiv的api文档,里面有更多的参数设置说明。
- start不必是200的整数(200 items per page),可以随意设置。所以如果要爬取的数据超过1w条,可以重新设置start重新运行继续爬取,或在代码中加入一个简单的判断。