(一)网页爬虫,获取文章doi
(1)csv对象
# 1. 创建文件对象
f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')
csv_writer = csv.writer(f)
# 2. 构建列表头
csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])
# 3. 写入记录
csv_writer.writerow([conf, m_year, m_set,m_name,m_doi])
(2)bs4
# 1. 解析URL
response = requests.get(URL)
content = BeautifulSoup(response.content.decode("utf-8"), "lxml")
# 2. 获取元素(根据实际的html修改)
for tag in content.find_all('cite', class_='data tts-content'):
#论文集名称
m_set = tag.find('span', class_='title').get_text()
#出版年份
m_year=tag.find('span', itemprop='datePublished').get_text()
#列表
m_url=tag.find('a', class_='toc-link').get('href')
//.......
(二)根据doi下载文章
(3) 下载PDF
def getPaperPdf(year,set,name,url):
pattern = '/.*?\.pdf'
headers = {
'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
content = requests.get(url, headers=headers)
download_url = re.findall(pattern, content.text)
print("url=",url)
print("download_url=",download_url)
link_index=0
for i, res in enumerate(download_url):
print(res,i)
if res.startswith("//"):
link_index=i
break
download_file = "https:" + download_url[link_index]
print("download_file=",download_file)
path = "/ase/"+year+"/"+set
if os.path.exists(path):
pass
else:
os.makedirs(path)
r = requests.get(download_file, stream=True,headers=headers)
status_code = r.status_code
print("r status code=", status_code)
r = requests.get(download_file, stream=True,headers=headers)
file_name=path+"/"+name+".pdf"
print(file_name)
with open(file_name, 'wb') as f:
f.write(r.content)
(4) 保证文件名有效
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|\.]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
new_title=new_title.strip()
return new_title
【完整代码】
paper_doi.py
import requests
from bs4 import BeautifulSoup
import csv
conf="ASE"
URL = "https://dblp.org/db/conf/kbse/index.html"
# 1. 创建文件对象
f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])
response = requests.get(URL)
# print(response.content)
content = BeautifulSoup(response.content.decode("utf-8"), "lxml")
for tag in content.find_all('cite', class_='data tts-content'):
#论文集名称
m_set = tag.find('span', class_='title').get_text()
#出版年份
m_year=tag.find('span', itemprop='datePublished').get_text()
m_url=tag.find('a', class_='toc-link').get('href')
if int(m_year)>=2018:
# print(m_name,m_year,m_url)
response = requests.get(m_url)
contents_paper = BeautifulSoup(response.content.decode("utf-8"), "lxml")
for tag_publ in contents_paper.find_all('li', class_='entry inproceedings'):
# content 链接,获取论文集合li
m_name=tag_publ.find('cite', class_='data tts-content').find('span', class_='title').get_text()
m_doi = tag_publ.find('nav', class_='publ').find('div', class_='head').find('a').get('href')
#论文名字,doi
print(m_name,'-----m_doi=',m_doi)
#写入csv
csv_writer.writerow([conf, m_year, m_set,m_name,m_doi])
else:
break
f.close()
doi_pdf.py
import os
import re
import requests
def getPaperPdf(year,set,name,url):
pattern = '/.*?\.pdf'
headers = {
'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
content = requests.get(url, headers=headers)
download_url = re.findall(pattern, content.text)
print("url=",url)
print("download_url=",download_url)
link_index=0
for i, res in enumerate(download_url):
print(res,i)
if res.startswith("//"):
link_index=i
break
download_file = "https:" + download_url[link_index]
print("download_file=",download_file)
path = "D:/kon_data/ase/"+year+"/"+set
if os.path.exists(path):
pass
else:
os.makedirs(path)
r = requests.get(download_file, stream=True,headers=headers)
status_code = r.status_code
print("r status code=", status_code)
r = requests.get(download_file, stream=True,headers=headers)
file_name=path+"/"+name+".pdf"
print(file_name)
with open(file_name, 'wb') as f:
f.write(r.content)
print("Sucessful to download" + " " + file_name)
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|\.]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
new_title=new_title.strip()
return new_title
if __name__ == '__main__':
sci_Hub_Url = "https://sci-hub.ren/"
import pandas as pd
data = pd.read_csv("D:\\kon_data\\ase.csv", header=0)
print(data.head())
failedpdf=[]
for indexs in data.index:
year=data.loc[indexs].values[1]
#截取
set=data.loc[indexs].values[2]
proc = set.split('/')
if len(proc) > 1:
set = proc[1].split(',')[0]
else:
set = set.split(',')[0]
#name,替换非法
name=validateTitle(data.loc[indexs].values[3])
if len(name)>90:
name=name[0:90]
#doi,截取
doi=data.loc[indexs].values[4].split('/')[3]+'/'+data.loc[indexs].values[4].split('/')[4]
paper_url = sci_Hub_Url + doi
# 通过文献的url下载pdf
try:
getPaperPdf(str(year), str(set), str(name), paper_url)
except Exception:
failedpdf.append(str(year)+"@@@@"+str(set)+"@@@@"+str(name)+"@@@@"+str(paper_url))
print("Failed to get pdf")
file_name = "D:\kon_data\error.txt"
f = open(file_name, "w")
for line in failedpdf:
f.write(line + '\n')
f.close()