python实现论文下载器

(一)网页爬虫,获取文章doi

(1)csv对象
# 1. 创建文件对象
f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')
csv_writer = csv.writer(f)
# 2. 构建列表头
csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])
# 3. 写入记录
csv_writer.writerow([conf, m_year, m_set,m_name,m_doi])
(2)bs4
# 1. 解析URL
response = requests.get(URL)
content = BeautifulSoup(response.content.decode("utf-8"), "lxml")

# 2. 获取元素(根据实际的html修改)
for tag in content.find_all('cite', class_='data tts-content'):
    #论文集名称
    m_set = tag.find('span', class_='title').get_text()
    #出版年份
    m_year=tag.find('span', itemprop='datePublished').get_text()
	#列表
    m_url=tag.find('a', class_='toc-link').get('href')
    //.......

(二)根据doi下载文章

(3) 下载PDF
def getPaperPdf(year,set,name,url):
    pattern = '/.*?\.pdf'
    headers = {
        'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                      '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }

    

    content = requests.get(url, headers=headers)
    download_url = re.findall(pattern, content.text)
    print("url=",url)
    print("download_url=",download_url)
    link_index=0
    for i, res in enumerate(download_url):
        print(res,i)
        if res.startswith("//"):
            link_index=i
            break

    download_file = "https:" + download_url[link_index]


    print("download_file=",download_file)
    path = "/ase/"+year+"/"+set
    if os.path.exists(path):
        pass
    else:
        os.makedirs(path)
    r = requests.get(download_file, stream=True,headers=headers)
    status_code = r.status_code
    print("r status code=", status_code)

    r = requests.get(download_file, stream=True,headers=headers)

    file_name=path+"/"+name+".pdf"
    print(file_name)
    with open(file_name, 'wb') as f:
        f.write(r.content)
(4) 保证文件名有效
def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|\.]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    new_title=new_title.strip()

    return new_title

【完整代码】
paper_doi.py

import requests
from bs4 import BeautifulSoup
import csv
conf="ASE"
URL = "https://dblp.org/db/conf/kbse/index.html"
# 1. 创建文件对象
f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])
response = requests.get(URL)
# print(response.content)
content = BeautifulSoup(response.content.decode("utf-8"), "lxml")

for tag in content.find_all('cite', class_='data tts-content'):
    #论文集名称
    m_set = tag.find('span', class_='title').get_text()
    #出版年份
    m_year=tag.find('span', itemprop='datePublished').get_text()

    m_url=tag.find('a', class_='toc-link').get('href')
    if int(m_year)>=2018:
        # print(m_name,m_year,m_url)
        response = requests.get(m_url)
        contents_paper = BeautifulSoup(response.content.decode("utf-8"), "lxml")

        for tag_publ in contents_paper.find_all('li', class_='entry inproceedings'):
                 # content 链接,获取论文集合li
              m_name=tag_publ.find('cite', class_='data tts-content').find('span', class_='title').get_text()
              m_doi = tag_publ.find('nav', class_='publ').find('div', class_='head').find('a').get('href')
              #论文名字,doi
              print(m_name,'-----m_doi=',m_doi)
            #写入csv
              csv_writer.writerow([conf, m_year, m_set,m_name,m_doi])

    else:
        break
f.close()

doi_pdf.py

import os
import re
import requests
def getPaperPdf(year,set,name,url):
    pattern = '/.*?\.pdf'
    headers = {
        'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                      '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    content = requests.get(url, headers=headers)
    download_url = re.findall(pattern, content.text)
    print("url=",url)
    print("download_url=",download_url)
    link_index=0
    for i, res in enumerate(download_url):
        print(res,i)
        if res.startswith("//"):
            link_index=i
            break
    download_file = "https:" + download_url[link_index]
    print("download_file=",download_file)
    path = "D:/kon_data/ase/"+year+"/"+set
    if os.path.exists(path):
        pass
    else:
        os.makedirs(path)
    r = requests.get(download_file, stream=True,headers=headers)
    status_code = r.status_code
    print("r status code=", status_code)
    r = requests.get(download_file, stream=True,headers=headers)
    file_name=path+"/"+name+".pdf"
    print(file_name)
    with open(file_name, 'wb') as f:
        f.write(r.content)
    print("Sucessful to download" + " " + file_name)
def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|\.]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    new_title=new_title.strip()

    return new_title

if __name__ == '__main__':

    sci_Hub_Url = "https://sci-hub.ren/"
    import pandas as pd
    data = pd.read_csv("D:\\kon_data\\ase.csv", header=0)
    print(data.head())
    failedpdf=[]
    for indexs in data.index:
        year=data.loc[indexs].values[1]
        #截取
        set=data.loc[indexs].values[2]
        proc = set.split('/')
        if len(proc) > 1:
            set = proc[1].split(',')[0]
        else:
            set = set.split(',')[0]
        #name,替换非法
        name=validateTitle(data.loc[indexs].values[3])
        if len(name)>90:
            name=name[0:90]
        #doi,截取
        doi=data.loc[indexs].values[4].split('/')[3]+'/'+data.loc[indexs].values[4].split('/')[4]
        paper_url = sci_Hub_Url + doi
         # 通过文献的url下载pdf
        try:
            getPaperPdf(str(year), str(set), str(name), paper_url)  
        except Exception:
            failedpdf.append(str(year)+"@@@@"+str(set)+"@@@@"+str(name)+"@@@@"+str(paper_url))
            print("Failed to get pdf")
    file_name = "D:\kon_data\error.txt"

    f = open(file_name, "w")

    for line in failedpdf:
        f.write(line + '\n')
    f.close()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值