python实现论文下载器

AmIWalruS

于 2021-12-29 12:38:48 发布

阅读量552

点赞数

文章标签： python 数据挖掘爬虫

本文链接：https://blog.csdn.net/qq_36975499/article/details/122212965

版权

（一）网页爬虫，获取文章doi

（1）csv对象

# 1. 创建文件对象
f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')
csv_writer = csv.writer(f)
# 2. 构建列表头
csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])
# 3. 写入记录
csv_writer.writerow([conf, m_year, m_set,m_name,m_doi])

（2）bs4

# 1. 解析URL
response = requests.get(URL)
content = BeautifulSoup(response.content.decode("utf-8"), "lxml")

# 2. 获取元素（根据实际的html修改）
for tag in content.find_all('cite', class_='data tts-content'):
    #论文集名称
    m_set = tag.find('span', class_='title').get_text()
    #出版年份
    m_year=tag.find('span', itemprop='datePublished').get_text()
	#列表
    m_url=tag.find('a', class_='toc-link').get('href')
    //.......

（二）根据doi下载文章

(3) 下载PDF

def getPaperPdf(year,set,name,url):
    pattern = '/.*?\.pdf'
    headers = {
        'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                      '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }

    

    content = requests.get(url, headers=headers)
    download_url = re.findall(pattern, content.text)
    print("url=",url)
    print("download_url=",download_url)
    link_index=0
    for i, res in enumerate(download_url):
        print(res,i)
        if res.startswith("//"):
            link_index=i
            break

    download_file = "https:" + download_url[link_index]


    print("download_file=",download_file)
    path = "/ase/"+year+"/"+set
    if os.path.exists(path):
        pass
    else:
        os.makedirs(path)
    r = requests.get(download_file, stream=True,headers=headers)
    status_code = r.status_code
    print("r status code=", status_code)

    r = requests.get(download_file, stream=True,headers=headers)

    file_name=path+"/"+name+".pdf"
    print(file_name)
    with open(file_name, 'wb') as f:
        f.write(r.content)

(4) 保证文件名有效

def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|\.]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    new_title=new_title.strip()

    return new_title

【完整代码】
paper_doi.py

import requests
from bs4 import BeautifulSoup
import csv
conf="ASE"
URL = "https://dblp.org/db/conf/kbse/index.html"
# 1. 创建文件对象
f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])
response = requests.get(URL)
# print(response.content)
content = BeautifulSoup(response.content.decode("utf-8"), "lxml")

for tag in content.find_all('cite', class_='data tts-content'):
    #论文集名称
    m_set = tag.find('span', class_='title').get_text()
    #出版年份
    m_year=tag.find('span', itemprop='datePublished').get_text()

    m_url=tag.find('a', class_='toc-link').get('href')
    if int(m_year)>=2018:
        # print(m_name,m_year,m_url)
        response = requests.get(m_url)
        contents_paper = BeautifulSoup(response.content.decode("utf-8"), "lxml")

        for tag_publ in contents_paper.find_all('li', class_='entry inproceedings'):
                 # content 链接，获取论文集合li
              m_name=tag_publ.find('cite', class_='data tts-content').find('span', class_='title').get_text()
              m_doi = tag_publ.find('nav', class_='publ').find('div', class_='head').find('a').get('href')
              #论文名字，doi
              print(m_name,'-----m_doi=',m_doi)
            #写入csv
              csv_writer.writerow([conf, m_year, m_set,m_name,m_doi])

    else:
        break
f.close()

doi_pdf.py

import os
import re
import requests
def getPaperPdf(year,set,name,url):
    pattern = '/.*?\.pdf'
    headers = {
        'Cookie': 'OCSSID=4df0bjva6j7ejussu8al3eqo03',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                      '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    content = requests.get(url, headers=headers)
    download_url = re.findall(pattern, content.text)
    print("url=",url)
    print("download_url=",download_url)
    link_index=0
    for i, res in enumerate(download_url):
        print(res,i)
        if res.startswith("//"):
            link_index=i
            break
    download_file = "https:" + download_url[link_index]
    print("download_file=",download_file)
    path = "D:/kon_data/ase/"+year+"/"+set
    if os.path.exists(path):
        pass
    else:
        os.makedirs(path)
    r = requests.get(download_file, stream=True,headers=headers)
    status_code = r.status_code
    print("r status code=", status_code)
    r = requests.get(download_file, stream=True,headers=headers)
    file_name=path+"/"+name+".pdf"
    print(file_name)
    with open(file_name, 'wb') as f:
        f.write(r.content)
    print("Sucessful to download" + " " + file_name)
def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|\.]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    new_title=new_title.strip()

    return new_title

if __name__ == '__main__':

    sci_Hub_Url = "https://sci-hub.ren/"
    import pandas as pd
    data = pd.read_csv("D:\\kon_data\\ase.csv", header=0)
    print(data.head())
    failedpdf=[]
    for indexs in data.index:
        year=data.loc[indexs].values[1]
        #截取
        set=data.loc[indexs].values[2]
        proc = set.split('/')
        if len(proc) > 1:
            set = proc[1].split(',')[0]
        else:
            set = set.split(',')[0]
        #name，替换非法
        name=validateTitle(data.loc[indexs].values[3])
        if len(name)>90:
            name=name[0:90]
        #doi，截取
        doi=data.loc[indexs].values[4].split('/')[3]+'/'+data.loc[indexs].values[4].split('/')[4]
        paper_url = sci_Hub_Url + doi
         # 通过文献的url下载pdf
        try:
            getPaperPdf(str(year), str(set), str(name), paper_url)  
        except Exception:
            failedpdf.append(str(year)+"@@@@"+str(set)+"@@@@"+str(name)+"@@@@"+str(paper_url))
            print("Failed to get pdf")
    file_name = "D:\kon_data\error.txt"

    f = open(file_name, "w")

    for line in failedpdf:
        f.write(line + '\n')
    f.close()

AmIWalruS

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python实现论文下载器

（一）网页爬虫，获取文章doi（1）csv对象# 1. 创建文件对象f = open('D:\\kon_data\\ase.csv','w',newline="",encoding='utf-8')csv_writer = csv.writer(f)# 2. 构建列表头csv_writer.writerow(["会议","年份","论文集","论文名称","doi"])# 3. 写入记录csv_writer.writerow([conf, m_year, m_set,m_name,m_doi
复制链接

扫一扫