Python爬取计算机领域文献并输出为csv表格

最新推荐文章于 2023-11-20 10:37:20 发布

神经元2020

最新推荐文章于 2023-11-20 10:37:20 发布

阅读量458

点赞数 1

文章标签： python csv

本文链接：https://blog.csdn.net/qq_46613531/article/details/107123095

版权

面试的导师是做图像处理的，面试后让我做一个检索近几年图像去雾顶会的程序，并输出成表格，于是我找了https://dblp.uni-trier.de/和http://openaccess.thecvf.com/两个网站进行爬取。

import requests
import re
import csv
from requests.exceptions import RequestException

huiyi = ['CVPR', 'ECCV', 'ICIP', 'ICCV', 'ECCV']  #输入想要搜索的期刊或会议名称
keyword = 'dehazing'                              #搜索关键词
year = 2014                                       #设置检索年份起点

#获取一个网页的所有信息
def Get_html(url):
    try:
        kv = {'User-Agent': 'Mozilla/5.0'}  #设置请求头反爬
        response = requests.get(url=url, headers=kv, timeout=10)
        response.encoding = 'utf-8'
        if response.status_code == 200:     #如果连接正常就返回响应
            return response
        else:                               #否则打印反常状态码
            print('response.status_code =={}'.format(response.status_code))
            return None
    except RequestException:                #其他错误
        return None


#获取每个文章对应的url链接
def Get_urllist(huiyi, keyword):
    huiyi = huiyi
    keyword = keyword
    list = []
    num = []
    for i in range(len(huiyi)):    #遍历每个会议，获取每个会议检索到的文章数
        url = 'https://dblp.uni-trier.de/search?q=' + keyword + '%20venue%3A' + huiyi[i] + '%3A'
        r = Get_html(url)
        num1 = re.findall('data-matches="(.*?)"></div></div></div><div id="semantic-scholar-results" ', r.text)
        num.append(num1[0])
    for i in range(len(huiyi)):    #遍历每个会议，获取每个会议每个文章的xml页面链接
        url = 'https://dblp.uni-trier.de/search/publ/inc?q='+keyword+'%20venue%3A'+huiyi[i]+'%3A&h='+num[i]+'&f=0&s=ydvspc'   #构建请求url
        r = Get_html(url)
        reurls = re.compile('(https://dblp.uni-trier.de/rec/xml/conf/.*?xml)')      #正则匹配文章url
        urls = re.findall(reurls, r.text)
        for j in urls:
            list.append(j)         #将每篇文章的xml页面链接存储到list列表中
    return list


#将openaccess文章主页中的pdf下载链接获取出来
def Get_download_url(oaurl):
    r = Get_html(oaurl)
    download_url = re.findall('<a href="\.\./\.\./(.*?)">pdf</a>', r.text)  #正则匹配pdf免费下载链接
    return 'http://openaccess.thecvf.com/'+download_url[0]

#如果xml中没有免费源，去openaccess的官网寻找
def Get_oaurl(title,book,year):
    r = requests.get('http://openaccess.thecvf.com/'+book+year+'.py')
    oaurl = re.findall('><a href="(.*?)">' + title[:-1] + '</a>', r.text)
    return 'http://openaccess.thecvf.com/' + oaurl[0]

#获取一条文章的信息
def Get_one_infor(url):
    r = Get_html(url)
    title = re.findall('<title>(.*?)</title>', r.text)                #获取当前文章标题
    authors = re.findall('<author>([^0-9]*?)[0-9]*?</author>', r.text)#获取当前文章作者信息
    author = ''
    for i in range(len(authors)):     #将该篇文章的所有作者信息汇总
        author = author+'/'+authors[i]
    books = re.findall('<booktitle>(.*?)</booktitle>', r.text)  #获取当前文章期刊名称
    years = re.findall('<year>(.*?)</year>', r.text)            #获取当前文章年份信息
    oaurl = re.findall('<ee type="oa">(.*?)</ee>', r.text)      #获取当前文章是否有免费下载途径
    eeurl = re.findall('<ee>(.*?)</ee>', r.text)                #获取当前文章付费下载链接
    if len(oaurl):                                              # 如果有免费的，用免费
        download_url = '[免费]' + Get_download_url(oaurl[0])
    else:                                                       #如果没有免费的，在oa官网找
        if books[0]+years[0] in ['ICCV2019','ICCV2017','ICCV2015','WACV2020']:
            try:
                oaurl2 = Get_oaurl(title[0],books[0],years[0])
                download_url = '[免费]' + Get_download_url(oaurl2)
            except:                                             #官网中没有该文章，切付费
                download_url = '[付费]' + eeurl[0]
        else:                                                   #官网中没有该会议，切付费
            download_url = '[付费]' + eeurl[0]
    infor = [title[0], author, books[0], years[0], download_url]#当前文章信息汇总
    return  infor


#主程序开始
if __name__ == '__main__':
    j = 0
    url_list = Get_urllist(huiyi, keyword)     #获取所有满足条件文章的url
    #创建csv文件并填写表头
    with open(file=keyword + '.csv', mode='a', encoding="gbk", newline='')as f:
        writer = csv.writer(f)
        writer.writerow(['标题','作者','会议','年份','下载链接'])
        f.close()

    for url in url_list:                       #遍历每篇文章
        infor = Get_one_infor(url)
        if int(infor[3])>year:                 #只看year年以后的
            j = j + 1                          #当前条数
            with open(file=keyword+'.csv', mode='a', encoding="gbk", newline='')as f: #信息存储在csv文件中
                writer = csv.writer(f)
                writer.writerow(infor)
                f.close()
            print('第{}条数据加载完成'.format(j))  #当前论文信息加载完成
    print('全部加载完成')

输出结果

在这里插入图片描述

神经元2020

关注

1
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
Python爬取计算机领域文献并输出为csv表格

面试的导师是做图像处理的，面试后让我做一个检索近几年图像去雾顶会的程序，并输出成表格，于是我找了https://dblp.uni-trier.de/和http://openaccess.thecvf.com/两个网站进行爬取
复制链接

扫一扫