Python爬取的论文

最新推荐文章于 2025-02-28 12:30:13 发布

阿黄Ahuang

最新推荐文章于 2025-02-28 12:30:13 发布

阅读量3.3k

点赞数 2

本文链接：https://blog.csdn.net/FZUMRWANG/article/details/82944100

版权

本文介绍如何使用Python的requests和BeautifulSoup库爬取CVPR2018会议的所有论文标题及摘要，并将结果保存至本地文件。通过Anaconda环境设置，逐步演示了爬虫程序的设计与实现过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Python小白爬虫入门实战

编写爬取程序爬取网站http://openaccess.thecvf.com/CVPR2018.py 2018年所有论文，并且输出至result.txt
工具
- Anaconda
- Chrome浏览器
步骤一
- 安装Anaconda
- 安装引入requests库，beautifulsoup4库
- 打开Anaconda中jupyter notebook
- 点击新建，选择python3编写爬虫程序

引入requests包获得网页响应内容

import requests #引入requests包
res = requests.get('https://news.sina.com.cn/') #通过get方法得到网址的回应
res.enconding = 'utf-8' #指定编码方式为utf-8，如果是英文则不用
print(res.text) #获取其中文本

引入BeautifulSoup模块解析网页获得论文Title

import requests
from bs4 import BeautifulSoup #引入BeautifulSoup模块
res = requests.get('http://openaccess.thecvf.com/CVPR2018.py') #通过监听网页可知使用get方法
#res.encoding='utf-8' #若是有中文则需要加上utf-8编码
soup = BeautifulSoup(res.text,'html.parser') #将get到的内容放入BeautifulSoup包中，并且使用html.parser解析由requests.get所得到的html页面内容
head='http://openaccess.thecvf.com/' #由于所得到的链接可能不全，因此加上前面的总链接，有时候不用
for news in soup.select('.ptitle'): #ptitle是通过观察每一个标题的分隔符而得到的，通过Chrome的检查功能中的选择功能来选择标题然后何可看出每一个标题是使用ptitle分隔，不同网页可能不同，如果ptitle是class的话用.，如果是id的话用#
    if len(news.select('a'))>0: #由于得到的list可能为空，因此加此判断
        h2=news.select('a')[0].text #这是选择ptitle下的标题中的文本内容，通过[0]来去掉[]
        a=news.select('a')[0]['href'] #选择ptitle下<a>标签中的href链接
        print(h2,head+a)

引入BeautifulSoup模块解析网页获得论文详情页中论文Title

import requests #引入requests包
from bs4 import BeautifulSoup
res = requests.get('http://openaccess.thecvf.com/content_cvpr_2018/html/Das_Embodied_Question_Answering_CVPR_2018_paper.html') #通过get方法得到网址的回应
soup = BeautifulSoup(res.text,'html.parser')
h2=soup.select('#papertitle')[0].text.strip()#strip()，用来移除左右空白
print(h2)

引入request和BeautifulSoup获得论文详情页Title和Abstract

import requests #引入requests包
from bs4 import BeautifulSoup
res = requests.get('http://openaccess.thecvf.com/content_cvpr_2018/html/Das_Embodied_Question_Answering_CVPR_2018_paper.html') #通过get方法得到网址的回应
soup = BeautifulSoup(res.text,'html.parser')
h2=soup.select('#papertitle')[0].text.strip()
article=soup.select('#abstract')[0].text.strip()
print('Title:',h2)
print('Abstract:',article)

其他知识，无关本次任务

newsurl='http://openaccess.thecvf.com/content_cvpr_2018/html/Das_Embodied_Question_Answering_CVPR_2018_paper.html'
newsid=newsurl.split('/')[-1].rstrip('.html')#切割之后只要最后的链接页面，并且移除掉html

commentURL="http://openaccess.thecvf.com/content_cvpr_2018/html/{}.html"
commentURL.format(newsid)

import requests
from bs4 import BeautifulSoup
def getNewsDetail(newsurl):
    result={}
    res=requests.get(newsurl)
    res.enconding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    result['Title: ']=soup.select('#papertitle')[0].text.strip()
    result['Abstract: ']=soup.select('#abstract')[0].text.strip()
    return result

getNewsDetail('http://openaccess.thecvf.com/content_cvpr_2018/html/Das_Embodied_Question_Answering_CVPR_2018_paper.html')

import requests
from bs4 import BeautifulSoup #引入BeautifulSoup模块
i=0
res = requests.get('http://openaccess.thecvf.com/CVPR2018.py') #通过监听网页可知使用get方法
#res.encoding='utf-8' #若是有中文则需要加上utf-8编码
soup = BeautifulSoup(res.text,'html.parser') #将get到的内容放入BeautifulSoup包中，并且使用html.parser解析由requests.get所得到的html页面内容
head='http://openaccess.thecvf.com/' #由于所得到的链接可能不全，因此加上前面的总链接，有时候不用
for news in soup.select('.ptitle'): #ptitle是通过观察每一个标题的分隔符而得到的，通过Chrome的检查功能中的选择功能来选择标题然后何可看出每一个标题是使用ptitle分隔，不同网页可能不同，如果ptitle是class的话用.，如果是id的话用#
    if len(news.select('a'))>0: #由于得到的list可能为空，因此加此判断
        a=head+news.select('a')[0]['href'] #选择ptitle下<a>标签中的href链接
        #print(h2,head+a)
        res2 = requests.get(a) #通过get方法得到网址的回应
        soup2 = BeautifulSoup(res2.text,'html.parser')
        h2=soup2.select('#papertitle')[0].text.strip()
        article=soup2.select('#abstract')[0].text.strip()
        print(i)
        print('Title:',h2)
        print('Abstract:',article)
        print('\n')
        print('\n')
        i=i+1

完整代码

import requests
from bs4 import BeautifulSoup #引入BeautifulSoup模块
i=0
res = requests.get('http://openaccess.thecvf.com/CVPR2018.py') #通过监听网页可知使用get方法
res.encoding='utf-8' #若是有中文则需要加上utf-8编码
soup = BeautifulSoup(res.text,'html.parser') #将get到的内容放入BeautifulSoup包中，并且使用html.parser解析由requests.get所得到的html页面内容
head='http://openaccess.thecvf.com/' #由于所得到的链接可能不全，因此加上前面的总链接，有时候不用
for news in soup.select('.ptitle'): #ptitle是通过观察每一个标题的分隔符而得到的，通过Chrome的检查功能中的选择功能来选择标题然后何可看出每一个标题是使用ptitle分隔，不同网页可能不同，如果ptitle是class的话用.，如果是id的话用#
    if len(news.select('a'))>0: #由于得到的list可能为空，因此加此判断
        a=head+news.select('a')[0]['href'] #选择ptitle下<a>标签中的href链接
        #print(h2,head+a)
        res2 = requests.get(a) #通过get方法得到网址的回应
        res.encoding='utf-8'
        soup2 = BeautifulSoup(res2.text,'html.parser')
        h2=soup2.select('#papertitle')[0].text.strip()
        article=soup2.select('#abstract')[0].text.strip()
        #print('Title:',h2)
        #print('Abstract:',article)
        with open('D:\\result.txt','a',encoding='gb18030',errors='ignore') as f:
            f.write(str(i))
            f.write('\n')
            f.write('Title: '+h2)
            f.write('\n')
            f.write('Abstract: '+article)
            f.write('\n')
            f.write('\n')
            f.write('\n')
        i=i+1