Xpath爬虫获取Springer论文信息
前言
手动输入关键词和页码范围,获取论文信息,并保存到同一列表
标题,链接,期刊,发表时间,论文类型,下载链接
一、CODE
主页面
search_url = "https://www.springeropen.com/search?"
完整代码
"获取主页下面翻页链接,进行逐页爬虫"
import pandas as pd
from lxml import etree
import requests
# 拼接后的链接样式
# url = 'https://www.springeropen.com/search?query=maritime+edge+computing&searchType=publisherSearch'
# url = 'https://www.springeropen.com/search?searchType=publisherSearch&sort=Relevance&query=maritime+edge+computing&page=1'
# 指向同一页面
# 定义不同页面对应参数的函数
def get_params(num, keywords):
# url = 'https://www.springeropen.com/search?searchType=publisherSearch&sort=Relevance&query=maritime+edge+computing&page=1'
data = {
'searchType': 'publisherSearch',
'sort': 'Relevance',
'query': keywords,
'page': num
}
return data
# 接收页面参数,获取页面信息
def get_data(data):
search_url = "https://www.springeropen.com/search?"
try:
# 请求获取页面信息,包含链接、头文件和pages返回的参数信息
# data=data 和 params=data 不一样,post使用data,get使用params
resp = requests.get(search_url, headers=headers, params=data)
# 返回请求回应状态码
resp.raise_for_status()
# 设置返回内容编码为网页编码格式
resp.encoding = resp.apparent_encoding
tree = etree.HTML(resp.text) # 将文档生成一棵树
# 之后就可以通过生成的文档树 tree 来查找节点
# 获取所有li标签的数据
list_data = tree.xpath('//*[@id="main-content"]/div/main/div/ol/li')
return list_data
except Exception as e:
print("爬取失败", e)
# 数据提取函数
def get_info(li_list):
cnt = 1
# 循环读取li标签
for li in li_list:
# 获取论文标题,[0]从text()的列表中取出str格式标题
title = li.xpath('./article/h3/a/text()')[0]
# 获取论文标题链接
title_link = li.xpath('./article/h3/a/@href')[0]
# 'https://educationaltechnologyjournal.springeropen.com/articles/10.1186/s41239-021-00255-0'
# 将标题链接补全
full_text_link = 'https:' + str(title_link)
# 获取期刊名
journal_title = li.xpath('./article/div[2]/em/text()')[0]
# 文章类型
article_type = li.xpath('./article/div[3]/span[1]/text()[1]')[0]
# 论文在期刊发表时间,normalize-space()去除空格、换行字符
published_on = li.xpath('normalize-space(./article/div[3]/span[2]/text())') + \
li.xpath('normalize-space(./article/div[3]/span[2]/span/text())')
# 获取PDF链接
pdf_link = li.xpath('./article/ul/li[2]/a/@href')[0]
# 'https://educationaltechnologyjournal.springeropen.com/counter/pdf/10.1186/s41239-021-00255-0'
# 拼接完整的下载链接
download_pdf_link = 'https:' + str(pdf_link)
# 提示title的文档信息获取完成
print(cnt, title, 'done!!!')
cnt += 1
# 将文档信息添加到article_info列表
all_info.append([title, published_on, journal_title, article_type, full_text_link, download_pdf_link])
# 保存数据
def save_data(all_info):
# 调用pages函数,获取参数值,并设为保存文件的名称,pages(1).values()取value值,[0]取第一个值
# filename = list(pages(1).values())[0]
# 将article_info列表转换为DataFrame格式,并设置列命
df = pd.DataFrame(all_info, columns=['title', 'published_on', 'journal_title',
'article_type', 'full_text_link', 'download_pdf_link'])
# 将DataFrame保存为csv,并设置编码格式
df.to_csv(f'{keywords}.csv', encoding='utf-8')
# 主函数
if __name__ == '__main__':
search_url = "https://www.springeropen.com/search?"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
# 输入想要查询的内容
print('please enter the keywords you want to query')
keywords = input('keywords:')
print('please enter the pages range you want to query')
start_page = int(input('start_page:'))
end_page = int(input('end_page:')) + 1
# 获取页面函数返回的li标签数据
# page_link_list = get_page_link(keywords)
all_info = []
for i in range(start_page, end_page):
print(f'第{i}页开始...')
params_data = get_params(i, keywords)
li_list = get_data(params_data)
info_list = get_info(li_list)
print(f'第{i}页结束。')
save_data(all_info)
print('爬取信息完成!!!')
二、TODO
循环访问全文链接,获取文章摘要并保存
总结
简单爬虫,小记一下,供大家交流,欢迎大佬指教。
本人有意做一个完整的、系统的论文爬虫项目,欢迎有兴趣的大佬一起讨论合作。