论文爬虫_Springer_Xpath方法

最新推荐文章于 2023-10-26 23:16:32 发布

sme2457

最新推荐文章于 2023-10-26 23:16:32 发布

阅读量600

点赞数 1

分类专栏： Springer xpath 文章标签：爬虫 python

本文链接：https://blog.csdn.net/sme2457/article/details/127622371

版权

Springer 同时被 2 个专栏收录

1 篇文章

订阅专栏

xpath

1 篇文章

订阅专栏

Xpath爬虫获取Springer论文信息

文章目录

Xpath爬虫获取Springer论文信息
前言
一、CODE
二、TODO
总结

前言

手动输入关键词和页码范围，获取论文信息，并保存到同一列表
标题，链接，期刊，发表时间，论文类型，下载链接

一、CODE

主页面

search_url = "https://www.springeropen.com/search?"

完整代码

"获取主页下面翻页链接，进行逐页爬虫"
import pandas as pd
from lxml import etree
import requests

# 拼接后的链接样式
# url = 'https://www.springeropen.com/search?query=maritime+edge+computing&searchType=publisherSearch'
# url = 'https://www.springeropen.com/search?searchType=publisherSearch&sort=Relevance&query=maritime+edge+computing&page=1'
# 指向同一页面


# 定义不同页面对应参数的函数
def get_params(num, keywords):
    # url = 'https://www.springeropen.com/search?searchType=publisherSearch&sort=Relevance&query=maritime+edge+computing&page=1'
    data = {
        'searchType': 'publisherSearch',
        'sort': 'Relevance',
        'query': keywords,
        'page': num
    }
    return data


# 接收页面参数，获取页面信息
def get_data(data):
    search_url = "https://www.springeropen.com/search?"
    try:
        # 请求获取页面信息，包含链接、头文件和pages返回的参数信息
        # data=data 和 params=data 不一样，post使用data，get使用params
        resp = requests.get(search_url, headers=headers, params=data)
        # 返回请求回应状态码
        resp.raise_for_status()
        # 设置返回内容编码为网页编码格式
        resp.encoding = resp.apparent_encoding
        tree = etree.HTML(resp.text)  # 将文档生成一棵树
        # 之后就可以通过生成的文档树 tree 来查找节点
        # 获取所有li标签的数据
        list_data = tree.xpath('//*[@id="main-content"]/div/main/div/ol/li')
        return list_data
    except Exception as e:
        print("爬取失败", e)


# 数据提取函数
def get_info(li_list):
    cnt = 1
    # 循环读取li标签
    for li in li_list:
        # 获取论文标题，[0]从text()的列表中取出str格式标题
        title = li.xpath('./article/h3/a/text()')[0]
        # 获取论文标题链接
        title_link = li.xpath('./article/h3/a/@href')[0]
        # 'https://educationaltechnologyjournal.springeropen.com/articles/10.1186/s41239-021-00255-0'
        # 将标题链接补全
        full_text_link = 'https:' + str(title_link)
        # 获取期刊名
        journal_title = li.xpath('./article/div[2]/em/text()')[0]
        # 文章类型
        article_type = li.xpath('./article/div[3]/span[1]/text()[1]')[0]
        # 论文在期刊发表时间，normalize-space()去除空格、换行字符
        published_on = li.xpath('normalize-space(./article/div[3]/span[2]/text())') + \
                       li.xpath('normalize-space(./article/div[3]/span[2]/span/text())')
        # 获取PDF链接
        pdf_link = li.xpath('./article/ul/li[2]/a/@href')[0]
        # 'https://educationaltechnologyjournal.springeropen.com/counter/pdf/10.1186/s41239-021-00255-0'
        # 拼接完整的下载链接
        download_pdf_link = 'https:' + str(pdf_link)
        # 提示title的文档信息获取完成
        print(cnt, title, 'done!!!')
        cnt += 1
        # 将文档信息添加到article_info列表
        all_info.append([title, published_on, journal_title, article_type, full_text_link, download_pdf_link])


# 保存数据
def save_data(all_info):
    # 调用pages函数，获取参数值，并设为保存文件的名称，pages(1).values()取value值，[0]取第一个值
    # filename = list(pages(1).values())[0]

    # 将article_info列表转换为DataFrame格式，并设置列命
    df = pd.DataFrame(all_info, columns=['title', 'published_on', 'journal_title',
                                             'article_type', 'full_text_link', 'download_pdf_link'])
    # 将DataFrame保存为csv，并设置编码格式
    df.to_csv(f'{keywords}.csv', encoding='utf-8')


# 主函数
if __name__ == '__main__':

    search_url = "https://www.springeropen.com/search?"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
    }
    # 输入想要查询的内容
    print('please enter the keywords you want to query')
    keywords = input('keywords:')
    print('please enter the pages range you want to query')
    start_page = int(input('start_page:'))
    end_page = int(input('end_page:')) + 1

    # 获取页面函数返回的li标签数据
    # page_link_list = get_page_link(keywords)
    all_info = []
    for i in range(start_page, end_page):
        print(f'第{i}页开始...')
        params_data = get_params(i, keywords)
        li_list = get_data(params_data)
        info_list = get_info(li_list)
        print(f'第{i}页结束。')

    save_data(all_info)
    print('爬取信息完成！！！')