统计ieee, springer, arxiv, sciencedirect, acm论文中作者信息

最新推荐文章于 2025-03-16 14:27:40 发布

lyq_998

最新推荐文章于 2025-03-16 14:27:40 发布

阅读量702

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_43729302/article/details/113932495

版权

这段代码展示了如何使用Python爬虫从arxiv, IEEE, Springer, ScienceDirect和ACM等网站抓取论文作者的名字，并进行统计。通过正则表达式和JSON解析，程序能有效地提取和清洗数据，为后续的数据分析提供便利。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

先给代码https://github.com/lyq998/Authors
主要是从网页里面爬取论文的作者名字，最后进行数量的统计

arxiv

import requests
import re

my_headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


def arxiv_authors(url):
    try:
        response = requests.get(url, headers=my_headers, timeout=4)
        if response.status_code != 200:
            author_names = None
        else:
            response.encoding = 'utf-8'
            html_text = response.text
            text_list = html_text.split('\n')
            author_lines = [text_line for text_line in text_list if text_line.find('citation_author') != -1]
            author_names = [re.sub('^.*content="', "", author_line) for author_line in author_lines]
            author_names = [re.sub('"/.*', "", author_line) for author_line in author_names]
            author_names = [author_name.replace(',', '') for author_name in author_names]
        return author_names
    except:
        arxiv_authors(url)

ieee

import requests
import re
import json

my_headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


def ieee_authors(url):
    try:
        response = requests.get(url, headers=my_headers, timeout=4)
        if response.status_code != 200:
            author_names = None
        else:
            response.encoding = 'utf-8'
            html_text = response.text
            text_list = html_text.split('\n')
            author_lines = [text_line for text_line in text_list if text_line.find('"authors"') != -1]
            # assert len(author_lines) == 1
            author_line = author_lines[0]

            # The below is not suitable for IEEE Access
            # author_line = re.sub(',"isbn".*$', '', author_line)
            # author_line = re.sub('^.*"authors":', '', author_line)

            author_line = re.sub('^.*document.metadata=', '', author_line)
            # delete the ;
            author_line = author_line[:-1]
            metadata = json.loads(author_line)
            author_list = metadata['authors']
            author_names = []
            for author in author_list:
                author_names.append(author['name'])
        return author_names
    except:
        ieee_authors(url)

springer

import requests
import re

my_headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


def springer_authors(url):
    try:
        response = requests.get(url, headers=my_headers, timeout=4)
        if response.status_code != 200:
            author_names = None
        else:
            response.encoding = 'utf-8'
            html_text = response.text
            text_list = html_text.split('\n')
            author_lines = [text_line for text_line in text_list if text_line.find('"citation_author"') != -1]
            author_names = [re.sub('^.*content="', "", author_line) for author_line in author_lines]
            author_names = [re.sub('"/.*', "", author_line) for author_line in author_names]
        return author_names
    except:
        springer_authors(url)

sciencedirect

import requests
import re
import json

my_headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


def sciencedirect_authors(url):
    try:
        response = requests.get(url, headers=my_headers, timeout=4)
        if response.status_code != 200:
            author_names = None
        else:
            response.encoding = 'utf-8'
            html_text = response.text
            text_list = html_text.split('\n')
            author_lines = [text_line for text_line in text_list if text_line.find('application/json') != -1]
            # assert len(author_lines) == 1
            author_line = author_lines[0]
            author_line = re.sub('^<script type="application/json".*">', '', author_line)
            author_line = re.sub('</script>$', '', author_line)
            json_dic = json.loads(author_line)
            author_infos = json_dic['authors']['content'][0]['$$']

            author_names = []
            for author_info in author_infos:
                if author_info['#name'] == 'author':
                    author_name_infos = author_info['$$']
                    for author_name_info in author_name_infos:
                        if author_name_info['#name'] == 'given-name':
                            first_name = author_name_info['_']
                        elif author_name_info['#name'] == 'surname':
                            last_name = author_name_info['_']

                    author_name = first_name + ' ' + last_name
                    author_names.append(author_name)
        return author_names
    except:
        sciencedirect_authors(url)

acm

import requests
import re

my_headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


def acm_authors(url):
    try:
        response = requests.get(url, headers=my_headers, timeout=4)
        if response.status_code != 200:
            author_names = None
        else:
            response.encoding = 'utf-8'
            html_text = response.text
            text_list = html_text.split('\n')
            author_lines = [text_line for text_line in text_list if text_line.find('rlist--inline loa truncate-list') != -1]
            author_line = author_lines[0]
            author_line = re.sub('^.*</b></li>', '', author_line).strip()
            author_list = author_line.split('<li class="loa__item">')
            # start from 1, because 0 is ""
            author_names = [re.sub('^.*"author-name" title="', '', author_x) for author_x in author_list[1:]]
            author_names = [re.sub('"><span class="loa__author-info".*$', '', author_x) for author_x in author_names]
        return author_names
    except:
        acm_authors(url)