爬虫在谷歌学术找文献被引次数

最新推荐文章于 2023-07-13 14:42:04 发布

NullSite

最新推荐文章于 2023-07-13 14:42:04 发布

阅读量526

点赞数 1

文章标签：爬虫 python 开发语言

本文链接：https://blog.csdn.net/NULLSET1/article/details/130510838

版权

import urllib.request, urllib.error
import re


def get_references(title):
    # 将文章标题中的空格替换为加号，以便用于生成 URL
    title = title.replace(' ', '+')

    # 构造搜索 URL
    search_url = f'https://scholar.google.com/scholar?hl=en&q={title}&btnG=&as_sdt=1%2C5&as_sdtp='
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Referer': 'https://www.google.com/'
    }
    # 发送 HTTP 请求并读取响应内容
    req=urllib.request.Request(search_url,headers=headers)
    try:
        response = urllib.request.urlopen(req)
        html_content = response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        print(f'Error: {e.code} {e.reason}')
        return None

    # 从 HTML 页面中提取被引用次数
    m = re.search('Cited by\s(\d+)', html_content)
    if m:
        num_citations = int(m.group(1))
        return num_citations
    else:
        return 0


# 测试代码
titles = ['Experimental Study on the Autogenic Acid Fluid System of a High-Temperature Carbonate Reservoir by Acid Fracturing',
          'Experimental study on a new type of self-propping fracturing technology'
          ]
for title in titles:
    num_citations = get_references(title)
    print(f'{title}: {num_citations} 次被引用')