import urllib.request, urllib.error
import re
def get_references(title):
# 将文章标题中的空格替换为加号,以便用于生成 URL
title = title.replace(' ', '+')
# 构造搜索 URL
search_url = f'https://scholar.google.com/scholar?hl=en&q={title}&btnG=&as_sdt=1%2C5&as_sdtp='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://www.google.com/'
}
# 发送 HTTP 请求并读取响应内容
req=urllib.request.Request(search_url,headers=headers)
try:
response = urllib.request.urlopen(req)
html_content = response.read().decode('utf-8')
except urllib.error.HTTPError as e:
print(f'Error: {e.code} {e.reason}')
return None
# 从 HTML 页面中提取被引用次数
m = re.search('Cited by\s(\d+)', html_content)
if m:
num_citations = int(m.group(1))
return num_citations
else:
return 0
# 测试代码
titles = ['Experimental Study on the Autogenic Acid Fluid System of a High-Temperature Carbonate Reservoir by Acid Fracturing',
'Experimental study on a new type of self-propping fracturing technology'
]
for title in titles:
num_citations = get_references(title)
print(f'{title}: {num_citations} 次被引用')
爬虫在谷歌学术找文献被引次数
最新推荐文章于 2024-02-06 13:46:27 发布