目的:利用谷歌学术,得到一篇论文中所有被引用文献信息
from time import sleep
from urllib import parse
from selenium import webdriver
class GetBibs():
def __init__(self, gg_search_url) -> None:
self.gg_search_url = gg_search_url
# 启用带插件的浏览器
option = webdriver.ChromeOptions()
option.add_argument("--user-data-dir=" + option_path)
self.browser = webdriver.Chrome(options=option) # 打开chrome浏览器
# 进入被引用文献链接
def get_title_to_google_scholar(self, paper_title):
strto_pn = parse.quote(paper_title)
url = self.gg_search_url + strto_pn
self.browser.get(url)
# 等待词条加载,得到数据
for i in range(100):
try:
element = self.browser.find_element_by_css_selector("[class='gs_r gs_or gs_scl']")
element = element.find_element_by_css_selector("[class='gs_fl']")
links = element.find_elements_by_xpath("a")
print(links[2].get_attribute('href'))
element = links[2]
element.click()
break
except:
sleep(0.1)
self.get_data_from_google_scholar()
def get_data_from_google_scholar(self):
old_url = self.browser.current_url
for page_count in range(0, 10):
print(self.browser.current_url)
print(self.browser.title)
url_head = old_url.split('scholar?')[0]
url_tail = old_url.split('scholar?')[1]
url = url_head + 'scholar?start=' + str(10 * page_count) + '&' + url_tail
self.browser.get(url)
print(self.browser.current_url)
print(self.browser.title)
# 等待词条加载,得到数据
for i in range(100):
try:
elements = self.browser.find_elements_by_css_selector("[class='gs_r gs_or gs_scl']")
for element in elements:
content = element.find_element_by_css_selector("[class='gs_rt']").text
print('标题:', content)
content = element.find_element_by_css_selector("[class='gs_a']").text
temp_content = content.split(' - ')[-2]
article = temp_content.split(', ')[0]
print('期刊:', article)
year = temp_content.split(', ')[1]
print('年月:', year)
temp_content = element.find_element_by_css_selector("[class='gs_fl']").text
cited = temp_content.split(' ')[0].split(':')[-1]
print('被引用:', cited)
print("\n")
break
except:
sleep(0.1)
sleep(10)
option_path = r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data - 副本/" # 使浏览器能用你自定义的设置,否则Selenium创建的浏览器对象是默认设置,一些插件就不能用了
gg_search_url = r'https://scholar.google.com/scholar?hl=zh-CN&as_sdt=0%2C5&inst=1597255436240989024&q=' # 在执行代码之前,先打开搜索页面,把类似的网址复制到这里,等号=后面就是一会儿要搜索的内容
get_bibs = GetBibs(gg_search_url)
# %% **********************以上定义爬虫对象,以下开始爬取*******************************
paper_titles = { # 要爬取的论文,key用于标记,value是论文题目。下面是一些样例
"DMCNN": 'Deep Multi-Scale Convolutional Neural Network for Dynamic Scene Deblurring',
}
for k in paper_titles.keys():
get_bibs.get_title_to_google_scholar(paper_titles[k])
结果
参考代码:
使用Selenium从IEEE与谷歌学术批量爬取BibTex文献引用_https://www.cnblogs.com/qizhou/-CSDN博客
https://github.com/Dao-zhi/GoogleScholarGUI
错误解决方法 :
selenium之加载chrome配置 - 越来越努力 - 博客园