参考了 python自动获取整理参考文献的bib信息
实现效果
将如下格式的参考文献转换为bib格式并存储到txt中,方便将word参考文献转换为Latex。
代码
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium
import time
def read_txt(path):
papers = []
with open(path,'r', encoding='utf-8') as f:
content=f.readlines()
for c in content:
paper_name=c.split('.')[1].split('[')[0]
papers.append(paper_name)
return papers
def notrobort(wd, File,index):
"""出现人机认证时,激活该部分,先手动通过图片认证"""
time.sleep(60) # 用于图片认证的时间 10s
element = wd.find_element(By.ID, 'gs_hdr_tsi')
element.send_keys(File[index] + '\n')
linkElem = wd.find_element(By.LINK_TEXT, '引用')
linkElem.click()
linkElem = wd.find_element(By.LINK_TEXT, 'BibTeX')
linkElem.click()
time.sleep(2)
try:
element = wd.find_element(By.TAG_NAME, 'pre')
print(element.text)
time.sleep(2)
wd.refresh()
wd.back()
wd.back()
wd.back()
wd.refresh()
wd.back()
except selenium.common.exceptions.NoSuchElementException:
print('将进行第二次人机验证\n')
time.sleep(60) # 用于图片认证的时间 10s
element = wd.find_element(By.TAG_NAME, 'pre')
print(element.text)
time.sleep(2)
wd.refresh()
wd.back()
wd.back()
wd.back()
wd.refresh()
wd.back()
def bib(wd, index, l, File):
with open('bib.txt', 'w+', encoding='utf-8') as f:
while index < l:
try:
wd.get('https://scholar.google.com/')
wd.maximize_window()
element = wd.find_element(By.ID, 'gs_hdr_tsi')
element.send_keys(File[index] + '\n')
try:
linkElem = wd.find_element(By.LINK_TEXT, '引用')
linkElem.click()
linkElem = wd.find_element(By.LINK_TEXT, 'BibTeX')
linkElem.click()
element = wd.find_element(By.TAG_NAME, 'pre')
print(element.text)
f.write('\n' + element.text)
time.sleep(2)
wd.refresh()
wd.back()
wd.back()
wd.back()
index += 1
except selenium.common.exceptions.NoSuchElementException:
index+=1
except selenium.common.exceptions.NoSuchElementException:
wd.quit()
key = input("请按 Y 进行人机验证\n")
if key == 'Y':
print('将进行人机验证\n')
option = webdriver.ChromeOptions()
option.add_experimental_option("detach", True)
wd = webdriver.Chrome(
executable_path=r'D:\anaconda\envs\tf1\chromedriver.exe',
options=option)
wd.implicitly_wait(5)
wd.get('https://scholar.google.com/')
wd.maximize_window()
notrobort(wd, File,index)
# f.write('\n' + bib)
wd.quit()
def bibdownload(path):
# File = readfilename(path)
File=read_txt(path)
l = len(File)
index = 0
option = webdriver.ChromeOptions()
option.add_experimental_option("detach", True)
wd = webdriver.Chrome(
executable_path=r'D:\anaconda\envs\tf1\chromedriver.exe',
options=option)
wd.implicitly_wait(5)
wd.get('https://scholar.google.com/')
wd.maximize_window()
bib(wd, index, l, File)
wd = webdriver.Chrome(
executable_path=r'D:\anaconda\envs\tf1\chromedriver.exe',
options=option)
wd.implicitly_wait(5)
wd.get('https://www.apple.com.cn/')
wd.maximize_window()
time.sleep(12)
wd.quit()
def paperrush(path):
bibdownload(path)
if __name__ == "__main__":
path = r'D:\Documents\ref.txt'
paperrush(path)
使用说明
1.需要科学上网
2.path = r'D:\Documents\ref.txt'
存放所有参考文献的txt文件,要求每行以[序号]作者.文献名称[文献类型]这种格式开头(其实就是谷歌学术那种常用GBT的参考文献格式)
3.executable_path=r'D:\anaconda\envs\tf1\chromedriver.exe'
更换为自己环境的chromedriver.exe,而且要求和谷歌浏览器的版本一致,不然会报错
4.论文搜索多了会进行人机验证,这时需手动按“Y”进入,有一分钟的验证时间。可能会有两次,一次是在搜索关键词时,一次是在打开bib时
5.有时候直接搜索会无法找到文献,在代码中会自动跳过,需要在之后自行添加(不过这种情况还是比较少的,以中文文献居多,可能换成百度学术要好些)
6.最后结果保存在项目下的bib.txt中