应用场景:毕业论文中文献往往高达几十篇,调整引用格式(如标点后需加“空格”,作者超过三人,第三人以后需改为“,等”),虽然有各种论文管理软件,但能利用自己所学技能批量获取文献引用也挺有趣。
核心:selenium+知网。selenium是python写爬虫常用的库,不知道的小伙伴可以先在CSDN搜索下,做好selenium的前置准备。本文selenium使用的浏览器为Google浏览器。
具体逻辑:首先将文献下载保存至一个文件夹,文件命名格式为 " XXXXXX_作者.pdf "(知网下载pdf默认格式),然后通过python提取文件名,利用selenium访问知网实现搜索、引用的功能,然后抓取该文献引用格式。
详细代码如下:
其中file_address即为保存有文献的文件夹路径,修改即可。
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
import os
import random
import re
time_s=random.random()*2+2
#保存文献文件夹
#file_address=input("文献文件夹:")
file_address=r"C:\Users\86156\Desktop\研二下\开题报告\文献\开挖变形\监测"
#读取文件名
#储存引用容器
cite_box=[]
#default
default=[]
# 遍历当前路径下所有文件
file = os.listdir(file_address)
for f in file:
#论文名容器
thesis=[]
#论文名
try:
name=re.search(r".*_",f).group()
name=name.replace("_","")
except:
default.append(f)
continue
thesis.append(name)
#论文作者
e=f.split("_")
peo=e[len(e)-1].replace(".pdf","")
thesis.append(peo)
#print(thesis)
driver = webdriver.Chrome()
driver.implicitly_wait(5)
#print("读取js")
with open('JS/stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js})
#print("读取完成,开始打开网页")
#driver.maximize_window()#最大化窗口
#打开知网
driver.get('https://www.cnki.net/')
time.sleep(0.25)
#打开输入栏,键入文章标题
search=driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div[1]/input[1]")
search_click=driver.find_element(By.XPATH, "/html/body/div[2]/div[2]/div/div[1]/input[2]")
ActionChains(driver) \
.send_keys_to_element(search, name) \
.click(search_click) \
.perform()
time.sleep(0.5)
#搜索结果
number=driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[1]/span/em").text
number=int(number)
#作者检查
fla=True
i=1
while fla:
try:
us=driver.find_element(By.XPATH,"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr["+str(i)+"]/td[3]/a[1]").text
except:
default.append(thesis[0])
fla=False
if us==thesis[1]:
#print(number,i)
# 两个中,第二个#gridTable > table > tbody > tr:nth-child(2) > td.operat > a.icon-quote
# 两个中,第一个#gridTable > table > tbody > tr.odd > td.operat > a.icon-quote
# 3~多个中,第i个#gridTable > table > tbody > tr:nth-child(i) > td.operat > a.icon-quote
if number == 1 and i == 1:
cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr > td.operat > a.icon-quote")
ActionChains(driver) \
.click(cite) \
.perform()
time.sleep(0.5)
cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
time.sleep(0.5)
elif number == 2 and i == 1:
cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr.odd > td.operat > a.icon-quote")
ActionChains(driver) \
.click(cite) \
.perform()
time.sleep(0.5)
cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
time.sleep(0.5)
elif number == 2 and i == 2:
cite = driver.find_element(By.CSS_SELECTOR,"#gridTable > table > tbody > tr:nth-child("+str(i)+") > td.operat > a.icon-quote")
ActionChains(driver) \
.click(cite) \
.perform()
time.sleep(0.5)
cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
time.sleep(0.5)
elif number > 2:
cite = driver.find_element(By.CSS_SELECTOR, "#gridTable > table > tbody > tr:nth-child(" + str(i) + ") > td.operat > a.icon-quote")
ActionChains(driver) \
.click(cite) \
.perform()
time.sleep(0.5)
cite_info = driver.find_element(By.CSS_SELECTOR,"#layui-layer1 > div.layui-layer-content > table > tbody > tr:nth-child(1) > td.quote-r").text
time.sleep(0.5)
#引用信息
cite_box.append(cite_info)
#跳出
fla=False
i=i+1
print("获取引用文献 ",len(cite_box)," 篇。")
for ci in cite_box:
# 姓名省略
names = re.search(r".*?\.", ci).group()
# print("前段:",names)
name_li = names.split(",")
be_name = names
if len(name_li) > 3:
be_name = name_li[0] + "," + name_li[1] + "," + name_li[2] + ",等."
# 后段
af = ci.split(".")
af_content = af[1] + af[2] + "."
# 拼接
cite_info_per = be_name + af_content
# print("后段:",af_content)
# 打空格
ci = cite_info_per.replace(",", ", ").replace(".", ". ").replace(":", ": ")
print(ci)
if len(default)!=0:
print("============失败文件:===========")
for i in default:
print(i)
例:
文件夹内容如下:
获取结果如下:
缺点:缺点很多,包括英文文章没有考虑,文献命名格式固定等等,不过作为一个小demo玩玩还是可以的。