webofscience 半自动化爬虫
需要的包
下面展示一些 需要的包。
import json
import ddddocr
import pandas as pd
import requests
import xlrd
import os
from seleniumwire import webdriver
import time
from selenium.webdriver.chrome.options import Options
主函数
def mains():
url = 'http://www.80lib.com/user/index'
chorme_options = Options()
chorme_options.add_argument("disable-infobars")
chorme_options.add_argument("--start-maximized")
chorme_options.add_argument("--proxy-server={}".format(get_proxy()))
driver = webdriver.Chrome(chrome_options=chorme_options, executable_path='chromedriver.exe')
driver.get(url)
login(driver)
driver.find_element_by_xpath('//*[@id="main-menu-navigation"]/li[8]/ul/li[3]/a/span[1]').click()
time.sleep(3)
driver.find_element_by_xpath(
'/html/body/div[4]/div[2]/div[2]/section/div/div/div/div[3]/div/div/div[1]/div/a').click()
time.sleep(3)
driver.find_element_by_xpath(
'/html/body/div[4]/div[2]/div[2]/section/div/div/div[1]/div[3]/div/div/div[1]/div/a').click()
time.sleep(3)
handles = driver.window_handles # 获取当前浏览器所有窗口句柄
driver.switch_to.window(handles[-1]) # 切换最新窗口句柄
time.sleep(2)
driver.find_element_by_xpath('//*[@id="p1"]/p[3]/a/font').click()
time.sleep(5)
ids = 0
# 获取搜索名称
while True:
list1 = get_search_text()
if list1 != None:
for i in list1:
print(i)
issue = is_use(i)
if issue == False:
try:
driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]').click()
except:
pass
try:
driver.find_element_by_xpath('//*[@id="snSearchType"]/div[3]/button[1]').click()
except:
pass
try:
driver.find_element_by_xpath(
'//*[@id="snSearchType"]/div[1]/app-search-row/div/div[1]/app-select-search-field/wos-select/button').click()
time.sleep(3)
driver.find_element_by_xpath('//*[@id="global-select"]/div[1]/div[2]/div[4]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[@id="mat-input-{}"]'.format(ids)).send_keys(i)
time.sleep(3)
driver.find_element_by_xpath(
'/html/body/app-wos/div/div/main/div/div/div[2]/app-input-route/app-search-home/div[2]/div/mat-nav-list/div/span[1]/a/span').click()
time.sleep(3)
driver.find_element_by_css_selector('[data-ta="run-search"]').click()
# driver.find_element_by_xpath('//*[@id="snSearchType"]/div[3]/button[2]').click()
time.sleep(3)
url = driver.current_url
cookies_list = driver.get_cookies()
mes_dict = set_cookies(cookies_list)
indexs = driver.find_element_by_xpath('/html/body/app-wos/div/div/main/div/div/div[2]/app-input-route/app-base-summary-component/app-search-friendly-display/div[1]/app-general-search-friendly-display/h1/span').text
print(indexs)
cookies = mes_dict['cookie']
wossid = mes_dict['wossid']
md5 = url.split('summary/')[-1].split('/relevance')[0]
text = getmessage(cookies, md5, wossid,indexs)
save_xlsl(i)
driver.back()
time.sleep(5)
ids += 1
except Exception as g:
with open('error.json', 'a', encoding='utf-8') as f:
error_message = {
'文件名称:': name1,
# '错误信息': str(e),
'错误标题': str(i)
}
f.write(str(error_message) + '\n')
print(str(g))
driver.close()
mains()
continue
else:
print('{}已处理'.format(i))
continue
os.remove(r'C:\Users\郭晓林\Desktop\2022期刊数据\2022期刊数据\{}'.format(name1))
else:
break
登录函数
def login(driver):
driver.find_element_by_xpath('/html/body/div/div[1]/div/div/form/div/input[1]').send_keys('账号')
time.sleep(0.5)
driver.find_element_by_xpath('/html/body/div/div[1]/div/div/form/div/input[2]').send_keys('密码')
time.sleep(0.5)
img_tag = driver.find_element_by_xpath('/html/body/div/div[1]/div/div/form/div/img').get_attribute('src')
# 根据selenium获取已加载得验证码图片
for r in driver.iter_requests():
if r.url==img_tag:
with open('yzm.png', 'wb') as f:
f.write(r.response.body)
with open('yzm.png','rb') as t:
types = t.read()
ocr = ddddocr.DdddOcr()
res = ocr.classification(types)
driver.find_element_by_xpath('/html/body/div/div[1]/div/div/form/div/input[3]').send_keys(res)
time.sleep(2)
driver.find_element_by_xpath('/html/body/div/div[1]/div/div/form/div/button').click()
请求数据
def getmessage(cookies, ids, wossid,indexs):
with open('message.txt', 'w', encoding='utf-8') as f:
f.write('')
if len(indexs) > 3:
num = int(indexs.split(',')[0]) + 1
else:
num = 1
markFrom = 1
markTo = 1000
for i in range(num):
headers = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-length': '5536',
'content-type': 'application/json',
'cookie': '{}'.format(cookies.strip()),
'origin': 'https://www.webofscience.com',
'pragma': 'no-cache',
'referer': 'https://www.webofscience.com/wos/alldb/summary/19a7c3c2-d47d-4e6e-b1d6-f548823abb5d-465ceb70/relevance/1(overlay:export/exc)',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'x-1p-wos-sid': '{}'.format(wossid),
}
payload = {"parentQid": "{}".format(ids), "sortBy": "date-descending",
"displayTimesCited": "true", "displayCitedRefs": "true", "product": "UA", "colName": "ALLDB",
"displayUsageInfo": "true", "fileOpt": "othersoftware", "action": "saveToTab", "markFrom": str(markFrom),
"markTo": str(markTo), "view": "summary", "isRefQuery": "false", "locale": "en_US",
"fieldList": ["AUTHORS", "TITLE", "SOURCE", "CITTIMES", "ACCESSION_NUM", "ABSTRACT",
"AUTHORSIDENTIFIERS", "ISSN_ISBN", "PMID", "CONFERENCE_INFO_SPONSORS", "USAGEIND"],
"bm-telemetry": "7a74G7m23Vrp0o5c9362241.75-1,2,-94,-100,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36,uaend,12147,20030107,zh-CN,Gecko,5,0,0,0,408358,1764935,1536,834,1536,864,1042,731,1536,,cpen:0,i1:0,dm:0,cwen:0,non:1,opc:0,fc:0,sc:0,wrc:1,isc:0,vib:1,bat:1,x11:0,x12:1,8101,0.596337456298,829835983799,0,loc:-1,2,-94,-131,Mozilla/5.0 (Windows;10.0.0;x86;64;) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36-1,2,-94,-101,do_en,dm_en,t_en-1,2,-94,-105,-1,2,-94,-102,0,-1,1,0,1023,831,0;0,-1,1,0,1024,622,0;0,-1,0,0,-1,1004,0;0,-1,1,0,1328,-1,0;0,-1,1,0,1650,-1,0;0,-1,0,0,2108,2108,0;-1,2,-94,-108,-1,2,-94,-110,0,1,8,814,117;1,1,139,814,119;2,1,189,814,120;3,1,226,814,121;4,1,236,813,121;5,1,254,813,122;6,1,261,813,123;7,1,276,812,125;8,1,302,812,126;9,1,318,812,127;10,1,323,812,128;11,1,335,812,129;12,1,339,812,131;13,1,355,811,133;14,1,369,811,134;15,1,372,811,136;16,1,404,811,138;17,1,406,811,141;18,1,417,811,143;19,1,427,811,145;20,1,436,811,147;21,1,449,811,148;22,1,455,811,149;23,1,459,811,150;24,1,471,811,151;25,1,475,811,153;26,1,502,811,155;27,1,507,811,156;28,1,519,811,157;29,1,523,811,158;30,1,540,811,159;31,1,553,811,160;32,1,557,811,161;33,1,564,811,164;34,1,573,811,165;35,1,590,811,168;36,1,599,812,171;37,1,605,812,173;38,1,622,813,177;39,1,628,814,180;40,1,638,816,181;41,1,644,816,184;42,1,658,818,186;43,1,659,820,189;44,1,675,821,189;45,1,676,822,192;46,1,691,824,193;47,1,692,825,195;48,1,706,828,197;49,1,713,830,199;50,1,722,832,200;51,1,751,840,207;52,1,757,843,209;53,1,776,847,210;54,1,788,850,212;55,1,788,855,213;56,1,798,859,214;57,1,807,864,215;58,1,814,868,216;59,1,819,875,217;60,1,839,890,220;61,1,850,898,222;62,1,851,908,223;63,1,860,917,224;64,1,870,925,224;65,1,884,936,224;66,1,884,946,224;67,1,901,958,224;68,1,902,972,224;69,1,914,984,224;70,1,918,998,224;71,1,940,1039,219;72,1,2350,1036,104;73,1,2359,1029,109;74,1,2366,1024,114;75,1,2372,1017,119;76,1,2379,1011,124;77,1,2389,1004,129;78,1,2395,998,133;79,1,2407,993,135;80,1,2411,988,139;81,1,2422,984,141;82,1,2427,979,145;83,1,2437,974,147;84,1,2443,968,150;85,1,2454,962,154;86,1,2459,955,158;87,1,2472,945,164;88,1,2476,933,169;89,1,2487,918,178;90,1,2491,902,188;91,1,2503,881,200;92,1,2507,860,212;93,1,2516,836,225;94,1,2524,815,237;95,1,2532,796,248;96,1,2539,784,257;97,1,2549,776,268;98,1,2555,768,277;99,1,2565,760,288;194,3,3856,610,400,-1;195,4,3925,610,400,-1;196,2,3925,610,400,-1;340,3,11005,652,239,-1;341,4,11086,652,239,-1;342,2,11087,652,239,-1;522,3,15636,370,582,-1;523,4,15721,370,582,-1;524,2,15721,370,582,-1;525,2,15725,370,582,1716;573,3,16253,448,717,-1;574,4,16332,448,717,-1;575,2,16332,448,717,-1;641,3,17449,428,788,-1;643,4,17508,426,787,-1;644,2,17508,426,787,-1;846,3,21455,374,783,-1;847,4,21560,374,783,-1;848,2,21560,374,783,-1;-1,2,-94,-117,-1,2,-94,-111,-1,2,-94,-109,-1,2,-94,-114,-1,2,-94,-103,2,1950;3,3854;2,19441;3,21454;-1,2,-94,-112,https://www.webofscience.com/wos/alldb/summary/47fd20d5-214b-4386-b211-e1856ecebefa-47364f23/date-descending/1(overlay:export/ext)-1,2,-94,-115,1,523226,32,0,0,0,523194,21566,0,1659671967598,31,17754,0,849,2959,12,0,21567,14529151,0,0658F8C6F8A7E46F8A18B464D4183E0C~0~YAAQfIzQF+wc22OCAQAAt10lbAgSxxizR8bO/ldpz0xVekrTRRE6wohjUVS4VbIkU4YakTFKeoLLOCFiX4hQP7UAp9bCNSK3uQ9c7kAZYLYx0LW0OQY3vaUth2FdHOP12MPKUi61HhyUKhTkzINWEP+euoBfaVSuaEx1drVyj8zwHCus14Ar0F/mGvRjR8dSWGsA32GY9OWALf0QoT4a+GzQ7GSpv3aWiJvgmMCq6eVGqv8AZn9kdGpONvBHssw65cIAJ4U/7gvr2/gFHb8o+WSeSduts2O0HuyXG0LV+bLQtigli/CeX/ZqAKGaGtP9zOcvONGrDOJwXrSF+lCgMc6KXn+/YvC3vI5gGPaA6ON4dgiy+MnMvyyTjSorjYAizDYMS1+Qx4ZfXNlc2ZMMbpXHot26VF6maC/DIIS+~-1~-1~-1,38566,490,1977982692,30261693,PiZtE,81584,41,0,-1-1,2,-94,-106,1,0-1,2,-94,-119,-1-1,2,-94,-122,0,0,0,0,1,0,0-1,2,-94,-123,-1,2,-94,-124,-1,2,-94,-126,-1,2,-94,-127,-1,2,-94,-70,-1752298761;-1489780605;dis;,7;true;true;true;-480;true;24;24;true;false;-1-1,2,-94,-80,5594-1,2,-94,-116,26474184-1,2,-94,-118,219267-1,2,-94,-129,-1,2,-94,-121,;6;10;0"}
url = 'https://www.webofscience.com/api/wosnx/indic/export/saveToFile'
res = requests.post(url=url, headers=headers, data=json.dumps(payload))
print(res.status_code)
if res.status_code == 200:
with open('message.txt', 'a', encoding='utf-8') as f:
f.write(res.text)
markFrom += 1000
markTo += 1000
time.sleep(2)
else:
with open('error.json', 'a', encoding='utf-8') as f:
error_message = {
'文件名称:': name1,
'错误信息': res.text
}
f.write(str(error_message) + '\n')
设置cookies
def set_cookies(cookielist):
dotmaticselementalKey = 'SLsLWlMhrHnTjDerSrlG'
ak_bmsc = ''
_sp_ses840c = '*'
_abck = ''
perf_dv5Tr4n = 1
OptanonAlertBoxClosed = ''
OptanonConsent = ''
RT = ''
bm_sv = ''
_sp_id840c = ''
bm_sz = ''
WOSSID = ''
for list1 in cookielist:
if list1['name'] == '_sp_id.840c':
_sp_id840c += list1['value']
if list1['name'] == 'ak_bmsc':
ak_bmsc += list1['value']
if list1['name'] == 'bm_sv':
bm_sv += list1['value']
if list1['name'] == 'RT':
RT += list1['value']
if list1['name'] == 'OptanonConsent':
OptanonConsent += list1['value']
if list1['name'] == 'OptanonAlertBoxClosed':
OptanonAlertBoxClosed += list1['value']
if list1['name'] == '_abck':
_abck += list1['value']
if list1['name'] == 'bm_sz':
bm_sz += list1['value']
if list1['name'] == 'WOSSID':
WOSSID += list1['value']
cookie = 'dotmatics.elementalKey=' + dotmaticselementalKey + ';' + 'ak_bmsc=' + ak_bmsc + ';' + 'bm_sz=' + bm_sz + ';' + ' _sp_ses.840c=' + _sp_ses840c + ';' + '_abck=' + _abck + ';' + 'perf_dv5Tr4n=' + str(
perf_dv5Tr4n) + ';' + 'OptanonAlertBoxClosed=' + OptanonAlertBoxClosed + ';' + 'OptanonConsent=' + OptanonConsent + ';' + 'RT=' + RT + ';' + 'bm_sv=' + bm_sv + ';' + ' _sp_id.840c' + _sp_id840c + ';'
dict1 = {'cookie': cookie, 'wossid': WOSSID}
return dict1
报错text文件为表格
def save_xlsl(filename):
df = pd.read_table('message.txt', encoding="utf-8", quoting=3)
df.to_excel(r'D:\webofscience\{}.xls'.format(filename), encoding='utf-8')
with open('success.txt','a',encoding='utf-8') as f:
f.write(filename+'\n')
链接: link.
图片:
带尺寸的图片:
居中的图片:
居中并且带尺寸的图片:
当然,我们为了让用户更加便捷,我们增加了图片拖拽功能。
如何插入一段漂亮的代码片
去博客设置页面,选择一款你喜欢的代码片高亮样式,下面展示同样高亮的 代码片
.
// An highlighted block
var foo = 'bar';
生成一个适合你的列表
- 项目
- 项目
- 项目
- 项目
- 项目1
- 项目2
- 项目3
- 计划任务
- 完成任务
创建一个表格
一个简单的表格是这么创建的:
项目 | Value |
---|---|
电脑 | $1600 |
手机 | $12 |
导管 | $1 |
设定内容居中、居左、居右
使用:---------:
居中
使用:----------
居左
使用----------:
居右
第一列 | 第二列 | 第三列 |
---|---|---|
第一列文本居中 | 第二列文本居右 | 第三列文本居左 |
SmartyPants
SmartyPants将ASCII标点字符转换为“智能”印刷标点HTML实体。例如:
TYPE | ASCII | HTML |
---|---|---|
Single backticks | 'Isn't this fun?' | ‘Isn’t this fun?’ |
Quotes | "Isn't this fun?" | “Isn’t this fun?” |
Dashes | -- is en-dash, --- is em-dash | – is en-dash, — is em-dash |
创建一个自定义列表
-
Markdown
- Text-to- HTML conversion tool Authors
- John
- Luke
如何创建一个注脚
一个具有注脚的文本。1
注释也是必不可少的
Markdown将文本转换为 HTML。
KaTeX数学公式
您可以使用渲染LaTeX数学表达式 KaTeX:
Gamma公式展示 Γ ( n ) = ( n − 1 ) ! ∀ n ∈ N \Gamma(n) = (n-1)!\quad\forall n\in\mathbb N Γ(n)=(n−1)!∀n∈N 是通过欧拉积分
Γ ( z ) = ∫ 0 ∞ t z − 1 e − t d t . \Gamma(z) = \int_0^\infty t^{z-1}e^{-t}dt\,. Γ(z)=∫0∞tz−1e−tdt.
你可以找到更多关于的信息 LaTeX 数学表达式here.
新的甘特图功能,丰富你的文章
- 关于 甘特图 语法,参考 这儿,
UML 图表
可以使用UML图表进行渲染。 Mermaid. 例如下面产生的一个序列图:
这将产生一个流程图。:
- 关于 Mermaid 语法,参考 这儿,
FLowchart流程图
我们依旧会支持flowchart的流程图:
- 关于 Flowchart流程图 语法,参考 这儿.
导出与导入
导出
如果你想尝试使用此编辑器, 你可以在此篇文章任意编辑。当你完成了一篇文章的写作, 在上方工具栏找到 文章导出 ,生成一个.md文件或者.html文件进行本地保存。
导入
如果你想加载一篇你写过的.md文件,在上方工具栏可以选择导入功能进行对应扩展名的文件导入,
继续你的创作。
注脚的解释 ↩︎