import random
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import csv
import time
import re
def start_spider():
browser.get(url)
page = browser.page_source
time.sleep(2)
title=browser.find_element_by_class_name("title").text
print(title)#标题
list.append(title)
author_list=[]
authors=browser.find_elements_by_xpath('.//a[@title="查找此作者的更多记录"]')
authors_num=browser.find_elements_by_tag_name("sup")
# for author in authors:
# author_list.append(author.text)
for author in range(0,len(authors)):
try:
author_list.append(authors[author].text + authors_num[author].text)
# pattern3 = re.compile('alt="Find more records by this author">(.*?)</a>')
# author = re.findall(pattern3, html)
except:
continue
list.append(author_list)
# ###作者详细信息
# author_detail=browser.find_element_by_xpath("//*[@id='show_resc_blurb']/table")
# tr_content=author_detail.find_elements_by_tag_name("tr")
detailauthor_list = []
# for tr in tr_content:
# try:
# tds=tr.find_elements_by_tag_name("td")
# for td in tds:
# detailauthor_list.append(td.get_attribute('textContent'))
# print(detailauthor_list)
# print(td.get_attribute('textContent'))
# # name=tr.find_element_by_tag_name("display_name").text
# # Web_of_Science_ResearcherID=tr.find_element_by_class_name("snowplow-RIDnumber").text
# # orcid=tr.find_element_by_class_name("snowplow-ORCIDnumber").text
# # detailauthor_list.append(name)
# # detailauthor_list.append(Web_of_Science_ResearcherID)
# # detailauthor_list.append(orcid)
# except:
# continue
# list.append(detailauthor_list)
# detailauthor_list.clear()
paperinfo=browser.find_element_by_class_name("block-record-info-source")
sourceTitle=paperinfo.find_element_by_tag_name("value").text#sourceTitle
list.append(sourceTitle)
# juan=paperinfo.find_element_by_class_name("block-record-info-source-values")
# nums=juan.find_elements_by_tag_name("value")
#
# for num in nums:
# list.append(num.text) # 卷#期
# # doi=juan.find_element_by_css_selector("#records_form > div > div > div > div.l-column-content > div > div.block-record-info.block-record-info-source > p:nth-child(4) > value")
# # doi=juan.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[3]/p[3]/value").text
# if len(nums) == 2:
# list.append("[]")
doi=re.findall('DOI:</span>\n<value>(.*?)</value>', page, re.S)
list.append(doi)
year=re.findall('出版年:</span>\n<value>(.*?)</value>', page, re.S)
list.append(year)
type=re.findall('文献类型:</span>(.*?)<', page, re.S)
# type=juan.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[3]/p[5]").text
# type=re.findall(r'[a-zA-Z]+', type)
# list.append(type)
# type=re.findall("< p class ="FR_field" ><span class='FR_label'>文献类型:</span>(.*?)</p>",page,re.S)
list.append(type)
keywords_list=[]
try:
keywords = browser.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[5]/p")
keywords_a = keywords.find_elements_by_tag_name("a")
for keyword in keywords_a:
keywords_list.append(keyword.text)
except:
pass
list.append(keywords_list)
# author_infor=browser.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[6]")
# address=author_infor.find_element_by_tag_name("address").text#通讯作者地址
# list.append(address)
# cor_author=author_infor.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[6]/p[3]/text()")#通讯作者
# print("通讯作者:"+cor_author)
# list.append(cor_author)
# cor_author_address=author_infor.find_element_by_class_name("fr_address_row2").text#通讯作者地址2
# print(cor_author_address)
# list.append(cor_author_address)
address_list=[]
all_address3=browser.find_elements_by_class_name("fr_address_row2")
for alladdress in all_address3:
try:
trs=alladdress.find_element_by_tag_name("a")
address_list.append(trs.text)
except:
continue
list.append(address_list)
fund_list=[]
fund=re.findall('<p class="uni_fund_links">(.*?)</p>', page, re.S)
for d in range(0,len(fund)):
fund_list.append(str(fund[d]))
# detailauthor_list.append(Web_of_Science_ResearcherID[d])
list.append(fund_list)
# publishers=browser.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[7]/p/value").text
publishers=re.findall('出版商</div>\n<p class="FR_field">\n<value>(.*?)</value>', page, re.S)
list.append(publishers)
# PubMedID = re.findall('PubMed ID:</span>\n<value>(.*?)</value>', page, re.S)
rucanghao=re.findall('入藏号:</span>\n<value>(.*?)<', page, re.S)
list.append(rucanghao)
# other_info=browser.find_element_by_xpath("//*[@id='hidden_section']")
# yinyong=other_info.find_elements_by_tag_name("b")
# yinyongdecankaowenxian=yinyong[0].text#引用的参考文献
# list.append(yinyongdecankaowenxian)
# beiyinpingci=yinyong[1].text#被引频次
# list.append(beiyinpingci)
nums=browser.find_elements_by_class_name("large-number")
yinyongdecankaowenxian = nums[0].text # 引用的参考文献
beiyinpingci=yinyong = nums[1].text # 被引频次
zuijin180=nums[2].text
year2013=nums[3].text
list.append(yinyongdecankaowenxian)
list.append(beiyinpingci)
list.append(zuijin180)
list.append(year2013)
name = re.findall('<display_name>(.*?)</display_name>', page, re.S)
Web_of_Science_ResearcherID = re.findall('http://www\.researcherid\.com/rid/(.*?)\',\'ResearcherID', page, re.S)
# orcid=re.findall('class="snowplow-ORCIDnumber" οnclick="javascript:open_location\(\'(.*?)\',\'ORCID\'\);', page, re.S)
for d in range(0,len(name)):
detailauthor_list.append(str(name[d]))
try:
detailauthor_list.append(Web_of_Science_ResearcherID[d])
except:
continue
# detailauthor_list.append(str(orcid[d]))
list.append(detailauthor_list)
# detailauthor_list.clear()
print(list)
with open('D:\duo1.csv', 'a', encoding='utf-8', newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(list)
print("已写入csv文件")
list.clear()
if __name__ == '__main__':
options = webdriver.ChromeOptions()
prefs = {
'profile.default_content_setting_values': {
'images': 2,
'permissions.default.stylesheet': 2,
'javascript': 2
}
}
# 设置代理
options.add_experimental_option('prefs', prefs)
options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
for index in range(1737, 2000):
print(index)
url = 'http://apps.webofknowledge.com/full_record.do?product=WOS&search_mode=AdvancedSearch&qid=1&SID=5EbBW5UmVxWKGpegUFp&page=1&doc=' + str(
index)
list = []
start_spider()
time.sleep(random.random())
#browser.close()
print("爬取完成,请到相应文件夹查看!")
#10463
WOS论文详细数据页面信息selenium爬虫
最新推荐文章于 2024-03-29 09:30:41 发布