根据作者姓名在某年到某年所发表的文章,对文章的题目,期刊的影响因子进行爬取
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import re
# from threading import Thread
from multiprocessing import Process
from multiprocessing import Manager
import requests
import time
import xlrd
from bs4 import BeautifulSoup
from lxml import etree
import os
#os.system('rm -r 1.csv')
#os.system('touch 1.csv')
c=0
d=0
e=0
chrome_options=Options()
chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
url1='http://apps.webofknowledge.com/full_record.do?product=WOS&search_mode=GeneralSearch&qid=4&SID=5ArzJjzffBtmmVcFhzj&page=1&doc=1&cacheurlFromRightClick=no'
url2=url1
tx='Xie, X'
zz='Xie, X'
zs=18282
xm='谢欣'
dz='Chinese Acad Sci'
i=13803
class SpiderMain(object):
def __init__(self, sid, kanming):
self.hearders = {
'Origin': 'https://apps.webofknowledge.com',
'Referer': 'https://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch&SID=R1ZsJrXOFAcTqsL6uqh&preferencesSaved=',
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
'Content-Type': 'application/x-www-form-urlencoded'
}
self.form_data = {
'fieldCount': 1,
'action': 'search',
'product': 'WOS',
'search_mode': 'GeneralSearch',
'SID': sid,
'max_field_count': 25,
'formUpdated': 'true',
'value(input1)': kanming,
'value(select1)': 'AU',
'value(hidInput1)': '',
'limitStatus': 'collapsed',
'ss_lemmatization': 'On',
'ss_spellchecking': 'Suggest',
'SinceLastVisit_UTC': '',
'SinceLastVisit_DATE': '',
'range': 'CUSTOM',
'period': 'Year Range',
'startYear': '2012',
'endYear': '2021',
'update_back2search_link_param': 'yes',
'ssStatus': 'display:none',
'ss_showsuggestions': 'ON',
'ss_query_language': 'auto',
'ss_numDefaultGeneralSearchFields': 1,
'rs_sort_by': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A'
}
self.form_data2 = {
'product': 'WOS',
'prev_search_mode': 'CombineSearches',
'search_mode': 'CombineSearches',
'SID': sid,
'action': 'remove',
'goToPageLoc': 'SearchHistoryTableBanner',
'currUrl': 'https://apps.webofknowledge.com/WOS_CombineSearches_input.do?SID=' + sid + '&product=WOS&search_mode=CombineSearches',
'x': 48,
'y': 9,
'dSet': 1
}
def craw(self, root_url,i):
try:
s = requests.Session()
r = s.post(root_url, data=self.form_data, headers=self.hearders)
#print(r)
r.encoding = r.apparent_encoding
#print(r.text)
re_text = r'<span class="smallV110">.*?value>'
re_text1 = r'<span class="smallV110">[\s\S]*?value>'
match_list = re.findall(re_text1, r.text)
#print(match_list[0])
soup = BeautifulSoup(match_list[0], 'html.parser')
prefix = "http://apps.webofknowledge.com"
#print(prefix+soup.a['href'])
return prefix+soup.a['href']
tree = etree.HTML(r.text)
#print(tree)
cited = tree.xpath("//div[@class='search-results-data-cite']/a/text()")
download = tree.xpath(".//div[@class='alum_text']/span/text()")
flag = 0
print(r.url)
#print(i,cited, download,r.url)
flag=0
#return cited, download, flag
except Exception as e:
pass
def delete_history(self):
murl = 'https://apps.webofknowledge.com/WOS_CombineSearches.do'
s = requests.Session()
s.post(murl, data=self.form_data2, headers=self.hearders)
root_url = 'https://apps.webofknowledge.com/UA_GeneralSearch.do'
class Html_data:
def __init__(self, soup):
self.title = ''
self.author = ''
self.abstract = ''
self.keywords = ''
self.author_data = ''
self.data = ''
self.JCR_quartile=''
self.Impact_Factor_table=''
self.FR_field=''
self.year=''
self.soup = soup
self.flag=0
self.flag1=0
global e
try:
self.title = soup.find(attrs={'class':'title'}).text.replace('\n','')
soup1=soup.find_all('td',class_="JCR_quartile")
#print(len(soup1))
r=None
global d
if len(soup1)>0:
r=re.search('>(.*)<', str(soup1[0]))
#print(str(soup.find_all('td',class_="JCR_quartile")))
global c
if r==None:
d=d+1
c=1
else:
c=0
print(r.group(1))
self.JCR_quartile=r.group(1)
r=re.search('<td> (.*) </td>', str(soup.find_all('table',class_="Impact_Factor_table")))
print(r)
if r==None:
c=1
else:
c=0
self.Impact_Factor_table=r.group(1)
try:
self.data = soup.find(attrs={'class':'block-record-info block-record-info-source'}).text
data1=self.data.split('\n')
data2=data1[data1.index('Published:')+1]
self.year=data2[-4:]
#print()
except:
pass
items = soup.find_all(attrs={'class':'block-record-info'})
for item in items:
if len(item.attrs['class']) > 1:
continue
if 'By:' in item.text:
item1=item.find_all('p',class_="FR_field")
self.author = item1[0].text.replace('By:','').replace('\n','').replace(' ','').replace(' ]',']')
#self.author = item.text.replace('By:','').replace('\n','').replace(' ','').replace(' ]',']')
continue
elif 'Abstract' in item.text:
self.abstract = item.text
continue
elif 'Keywords' in item.text:
self.keywords = item.text
continue
elif 'Author Information' in item.text:
item2=item.find_all('table',class_="FR_table_noborders")
#print(item.find_all('p',class_="FR_field")[1].text)
if tx in str(item.find_all('p',class_="FR_field")):########################3
self.flag=1
#print(self.flag)
#if 'Tianjin Univ, Sch Mat Sci & Engn, Tianjin 300072, Peoples R China' in item2[len(item2)-1].text:
#print(item2[len(item2)-1].text)
try:
if dz in item2[len(item2)-1].text:########################
self.flag1=1
except:
self.author_data = item.text
continue
e=0
except:
browser = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver',options=chrome_options)
browser.get('https://www.webofscience.com/wos/alldb/basic-search')
#print(browser.page_source)
soup = BeautifulSoup(browser.page_source,'lxml')
r=re.search('"sid":"(.*)"};', str(soup))
print(r[1])
browser.quit()
obj_spider = SpiderMain(r[1], zz)
global url1
url1=obj_spider.craw(root_url,0)
global i
i=i-1
e=e+1
if e>=2:
i=i+1
import random
f=open('User-Agent.txt',"rb")
user_agents =f.readlines()
f.close()
if i==1:
os.system('rm -r 1.csv')
os.system('touch 1.csv')
while True:
url=url1
try:
url=url.replace('doc=1','doc='+str(i))
except:
url=url2
url=url.replace('doc=1','doc='+str(i))
print(url)
respon=None
c=0
while respon==None:
user_agent1=random.choice(user_agents)
user_agent=str(user_agent1)
print(user_agent[2:-5])
headers = {'User-Agent':user_agent[2:-5]}
try:
respon = requests.get(url, headers =headers,timeout=60)
except:
user_agents.remove(user_agent1)
print(len(user_agents))
c=c+1
if c==3:
url=url1
i=i+1
url=url.replace('doc=1','doc='+str(i))
print(url)
c=0
#print(respon)
#print(22222222222)
if respon:
html = respon.text
soup = BeautifulSoup(html,'lxml')
html_data = Html_data(soup)
#print(soup)
# 获取对象信息
title = html_data.title
authors = html_data.author
abstract = html_data.abstract
authors_data = html_data.author_data
data = html_data.data
keywords = html_data.keywords
year=html_data.year
authors1=authors.split(';')
b=0
for a in authors1:
b=b+1
#if 'Su, Yan Qing' in a or 'Su, Yanqing' in a or 'Su Yanqing' in a or 'Su, Yan-Qing' in a:#############################
if zz in a :
if html_data.flag1==1:
if html_data.flag==1:
cengci='通讯'
else:
cengci=str(b)
csv_data =[xm,title, year,cengci,html_data.Impact_Factor_table,html_data.JCR_quartile]##########################
c=0
f=open('1.csv', encoding='gbk', mode='a', newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(csv_data)
f.close()
print(csv_data)
break
if i==zs:
break
i=i+1