python爬取web of science

根据作者姓名在某年到某年所发表的文章,对文章的题目,期刊的影响因子进行爬取

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import re
# from threading import Thread
from multiprocessing import Process
from multiprocessing import Manager
import requests
import time
import xlrd
from bs4 import BeautifulSoup
from lxml import etree
import os
#os.system('rm -r 1.csv')
#os.system('touch 1.csv')
c=0
d=0
e=0
chrome_options=Options()
chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
url1='http://apps.webofknowledge.com/full_record.do?product=WOS&search_mode=GeneralSearch&qid=4&SID=5ArzJjzffBtmmVcFhzj&page=1&doc=1&cacheurlFromRightClick=no'
url2=url1
tx='Xie, X'
zz='Xie, X'
zs=18282
xm='谢欣'
dz='Chinese Acad Sci'
i=13803
class SpiderMain(object):
    def __init__(self, sid, kanming):
        self.hearders = {
            'Origin': 'https://apps.webofknowledge.com',
            'Referer': 'https://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch&SID=R1ZsJrXOFAcTqsL6uqh&preferencesSaved=',
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
            'Content-Type': 'application/x-www-form-urlencoded'
        }
        self.form_data = {
            'fieldCount': 1,
            'action': 'search',
            'product': 'WOS',
            'search_mode': 'GeneralSearch',
            'SID': sid,
            'max_field_count': 25,
            'formUpdated': 'true',
            'value(input1)': kanming,
            'value(select1)': 'AU',
            'value(hidInput1)': '',
            'limitStatus': 'collapsed',
            'ss_lemmatization': 'On',
            'ss_spellchecking': 'Suggest',
            'SinceLastVisit_UTC': '',
            'SinceLastVisit_DATE': '',
            'range': 'CUSTOM',
            'period': 'Year Range',
            'startYear': '2012',
            'endYear': '2021',
            'update_back2search_link_param': 'yes',
            'ssStatus': 'display:none',
            'ss_showsuggestions': 'ON',
            'ss_query_language': 'auto',
            'ss_numDefaultGeneralSearchFields': 1,
            'rs_sort_by': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A'
        }
        self.form_data2 = {
            'product': 'WOS',
            'prev_search_mode': 'CombineSearches',
            'search_mode': 'CombineSearches',
            'SID': sid,
            'action': 'remove',
            'goToPageLoc': 'SearchHistoryTableBanner',
            'currUrl': 'https://apps.webofknowledge.com/WOS_CombineSearches_input.do?SID=' + sid + '&product=WOS&search_mode=CombineSearches',
            'x': 48,
            'y': 9,
            'dSet': 1
        }
 
    def craw(self, root_url,i):
        try:
            s = requests.Session()
            r = s.post(root_url, data=self.form_data, headers=self.hearders)
            #print(r)
            r.encoding = r.apparent_encoding
            #print(r.text)
            re_text = r'<span class="smallV110">.*?value>'
            re_text1 = r'<span class="smallV110">[\s\S]*?value>'
            match_list = re.findall(re_text1, r.text)
            #print(match_list[0])
            soup = BeautifulSoup(match_list[0], 'html.parser')
            prefix = "http://apps.webofknowledge.com"
            #print(prefix+soup.a['href'])
            return prefix+soup.a['href']
            tree = etree.HTML(r.text)
            #print(tree)
            cited = tree.xpath("//div[@class='search-results-data-cite']/a/text()")
            download = tree.xpath(".//div[@class='alum_text']/span/text()")
            flag = 0
            print(r.url)
            #print(i,cited, download,r.url)
            flag=0
            #return cited, download, flag
        except Exception as e:
            pass
    def delete_history(self):
        murl = 'https://apps.webofknowledge.com/WOS_CombineSearches.do'
        s = requests.Session()
        s.post(murl, data=self.form_data2, headers=self.hearders)
root_url = 'https://apps.webofknowledge.com/UA_GeneralSearch.do'
class Html_data:
    def __init__(self, soup):
        self.title = ''
        self.author = ''
        self.abstract = ''
        self.keywords = ''
        self.author_data = ''
        self.data = ''
        self.JCR_quartile=''
        self.Impact_Factor_table=''
        self.FR_field=''
        self.year=''
        self.soup = soup
        self.flag=0
        self.flag1=0
        global e
        try:
            self.title = soup.find(attrs={'class':'title'}).text.replace('\n','')  
            soup1=soup.find_all('td',class_="JCR_quartile")
            #print(len(soup1))   
            r=None
            global d
            if len(soup1)>0: 
                r=re.search('>(.*)<', str(soup1[0]))
        #print(str(soup.find_all('td',class_="JCR_quartile")))
            global c
            if r==None:
                d=d+1
                c=1
            else:
                c=0
                print(r.group(1))
                self.JCR_quartile=r.group(1)
            r=re.search('<td> (.*) </td>', str(soup.find_all('table',class_="Impact_Factor_table")))
            print(r)
            if r==None:
                c=1
            else:
                c=0
                self.Impact_Factor_table=r.group(1)    
            try:
                self.data = soup.find(attrs={'class':'block-record-info block-record-info-source'}).text
                data1=self.data.split('\n')
                data2=data1[data1.index('Published:')+1]
                self.year=data2[-4:]
            #print()
            except:
                pass
        
            items = soup.find_all(attrs={'class':'block-record-info'})
            for item in items:
                if len(item.attrs['class']) > 1:
                    continue
                if 'By:' in item.text:
                    item1=item.find_all('p',class_="FR_field")
                    self.author = item1[0].text.replace('By:','').replace('\n','').replace('  ','').replace(' ]',']')
                #self.author = item.text.replace('By:','').replace('\n','').replace('  ','').replace(' ]',']')            
                    continue
                elif 'Abstract' in item.text:
                    self.abstract = item.text
                    continue
                elif 'Keywords' in item.text:
                    self.keywords = item.text
                    continue
                elif 'Author Information' in item.text:
                    item2=item.find_all('table',class_="FR_table_noborders")
                #print(item.find_all('p',class_="FR_field")[1].text)
                    if tx in str(item.find_all('p',class_="FR_field")):########################3
                        self.flag=1
                    #print(self.flag)
                #if 'Tianjin Univ, Sch Mat Sci & Engn, Tianjin 300072, Peoples R China' in item2[len(item2)-1].text:
                #print(item2[len(item2)-1].text)
                    try:
                        if dz in item2[len(item2)-1].text:########################
                            self.flag1=1
                        
                    except:
                        self.author_data = item.text 
                    continue
            e=0
        except:
            browser = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver',options=chrome_options)
            browser.get('https://www.webofscience.com/wos/alldb/basic-search')
            #print(browser.page_source)
            soup = BeautifulSoup(browser.page_source,'lxml')
            r=re.search('"sid":"(.*)"};', str(soup))
            print(r[1])
            browser.quit()
            obj_spider = SpiderMain(r[1], zz)
            global url1
            url1=obj_spider.craw(root_url,0)
            global i
            i=i-1
            e=e+1
            if e>=2:
                i=i+1
import random
f=open('User-Agent.txt',"rb")
user_agents =f.readlines()
f.close()
if i==1:
    os.system('rm -r 1.csv')
    os.system('touch 1.csv')
while True:
    url=url1
    try:
        url=url.replace('doc=1','doc='+str(i))
    except:
        url=url2
        url=url.replace('doc=1','doc='+str(i))
    print(url)
    respon=None
    c=0
    while respon==None:
        user_agent1=random.choice(user_agents)
        user_agent=str(user_agent1)
        print(user_agent[2:-5])
        headers = {'User-Agent':user_agent[2:-5]}
        try:
            respon = requests.get(url, headers =headers,timeout=60)
        except:
            user_agents.remove(user_agent1)
            print(len(user_agents))
        c=c+1
        if c==3:
            url=url1
            i=i+1
            url=url.replace('doc=1','doc='+str(i))
            print(url)
            c=0
    #print(respon)
    #print(22222222222)
    if respon:
        html = respon.text
        soup = BeautifulSoup(html,'lxml')
        html_data = Html_data(soup)
        #print(soup) 
        # 获取对象信息
        title = html_data.title 
        authors = html_data.author
        abstract = html_data.abstract
        authors_data = html_data.author_data
        data = html_data.data
        keywords = html_data.keywords   
        year=html_data.year 
        authors1=authors.split(';')
        b=0
        for a in authors1:
            b=b+1
            #if 'Su, Yan Qing' in a or 'Su, Yanqing' in a or 'Su Yanqing' in a or 'Su, Yan-Qing' in a:#############################
            if zz in a :
                if html_data.flag1==1:
                    if html_data.flag==1:
                        cengci='通讯'
                    else:
                        cengci=str(b)
                    csv_data =[xm,title, year,cengci,html_data.Impact_Factor_table,html_data.JCR_quartile]##########################
                    c=0
                    f=open('1.csv', encoding='gbk', mode='a', newline='')
                    csv_writer = csv.writer(f)
                    csv_writer.writerow(csv_data)
                    f.close()
                    print(csv_data)
                break
    if i==zs:
        break
    i=i+1
  • 2
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值