WOS论文详细数据页面信息selenium爬虫

最新推荐文章于 2024-10-29 16:39:34 发布
苏格拉没有鞋底
最新推荐文章于 2024-10-29 16:39:34 发布
阅读量972
点赞数 1
分类专栏： python 文章标签： python selenium 爬虫
本文链接：https://blog.csdn.net/sgld995/article/details/118256266
版权
python 专栏收录该内容
20 篇文章
订阅专栏
import random

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import csv
import time
import re

def start_spider():
    browser.get(url)
    page = browser.page_source
    time.sleep(2)
    title=browser.find_element_by_class_name("title").text
    print(title)#标题
    list.append(title)

    author_list=[]
    authors=browser.find_elements_by_xpath('.//a[@title="查找此作者的更多记录"]')
    authors_num=browser.find_elements_by_tag_name("sup")
    # for author in authors:
    #     author_list.append(author.text)
    for author in range(0,len(authors)):
        try:
            author_list.append(authors[author].text + authors_num[author].text)
        # pattern3 = re.compile('alt="Find more records by this author">(.*?)</a>')
        # author = re.findall(pattern3, html)
        except:
            continue
    list.append(author_list)


    # ###作者详细信息
    # author_detail=browser.find_element_by_xpath("//*[@id='show_resc_blurb']/table")
    # tr_content=author_detail.find_elements_by_tag_name("tr")
    detailauthor_list = []
    # for tr in tr_content:
    #     try:
    #         tds=tr.find_elements_by_tag_name("td")
    #         for td in tds:
    #             detailauthor_list.append(td.get_attribute('textContent'))
    #             print(detailauthor_list)
    #             print(td.get_attribute('textContent'))
    #         # name=tr.find_element_by_tag_name("display_name").text
    #         # Web_of_Science_ResearcherID=tr.find_element_by_class_name("snowplow-RIDnumber").text
    #         # orcid=tr.find_element_by_class_name("snowplow-ORCIDnumber").text
    #         # detailauthor_list.append(name)
    #         # detailauthor_list.append(Web_of_Science_ResearcherID)
    #         # detailauthor_list.append(orcid)
    #     except:
    #         continue
    # list.append(detailauthor_list)
    # detailauthor_list.clear()

    paperinfo=browser.find_element_by_class_name("block-record-info-source")
    sourceTitle=paperinfo.find_element_by_tag_name("value").text#sourceTitle
    list.append(sourceTitle)
    # juan=paperinfo.find_element_by_class_name("block-record-info-source-values")
    # nums=juan.find_elements_by_tag_name("value")
    #
    # for num in nums:
    #     list.append(num.text)  # 卷#期
    # # doi=juan.find_element_by_css_selector("#records_form > div > div > div > div.l-column-content > div > div.block-record-info.block-record-info-source > p:nth-child(4) > value")
    # # doi=juan.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[3]/p[3]/value").text
    # if len(nums) == 2:
    #     list.append("[]")
    doi=re.findall('DOI:</span>\n<value>(.*?)</value>', page, re.S)
    list.append(doi)
    year=re.findall('出版年:</span>\n<value>(.*?)</value>', page, re.S)
    list.append(year)

    type=re.findall('文献类型:</span>(.*?)<', page, re.S)
    # type=juan.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[3]/p[5]").text
    # type=re.findall(r'[a-zA-Z]+', type)
    # list.append(type)
    # type=re.findall("< p class ="FR_field" ><span class='FR_label'>文献类型:</span>(.*?)</p>",page,re.S)
    list.append(type)

    keywords_list=[]
    try:
        keywords = browser.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[5]/p")
        keywords_a = keywords.find_elements_by_tag_name("a")
        for keyword in keywords_a:
            keywords_list.append(keyword.text)
    except:
        pass
    list.append(keywords_list)

    # author_infor=browser.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[6]")
    # address=author_infor.find_element_by_tag_name("address").text#通讯作者地址
    # list.append(address)
    # cor_author=author_infor.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[6]/p[3]/text()")#通讯作者
    # print("通讯作者："+cor_author)
    # list.append(cor_author)
    # cor_author_address=author_infor.find_element_by_class_name("fr_address_row2").text#通讯作者地址2
    # print(cor_author_address)
    # list.append(cor_author_address)
    address_list=[]
    all_address3=browser.find_elements_by_class_name("fr_address_row2")

    for alladdress in all_address3:
        try:
            trs=alladdress.find_element_by_tag_name("a")
            address_list.append(trs.text)
        except:
            continue



    list.append(address_list)


    fund_list=[]
    fund=re.findall('<p class="uni_fund_links">(.*?)</p>', page, re.S)
    for d in range(0,len(fund)):
        fund_list.append(str(fund[d]))
        # detailauthor_list.append(Web_of_Science_ResearcherID[d])
    list.append(fund_list)

    # publishers=browser.find_element_by_xpath("//*[@id='records_form']/div/div/div/div[1]/div/div[7]/p/value").text
    publishers=re.findall('出版商</div>\n<p class="FR_field">\n<value>(.*?)</value>', page, re.S)
    list.append(publishers)


    # PubMedID = re.findall('PubMed ID:</span>\n<value>(.*?)</value>', page, re.S)
    rucanghao=re.findall('入藏号:</span>\n<value>(.*?)<', page, re.S)
    list.append(rucanghao)
    # other_info=browser.find_element_by_xpath("//*[@id='hidden_section']")
    # yinyong=other_info.find_elements_by_tag_name("b")
    # yinyongdecankaowenxian=yinyong[0].text#引用的参考文献
    # list.append(yinyongdecankaowenxian)
    # beiyinpingci=yinyong[1].text#被引频次
    # list.append(beiyinpingci)
    nums=browser.find_elements_by_class_name("large-number")
    yinyongdecankaowenxian = nums[0].text  # 引用的参考文献
    beiyinpingci=yinyong = nums[1].text  # 被引频次
    zuijin180=nums[2].text
    year2013=nums[3].text
    list.append(yinyongdecankaowenxian)
    list.append(beiyinpingci)
    list.append(zuijin180)
    list.append(year2013)

    name = re.findall('<display_name>(.*?)</display_name>', page, re.S)
    Web_of_Science_ResearcherID = re.findall('http://www\.researcherid\.com/rid/(.*?)\',\'ResearcherID', page, re.S)

    # orcid=re.findall('class="snowplow-ORCIDnumber" οnclick="javascript:open_location\(\'(.*?)\',\'ORCID\'\);', page, re.S)
    for d in range(0,len(name)):
        detailauthor_list.append(str(name[d]))
        try:
            detailauthor_list.append(Web_of_Science_ResearcherID[d])
        except:
            continue
        # detailauthor_list.append(str(orcid[d]))
    list.append(detailauthor_list)
    # detailauthor_list.clear()

    print(list)
    with open('D:\duo1.csv', 'a', encoding='utf-8', newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(list)
        print("已写入csv文件")
    list.clear()
if __name__ == '__main__':
    options = webdriver.ChromeOptions()
    prefs = {
        'profile.default_content_setting_values': {
            'images': 2,
            'permissions.default.stylesheet': 2,
            'javascript': 2
        }
    }
    # 设置代理
    options.add_experimental_option('prefs', prefs)
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
    browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {
          get: () => undefined
        })
      """
    })
    for index in range(1737, 2000):
        print(index)
        url = 'http://apps.webofknowledge.com/full_record.do?product=WOS&search_mode=AdvancedSearch&qid=1&SID=5EbBW5UmVxWKGpegUFp&page=1&doc=' + str(
            index)
        list = []
        start_spider()
        time.sleep(random.random())
        #browser.close()
    print("爬取完成，请到相应文件夹查看！")
    #10463