selenium分类爬取谷粉专利数据

方法:利用selenium模仿人点击页面,爬取需要的数据

源代码

from selenium import webdriver
import os
import time
page_index1=1
page_path_new=""
time_temp = 10
i_index=['H01','H02','H04','G06']
def pang_page(page_path,i1):
    searchResultItems=browser.find_elements_by_xpath('//*[@id="resultsContainer"]//search-result-item')
    print(searchResultItems)
    if len(searchResultItems):
        def page_try(page_path, i1,searchResultItems):
            global page_index1
            print("从",page_index1,"页开始")
            for pang_index in range(page_index1,1100):
                global page_path_new
                global time_temp
                try:
                    js = "var q=document.documentElement.scrollTop=10000"
                    browser.execute_script(js)
                    i=1
                    for item in searchResultItems:

                        txt_path='F:\\pangZL\\'+str(i1)

                        isExists=os.path.exists(txt_path)
                        if not isExists:
                            os.makedirs(txt_path)
                        browser.find_element_by_xpath('//*[@id="resultsContainer"]//search-result-item['+str(i)+']//a').click()
                        time.sleep(time_temp)
                        try:
                            page_name=browser.find_element_by_xpath('//*[@id="pubnum"]').text
                            txt_name = 'F:\\pangZL\\' + str(i1) + '\\' + str(page_name) + '.txt'
                            f = open(txt_name, mode='w', encoding='utf-8')
                            txt_data=browser.find_element_by_xpath('//*[@id="text"]/abstract/div').text
                            f.write(txt_data)  # write 写入
                            f.close()
                            print(page_name,"爬取成功!!!")
                        except:
                            print(txt_name,"没有摘要,必须跳过")
                        i = i+1
                        browser.back()
                        time.sleep(time_temp-5)
                    page_path_new="https://patents.glgoo.top/?q="+i1+"&language=CHINESE&page="+str(pang_index+1)
                    print(page_path_new)
                    browser.get(page_path_new)
                    time.sleep(time_temp)
                    print(pang_index,"页爬取成功!")
                    page_index1=pang_index
                except:
                    print(time_temp,":时间都这么久了,又来一次!!!!!!")
                    page_path_new = "https://patents.glgoo.top/?q=" + i1 + "&language=CHINESE&page=" + str(pang_index + 1)
                    print(page_path_new)
                    time_temp+=2
                    browser.get(page_path_new)
                    time.sleep(time_temp)
                    pang_page(page_path,i1)
    else:
        raise Exception("集合为空,爬完了")
    page_try(page_path, i1,searchResultItems)


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_driver = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
browser = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)

for i1 in i_index:
    try:
        page_path=browser.get("https://patents.glgoo.top/?q="+i1+"&language=CHINESE")
        time.sleep(10)
        page_index1=0
        pang_page(page_path,i1)
        print(i1,'搞定了小伙子!!!!')
        page_index1=0
    except:
        print(i1,'搞定了小伙子!!!!')
        continue


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值