selenium分类爬取谷粉专利数据

最新推荐文章于 2024-03-26 11:12:47 发布

鼠小米

最新推荐文章于 2024-03-26 11:12:47 发布

阅读量321

点赞数 1

分类专栏：网络爬虫文章标签： python selenium chrome 机器学习数据挖掘

本文链接：https://blog.csdn.net/sinat_39616953/article/details/108833553

版权

网络爬虫专栏收录该内容

8 篇文章 3 订阅

订阅专栏

方法：利用selenium模仿人点击页面，爬取需要的数据

源代码

from selenium import webdriver
import os
import time
page_index1=1
page_path_new=""
time_temp = 10
i_index=['H01','H02','H04','G06']
def pang_page(page_path,i1):
    searchResultItems=browser.find_elements_by_xpath('//*[@id="resultsContainer"]//search-result-item')
    print(searchResultItems)
    if len(searchResultItems):
        def page_try(page_path, i1,searchResultItems):
            global page_index1
            print("从",page_index1,"页开始")
            for pang_index in range(page_index1,1100):
                global page_path_new
                global time_temp
                try:
                    js = "var q=document.documentElement.scrollTop=10000"
                    browser.execute_script(js)
                    i=1
                    for item in searchResultItems:

                        txt_path='F:\\pangZL\\'+str(i1)

                        isExists=os.path.exists(txt_path)
                        if not isExists:
                            os.makedirs(txt_path)
                        browser.find_element_by_xpath('//*[@id="resultsContainer"]//search-result-item['+str(i)+']//a').click()
                        time.sleep(time_temp)
                        try:
                            page_name=browser.find_element_by_xpath('//*[@id="pubnum"]').text
                            txt_name = 'F:\\pangZL\\' + str(i1) + '\\' + str(page_name) + '.txt'
                            f = open(txt_name, mode='w', encoding='utf-8')
                            txt_data=browser.find_element_by_xpath('//*[@id="text"]/abstract/div').text
                            f.write(txt_data)  # write 写入
                            f.close()
                            print(page_name,"爬取成功！！！")
                        except:
                            print(txt_name,"没有摘要，必须跳过")
                        i = i+1
                        browser.back()
                        time.sleep(time_temp-5)
                    page_path_new="https://patents.glgoo.top/?q="+i1+"&language=CHINESE&page="+str(pang_index+1)
                    print(page_path_new)
                    browser.get(page_path_new)
                    time.sleep(time_temp)
                    print(pang_index,"页爬取成功！")
                    page_index1=pang_index
                except:
                    print(time_temp,":时间都这么久了，又来一次!!!!!!")
                    page_path_new = "https://patents.glgoo.top/?q=" + i1 + "&language=CHINESE&page=" + str(pang_index + 1)
                    print(page_path_new)
                    time_temp+=2
                    browser.get(page_path_new)
                    time.sleep(time_temp)
                    pang_page(page_path,i1)
    else:
        raise Exception("集合为空，爬完了")
    page_try(page_path, i1,searchResultItems)


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_driver = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
browser = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)

for i1 in i_index:
    try:
        page_path=browser.get("https://patents.glgoo.top/?q="+i1+"&language=CHINESE")
        time.sleep(10)
        page_index1=0
        pang_page(page_path,i1)
        print(i1,'搞定了小伙子！！！！')
        page_index1=0
    except:
        print(i1,'搞定了小伙子！！！！')
        continue