方法:利用selenium模仿人点击页面,爬取需要的数据
源代码
from selenium import webdriver
import os
import time
page_index1=1
page_path_new=""
time_temp = 10
i_index=['H01','H02','H04','G06']
def pang_page(page_path,i1):
searchResultItems=browser.find_elements_by_xpath('//*[@id="resultsContainer"]//search-result-item')
print(searchResultItems)
if len(searchResultItems):
def page_try(page_path, i1,searchResultItems):
global page_index1
print("从",page_index1,"页开始")
for pang_index in range(page_index1,1100):
global page_path_new
global time_temp
try:
js = "var q=document.documentElement.scrollTop=10000"
browser.execute_script(js)
i=1
for item in searchResultItems:
txt_path='F:\\pangZL\\'+str(i1)
isExists=os.path.exists(txt_path)
if not isExists:
os.makedirs(txt_path)
browser.find_element_by_xpath('//*[@id="resultsContainer"]//search-result-item['+str(i)+']//a').click()
time.sleep(time_temp)
try:
page_name=browser.find_element_by_xpath('//*[@id="pubnum"]').text
txt_name = 'F:\\pangZL\\' + str(i1) + '\\' + str(page_name) + '.txt'
f = open(txt_name, mode='w', encoding='utf-8')
txt_data=browser.find_element_by_xpath('//*[@id="text"]/abstract/div').text
f.write(txt_data) # write 写入
f.close()
print(page_name,"爬取成功!!!")
except:
print(txt_name,"没有摘要,必须跳过")
i = i+1
browser.back()
time.sleep(time_temp-5)
page_path_new="https://patents.glgoo.top/?q="+i1+"&language=CHINESE&page="+str(pang_index+1)
print(page_path_new)
browser.get(page_path_new)
time.sleep(time_temp)
print(pang_index,"页爬取成功!")
page_index1=pang_index
except:
print(time_temp,":时间都这么久了,又来一次!!!!!!")
page_path_new = "https://patents.glgoo.top/?q=" + i1 + "&language=CHINESE&page=" + str(pang_index + 1)
print(page_path_new)
time_temp+=2
browser.get(page_path_new)
time.sleep(time_temp)
pang_page(page_path,i1)
else:
raise Exception("集合为空,爬完了")
page_try(page_path, i1,searchResultItems)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_driver = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
browser = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)
for i1 in i_index:
try:
page_path=browser.get("https://patents.glgoo.top/?q="+i1+"&language=CHINESE")
time.sleep(10)
page_index1=0
pang_page(page_path,i1)
print(i1,'搞定了小伙子!!!!')
page_index1=0
except:
print(i1,'搞定了小伙子!!!!')
continue