from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time,random
from lxml import etree
def next_click(button):
global driver
action = ActionChains(driver)
action.move_to_element(button)
action.click()
action.perform()
def parse_url(driver):
html = driver.page_source
html = etree.HTML(html)
content = html.xpath('//a[@class="position_link"]/@href')
print(content)
return content
def parse_page(url):
driver.get(url)
page = driver.page_source
name = etree.HTML(page)
name =name.xpath('//h1[@class="name"]/text()')
print(name)
def open_url(urls):
driver.switch_to_window(driver.window_handles[1])
for url in urls:
time.sleep(1)
parse_page(url)
driver.switch_to_window(driver.window_handles[0])
url = "https://www.lagou.com/"
driver_path = r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get(url)
# print(driver.page_source)
q = driver.find_element_by_xpath("//p[@class='checkTips']/a")
next_click(q)
time.sleep(5)
search = driver.find_element_by_id("search_input")
button = driver.find_element_by_id("search_button")
search.send_keys("python")
next_click(button)
time.sleep(5)
# 创建新的窗口用于打开新的界面
driver.execute_script("window.open()")
driver.switch_to_window(driver.window_handles[0])
while True:
t = random.randint(4,7)
time.sleep(t)
urls = parse_url(driver)
open_url(urls)
nextbutton = driver.find_element_by_xpath('//span[@action="next"]')
classname = nextbutton.get_attribute("class")
if(classname.find("pager_next_disabled") != -1):
print("="*40)
print("爬虫over")
break
else:
next_click(nextbutton)
selenuim篇之拉勾网爬虫
最新推荐文章于 2021-07-30 23:49:35 发布