from lxml import etree
from selenium import webdriver
import time
import re
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
def __init__(self):
self.driver = webdriver.Chrome()
self.url = 'https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
self.info = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
self.parse_list_page(self,source)
# 将按钮显示等待
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_all_elements_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
)
#按钮元素加载出来以后就获取
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']"
"/span[last()]")
#按钮置灰说没有下一页
if "pager_next_disabled" in next_btn.get_attribute("class"):
pass
else:
#点击下一页
self.driver.execute_script("arguments[0].click();", next_btn)
time.sleep(1)
#python中的类默认第一个形参delf必传,调用时传入的实参数对应的是delf后面的形参:
def parse_list_page(delf,self,source):
html = etree.HTML(source)
#获取当页职位列表详情链接
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
#解析每一个详情页面
self.request_detail_page(self, link)
time.sleep(1)
def request_detail_page(delf,self,url):
print(url)
#打开新的窗口
self.driver.execute_script("window.open('%s')"%url)
#切换窗口,不能让详情页面把列表页面覆盖掉了
self.driver.switch_to.window(self.driver.window_handles[1])
source = self.driver.page_source
#获取详情页数据,数据没有获取那么多,这边随便low出来几个数据
self.parse_detail_page(self,source)
#关闭详情页面
self.driver.close()
#切换到列表窗口
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(delf,self,source):
html = etree.HTML(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath(".//text()")[0].strip()
city = re.sub(r"[\s/]","",city)
detail_info = {
'name':position_name,
'salary':salary,
'city':city
}
self.info.append(detail_info)
print(self.info)
print("="*40) #打印40个等号
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
python爬虫之爬取拉勾职位列表以及职位详情(selenium+chrome)
最新推荐文章于 2024-04-04 01:24:48 发布