from selenium import webdriver
from lxml import etree
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import re
headers={
“User-Agent”:“替换成自己的即可”,
“Cookie”:“替换成自己的即可”,
“Referer”:“https://www.lagou.com/”,
“Origin”:“https://www.lagou.com”
}
class LagouSpider(object):
driver_path = r"D:\Chrome Driver\chromedriver.exe"
def init(self):
self.driver=webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url=“https://www.lagou.com/jobs/list_Python/p-city_3-gm_6?px=default#filterBox”
def run(self):
self.driver.get(self.url)
while True:
try:
source = self.driver.page_source
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="pager_container"]/span[last()]')))
self.parse_list_page(source)
btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
if "pager_next_disabled" in btn.get_attribute("class"):
break
else:
btn.click()
time.sleep(2)
except:
time.sleep(20)
def parse_list_page(self,source):
try:
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
self.request_detail_page(link)
time.sleep(1)
except:
time.sleep(20)
def request_detail_page(self,url):
try:
self.driver.execute_script("window.open('%s')" % url)
self.driver._switch_to.window(self.driver.window_handles[1])
WebDriverWait(self.driver, timeout=10).until(
EC.presence_of_all_elements_located((By.XPATH, "//h1[@class='name']")))
source = self.driver.page_source
self.parse_source_page(source)
self.driver.close()
self.driver._switch_to.window(self.driver.window_handles[0])
except:
time.sleep(20)
def parse_source_page(self,source):
html=etree.HTML(source)
position_name=html.xpath("//h1[@class='name']/text()")[0]
job_request_spans=html.xpath("//dd[@class='job_request']//span")
job_salary=job_request_spans[0].xpath("./text()")
citypre=job_request_spans[1].xpath("./text()")
city=re.search('/(.+)/',str(citypre)).group(1)
job_jingyan_pre=job_request_spans[2].xpath("./text()")
job_jingyan=re.search("\['(.+)/",str(job_jingyan_pre)).group(1)
job_xueli_pre=job_request_spans[3].xpath("./text()")
job_xueli=re.search("\['(.+)/",str(job_xueli_pre)).group(1)
position={
'职业名称':position_name,
'工资':job_salary,
'城市':city,
'工作经验':job_jingyan,
'学历':job_xueli
}
print(position)
if name == ‘main’:
spider=LagouSpider()
spider.run()