selenium爬取lagou

最新推荐文章于 2021-03-17 10:27:24 发布

aisigan0481

最新推荐文章于 2021-03-17 10:27:24 发布

阅读量180

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/ForT/p/11152098.html

版权

from selenium import webdriver
import time
from lxml import etree
import re


class LagouSpider(object):

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = "https://www.lagou.com/jobs/list_python?px=default&city=%E5%85%A8%E5%9B%BD#filterBox"

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            self.parse_page_list(source)
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()

    def parse_page_list(self, source):
        html = etree.HTML(source)
        detail_urls = html.xpath("//div/a[@class='position_link']/@href")
        for detail_url in detail_urls:
            self.get_detail_page(detail_url)
            time.sleep(1)

    def get_detail_page(self, detail_url):
        # self.driver.get(detail_url)
        # 新打开一个窗口
        self.driver.execute_script("window.open('%s')" % detail_url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        source = self.driver.page_source
        self.parse_datail_page(source)
        # 关闭该窗口
        self.driver.close()
        # 继续切换回职位列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_datail_page(self, source):
        html = etree.HTML(source)
        job_name = html.xpath("//div[@class='job-name']/h2/text()")[0].strip()
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        job_salary = job_request_spans[0].xpath("./text()")[0].strip()
        city = job_request_spans[1].xpath("./text()")[0].strip()
        city = re.sub(r'[/\s]', '', city)
        work_year = job_request_spans[2].xpath("./text()")[0].strip()
        work_year = re.sub(r'[/\s]', '', work_year)
        education = job_request_spans[3].xpath("./text()")[0].strip()
        education = re.sub(r'[/\s]', '', education)
        company_name = html.xpath("//h3[@class='fl']//text()")[0].strip()
        desc = "".join(html.xpath("//dl[@id='job_detail']/dd[@class='job_bt']//text()")).strip()
        desc = re.sub(r'[/\s\\xa]', '', desc)
        position = {
            "name": job_name,
            "job_salary": job_salary,
            "city": city,
            "work_year": work_year,
            "education": education,
            "company_name": company_name,
            "desc": desc
        }
        print(position)


lagou = LagouSpider()
lagou.run()