selenium爬拉勾网职位信息并保存到mongoDB数据库

最新推荐文章于 2022-05-19 13:43:17 发布

ShirleyQueen321

最新推荐文章于 2022-05-19 13:43:17 发布

阅读量504

点赞数

分类专栏： python 爬虫文章标签： s'pselenium

本文链接：https://blog.csdn.net/weixin_40569991/article/details/81948916

版权

python 同时被 2 个专栏收录

30 篇文章 0 订阅

订阅专栏

爬虫

18 篇文章 0 订阅

订阅专栏

import pymongo

from selenium import webdriver
from lxml import etree
import re
import time

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
        self.positions = []

    def connectDB(self, host, port):
        """
        连接数据库
        :param host: IP号
        :param port: 端口号
        :return:
        """
        client = pymongo.MongoClient(host=host, port=port)
        return client

    def run(self):
        self.driver.get(self.url)
        while True:
            # 获取page_source
            source = self.driver.page_source
            # 直到页面加载出来"//div[@class='pager_container']/span[last()]" 才执行下面的活动
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            # 调用parse_list_page获取所有职位的超链接方法
            self.parse_list_page(source)
            time.sleep(2)
            # 下一页
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
            # 判断是否是最后一页，如果是break,否则点击“下一页”
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()
            time.sleep(1)

    def parse_list_page(self, source):
        """
        获取所有职位的超链接
        :param source:
        :return:
        """
        html = etree.HTML(source)
        # 获取职位详情的超链接links列表
        links = html.xpath("//a[@class='position_link']/@href")
        # 遍历links列表
        for link in links:
            # 调用request_detail_page请求职位详情页方法
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        """
        请求职位详情页
        :param url:请求职位详情页的url
        :return:
        """
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(driver=self.driver, timeout=10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
        )
        source = self.driver.page_source
        # 调用parse_detail_page获取职位“字段”的方法
        self.parse_detail_page(source)
        # 关闭当前这个详情页
        self.driver.close()
        # 继续切换回职位列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        """
        获取职位详情页的“字段”
        :param source:
        :return:
        """
        html = etree.HTML(source)
        position_name = html.xpath("//span[@class='name']/text()")[0]
        job_request = html.xpath("//dd[@class='job_request']//span")
        salary = job_request[0].xpath("./text()")[0].strip()
        city = job_request[1].xpath("./text()")[0].strip()
        # 去掉无用的字符
        city = re.sub(r"[\s/]", "", city)
        work_year = job_request[2].xpath("./text()")[0].strip()
        work_year = re.sub(r"[\s/]", "", work_year)
        education = job_request[3].xpath("./text()")[0].strip()
        education = re.sub(r"[\s/]", "", education)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()

        position = {
            'name': position_name,
            'company_name': company_name,
            'salary': salary,
            'city': city,
            'work_year': work_year,
            'education': education,
            'desc': desc

        }
        self.positions.append(position)
        # print(position)
        # 链接数据库
        client = self.connectDB('127.0.0.1', 27017)
        db = client['test']
        p = db['lagou']
        # 保存至数据库
        p.insert(position)

        print("*" * 40)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()