拉勾网Python岗位爬取

最新推荐文章于 2021-11-22 22:01:03 发布

一许流星

最新推荐文章于 2021-11-22 22:01:03 发布

阅读量257

点赞数

分类专栏：爬虫系列

本文链接：https://blog.csdn.net/ywk_hax/article/details/82633815

版权

爬虫系列专栏收录该内容

11 篇文章 1 订阅

订阅专栏

具体流程都在我写的注释中,完整代码如下:

import time
from selenium import webdriver


class LagouSpider(object):
    def __init__(self):
        # 准备 start_url
        self.start_url = "https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput="
        self.drive = webdriver.Firefox()

        # 发送请求,获取响应
        self.drive.get(self.start_url)

    def get_position_link(self):
        """获取所有职位对应的链接"""
        time.sleep(3)
        # 得到当前页面的所有职位中包含链接的节点对象
        position_links = self.drive.find_elements_by_xpath('//a[@class="position_link"]')

        return position_links

    def get_position_info(self, position_links):
        """获取每个职位的相关信息"""

        # 通过每一个节点对象获取href的属性值
        # 定义一个列表存储链接信息
        position_links_list = []

        for i in position_links:
            position_link = i.get_attribute('href')
            position_links_list.append(position_link)


        for link in position_links_list:
            # 发起请求
            self.drive.get(link)
            time.sleep(5)

            # 定义一个字典保存当前房间的所有信息
            item = {}

            # 由于网络的原因,加上等待时间还不够的,会出现问题匹配不带,就跳过这条  ,网速快的可以忽略
            try:
                # 职位名称
                item['职位'] = self.drive.find_element_by_xpath('//span[@class="name"]').text
                # 公司名
                item['公司名'] = self.drive.find_element_by_xpath('//div[@class="company"]').text
                # 工资
                item['工资'] = self.drive.find_element_by_xpath('//span[@class="salary"]').text
                # 工资经验
                item["工资经验"] = self.drive.find_element_by_xpath('//dd[@class="job_request"]/p[1]/span[3]').text
                # 学历要求
                item["学历要求"] = self.drive.find_element_by_xpath('//dd[@class="job_request"]/p[1]/span[4]').text
                # 发布时间
                item["发布时间"] = self.drive.find_element_by_xpath('//p[@class="publish_time"]').text

                item['分割线'] = '-' * 30
                # print(item)

                # 爬完一条保存一条
                self.save_content_list(item)

            except:
                pass

        # 将当前页面的所有职位获取完后,回到首页
        self.drive.get(self.start_url)

        # 获取下一页按钮节点
        next_url = self.drive.find_elements_by_xpath('//span[@class="pager_next "]')

        # 可能到达最后一页,为了防止报错,判断一下
        next_url = next_url[0] if len(next_url) > 0 else None
        return next_url

    def save_content_list(self, item):
        for i in item:
            with open('lagou.txt', 'a', encoding='utf8') as f:
                f.write(item[i] + '\n')

    def run(self):
        """实现主要逻辑"""
        # 获取当前页面所有职位信息的链接
        position_links = self.get_position_link()

        # 获取每个职位的相关信息
        next_url = self.get_position_info(position_links)

        # 点击下一页元素,循环
        while next_url is not None:
            next_url.click()

            # 由于我这网速慢 每次点击下一页之后停留三秒,保证页面更新完整
            time.sleep(3)

            # 获取当前页面所有职位信息的链接
            position_links = self.get_position_link()

            # 获取每个职位的相关信息
            next_url = self.get_position_info(position_links)


#
if __name__ == '__main__':
    lagou_spider = LagouSpider()
    lagou_spider.run()

爬取结果:

如果你和我有共同爱好,我们可以加个好友一起交流!

一许流星

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
拉勾网Python岗位爬取

具体流程都在我写的注释中,完整代码如下:import timefrom selenium import webdriverclass LagouSpider(object): def __init__(self): # 准备 start_url self.start_url = "https://www.lagou.com/jobs/list_...
复制链接

扫一扫

专栏目录