利用python编写爬虫程序，从招聘网站上爬取数据，将数据存入到MongoDB数据库中，将存入的数据作一定的数据清洗后做数据分析，最后将分析的结果做数据可视化

本文链接：https://blog.csdn.net/qq_41434270/article/details/107335491

本文介绍了如何使用Python编写爬虫，抓取招聘网站数据并存储到MongoDB，然后进行数据清洗。数据进一步通过HDFS、Flume传输，存储到Hive进行分析，最后进行了数据可视化展示，包括岗位薪资、城市分布、工作经验薪资及岗位需求趋势等。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

教程演示

创建爬虫项目
MongoDB数据库的安装
- 软件下载
数据测试
- 本人已经爬取了一部分数据，分享出来供大家测试
配置Linux环境
数据储存
- 具体要求：将爬取的数据存储到hdfs上。利用flume收集日志
数据分析和可视化
数据分析和可视化代码
到此实训结束，感谢大家观看
ps：推荐大家使用我的虚拟机进行测试，某些功能如远程连接hive需要特定的jar包，远程连接也要特定的jar包，sqoop也需要jar包

本次演示的是爬取前程无忧网站，因为该网站几乎不存在防爬措施，对爬虫练习十分友好

创建爬虫项目

scrapy startproject qianchengwuyou
cd qianchengwuyou
scrapy genspider wuyou jobs.51job.com

编写需要爬取的字段（items.py）

import scrapy


class QianchengwuyouItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位名称
    Job_title = scrapy.Field()

    # 薪资水平
    Pay_level = scrapy.Field()

    # 招聘单位
    Recruitment_unit = scrapy.Field()

    # 工作地点
    Workplace = scrapy.Field()

    # 工作经验
    hands_background = scrapy.Field()

    # 学历要求
    Education_requirements = scrapy.Field()

    # 职位信息（工作内容+任职要求+工作经验）
    Career_information = scrapy.Field()

    # 关键字：keyword
    keyword = scrapy.Field()

    # 日期
    day = scrapy.Field()

编写spider文件（wuyou.py）

import scrapy
import re
from ..items import QianchengwuyouItem


class WuyouSpider(scrapy.Spider):
    name = 'wuyou'
    allowed_domains = ['search.51job.com','jobs.51job.com']
    start_urls = ['https://search.51job.com/list/000000,000000,0000,00,3,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare=']
    # for a in range(1,800):
    #     print('正在爬取第' + str(a) + '页')
    # url = ('https://search.51job.com/list/000000,000000,0000,00,3,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare=')
    # start_urls.append(url)
    def parse(self, response):
        # 获取全部招聘职位的链接
        all_list = response.xpath('//*[@id="resultList"]//div[@class="el"]')
        # 获取全部招聘职位下的所有行业（industry）链接
        for b in all_list:
            all_url = b.xpath('./p/span/a/@href').extract_first()
            yield scrapy.Request(
                all_url,
                callback=self.parse_details
            )
        print(all_url)
        next_url = response.xpath("//*[@id='resultList']//div[@class='p_in']//li/a[text()='下一页']/@href").extract_first()
        if next_url is not None:
            yield  scrapy.Request(
                next_url,
                callback=self.parse
            )
        elif next_url is None:
            print("爬虫结束!")

        # selectors = response.xpath('//div[@class="el"]')
        # for selector in selectors:
        #     # 详细页网址
        #     post = selector.xpath('./p/span/a/text()').get(default='000 ').replace('\r\n', '')
        #     day = selector.xpath('./span[4]/text()').get(default='000 ')
        #     print(post,day)

    def parse_details(self, response):
        # 获取详情页面数据
        print("=" * 100)
        print(response.url)
        print("正在爬取数据！")
        item = QianchengwuyouItem()
        # 职位名称
        item["Job_title"] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
        item["Job_title"] = [i.strip() for i in item["Job_title"]]
        item["Job_title"] = [i for i in item["Job_title"] if len(i) > 0]
        item["Job_title"] = "".join(item["Job_title"]).replace(",", "，")
        # 薪资水平
        item["Pay_level"] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
        # 招聘单位
        item["Recruitment_unit"] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
        # 工作地点 + 工作经验 + 学历要求....都在//div[@class='cn']/p[2]中
        item["Workplace"] = response.xpath("//div[@class='cn']/p[2]/text()[1]").get().replace('\xa0','')
        # 工作经验 + 学历要求
        item["hands_background"] = response.xpath("//div[@class='cn']/p[2]/text()").extract()
        item["hands_background"] = [i.strip() for i in item["hands_background"]]
        item["hands_background"] = [i for i in item["hands_background"] if "经验" in i]
        item["hands_background"] = " ".join(item["hands_background"]).replace("\xa0", "")
        if len(item["hands_background"]) == 0:
            item["hands_background"] = "无"
        all = response.xpath("//div[@class='cn']/p[2]/text()[2]").get().replace('\xa0','')
        # 判断工作经验是否存在
        if len(all) >= 4:
            item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
            if len(item["Education_requirements"]) != 2:
                item["Education_requirements"] = "无"
        elif len(all) < 4:
            item["Education_requirements"] = "无"
        elif len(all) == 2:
            item["Education_requirements"] = all
        item["Career_information"] = response.xpath("//div[@class='bmsg job_msg inbox']//text()").extract()
        item["Career_information"] = [i.strip() for i in item["Career_information"]]
        item["Career_information"] = [i for i in item["Career_information"] if len(i) > 0]
        item["Career_information"] = " ".join(item["Career_information"]).replace("\xa0","").replace(",","，")
        if (item["Pay_level"]) is None:
            item["Pay_level"] = "无"
        # 关键字：keyword
        item["keyword"] = response.xpath("//div[@class='mt10']//p//a/text()").extract()
        # item["keyword"] = [i.strip() for i in item["keyword"]]
        item["keyword"] = [i for i in item["keyword"] if len(i) > 0]
        item["keyword"] = " ".join(item["keyword"]).replace("\xa0", "").replace(",", "，")
        # 日期
        item["day"] = response.xpath("//div[@class='cn']/p[2]/@title").get().replace("\xa0","")
        riqi = re.findall("(\d+-\d+)发布", response.text)[0]
        item["day"] = riqi
        yield item
        print("数据爬取成功！")

编写数据库连接（pipelines.py）

from pymongo import MongoClient


class QianchengwuyouPipeline:
    def open_spider(self, spider):
        self.db = MongoClient("localhost", 27017).qiancheng  # 创建数据库yc
        self.collection = self.db.liuli05  # 创建一个集合

    def process_item(self, item, spider):
        # 添加数据到jingjiren表中
        self.collection.insert_one(dict(item))
        return item

    def close_spider(self, spider):
        self.collection.close()

编写反爬措施（settings.py）

BOT_NAME = 'qianchengwuyou'

SPIDER_MODULES = ['qianchengwuyou.spiders']
NEWSPIDER_MODULE = 'qianchengwuyou.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 ' \
             'Safari/537.36 Edg/83.0.478.58 '

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FEED_EXPORT_ENCODING = 'utf-8'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# # 启用pipeline
ITEM_PIPELINES = {
   
    'qianchengwuyou.pipelines.QianchengwuyouPipeline':300,
}

如果运行没问题的话，mongosb数据库应该会有数据

MongoDB数据库的安装

软件下载

mongodb：链接: https://pan.baidu.com/s/1Yf_7g4kSY_wvWde78wlzpQ 提取码: f6se
安装教程

数据测试

本人已经爬取了一部分数据，分享出来供大家测试

链接: https://pan.baidu.com/s/1ymL917_etE-5Fn2HBuWHFQ 提取码: 5x5k

配置Linux环境

在这里本人使用的资源都会发布出来

集合：链接: https://pan.baidu.com/s/1KeFcPVKdlWeJHJIT6VemAg 提取码: pk3v

为了方便大家的测试，配置好的虚拟机我也会分享出来

ps:我搭建的完全分布式，共三台电脑，我发布的是主机，另外两台傀儡机大家从主机复制两份就可以了，改主机名和ip地址就可以了
链接: https://pan.baidu.com/s/1sIPW_D0JuvxfXLXJP7hNqQ 提取码: 9hdg

启动Hadoop集群

hadoop01:start-dfs.sh
hadoop02:start-yarn.sh

数据储存

具体要求：将爬取的数据存储到hdfs上。利用flume收集日志

1：将mongodb数据库的爬虫数据导入出来存为csv或者txt文件

如果是csv文件可以用python代码写

import pymongo
import csv

mongo_url = "localhost:27017"
DATABASE = "xxxxx" #数据库名称
TABLE = "xxxx" #集合名（表名）
client = pymongo.MongoClient(mongo_url)
db_des = client[DATABASE]
db_des_table = db_des[TABLE]

# newline='' 的作用是防止结果数据中出现空行，专属于python3
with open(f"{DATABASE}_{TABLE}.csv", "w", newline='',encoding="utf-8") as csvfileWriter:
    writer = csv.writer(csvfileWriter)

    fieldList = [
        "Job_title",
        "Pay_level",
        "Recruitment_unit",
        "Workplace",
        "hands_background",
        "Education_requirements",
        "Career_information",
        "keyword",
        "day"
    ]
    writer.writerow(fieldList)

    allRecordRes = db_des_table.find()
    # 写入多行数据
    for record in allRecordRes:
        # print(f"record = {record}")
        recordValueLst = []
        for field in fieldList:
            if field not in record:
                recordValueLst.append("None")
            else:
                recordValueLst.append(record[field])
        try:
            writer.writerow(recordValueLst)
        except Exception as e:
            print(f"write csv exception. e = {e}")
    print("数据导出成功!")

但是为了后面用flume收集数据则txt文件比较好
txt文件可以用Navicat Premium 15导出
Navicat Premium 15破解软件
链接: https://pan.baidu.com/s/1_SA8xEemn0a5Vvp7CQ870g 提取码: 5fb9 复制这段内容后打开百度网盘手机App，操作更方便哦

flume配置文件

# The configuration file needs to define the sources, 
# the channels and the sinks.
# Sources, channels and sinks are defined per agent, 
# in this case called 'agent'
# 定义别名
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# 描述/配置源source
a1.sources.r1.type = spooldir
# 设置监控的文件夹
# 扫描文件之后修改文件后缀
a1.sources.r1.spoolDir = /usr/qianchengwuyou
# 上传成功后显示后缀名 
a1.sources.r1.fileSuffix = .COMPLETED
# 如论如何 加绝对路径的文件名 默认false
a1.sources.r1.fileHeader = true
# #忽略所有以.tmp结尾的文件，不上传
a1.sources.r1.ignorePattern = ([^ ]*\.tmp)
#开启日志长度截取标志，默认true，开启
a1.sources.tail.interceptors.i2.cutFlag = true
#最大截取字符串长度,整数,尽量控制在2M以内，单位：kb，1M=1024
a1.sources.tail.interceptors.i2.cutMax = 2048
#
# # 描述一下sink 下沉到hdfs
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /Hadoop/flume/%y-%m-%d/%H-%M/
# #上传文件的前缀
a1.sinks.k1.hdfs.filePrefix = pachong
# #是否按照时间滚动文件夹
a1.sinks.k1.hdfs.round = true
# #多少时间单位创建一个新的文件夹
a1.sinks.k1.hdfs.roundValue = 1
# #重新定义时间单位
a1.sinks.k1.hdfs.roundUnit = hour
# #是否使用本地时间戳
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# #积攒多少个Event才flush到HDFS一次
a1.sinks.k1.hdfs.batchSize = 10
# #设置文件类型，可支持压缩
a1.sinks.k1.hd