大数据项目开发实训

实训要求

利用python编写爬虫程序,从招聘网站上爬取数据,将数据存入到MongoDB数据库中,将存入的数据作一定的数据清洗后做数据分析,利用flume采集日志进HDFS中,利用hive进行分析,将hive分析结果利用sqoop技术存储到mysql数据库中,并显示分析结果,最后将分析的结果做数据可视化。

搭建爬虫

本次选取的网站是前程无忧网,利用框架是scrapy,上代码!
在这里插入图片描述
Wuyou.py
1、爬取字段:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。

# -*- coding: utf-8 -*-
import scrapy
from wuyou.items import WuyouItem
import re
import urllib.parse


class WuyouSpider(scrapy.Spider):
    name = 'Wuyou'
    allowed_domains = ['51job.com']
    # 全国 000000
    # web
    start_urls = [
    'https://search.51job.com/list/000000,000000,0000,00,9,99,web,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
    # python
    # start_urls = [
    # 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
    # 数据采集
    # start_urls = [
    #     'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E9%2587%2587%25E9%259B%2586,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
    # 数据分析
    # start_urls = [
    #     'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=']
    # 大数据开发工程师
    # start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):

        itme = []
        # 选取每一个页面的职位信息
        for line in response.xpath('//div[@class="dw_table"]/div[@class="el"]'):
            # 创建对象
            item = WuyouItem()
            # 提取出相应的字段
            # extract()返回一个list
            # 返回title
            jobtitle = line.xpath('p/span/a/text()').extract()
            # 返回详情页连接的list
            link = line.xpath('p/span/a/@href').extract()
            # 返回公司名的list
            company = line.xpath('span[@class="t2"]/a/@title').extract()
            # 返回地址list
            location = line.xpath('span[@class="t3"]/text()').extract()
            # 返回薪资list
            money = line.xpath('span[@class="t4"]/text()').extract()
            # 有些薪资是面议,为空
            if len(money) == 0:
                money = ["null"]
            # 发布时间列表
            update_time = line.xpath('span[@class="t5"]/text()').extract()

            item['jobname'] = jobtitle[0].strip()
            item['link'] = link[0]
            item['comany_name'] = company[0]
            item['address'] = location[0]
            item['salary'] = money[0]
            item['release_date'] = update_time[0]

            # 提取职位详情和公司详情
            yield scrapy.Request(item['link'], meta={'position': item},
                                 callback=self.parse_position)
        # 获取当前页码
        now_pagenumber = response.xpath('//div[@class="dw_page"]/div[@class="p_box"]/div[@class="p_wp"]/div[@class="p_in"]/ul/li[@class="on"]/text()').extract()[0]

        url = "https://search.51job.com/list/000000,000000,0000,00,9,99,web,2," + str(
        int(now_pagenumber) + 1) + ".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="

        # 判断是否到达尾页
        if response.xpath("//li[@class='bk'][last()]/a/@href"):
            # 发送下一页请求
            yield scrapy.Request(url=url, callback=self.parse)

    # 获取职位详情和公司详情函数

    def parse_position(self, response):
            item = response.meta['position']
            zhize_list = response.xpath(
                "//div[@class='tBorderTop_box']/div[1]/text()").extract()
            zhize = "".join(zhize_list)
            zhize_rnt = re.compile(r'[\r\n\t]')
            result_rnt = zhize_rnt.sub("", zhize)
            item['job_require'] = result_rnt.strip()

            try:
                item['experience'] = response.xpath("//div[@class='cn']/p[2]/text()").extract()[1].strip()

            except:
                item['experience'] = '经验不限'
            try:
                item['education_require'] = response.xpath("//div[@class='cn']/p[2]/text()").extract()[2].strip()

            except:
                item['education_require'] = '不限'
            try:
                item['head_count'] = response.xpath("//div[@class='cn']/p[2]/text()").extract()[3].strip()

            except:
                item['head_count'] = '若干'


            # 提交给管道
            yield item

items.py

import scrapy


class WuyouItem(scrapy.Item):
    _id = scrapy.Field()
    # 职位名称
    jobname = scrapy.Field()
    # 薪资水平
    salary = scrapy.Field()
    link = scrapy.Field()
    # 工作经验
    experience = scrapy.Field()
    # 工作地址
    address = scrapy.Field()

    # 招聘单位
    comany_name = scrapy.Field()
    # 招聘人数
    head_count = scrapy.Field()
    # 教育要求
    education_require = scrapy.Field()
    comany_size = scrapy.Field()
    # 工作要求
    job_require = scrapy.Field()
    # 发布日期
    release_date = scrapy.Field()
    # 增加一个反馈率字段

pipelines.py

import pymongo
from twisted.enterprise import adbapi


class WuyouPipeline(object):
    def open_spider(self, spider):
        self.client = pymongo.MongoClient()

    def process_item(self, item, spider):
        self.client.lx.new.insert_one(item)
        return item

    def close_spider(self, spider):
        self.client.close()

setting.py

from fake_useragent import UserAgent

BOT_NAME = 'wuyou'
SPIDER_MODULES = ['wuyou.spiders']
NEWSPIDER_MODULE = 'wuyou.spiders'
USER_AGENT = UserAgent(use_cache_server=False).random
ROBOTSTXT_OBEY = False
# CONCURRENT_REQUESTS = 8
DOWNLOAD_DELAY = 1
# CONCURRENT_REQUESTS_PER_DOMAIN = 8
# CONCURRENT_REQUESTS_PER_IP = 8
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                ' AppleWebKit/537.36 (KHTML, like Gecko)'
                ' Chrome/81.0.4044.113 Safari/537.36'
}
ITEM_PIPELINES = {
   'wuyou.pipelines.WuyouPipeline': 300,
}
# HTTPCACHE_ENABLED = True

start.py

from scrapy import cmdline

cmdline.execute('scrapy crawl Wuyou'.split())

在这里插入图片描述
将文件存如mongodb导出到windows里

在mongodb目录下输入

mongoexport -h localhost:27017 -d lx -c wuyou -o D:\wuyou.csv

因为文件过大我只有文本文档能打开

于是我下载了
在这里插入图片描述
打开大型文件只需要几秒

因为job_require里面有很多逗号,而我们的导出来的文件是由逗号分隔的,后期hive不好创表,所以要将里面的逗号全部替换掉在这里插入图片描述
将job_require里面的逗号都替换为空
再修改工资和日期字段,修改为可被sql语句识别的格式(int,date)
在这里插入图片描述
再打开文本文档,将逗号全部替换为](这是我习惯的操作,可以替换可以不替换)
在这里插入图片描述

Hadoop搭建和配置

操作系统:CentOS7
机器:虚拟机3台,(master 192.168.1.201, slave1 192.168.1.202, slave2 192.168.1.203)
JDK:1.8.0_121(jdk-8u221-linux-x64.tar.gz)
Hadoop:2.9.2(http://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-2.9.2/hadoop-2.9.2.tar.gz)

开始搭建

在vmware安装好linux虚拟机后
重启虚拟机
在root权限的根目录下
修改IP地址
vi /etc/sysconfig/network-scripts/ifcfg-ens33
在这里插入图片描述
修改BOOTPROTO=“static”
并为其添加IP和网关
IPADDR=“需要配置的IP地址”
GATEWAY=“192.168.1.2”
DNS1=“8.8.8.8”

在这里插入图片描述
!wq保存后
执行:service network restart

在这里插入图片描述
如果出现错误,执行reboot,重启虚拟机
修改主机名
在这里插入图片描述
修改主机名:vi /etc/sysconfig/network
在这里插入图片描述
在hosts里面添加内容
vi /etc/hosts
在这里插入图片描述
并重启设备,重启后,查看主机名,已经修改成功
5、修改window10的hosts文件
(1)进入C:\Windows\System32\drivers\etc路径
(2)打开hosts文件并添加如下内容
192.168.1.201 hadoop201
192.168.1.202 hadoop202
192.168.1.203 hadoop203
6、关闭防火墙,并在命令里面ping虚拟机
防火墙基本语法:
firewall-cmd --state (功能描述:查看防火墙状态)
Service firewalld restart 重启
Service firewalld start 开启
Service firewalld stop 关闭
永久关闭:
systemctl stop firewalld.service停止
systemctl disable firewalld.service禁止开机启动在这里插入图片描述
ping -c 3 slave1 (※ 3表示发送 3 个数据包)
在这里插入图片描述
ping baidu.com

在这里插入图片描述
打开SecureCRT 8.5
连接IP配置用户和密码
在这里插入图片描述

## 每台机器安装&配置JDK

1、卸载现有jdk
查询是否安装java软件:rpm -qa|grep java
在这里插入图片描述
如果安装的版本低于1.7,卸载该jdk:rpm -e 软件包
2、用filezilla或WinSCP工具将jdk、hadoop压缩包导入到opt目录下面的software文件夹下面:
opt下创建两个文件夹
mkdir software
mkdir module
在这里插入图片描述
在software下
tar -zxvf jdk-8u121-linux-x64.gz -C /opt/module/
依次解压
tar -xvf mysql文件名

复制路径在这里插入图片描述
/opt/module/jdk1.8.0_121
/opt/module/hadoop-2.9.2

配置全局路径
vi /etc/profile

#JAVA_HOME
export JAVA_HOME=/opt/module/jdk1.8.0_121
export PATH=$PATH:$JAVA_HOME/bin

##HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-2.9.2
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin

让修改后的文件生效: source /etc/profile

在这里插入图片描述
配置文件
(1)core-site.xml
在这里插入图片描述
hadoop-env.sh
在这里插入图片描述
hdfs-site.xml
在这里插入图片描述
Slaves(配置哪几台是datanode)
在这里插入图片描述
yarn-env.sh
在这里插入图片描述
yarn-site.xml
在这里插入图片描述
mapred-env.sh
在这里插入图片描述
mapred-site.xml
在这里插入图片描述
在vmware中对配置好的虚拟机进行克隆002,003(注意修改主机名和IP)
ssh免密码登录
每台机器执行:
ssh-keygen -t rsa
在这里插入图片描述
把 hadoop201 节点上的 authorized_keys 钥发送到其他节点
hadoop201 执行命令,生成 authorized_keys 文件:

ssh-copy-id -i /root/.ssh/id_rsa.pub hadoop201

在这里插入图片描述
把 authorized_keys 发送到 hadoop202 hadoop203 节点上

scp /root/.ssh/authorized_keys root@hadoop202:/root/.ssh/
scp /root/.ssh/authorized_keys root@hadoop203:/root/.ssh/

在hadoop201 节点测试免密码登录 hadoop202、hadoop203
命令:ssh 机器名
在这里插入图片描述
启动 Hadoop 集群
1.格式化 namenode 节点
只需要在 master 机器上执行就好 hdfs namenode -format
2. 启动集群:在master上执行 start-all.sh

起动时候发现resourcemanager没有起来关闭防火墙输入以下代码
sbin/yarn-daemon.sh start resourcemanager
在这里插入图片描述
在浏览器输入192.168.1.201:50070
在这里插入图片描述
在浏览器输入192.168.1.202:8088
在这里插入图片描述

hive,flume,mysql,sqoop安装包和安装步骤在链接里面

链接:https://pan.baidu.com/s/1C3e4FpeX-RQ-9GVak6rekA
提取码:sed6

安装好以上配置后

flume配置

配置flie-hdfs.conf文件

a3.sources = r3
a3.sinks = k3
a3.channels = c3

# Describe/configure the source
a3.sources.r3.type = spooldir
a3.sources.r3.spoolDir = /opt/qiancheng/
a3.sources.r3.fileSuffix = .log   
a3.sources.r3.fileHeader = true
a3.sources.r3.inputCharset = GBK

# Describe the sink
a3.sinks.k3.type = hdfs
a3.sinks.k3.hdfs.path = hdfs://hadoop201:9000/flume/%Y%m%d/%H
#上传文件的前缀
a3.sinks.k3.hdfs.filePrefix = wuyou-
#是否按照时间滚动文件夹
a3.sinks.k3.hdfs.rollSize = 0
a3.sinks.k3.hdfs.rollCount = 0
a3.sinks.k3.hdfs.useLocalTimeStamp = true
a3.sinks.k3.hdfs.fileType = DataStream

# Use a channel which buffers events in memory
a3.channels.c3.type = memory
a3.channels.c3.capacity =30000
a3.channels.c3.transactionCapacity = 30000

# Bind the source and sink to the channel
a3.sources.r3.channels = c3
a3.sinks.k3.channel = c3


运行flume数据

在这里插入图片描述
可以看见目录文件很多需要进行合并操作
合并文件数据

hadoop fs -cat /flume/20200712/20/* | hadoop fs -put - /flume/20200712/20

在这里插入图片描述
修改名称
hadoop dfs -mv /flume/20200712/20/- /flume/20200712/20/qcwy
在这里插入图片描述

hive配置

create table wuyouwai(
id varchar(30),
jobname varchar(30),
link varchar(30),
comany_name varchar(30),
address varchar(30),
salary int,
release_date DATE,
job_require varchar(2000),
experience varchar(30),
education_require varchar(30),
head_count varchar(30))
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)',
'output.format.string'='%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s')
stored as textfile;

导入HDFS文件数据

load data inpath "/flume/20200712/21/qcwy" into table wuyouwai;

在这里插入图片描述
创建个小表装自己想要的字段

create table xwuyou as 
select wuyouwai.jobname as jobname,wuyouwai.salary as salary ,wuyouwai.address as address,wuyouwai.release_date as release_date
from wuyouwai 
where jobname LIKE '%数据采集%';

在这里插入图片描述
将所需数据插入

insert into table xwuyou
select jobname,salary,address,release_date
from wuyouwai
where jobname ='大数据开发工程师';

insert into table xwuyou
select jobname,salary,address,release_date
from wuyouwai
where jobname = '数据分析';

在这里插入图片描述
在这里插入图片描述

hive分析数据

思路分析:
  1、需求字段:工作名 wuyouwai.jobname
         工资数 wuyouwai.salary min() max() avg()
地址 wuyouwai.address
日期 wuyouwai.release_data
(1)分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来;

大数据相关表

create table bigdata(
jobname varchar(30),
avg int,
min int,
max int);
create table caiji as 
select xwuyou.jobname as jobname,xwuyou.salary as salary ,xwuyou.address as address,xwuyou.release_date as release_date
from xwuyou 
where salary is not null;

向上取整,插入向bigdata数据,即可得出第一题答案

insert into table bigdata
select caiji.jobname as jobname,ceiling(avg(salary)),min(salary),max(salary) from caiji where jobname like '数据分析' group by jobname;

insert into table bigdata
select caiji.jobname as jobname,ceiling(avg(salary)),min(salary),max(salary) from caiji where jobname like '大数据开发工程师' group by jobname;

insert into table bigdata
select '数据采集',ceiling(avg(salary)),min(salary),max(salary) from caiji where jobname like '%数据采集%';

在这里插入图片描述
(2)分析“数据分析”、“大数据开发工程师”、“数据采集”等大数据相关岗位在成都、北京、上海、广州、深圳的岗位数,并做饼图将结果展示出来。

创建三个表,分别表示三个职业
在这里插入图片描述

insert into table fenxi
select '成都',count(address) from xwuyou where jobname like '数据分析' and address like '%成都%';

insert into table fenxi
select '北京',count(address) from xwuyou where jobname like '数据分析' and address like '%北京%';

insert into table fenxi
select '上海',count(address) from xwuyou where jobname like '数据分析' and address like '%上海%';

insert into table fenxi
select '广州',count(address) from xwuyou where jobname like '数据分析' and address like '%广州%';

insert into table fenxi
select '深圳',count(address) from xwuyou where jobname like '数据分析' and address like '%深圳%';

数据分析
在这里插入图片描述

insert into table big
select '成都',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%成都%';

insert into table big
select '北京',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%北京%';

insert into table big
select '上海',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%上海%';

insert into table big
select '广州',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%广州%';

insert into table big
select '深圳',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%深圳%';

大数据开发工程师
在这里插入图片描述

insert into table cj
select '成都',count(address) from xwuyou where jobname like '%数据采集%' and address like '%成都%';

insert into table cj
select '北京',count(address) from xwuyou where jobname like '%数据采集%' and address like '%北京%';

insert into table cj
select '上海',count(address) from xwuyou where jobname like '%数据采集%' and address like '%上海%';

insert into table cj
select '广州',count(address) from xwuyou where jobname like '%数据采集%' and address like '%广州%';

insert into table cj
select '深圳',count(address) from xwuyou where jobname like '%数据采集%' and address like '%深圳%';

数据采集
在这里插入图片描述
(3)分析大数据相关岗位1-3年工作经验的薪资水平(平均工资、最高工资、最低工资),并做出条形图展示出来;

create table jingyan as 
select wuyouwai.jobname as jobname,wuyouwai.salary as salary ,wuyouwai.experience as experience
from wuyouwai 
where salary is not null and experience like '%经验%' and jobname like '%大数据%';
create table oneth(
jobname varchar(30),
avg int,
min int,
max int);
insert into table oneth
select '大数据相关',ceiling(avg(salary)),min(salary),max(salary) from jingyan
where experience in ('1年经验','2年经验','3-4年经验');

在这里插入图片描述
4)分析大数据相关岗位几年需求的走向趋势,并做出折线图展示出来;
将hive表的数据传到HDFS里面

create table fourbigdata(
    release_date date,
    gangweishu int,
);


insert into table fourbigdata
select release_date,count(jobname) from caiji group by release_date;

在这里插入图片描述
hive导出到HDFS

insert overwrite [local] directory '/root'  --> 导出的路径 去掉 loacl 就是导出到 HDFS
row format delimited fields terminated by '\t' --> 导出的分隔符
select * from hive_db; --> 需要导出的内容
# 数据分析表饼图
insert overwrite directory '/flume/20200712/24'
row format delimited fields terminated by '\t'
select * from fenxi;
#大数据开发工程师饼图
insert overwrite directory '/flume/20200712/22'
row format delimited fields terminated by '\t'
select * from big;
#1-3年
insert overwrite directory '/flume/20200712/21'
row format delimited fields terminated by '\t'
select * from oneth;
# 数据采集饼图
insert overwrite directory '/flume/20200712/23'
row format delimited fields terminated by '\t'
select * from cj;
# 三个职业薪资水平
insert overwrite directory '/flume/20200712/25'
row format delimited fields terminated by '\t'
select * from bigdata;
# 带日期的表
insert overwrite directory '/flume/20200712/26'
row format delimited fields terminated by '\t'
select * from caiji;

打开HDFS文件目录,显示插入成功
在这里插入图片描述

利用sqoop导入到mysql

将hive分析结果利用sqoop技术存储到mysql数据库中,并最后显示分析结果。

在mysql里面建表
在这里插入图片描述

利用sqoop将数据导入
进入sqoop的bin目录下输入

sqoop export --connect jdbc:mysql://127.0.0.1:3306/wuyou --username root --password 111111 --table caiji --export-dir '/flume/20200712/26' --fields-terminated-by '\t' -m 1

在这里插入图片描述
其他数据一样操作

数据可视化实现

岗位薪资柱状图

import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts

db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from bigdata"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
zhiwei = [data[0][0], data[1][0], data[2][0]]
print(zhiwei)
min_list = [data[0][2], data[1][2], data[2][2]]
max_list = [data[0][3], data[1][3], data[2][3]]
average_list = [data[0][1], data[1][1], data[2][1]]
bar = Bar()
bar.add_xaxis(xaxis_data=zhiwei)
# 第一个参数是图例名称,第二个参数是y轴数据
bar.add_yaxis(series_name="最低工资", yaxis_data=min_list)
bar.add_yaxis(series_name="最高工资", yaxis_data=max_list)
bar.add_yaxis(series_name="平均工资", yaxis_data=average_list)
# 设置表的名称
bar.set_global_opts(title_opts=opts.TitleOpts(title='职位工资分析', subtitle='工资单位:万/月'), toolbox_opts=opts.ToolboxOpts(),
                    )
bar.render("岗位薪资图.html")

在这里插入图片描述
数据分析工程师地区岗位数

import pymysql
from pyecharts.charts import Pie
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from fenxi"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
addr = ["成都","北京","上海","广州","深圳"]
num = [data[0][1],data[1][1],data[2][1],data[3][1],data[4][1]]
data_pair = [list(z) for z in zip(addr, num)]
data_pair.sort(key=lambda x: x[1])
# 画饼图
c = (
        Pie()
        .add("", [list(z) for z in zip(addr,num)])
        .set_global_opts(title_opts=opts.TitleOpts(title="数据分析工程师地区岗位数",subtitle='单位:个数'),toolbox_opts=opts.ToolboxOpts())
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    ).render("数据分析工程师地区岗位数.html")

在这里插入图片描述
数据采集工程师地区岗位数

import pymysql
from pyecharts.charts import Pie
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from cj"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
addr = ["成都","北京","上海","广州","深圳"]
num = [data[0][1],data[1][1],data[2][1],data[3][1],data[4][1]]
data_pair = [list(z) for z in zip(addr, num)]
data_pair.sort(key=lambda x: x[1])
# 画饼图
c = (
        Pie()
        .add("", [list(z) for z in zip(addr,num)])
        .set_global_opts(title_opts=opts.TitleOpts(title="数据采集工程师地区岗位数",subtitle='单位:个数'),toolbox_opts=opts.ToolboxOpts())
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    ).render("数据采集工程师地区岗位数.html")

在这里插入图片描述
大数据开发工程师地区岗位数

import pymysql
from pyecharts.charts import Pie
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from big"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
addr = ["成都","北京","上海","广州","深圳"]
num = [data[0][1],data[1][1],data[2][1],data[3][1],data[4][1]]
data_pair = [list(z) for z in zip(addr, num)]
data_pair.sort(key=lambda x: x[1])
# 画饼图
c = (
        Pie()
        .add("", [list(z) for z in zip(addr,num)])
        .set_global_opts(title_opts=opts.TitleOpts(title="大数据开发工程师各地区岗位数",subtitle='单位:个数'),toolbox_opts=opts.ToolboxOpts())
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    ).render("大数据开发工程师地区岗位数.html")

在这里插入图片描述
1-3年经验

import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts

db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from oneth"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
zhiwei = [data[0][0]]
print(zhiwei)
min_list = [data[0][2]]
max_list = [data[0][3]]
average_list = [data[0][1]]
bar = Bar()
bar.add_xaxis(xaxis_data=zhiwei)
# 第一个参数是图例名称,第二个参数是y轴数据
bar.add_yaxis(series_name="最低工资", yaxis_data=min_list)
bar.add_yaxis(series_name="最高工资", yaxis_data=max_list)
bar.add_yaxis(series_name="平均工资", yaxis_data=average_list)
# 设置表的名称
bar.set_global_opts(title_opts=opts.TitleOpts(title='1-3年经验', subtitle='工资单位:万/月'), toolbox_opts=opts.ToolboxOpts(),
                    )
bar.render("1-3年经验.html")

在这里插入图片描述
大数据需求变化趋势

import pymysql
from pyecharts.charts import Line
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from fourbigdata"
cursor.execute(sql)
data = cursor.fetchall()
time_list = []
renshu = []
for i in data:
    time_list.append(str(i[0]))
    renshu.append(str(i[1]))
print(time_list)
print(renshu)
data_pair = [list(z) for z in zip(time_list, renshu)]
data_pair.sort(key=lambda x: x[1])

(
    Line(init_opts=opts.InitOpts(width="6000px", height="800px"))
    .set_global_opts(
        tooltip_opts=opts.TooltipOpts(is_show=False),
        xaxis_opts=opts.AxisOpts(type_="category"),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            axistick_opts=opts.AxisTickOpts(is_show=True),
            splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
    )
    .add_xaxis(xaxis_data=time_list)
    .add_yaxis(
        series_name="大数据岗位需求变化趋势",
        y_axis=renshu,
        symbol="emptyCircle",
        is_symbol_show=True,
        label_opts=opts.LabelOpts(is_show=False),
    )
    .render("需求变化趋势.html")
)

在这里插入图片描述

  • 11
    点赞
  • 84
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
⼤数据项⽬开发实训 实训要求 利⽤python编写爬⾍程序,从招聘⽹站上爬取数据,将数据存⼊到MongoDB数据库中,将存⼊的数据作⼀定的数据清洗后做数据分析, 利⽤flume采集⽇志进HDFS中,利⽤hive进⾏分析,将hive分析结果利⽤sqoop技术存储到mysql数据库中,并显⽰分析结果,最后将分 析的结果做数据可视化。 搭建爬⾍ 本次选取的⽹站是前程⽆忧⽹,利⽤框架是scrapy,上代码! Wuyou.py 1、爬取字段:职位名称、薪资⽔平、招聘单位、⼯作地点、⼯作经验、学历要求、⼯作内容(岗位职责)、任职要求(技能要求)。 # -*- coding: utf-8 -*- import scrapy from wuyou.items import WuyouItem import re import urllib.parse class WuyouSpider(scrapy.Spider): name = 'Wuyou' allowed_domains = ['51job.com'] # 全国 000000 # web start_urls = [ 'https://search.51job.com/list/000000,000000,0000,00,9,99,web,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99 &jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&special area=00&from=&welfare='] # python # start_urls = [ # 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom =99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&spe cialarea=00&from=&welfare='] # 数据采集 # start_urls = [ # 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E9%2587%2587%25E9%259B%2586,2, 1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&ra dius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='] dius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='] # 数据分析 # start_urls = [ # 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2, 1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='] # ⼤数据开发⼯程师 # start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值