实训要求
利用python编写爬虫程序,从招聘网站上爬取数据,将数据存入到MongoDB数据库中,将存入的数据作一定的数据清洗后做数据分析,利用flume采集日志进HDFS中,利用hive进行分析,将hive分析结果利用sqoop技术存储到mysql数据库中,并显示分析结果,最后将分析的结果做数据可视化。
搭建爬虫
本次选取的网站是前程无忧网,利用框架是scrapy,上代码!
Wuyou.py
1、爬取字段:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。
# -*- coding: utf-8 -*-
import scrapy
from wuyou.items import WuyouItem
import re
import urllib.parse
class WuyouSpider(scrapy.Spider):
name = 'Wuyou'
allowed_domains = ['51job.com']
# 全国 000000
# web
start_urls = [
'https://search.51job.com/list/000000,000000,0000,00,9,99,web,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
# python
# start_urls = [
# 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
# 数据采集
# start_urls = [
# 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E9%2587%2587%25E9%259B%2586,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
# 数据分析
# start_urls = [
# 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=']
# 大数据开发工程师
# start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
def parse(self, response):
itme = []
# 选取每一个页面的职位信息
for line in response.xpath('//div[@class="dw_table"]/div[@class="el"]'):
# 创建对象
item = WuyouItem()
# 提取出相应的字段
# extract()返回一个list
# 返回title
jobtitle = line.xpath('p/span/a/text()').extract()
# 返回详情页连接的list
link = line.xpath('p/span/a/@href').extract()
# 返回公司名的list
company = line.xpath('span[@class="t2"]/a/@title').extract()
# 返回地址list
location = line.xpath('span[@class="t3"]/text()').extract()
# 返回薪资list
money = line.xpath('span[@class="t4"]/text()').extract()
# 有些薪资是面议,为空
if len(money) == 0:
money = ["null"]
# 发布时间列表
update_time = line.xpath('span[@class="t5"]/text()').extract()
item['jobname'] = jobtitle[0].strip()
item['link'] = link[0]
item['comany_name'] = company[0]
item['address'] = location[0]
item['salary'] = money[0]
item['release_date'] = update_time[0]
# 提取职位详情和公司详情
yield scrapy.Request(item['link'], meta={'position': item},
callback=self.parse_position)
# 获取当前页码
now_pagenumber = response.xpath('//div[@class="dw_page"]/div[@class="p_box"]/div[@class="p_wp"]/div[@class="p_in"]/ul/li[@class="on"]/text()').extract()[0]
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,web,2," + str(
int(now_pagenumber) + 1) + ".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
# 判断是否到达尾页
if response.xpath("//li[@class='bk'][last()]/a/@href"):
# 发送下一页请求
yield scrapy.Request(url=url, callback=self.parse)
# 获取职位详情和公司详情函数
def parse_position(self, response):
item = response.meta['position']
zhize_list = response.xpath(
"//div[@class='tBorderTop_box']/div[1]/text()").extract()
zhize = "".join(zhize_list)
zhize_rnt = re.compile(r'[\r\n\t]')
result_rnt = zhize_rnt.sub("", zhize)
item['job_require'] = result_rnt.strip()
try:
item['experience'] = response.xpath("//div[@class='cn']/p[2]/text()").extract()[1].strip()
except:
item['experience'] = '经验不限'
try:
item['education_require'] = response.xpath("//div[@class='cn']/p[2]/text()").extract()[2].strip()
except:
item['education_require'] = '不限'
try:
item['head_count'] = response.xpath("//div[@class='cn']/p[2]/text()").extract()[3].strip()
except:
item['head_count'] = '若干'
# 提交给管道
yield item
items.py
import scrapy
class WuyouItem(scrapy.Item):
_id = scrapy.Field()
# 职位名称
jobname = scrapy.Field()
# 薪资水平
salary = scrapy.Field()
link = scrapy.Field()
# 工作经验
experience = scrapy.Field()
# 工作地址
address = scrapy.Field()
# 招聘单位
comany_name = scrapy.Field()
# 招聘人数
head_count = scrapy.Field()
# 教育要求
education_require = scrapy.Field()
comany_size = scrapy.Field()
# 工作要求
job_require = scrapy.Field()
# 发布日期
release_date = scrapy.Field()
# 增加一个反馈率字段
pipelines.py
import pymongo
from twisted.enterprise import adbapi
class WuyouPipeline(object):
def open_spider(self, spider):
self.client = pymongo.MongoClient()
def process_item(self, item, spider):
self.client.lx.new.insert_one(item)
return item
def close_spider(self, spider):
self.client.close()
setting.py
from fake_useragent import UserAgent
BOT_NAME = 'wuyou'
SPIDER_MODULES = ['wuyou.spiders']
NEWSPIDER_MODULE = 'wuyou.spiders'
USER_AGENT = UserAgent(use_cache_server=False).random
ROBOTSTXT_OBEY = False
# CONCURRENT_REQUESTS = 8
DOWNLOAD_DELAY = 1
# CONCURRENT_REQUESTS_PER_DOMAIN = 8
# CONCURRENT_REQUESTS_PER_IP = 8
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
' AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/81.0.4044.113 Safari/537.36'
}
ITEM_PIPELINES = {
'wuyou.pipelines.WuyouPipeline': 300,
}
# HTTPCACHE_ENABLED = True
start.py
from scrapy import cmdline
cmdline.execute('scrapy crawl Wuyou'.split())
将文件存如mongodb导出到windows里
在mongodb目录下输入
mongoexport -h localhost:27017 -d lx -c wuyou -o D:\wuyou.csv
因为文件过大我只有文本文档能打开
于是我下载了
打开大型文件只需要几秒
因为job_require里面有很多逗号,而我们的导出来的文件是由逗号分隔的,后期hive不好创表,所以要将里面的逗号全部替换掉
将job_require里面的逗号都替换为空
再修改工资和日期字段,修改为可被sql语句识别的格式(int,date)
再打开文本文档,将逗号全部替换为](这是我习惯的操作,可以替换可以不替换)
Hadoop搭建和配置
操作系统:CentOS7
机器:虚拟机3台,(master 192.168.1.201, slave1 192.168.1.202, slave2 192.168.1.203)
JDK:1.8.0_121(jdk-8u221-linux-x64.tar.gz)
Hadoop:2.9.2(http://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-2.9.2/hadoop-2.9.2.tar.gz)
开始搭建
在vmware安装好linux虚拟机后
重启虚拟机
在root权限的根目录下
修改IP地址
vi /etc/sysconfig/network-scripts/ifcfg-ens33
修改BOOTPROTO=“static”
并为其添加IP和网关
IPADDR=“需要配置的IP地址”
GATEWAY=“192.168.1.2”
DNS1=“8.8.8.8”
!wq保存后
执行:service network restart
如果出现错误,执行reboot,重启虚拟机
修改主机名
修改主机名:vi /etc/sysconfig/network
在hosts里面添加内容
vi /etc/hosts
并重启设备,重启后,查看主机名,已经修改成功
5、修改window10的hosts文件
(1)进入C:\Windows\System32\drivers\etc路径
(2)打开hosts文件并添加如下内容
192.168.1.201 hadoop201
192.168.1.202 hadoop202
192.168.1.203 hadoop203
6、关闭防火墙,并在命令里面ping虚拟机
防火墙基本语法:
firewall-cmd --state (功能描述:查看防火墙状态)
Service firewalld restart 重启
Service firewalld start 开启
Service firewalld stop 关闭
永久关闭:
systemctl stop firewalld.service停止
systemctl disable firewalld.service禁止开机启动
ping -c 3 slave1 (※ 3表示发送 3 个数据包)
ping baidu.com
打开SecureCRT 8.5
连接IP配置用户和密码
## 每台机器安装&配置JDK
1、卸载现有jdk
查询是否安装java软件:rpm -qa|grep java
如果安装的版本低于1.7,卸载该jdk:rpm -e 软件包
2、用filezilla或WinSCP工具将jdk、hadoop压缩包导入到opt目录下面的software文件夹下面:
opt下创建两个文件夹
mkdir software
mkdir module
在software下
tar -zxvf jdk-8u121-linux-x64.gz -C /opt/module/
依次解压
tar -xvf mysql文件名
复制路径
/opt/module/jdk1.8.0_121
/opt/module/hadoop-2.9.2
配置全局路径
vi /etc/profile
#JAVA_HOME
export JAVA_HOME=/opt/module/jdk1.8.0_121
export PATH=$PATH:$JAVA_HOME/bin
##HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-2.9.2
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
让修改后的文件生效: source /etc/profile
配置文件
(1)core-site.xml
hadoop-env.sh
hdfs-site.xml
Slaves(配置哪几台是datanode)
yarn-env.sh
yarn-site.xml
mapred-env.sh
mapred-site.xml
在vmware中对配置好的虚拟机进行克隆002,003(注意修改主机名和IP)
ssh免密码登录
每台机器执行:
ssh-keygen -t rsa
把 hadoop201 节点上的 authorized_keys 钥发送到其他节点
hadoop201 执行命令,生成 authorized_keys 文件:
ssh-copy-id -i /root/.ssh/id_rsa.pub hadoop201
把 authorized_keys 发送到 hadoop202 hadoop203 节点上
scp /root/.ssh/authorized_keys root@hadoop202:/root/.ssh/
scp /root/.ssh/authorized_keys root@hadoop203:/root/.ssh/
在hadoop201 节点测试免密码登录 hadoop202、hadoop203
命令:ssh 机器名
启动 Hadoop 集群
1.格式化 namenode 节点
只需要在 master 机器上执行就好 hdfs namenode -format
2. 启动集群:在master上执行 start-all.sh
起动时候发现resourcemanager没有起来关闭防火墙输入以下代码
sbin/yarn-daemon.sh start resourcemanager
在浏览器输入192.168.1.201:50070
在浏览器输入192.168.1.202:8088
hive,flume,mysql,sqoop安装包和安装步骤在链接里面
链接:https://pan.baidu.com/s/1C3e4FpeX-RQ-9GVak6rekA
提取码:sed6
安装好以上配置后
flume配置
配置flie-hdfs.conf文件
a3.sources = r3
a3.sinks = k3
a3.channels = c3
# Describe/configure the source
a3.sources.r3.type = spooldir
a3.sources.r3.spoolDir = /opt/qiancheng/
a3.sources.r3.fileSuffix = .log
a3.sources.r3.fileHeader = true
a3.sources.r3.inputCharset = GBK
# Describe the sink
a3.sinks.k3.type = hdfs
a3.sinks.k3.hdfs.path = hdfs://hadoop201:9000/flume/%Y%m%d/%H
#上传文件的前缀
a3.sinks.k3.hdfs.filePrefix = wuyou-
#是否按照时间滚动文件夹
a3.sinks.k3.hdfs.rollSize = 0
a3.sinks.k3.hdfs.rollCount = 0
a3.sinks.k3.hdfs.useLocalTimeStamp = true
a3.sinks.k3.hdfs.fileType = DataStream
# Use a channel which buffers events in memory
a3.channels.c3.type = memory
a3.channels.c3.capacity =30000
a3.channels.c3.transactionCapacity = 30000
# Bind the source and sink to the channel
a3.sources.r3.channels = c3
a3.sinks.k3.channel = c3
运行flume数据
可以看见目录文件很多需要进行合并操作
合并文件数据
hadoop fs -cat /flume/20200712/20/* | hadoop fs -put - /flume/20200712/20
修改名称
hadoop dfs -mv /flume/20200712/20/- /flume/20200712/20/qcwy
hive配置
create table wuyouwai(
id varchar(30),
jobname varchar(30),
link varchar(30),
comany_name varchar(30),
address varchar(30),
salary int,
release_date DATE,
job_require varchar(2000),
experience varchar(30),
education_require varchar(30),
head_count varchar(30))
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)](.*)',
'output.format.string'='%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s')
stored as textfile;
导入HDFS文件数据
load data inpath "/flume/20200712/21/qcwy" into table wuyouwai;
创建个小表装自己想要的字段
create table xwuyou as
select wuyouwai.jobname as jobname,wuyouwai.salary as salary ,wuyouwai.address as address,wuyouwai.release_date as release_date
from wuyouwai
where jobname LIKE '%数据采集%';
将所需数据插入
insert into table xwuyou
select jobname,salary,address,release_date
from wuyouwai
where jobname ='大数据开发工程师';
insert into table xwuyou
select jobname,salary,address,release_date
from wuyouwai
where jobname = '数据分析';
hive分析数据
思路分析:
1、需求字段:工作名 wuyouwai.jobname
工资数 wuyouwai.salary min() max() avg()
地址 wuyouwai.address
日期 wuyouwai.release_data
(1)分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来;
大数据相关表
create table bigdata(
jobname varchar(30),
avg int,
min int,
max int);
create table caiji as
select xwuyou.jobname as jobname,xwuyou.salary as salary ,xwuyou.address as address,xwuyou.release_date as release_date
from xwuyou
where salary is not null;
向上取整,插入向bigdata数据,即可得出第一题答案
insert into table bigdata
select caiji.jobname as jobname,ceiling(avg(salary)),min(salary),max(salary) from caiji where jobname like '数据分析' group by jobname;
insert into table bigdata
select caiji.jobname as jobname,ceiling(avg(salary)),min(salary),max(salary) from caiji where jobname like '大数据开发工程师' group by jobname;
insert into table bigdata
select '数据采集',ceiling(avg(salary)),min(salary),max(salary) from caiji where jobname like '%数据采集%';
(2)分析“数据分析”、“大数据开发工程师”、“数据采集”等大数据相关岗位在成都、北京、上海、广州、深圳的岗位数,并做饼图将结果展示出来。
创建三个表,分别表示三个职业
insert into table fenxi
select '成都',count(address) from xwuyou where jobname like '数据分析' and address like '%成都%';
insert into table fenxi
select '北京',count(address) from xwuyou where jobname like '数据分析' and address like '%北京%';
insert into table fenxi
select '上海',count(address) from xwuyou where jobname like '数据分析' and address like '%上海%';
insert into table fenxi
select '广州',count(address) from xwuyou where jobname like '数据分析' and address like '%广州%';
insert into table fenxi
select '深圳',count(address) from xwuyou where jobname like '数据分析' and address like '%深圳%';
数据分析
insert into table big
select '成都',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%成都%';
insert into table big
select '北京',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%北京%';
insert into table big
select '上海',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%上海%';
insert into table big
select '广州',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%广州%';
insert into table big
select '深圳',count(address) from xwuyou where jobname like '大数据开发工程师' and address like '%深圳%';
大数据开发工程师
insert into table cj
select '成都',count(address) from xwuyou where jobname like '%数据采集%' and address like '%成都%';
insert into table cj
select '北京',count(address) from xwuyou where jobname like '%数据采集%' and address like '%北京%';
insert into table cj
select '上海',count(address) from xwuyou where jobname like '%数据采集%' and address like '%上海%';
insert into table cj
select '广州',count(address) from xwuyou where jobname like '%数据采集%' and address like '%广州%';
insert into table cj
select '深圳',count(address) from xwuyou where jobname like '%数据采集%' and address like '%深圳%';
数据采集
(3)分析大数据相关岗位1-3年工作经验的薪资水平(平均工资、最高工资、最低工资),并做出条形图展示出来;
create table jingyan as
select wuyouwai.jobname as jobname,wuyouwai.salary as salary ,wuyouwai.experience as experience
from wuyouwai
where salary is not null and experience like '%经验%' and jobname like '%大数据%';
create table oneth(
jobname varchar(30),
avg int,
min int,
max int);
insert into table oneth
select '大数据相关',ceiling(avg(salary)),min(salary),max(salary) from jingyan
where experience in ('1年经验','2年经验','3-4年经验');
4)分析大数据相关岗位几年需求的走向趋势,并做出折线图展示出来;
将hive表的数据传到HDFS里面
create table fourbigdata(
release_date date,
gangweishu int,
);
insert into table fourbigdata
select release_date,count(jobname) from caiji group by release_date;
hive导出到HDFS
insert overwrite [local] directory '/root' --> 导出的路径 去掉 loacl 就是导出到 HDFS
row format delimited fields terminated by '\t' --> 导出的分隔符
select * from hive_db; --> 需要导出的内容
# 数据分析表饼图
insert overwrite directory '/flume/20200712/24'
row format delimited fields terminated by '\t'
select * from fenxi;
#大数据开发工程师饼图
insert overwrite directory '/flume/20200712/22'
row format delimited fields terminated by '\t'
select * from big;
#1-3年
insert overwrite directory '/flume/20200712/21'
row format delimited fields terminated by '\t'
select * from oneth;
# 数据采集饼图
insert overwrite directory '/flume/20200712/23'
row format delimited fields terminated by '\t'
select * from cj;
# 三个职业薪资水平
insert overwrite directory '/flume/20200712/25'
row format delimited fields terminated by '\t'
select * from bigdata;
# 带日期的表
insert overwrite directory '/flume/20200712/26'
row format delimited fields terminated by '\t'
select * from caiji;
打开HDFS文件目录,显示插入成功
利用sqoop导入到mysql
将hive分析结果利用sqoop技术存储到mysql数据库中,并最后显示分析结果。
在mysql里面建表
利用sqoop将数据导入
进入sqoop的bin目录下输入
sqoop export --connect jdbc:mysql://127.0.0.1:3306/wuyou --username root --password 111111 --table caiji --export-dir '/flume/20200712/26' --fields-terminated-by '\t' -m 1
其他数据一样操作
数据可视化实现
岗位薪资柱状图
import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from bigdata"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
zhiwei = [data[0][0], data[1][0], data[2][0]]
print(zhiwei)
min_list = [data[0][2], data[1][2], data[2][2]]
max_list = [data[0][3], data[1][3], data[2][3]]
average_list = [data[0][1], data[1][1], data[2][1]]
bar = Bar()
bar.add_xaxis(xaxis_data=zhiwei)
# 第一个参数是图例名称,第二个参数是y轴数据
bar.add_yaxis(series_name="最低工资", yaxis_data=min_list)
bar.add_yaxis(series_name="最高工资", yaxis_data=max_list)
bar.add_yaxis(series_name="平均工资", yaxis_data=average_list)
# 设置表的名称
bar.set_global_opts(title_opts=opts.TitleOpts(title='职位工资分析', subtitle='工资单位:万/月'), toolbox_opts=opts.ToolboxOpts(),
)
bar.render("岗位薪资图.html")
数据分析工程师地区岗位数
import pymysql
from pyecharts.charts import Pie
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from fenxi"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
addr = ["成都","北京","上海","广州","深圳"]
num = [data[0][1],data[1][1],data[2][1],data[3][1],data[4][1]]
data_pair = [list(z) for z in zip(addr, num)]
data_pair.sort(key=lambda x: x[1])
# 画饼图
c = (
Pie()
.add("", [list(z) for z in zip(addr,num)])
.set_global_opts(title_opts=opts.TitleOpts(title="数据分析工程师地区岗位数",subtitle='单位:个数'),toolbox_opts=opts.ToolboxOpts())
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
).render("数据分析工程师地区岗位数.html")
数据采集工程师地区岗位数
import pymysql
from pyecharts.charts import Pie
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from cj"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
addr = ["成都","北京","上海","广州","深圳"]
num = [data[0][1],data[1][1],data[2][1],data[3][1],data[4][1]]
data_pair = [list(z) for z in zip(addr, num)]
data_pair.sort(key=lambda x: x[1])
# 画饼图
c = (
Pie()
.add("", [list(z) for z in zip(addr,num)])
.set_global_opts(title_opts=opts.TitleOpts(title="数据采集工程师地区岗位数",subtitle='单位:个数'),toolbox_opts=opts.ToolboxOpts())
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
).render("数据采集工程师地区岗位数.html")
大数据开发工程师地区岗位数
import pymysql
from pyecharts.charts import Pie
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from big"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
addr = ["成都","北京","上海","广州","深圳"]
num = [data[0][1],data[1][1],data[2][1],data[3][1],data[4][1]]
data_pair = [list(z) for z in zip(addr, num)]
data_pair.sort(key=lambda x: x[1])
# 画饼图
c = (
Pie()
.add("", [list(z) for z in zip(addr,num)])
.set_global_opts(title_opts=opts.TitleOpts(title="大数据开发工程师各地区岗位数",subtitle='单位:个数'),toolbox_opts=opts.ToolboxOpts())
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
).render("大数据开发工程师地区岗位数.html")
1-3年经验
import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from oneth"
cursor.execute(sql)
data = cursor.fetchall()
print(data)
zhiwei = [data[0][0]]
print(zhiwei)
min_list = [data[0][2]]
max_list = [data[0][3]]
average_list = [data[0][1]]
bar = Bar()
bar.add_xaxis(xaxis_data=zhiwei)
# 第一个参数是图例名称,第二个参数是y轴数据
bar.add_yaxis(series_name="最低工资", yaxis_data=min_list)
bar.add_yaxis(series_name="最高工资", yaxis_data=max_list)
bar.add_yaxis(series_name="平均工资", yaxis_data=average_list)
# 设置表的名称
bar.set_global_opts(title_opts=opts.TitleOpts(title='1-3年经验', subtitle='工资单位:万/月'), toolbox_opts=opts.ToolboxOpts(),
)
bar.render("1-3年经验.html")
大数据需求变化趋势
import pymysql
from pyecharts.charts import Line
from pyecharts import options as opts
db = pymysql.connect(host="192.168.1.201",port=3306,database="wuyou",user='root',password='111111')
cursor = db.cursor()
sql = "select * from fourbigdata"
cursor.execute(sql)
data = cursor.fetchall()
time_list = []
renshu = []
for i in data:
time_list.append(str(i[0]))
renshu.append(str(i[1]))
print(time_list)
print(renshu)
data_pair = [list(z) for z in zip(time_list, renshu)]
data_pair.sort(key=lambda x: x[1])
(
Line(init_opts=opts.InitOpts(width="6000px", height="800px"))
.set_global_opts(
tooltip_opts=opts.TooltipOpts(is_show=False),
xaxis_opts=opts.AxisOpts(type_="category"),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
)
.add_xaxis(xaxis_data=time_list)
.add_yaxis(
series_name="大数据岗位需求变化趋势",
y_axis=renshu,
symbol="emptyCircle",
is_symbol_show=True,
label_opts=opts.LabelOpts(is_show=False),
)
.render("需求变化趋势.html")
)