一、导出数据
导出为zh_all2.txt文件
二、上传数据
三、使用Flume传入HDFS
(1)编写conf文件
在flume的conf目录下新建文件
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /opt/module/flume-1.9.0/conf/data/zh_all2.txt
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=44444
a1.sinks.s1.type=HDFS
a1.sinks.s1.type=hdfs://hadoop129:90000/user/flume/qcwy_txt
a1.sinks.s1.hdfs.rollCount=0
a1.sinks.s1.hdfs.fileType=Datastream
# 配置a1的channel组件c1的属性
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
# 为source和sink组件绑定channel
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
文件传入成功
四、数据分析
1、利用hive进行分析,2、将hive分析结果利用sqoop技术存储到mysql数据库中,并最后显示分析结果
1、启动Hive导入zh_all2.txt数据
2、查看table表qcwy2
3、岗位薪资分析
分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来
A. 数据分析岗位
(1)模糊匹配提取
以模糊匹配提取出数据分析岗位的记录,存入表f_x_1(只存Jobtitle和wages字段)
(2)切分薪资字段存储
create table f_x_2 as select Jobtitle, regexp_extract(wages,'([0-9]+)-',1) as a_min, regexp_extract(wages,'-([0-9]+)',1) as a_max, (regexp_extract(wages,'([0-9]+)-',1) + regexp_extract(wages,'-([0-9]+)',1))/2 as a_avg from f_x_1;
数据分析
数据采集
大数据
(3)计算最大 、最小、平均
create table f_x_3 as select "数据分析" as Jobtitle, min(int(a_min)*0.1) as s_min, max(int(a_max)*0.1) as s_max, regexp_extract(avg(a_avg),'([0-9]+.[0-9]?[0-9]?)',1)*0.1 as s_avg from f_x_2;
汇总
(4)、下面查询大数据、数据采集方法类似、然后汇总为一张总表
四、使用sqoop存到mysql
(1)在mysql创建数据库数据表
进入数据库:mysql -u root -p
创建qcwy_db数据库
使用qcwy_db数据库创建表
(1)创建表:create table tab1(t_name varchar(20), t_min int, t_max int, t_avg varchar(10)) charset utf8 collate utf8_general_ci;
(2)导入数据
bin/sqoop export --connect jdbc:mysql://hadoop129:3306/qcwy_db --username root --password 1 --table tab1 --export-dir /user/hive/warehouse/qcwy_db.db/tab1 --input-null-string "\\\\N" --input-null-non-string "\\\\N" --input-fields-terminated-by "\001" --input-lines-terminated-by "\\n" -m 1
查询导入的数据
查询城市岗位数
可视化分析
创建远程访问mysql数据库用户
GRANT ALL PRIVILEGES ON *.* TO 'admin'@'%' IDENTIFIED BY '1' WITH GRANT OPTION;
1、岗位薪资分析
import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts
class MysqlTool:
def __init__(self,host,user,password,database,port = 3306,charset = 'utf8'):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
self.charset = charset
def connect(self):#连接数据库
self.conn = pymysql.connect(
host = self.host,
user = self.user,
password = self.password,
database = self.database,
port = self.port,
charset = self.charset
)
self.cursor = self.conn.cursor() #cursor获取游标
#增删改
#sql:要执行的sql语句
#args:带参sql的值
#返回受影响的行数
def __cud(self,sql,args = None):#私有
row_count = 0
try:
self.connect()
row_count = self.cursor.execute(sql,args)#execute执行
self.conn.commit()#commit提交
self.close()
except Exception as e:
print(e)
return row_count
#插入
def insert(self,sql,args):
return self.__cud(sql,args)
#修改
def updata(self,sql,args):
return self.__cud(sql,args)
#删除
def delete