hive-flume爬虫

最新推荐文章于 2023-12-05 15:30:00 发布

一休祺

最新推荐文章于 2023-12-05 15:30:00 发布

阅读量503

点赞数

文章标签： hive sqoop flume

本文链接：https://blog.csdn.net/heshiqi9927/article/details/107207557

版权

本文详细介绍了使用Flume监听数据，Hive进行数据处理和分析的过程。包括创建数据库和表，处理不同工资单位的数据以获取最大、最小和平均工资，以及地区分布情况。进一步分析了大数据相关岗位1-3年工作经验的薪资水平，展示了数据需求的走向趋势。

摘要由CSDN通过智能技术生成

flume监听

[root@hadoop104 conf]# pwd
/opt/module/apache-flume-1.6.0-bin/conf
[root@hadoop104 conf]# vi qc.conf

配置内容如下

name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f //home/temdata/qianchen.txt
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://192.168.10.104:9000/flume/tailout/%y-%m-%d/%H-%M/ 
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型，默认是Sequencefile，可用DataStream，则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

保存，并运行

[root@hadoop104 conf]# ../bin/flume-ng agent -c conf -f qc.conf -name a1 -Dflume.root.logger=DEBUG,console

在这里插入图片描述

进入hive

1.创建数据库

hive> create database if not exists  qianchen;
hive> use qianchen;

2.创建表

hive>  create table t_qianchen(id string,position string,salary string,company string,place string,job_require string,eduction string,experience string)row format delimited fields terminated by ',' stored as textfile ;
hive>  load data local inpath "/home/temdata/qianchen.txt" into table t_qianchen;
hive> select * from t_qianchen;

在这里插入图片描述重命名数据名：

hive> select positiion as position from t_qianchen ;

新建一个表t_qianchen1，拆分数据：

hive> create table t_qianchen1 as select position,salary from t_qianchen;

去除字段里面的双引号：

hive> create table t_qianchen2 as select regexp_replace(position, '"','')as position,regexp_replace(salary, '"','')as salary from t_qianchen1;

去掉中括号的命令：

hive> create table t_qianchen12 as select regexp_replace(salary, '\\[|\\]','')as salary from t_qianchen1;

查字段名：

hive> desc 表名;

修改表名：

hive> alter table t_qianchen2 rename to t_qianchen1;

去重：

hive> create table t_qianchen2 as select  position ,salary  from t_qianchen1 group by position,salary;

一、最低最高平均工资柱状图：

1.数据分析：

hive> create table t_qianchen3 as select position,salary from t_qianchen2 where position like '%数据分析%';

在这里插入图片描述取数据分析工资字段shujufenxi_salary

  create table shujufenxi_salary  as select regexp_replace(salary, '\\[|\\]','')as salary from t_qianchen3;

（1）取出工资单位为年的shujufenxi_salary_year

 create table shujufenxi_salary_year as select * from shujufenxi_salary where salary like '%年';

去除/年shujufenxi_salary_year1

hive> create table shujufenxi_salary_year1 as select regexp_replace(salary, '/年', '')as pay from  shujufenxi_salary_year;

转换得到万的工资shujufenxi_salary_year1_1

hive> create table shujufenxi_salary_year1_1 as select regexp_replace(pay, '万','')as pay from  shujufenxi_salary_year1 where pay like '%万';

切分shujufenxi_salary_year1_1_1

hive>  create table shujufenxi_salary_year1_1_1 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from shujufenxi_salary_year1_1;

分别的到最大最小SJFX1

hive> create table SJFX1 as select regexp_replace(max, 'salary:','')as max,regexp_replace(min, 'salary:','')as min from shujufenxi_salary_year1_1_1;

将万写成10000，求出每个月的工资，并求平均值SJFX

create table SJFX as select min*10000/12 as min,max*10000/12 as max,(min+max)/2*10000/12 as avg from SJFX1;

（2）取出工资单位为月的shujufenxi_salary_month

 create table shujufenxi_salary_month as select * from shujufenxi_salary where salary like '%月';

去除/月shujufenxi_salary_month1

hive> create table shujufenxi_salary_month1 as select regexp_replace(salary, '/月', '')as pay from  shujufenxi_salary_month;

2.1）转换得到万的工资shujufenxi_salary_month1_1

hive> create table shujufenxi_salary_month1_1 as select regexp_replace(pay, '万','')as pay from  shujufenxi_salary_month1 where pay like '%万';

切分shujufenxi_salary_month1_1_1

 create table shujufenxi_salary_month1_1_1 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from shujufenxi_salary_month1_1;

分别的到最大最小SJFX2

hive> create table SJFX2 as select regexp_replace(max, 'salary:','')as max,regexp_replace(min, 'salary:','')as min from shujufenxi_salary_month1_1_1;

将万写成10000，并求平均值SJFX21

hive> create table SJFX21  as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from SJFX2 ;

将SJFX21插入数据到SJFX

hive> insert into SJFX select * from SJFX21;

2.2）转换得到千的工资shujufenxi_salary_month1_2

hive> create table shujufenxi_salary_month1_2 as select regexp_replace(pay, '千','')as pay from  shujufenxi_salary_month1 where pay like '%千';

切分shujufenxi_salary_month1_2_1

hive>  create table shujufenxi_salary_month1_2_1 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from shujufenxi_salary_month1_2;

分别的到最大最小SJFX3

hive> create table SJFX3 as select regexp_replace(max, 'salary:','')as max,regexp_replace(min, 'salary:','')as min from shujufenxi_salary_month1_2_1;

将千写成1000，并求平均值SJFX31

create table SJFX31  as select min*1000 as min,max*1000 as max,(min+max)/2*1000 as avg from SJFX3 ;

将SJFX31插入数据到SJFX

hive> insert into SJFX select * from SJFX31;

（3）终：三表合一SJFX，求最大最小平均SJFX_all

hive> create table SJFX_all as select min(min) as min,max(m

最低0.47元/天解锁文章

一休祺

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫