hive-flume爬虫

本文详细介绍了使用Flume监听数据,Hive进行数据处理和分析的过程。包括创建数据库和表,处理不同工资单位的数据以获取最大、最小和平均工资,以及地区分布情况。进一步分析了大数据相关岗位1-3年工作经验的薪资水平,展示了数据需求的走向趋势。
摘要由CSDN通过智能技术生成

flume监听

[root@hadoop104 conf]# pwd
/opt/module/apache-flume-1.6.0-bin/conf
[root@hadoop104 conf]# vi qc.conf

配置内容如下

name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f //home/temdata/qianchen.txt
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://192.168.10.104:9000/flume/tailout/%y-%m-%d/%H-%M/ 
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

保存,并运行

[root@hadoop104 conf]# ../bin/flume-ng agent -c conf -f qc.conf -name a1 -Dflume.root.logger=DEBUG,console

在这里插入图片描述
在这里插入图片描述

进入hive

1.创建数据库

hive> create database if not exists  qianchen;
hive> use qianchen;

2.创建表

hive>  create table t_qianchen(id string,position string,salary string,company string,place string,job_require string,eduction string,experience string)row format delimited fields terminated by ',' stored as textfile ;
hive>  load data local inpath "/home/temdata/qianchen.txt" into table t_qianchen;
hive> select * from t_qianchen;

在这里插入图片描述重命名数据名:

hive> select positiion as position from t_qianchen ;

新建一个表t_qianchen1,拆分数据:

hive> create table t_qianchen1 as select position,salary from t_qianchen;

去除字段里面的双引号:

hive> create table t_qianchen2 as select regexp_replace(position, '"','')as position,regexp_replace(salary, '"','')as salary from t_qianchen1;

去掉中括号的命令:

hive> create table t_qianchen12 as select regexp_replace(salary, '\\[|\\]','')as salary from t_qianchen1;

查字段名:

hive> desc 表名;

修改表名:

hive> alter table t_qianchen2 rename to t_qianchen1;

去重:

hive> create table t_qianchen2 as select  position ,salary  from t_qianchen1 group by position,salary;

一、 最低最高平均工资柱状图:

1.数据分析:

hive> create table t_qianchen3 as select position,salary from t_qianchen2 where position like '%数据分析%';

在这里插入图片描述取数据分析工资字段shujufenxi_salary

  create table shujufenxi_salary  as select regexp_replace(salary, '\\[|\\]','')as salary from t_qianchen3;

(1)取出工资单位为年的shujufenxi_salary_year

 create table shujufenxi_salary_year as select * from shujufenxi_salary where salary like '%年';

去除/年shujufenxi_salary_year1

hive> create table shujufenxi_salary_year1 as select regexp_replace(salary, '/年', '')as pay from  shujufenxi_salary_year;

转换得到万的工资shujufenxi_salary_year1_1

hive> create table shujufenxi_salary_year1_1 as select regexp_replace(pay, '万','')as pay from  shujufenxi_salary_year1 where pay like '%万';

切分shujufenxi_salary_year1_1_1

hive>  create table shujufenxi_salary_year1_1_1 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from shujufenxi_salary_year1_1;

分别的到最大最小SJFX1

hive> create table SJFX1 as select regexp_replace(max, 'salary:','')as max,regexp_replace(min, 'salary:','')as min from shujufenxi_salary_year1_1_1;

将万写成10000,求出每个月的工资,并求平均值SJFX

create table SJFX as select min*10000/12 as min,max*10000/12 as max,(min+max)/2*10000/12 as avg from SJFX1;

(2)取出工资单位为月的shujufenxi_salary_month

 create table shujufenxi_salary_month as select * from shujufenxi_salary where salary like '%月';

去除/月shujufenxi_salary_month1

hive> create table shujufenxi_salary_month1 as select regexp_replace(salary, '/月', '')as pay from  shujufenxi_salary_month;

2.1)转换得到万的工资shujufenxi_salary_month1_1

hive> create table shujufenxi_salary_month1_1 as select regexp_replace(pay, '万','')as pay from  shujufenxi_salary_month1 where pay like '%万';

切分shujufenxi_salary_month1_1_1

 create table shujufenxi_salary_month1_1_1 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from shujufenxi_salary_month1_1;

分别的到最大最小SJFX2

hive> create table SJFX2 as select regexp_replace(max, 'salary:','')as max,regexp_replace(min, 'salary:','')as min from shujufenxi_salary_month1_1_1;

将万写成10000,并求平均值SJFX21

hive> create table SJFX21  as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from SJFX2 ;

将SJFX21插入数据到SJFX

hive> insert into SJFX select * from SJFX21;

2.2)转换得到千的工资shujufenxi_salary_month1_2

hive> create table shujufenxi_salary_month1_2 as select regexp_replace(pay, '千','')as pay from  shujufenxi_salary_month1 where pay like '%千';

切分shujufenxi_salary_month1_2_1

hive>  create table shujufenxi_salary_month1_2_1 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from shujufenxi_salary_month1_2;

分别的到最大最小SJFX3

hive> create table SJFX3 as select regexp_replace(max, 'salary:','')as max,regexp_replace(min, 'salary:','')as min from shujufenxi_salary_month1_2_1;

将千写成1000,并求平均值SJFX31

create table SJFX31  as select min*1000 as min,max*1000 as max,(min+max)/2*1000 as avg from SJFX3 ;

将SJFX31插入数据到SJFX

hive> insert into SJFX select * from SJFX31;

(3)终:三表合一SJFX,求最大最小平均SJFX_all

hive> create table SJFX_all as select min(min) as min,max(m
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值