目录
1.将数据插入hive中
-
在Termius客户端中将文件传到hive中
-
在Termius客户端中将本地文件上传到HDFS中
hadoop fs -put chat_data-10W.csv /
-
可以查看是否成功上传
[root@node1 data]# hadoop fs -ls /
Found 6 items
-rw-r--r-- 1 root supergroup 6 2024-04-23 08:10 /a.txt
-rw-r--r-- 1 root supergroup 38148732 2024-04-25 09:28 /chat_data-10W.csv
drwxr-xr-x - root supergroup 0 2024-04-24 17:22 /dir1
-rw-r--r-- 1 root supergroup 12 2021-07-07 16:21 /hello.txt
drwxrwx--- - root supergroup 0 2021-07-07 13:26 /tmp
drwxr-xr-x - root supergroup 0 2024-04-24 11:08 /user
您在 /var/spool/mail/root 中有新邮件
-
先在idea中先创建一个表
create table if not exists csv(
msg_time string comment "消息发送时间",
sender_name string comment "发送人昵称",
sender_account string comment "发送人账号",
sender_sex string comment "发送人性别",
sender_ip string comment "发送人ip地址",
sender_os string comment "发送人操作系统",
sender_phonetype string comment "发送人手机型号",
sender_network string comment "发送人网络类型",
sender_gps string comment "发送人的GPS定位",
receiver_name string comment "接收人昵称",
receiver_ip string comment "接收人IP",
receiver_account string comment "接收人账号",
receiver_os string comment "接收人操作系统",
receiver_phonetype string comment "接收人手机型号",
receiver_network string comment "接收人网络类型",
receiver_gps string comment "接收人的GPS定位",
receiver_sex string comment "接收人性别",
msg_type string comment "消息类型",
distance string comment "双方距离",
message string comment "消息内容" )
row format delimited fields terminated by '' stored as textfile ;
-
然后将HDFS上的数据文件加载到Hive表中
load data inpath '/chat_data-10W.csv' into table csv;
2.根据需求筛选信息
-
统计今日总消息量
create table if not exists TodayTotalMessageCount as
select
substr(msg_time,1,10)as message_date,
count(*) as daily_message_count
from csv
group by
substr(msg_time,1,10);
-
统计今日发送用户量
create table if not exists TodaySentUserCount comment '统计今日发送数据量'as
select count(distinct sender_account) as today_senders
from csv
where date(substr(msg_time,1,10)) ='2023-01-01';
-
统计今日接收用户量
create table if not exists TodayReceivedUserCount comment '统计今日接收数据量' as
select count(distinct receiver_account) as today_receivers
from csv
where date(substr(msg_time,1,10)) = '2023-01-01';
-
统计每小时消息量
create table if not exists HourlyMessageVolume comment '统计每小时消息量' as
select date_format(msg_time,'yyyy-MM-dd HH') as hour,
count(*) as hourly_message_count
from csv
where substr(msg_time,1,10) = '2023-01-01'
group by date_format(msg_time,'yyyy-MM-dd HH');
-
统计每日小时发送用户量
create table if not exists DailyHourlySendingUsers comment '统计每日小时发送用户量' as
select
date_format(msg_time, 'yyyy-MM-dd HH') as hour,
count(distinct sender_account) as hourly_senders
from
csv
where
substr(msg_time, 1, 10) = '2023-01-01'
group by
date_format(msg_time, 'yyyy-MM-dd HH');
-
统计每小时接受用户量
create table if not exists HourlyReceivedUsersCount comment '统计每小时接收用户量' as
select
date_format(msg_time,'yyyy-MM-dd HH') as hour,
count(distinct receiver_account) as hourly_receivers
from
csv
where
substr(msg_time,1,10) = '2023-01-01'
group by
date_format(msg_time,'yyyy-MM-dd HH');
-
统计今日发送消息最多的Top10用户
create table if not exists Top10UsersByMessagesSentToday comment '统计今日发送消息最多的Top10用户' as
select
sender_account,
count(*) as message_count
from
csv
where
substr(msg_time,1,10) = '2023-01-01'
group by
sender_account
order by
message_count desc
limit 10;
-
统计发送人的设备操作系统分布情况
create table if not exists DeviceOSDistributionAmongSenders comment '统计发送人的设备操作系统分布情况' as
select
sender_os as operating_system,
count(*) as count
from
csv
group by
sender_os
order by
count desc ;
-
统计发送人的手机型号分布情况
create table if not exists SmartphoneModelDistributionAmongSenders comment '统计发送人的手机型号分布情况' as
select
sender_phonetype as phone_model,
count(*) as model_count
from csv
group by sender_phonetype
order by model_count desc ;
-
统计发送人的网络类型分布情况
create table if not exists NetworkTypeDistributionAmongSenders comment '统计发送人的网络类型分布情况' as
select
sender_network as network_type,
count(*) as network_count
from csv
group by sender_network
order by network_count desc ;
-
统计发送信息各地区数据量
create table if not exists RegionalDataVolumeOfSentMessages comment '统计发送信息各地数据量' as
select
sender_gps as region,
count(*) as message_count
from
csv
group by
sender_gps
order by
message_count desc ;
可以看到已经将统计的数据建成了对应的表