Day4 认知实习——项目实战

目录

1.将数据插入hive中

在Termius中将文件传到hive中

在Termius客户端中将本地文件上传到HDFS中

可以查看是否成功上传

先在idea中先创建一个表

然后将HDFS上的数据文件加载到Hive表中

2.根据需求筛选信息

统计今日总消息量

统计今日发送用户量

统计今日接收用户量

统计每小时消息量

统计每日小时发送用户量

统计每小时接受用户量

统计今日发送消息最多的Top10用户

统计发送人的设备操作系统分布情况

统计发送人的手机型号分布情况

统计发送人的网络类型分布情况

统计发送信息各地区数据量


1.将数据插入hive中

  • 在Termius客户端中将文件传到hive中

  • 在Termius客户端中将本地文件上传到HDFS中

hadoop fs -put chat_data-10W.csv /
  • 可以查看是否成功上传

[root@node1 data]# hadoop fs -ls /
Found 6 items
-rw-r--r--   1 root supergroup          6 2024-04-23 08:10 /a.txt
-rw-r--r--   1 root supergroup   38148732 2024-04-25 09:28 /chat_data-10W.csv
drwxr-xr-x   - root supergroup          0 2024-04-24 17:22 /dir1
-rw-r--r--   1 root supergroup         12 2021-07-07 16:21 /hello.txt
drwxrwx---   - root supergroup          0 2021-07-07 13:26 /tmp
drwxr-xr-x   - root supergroup          0 2024-04-24 11:08 /user
您在 /var/spool/mail/root 中有新邮件
  • 先在idea中先创建一个表

create table if not exists csv(
    msg_time string comment "消息发送时间",
    sender_name string comment "发送人昵称",
    sender_account string comment "发送人账号",
    sender_sex string comment "发送人性别",
    sender_ip string comment "发送人ip地址",
    sender_os string comment "发送人操作系统",
    sender_phonetype string comment "发送人手机型号",
    sender_network string comment "发送人网络类型",
    sender_gps string comment "发送人的GPS定位",
    receiver_name string comment "接收人昵称",
    receiver_ip string comment "接收人IP",
    receiver_account string comment "接收人账号",
    receiver_os string comment "接收人操作系统",
    receiver_phonetype string comment "接收人手机型号",
    receiver_network string comment "接收人网络类型",
    receiver_gps string comment "接收人的GPS定位",
    receiver_sex string comment "接收人性别",
    msg_type string comment "消息类型",
    distance string comment "双方距离",
    message string comment "消息内容" )

    row format delimited fields terminated by '' stored as textfile ;
  • 然后将HDFS上的数据文件加载到Hive表中

load data inpath '/chat_data-10W.csv' into table csv;

2.根据需求筛选信息

  • ​​​​​​​统计今日总消息量

create table if not exists TodayTotalMessageCount as
select
    substr(msg_time,1,10)as message_date,
    count(*) as daily_message_count
from csv
group by
    substr(msg_time,1,10);
  • 统计今日发送用户量

create table if not exists TodaySentUserCount comment '统计今日发送数据量'as
select count(distinct sender_account) as today_senders
from csv
where date(substr(msg_time,1,10)) ='2023-01-01';
  • 统计今日接收用户量

create table if not exists TodayReceivedUserCount comment '统计今日接收数据量' as
select count(distinct receiver_account) as today_receivers
from csv
where date(substr(msg_time,1,10)) = '2023-01-01';
  • 统计每小时消息量

create table if not exists HourlyMessageVolume comment '统计每小时消息量' as
select date_format(msg_time,'yyyy-MM-dd HH') as hour,
       count(*) as hourly_message_count
from csv
where substr(msg_time,1,10) = '2023-01-01'
group by date_format(msg_time,'yyyy-MM-dd HH');

  • 统计每日小时发送用户量

create table if not exists DailyHourlySendingUsers comment '统计每日小时发送用户量' as
select
    date_format(msg_time, 'yyyy-MM-dd HH') as hour,
    count(distinct sender_account) as hourly_senders
from
    csv
where
        substr(msg_time, 1, 10) = '2023-01-01'
group by
    date_format(msg_time, 'yyyy-MM-dd HH');
  • 统计每小时接受用户量

create table if not exists HourlyReceivedUsersCount comment '统计每小时接收用户量' as
select
    date_format(msg_time,'yyyy-MM-dd HH') as hour,
    count(distinct receiver_account) as hourly_receivers
from
    csv
where
        substr(msg_time,1,10) = '2023-01-01'
group by
    date_format(msg_time,'yyyy-MM-dd HH');
  • 统计今日发送消息最多的Top10用户

create table if not exists Top10UsersByMessagesSentToday comment '统计今日发送消息最多的Top10用户' as
select
    sender_account,
    count(*) as message_count
from
    csv
where
        substr(msg_time,1,10) = '2023-01-01'
group by
    sender_account
order by
    message_count desc
limit 10;
  • 统计发送人的设备操作系统分布情况

create table if not exists DeviceOSDistributionAmongSenders comment '统计发送人的设备操作系统分布情况' as
select
    sender_os as operating_system,
    count(*) as count
from
    csv
group by
    sender_os
order by
    count desc ;
  • 统计发送人的手机型号分布情况

create table if not exists SmartphoneModelDistributionAmongSenders comment '统计发送人的手机型号分布情况' as
select
    sender_phonetype as phone_model,
    count(*) as model_count
from csv
group by sender_phonetype
order by model_count desc ;
  • 统计发送人的网络类型分布情况

create table if not exists NetworkTypeDistributionAmongSenders comment '统计发送人的网络类型分布情况' as
select
    sender_network as network_type,
    count(*) as network_count
from csv
group by sender_network
order by network_count desc ;
  • 统计发送信息各地区数据量

create table if not exists RegionalDataVolumeOfSentMessages comment '统计发送信息各地数据量' as
select
    sender_gps as region,
    count(*) as message_count
from
    csv
group by
    sender_gps
order by
    message_count desc ;

可以看到已经将统计的数据建成了对应的表

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值