将csv文件导入到hive上
先将csv文件存进虚拟机里面
再将csv文件存入集群里面
在pycharm中连接hive中,并进行建数据库与建表
create external table if not exists 10w (
msg_time string comment "消息发送时间"
, sender_name string comment "发送人昵称"
, sender_account string comment "发送人账号"
, sender_sex string comment "发送人性别"
, sender_ip string comment "发送人ip地址"
, sender_os string comment "发送人操作系统"
, sender_phonetype string comment "发送人手机型号"
, sender_network string comment "发送人网络类型"
, sender_gps string comment "发送人的GPS定位"
, receiver_name string comment "接收人昵称"
, receiver_ip string comment "接收人IP"
, receiver_account string comment "接收人账号"
, receiver_os string comment "接收人操作系统"
, receiver_phonetype string comment "接收人手机型号"
, receiver_network string comment "接收人网络类型"
, receiver_gps string comment "接收人的GPS定位"
, receiver_sex string comment "接收人性别"
, msg_type string comment "消息类型"
, distance string comment "双方距离"
, message string comment "消息内容"
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '' STORED AS TEXTFILE;
load data inpath '/chat_data-10W.csv' into table 10w;
load data inpath 'csv文件在集群中的位置' into table 表名;
最后成果如下图
数据查询
统计今日信息总量
select
substr(msg_time,1,10)as message_date,
count(*) as daily_message_count
from 10w
group by
substr(msg_time,1,10);
统计每小时消息量,发送和接收用户数
create table if not exists 10w2 comment '每小时消息量趋势'
as select dayinfo, hourinfo,
count(*) as total_msg_cnt,
count(distinct sender_account) as sender_usr_cnt,
count(distinct receiver_account) as receiver_usr_cnt
from 10w
group by dayinfo, hourinfo;
统计今日发送消息最多的Top10用户
select
sender_account,
count(*) as message_count
from
10w
where
substr(msg_time,1,10) = '2023-01-01'
group by
sender_account
order by
message_count desc
limit 10
统计发送人的设备操作系统分布情况
select
sender_os as operating_system,
count(*) as count
from
10w
group by
sender_os
order by
count desc
统计发送人的手机型号分布情况
select
sender_phonetype as phone_model,
count(*) as model_count
from 10w
group by sender_phonetype
order by model_count desc