创建数据库:
create database if not exists myhive2;
连接数据库:
use myhive2;
创建一个收发人基本信息详情表:
create table myhive2.news(
send_date STRING comment '日期',
id STRING comment '发件人姓名' ,
send_num STRING comment '发件人手机号',
send_sex STRING comment '发件人性别',
send_ip STRING comment '发件人ip地址',
send_from STRING comment '发件人手机系统',
send_phone STRING comment '发件人手机型号',
send_network STRING comment '发件人手机网络',
send_site STRING comment '发件人地址',
re_id STRING comment '收件人姓名' ,
re_ip STRING comment '收件人ip地址',
re_num STRING comment '收件人手机号',
re_from STRING comment '收件人手机系统',
re_phone STRING comment '收件人手机型号',
re_network STRING comment '收件人手机网络',
re_site STRING comment '收件人地址',
re_sex STRING comment '收件人性别',
type STRING comment '文件类型',
distance STRING comment '距离',
content STRING comment '发送内容'
)
COMMENT '信息表';
插入用户信息:
load data local inpath '/chat_data-10W.csv' into table myhive2.news;
(自备的数据chat_data-10W.csv)
实战数据处理:
1.统计今日总消息量
SELECT COUNT(*)
FROM myhive2.news;
2.统计今日发送和接收用户数
SELECT
COUNT(DISTINCT id) AS total_unique_senders,
COUNT(DISTINCT re_id) AS total_unique_receivers
FROM myhive2.news;
3.统计每小时消息量,发送和接收用户量
SELECT
DATE_FORMAT(send_date, 'yyyy-MM-dd HH') AS hour,
COUNT(*) AS message_count,
COUNT(DISTINCT id) AS unique_senders,
COUNT(DISTINCT re_id) AS unique_receivers
FROM myhive2.news
GROUP BY DATE_FORMAT(send_date, 'yyyy-MM-dd HH')
ORDER BY hour;
4.查询每个人发了多少条信息
select id,count(*) from myhive2.news group by id;
SELECT
id AS person_name,
COUNT(*) AS message_count
FROM myhive2.news
GROUP BY id
ORDER BY message_count DESC;
5.统计发送人的设备操作系统分布情况
SELECT
send_from AS operating_system,
COUNT(*) AS count
FROM myhive2.news
GROUP BY send_from;
6.统计发送人的手机型号分布情况
SELECT send_phone, COUNT(*) AS count
FROM myhive2.news
GROUP BY send_phone;
7.统计发送人的网络类型分布情况
SELECT send_network, COUNT(*) AS count
FROM myhive2.news
GROUP BY send_network;
8.统计发送消息各地区数据量
SELECT
send_site AS region,
COUNT(*) AS message_count
FROM myhive2.news
GROUP BY send_site;