1、技术架构
业务数据存储在: mysql
中
注意
:
数据导出存在在
: csv
文件中
java
环境
: jdk1.8
海量数据存储
: Hadoop - HDFS
海量数据计算
: Hive / Hadoop - MapReduce
集群资源调度
: Hadoop - YARN
可视化报表工具
: FineBI
2、项目报表需求
1
、统计今日总消息量
2
、统计今日发送和接收用户数
3
、统计每小时消息量、发送和接收用户数
4
、统计今日发送消息最多的
Top10
用户
5
、统计发送人的设备操作系统分布情况
6
、统计发送人的手机型号分布情况
7
、统计发送人的网络类型分布情况
8
、统计发送消息各地区数据量
3、项目业务数据
4、代码展示
--如果数据库已存在就删除
drop database if exists lj cascade;
--创建数据库
create database lj;
--切换数据库
use lj;
--列举数据库
show databases ;
--如果表已存在就删除
drop table if exists lj.source ;
--建表
create table lj.source(
msg_time string comment "消息时间"
, sender_name string comment "发件人昵称"
, sender_account string comment "发件人账号"
, sender_sex string comment "发件人性别"
, sender_ip string comment "发件人IP"
, sender_os string comment "发件人系统"
, sender_phonetype string comment "发件人手机型号"
, sender_network string comment "发件人网络制式"
, sender_gps string comment "发件人GPS"
, receiver_name string comment "收件人昵称"
, receiver_ip string comment "收件人IP"
, receiver_account string comment "收件人账号"
, receiver_os string comment "收件人系统"
, receiver_phonetype string comment "收件人手机型号"
, receiver_network string comment "收件人网络制式"
, receiver_gps string comment "收件人GPS"
, receiver_sex string comment "收件人性别"
, msg_type string comment "消息类型"
, distance string comment "双方距离"
, message string comment "消息" )
load data inpath '/chat_data-10W.csv' into table lj.source;
-- 若已存在转换后的表,则将其删除
drop table if exists lj.table1;
-- 基于SELECT查询结果创建转换后的表
create table lj.table1 as
select *,
substr(msg_time, 1, 10) as dayinfo,
substr(msg_time, 12, 2) as hourinfo, -- 获取天和小时
split(sender_gps, ",")[0] as sender_lng,
split(sender_gps, ",")[1] as sender_lat -- 提取经度纬度
from lj.source
where length(sender_gps) > 0;
-- 预览转换后表的前10行
select msg_time, dayinfo, hourinfo, sender_gps, sender_lng, sender_lat
from lj.table1
limit 10;
5、数据查询
-- 指标1:统计今日信息总量
create table if not exists lj.tb_1 comment '今日总消息量'
as select dayinfo, count(*) as total_msg_cnt
from lj.table1
group by dayinfo;
-- 指标2:统计每小时消息量,发送和接收用户数
create table if not exists lj.tb_2 comment '每小时消息量趋势'
as select dayinfo, hourinfo,
count(*) as total_msg_cnt,
count(distinct sender_account) as sender_usr_cnt,
count(distinct receiver_account) as receiver_usr_cnt
from lj.table1
group by dayinfo, hourinfo;
-- 指标3:统计今日发送和接收用户人数
create table if not exists lj.tb_3
comment '今日发送消息人数、接收消息人数'
as select dayinfo,
count(distinct sender_account) as sender_usr_cnt,
count(distinct receiver_account) as receiver_usr_cnt
from lj.table1
group by dayinfo;
-- 指标4:统计发送消息条数最多的Top10用户
create table if not exists lj.tb_4
comment '发送消息条数最多的Top10用户'
as select dayinfo, sender_name as username,
count(*) as sender_msg_cnt
from lj.table1
group by dayinfo, sender_name
order by sender_msg_cnt desc
limit 10;
-- 指标5:统计发送人手机型号分布
create table if not exists lj.tb_5
comment '发送人的手机型号分布'
as select dayinfo, sender_phonetype,
count(distinct sender_account) as cnt
from lj.table1
group by dayinfo, sender_phonetype;
-- 指标6:统计发送人的操作系统分布
create table if not exists lj.tb_7
comment '发送人的OS分布'
as select dayinfo, sender_os,
count(distinct sender_account) as cnt
from lj.table1
group by dayinfo, sender_os;
-- 指标7:统计接收消息条数最多的Top10用户
create table if not exists lj.tb_8
comment '接收消息条数最多的Top10用户'
as select dayinfo, receiver_name as username,
count(*) as receiver_msg_cnt
from lj.table1
group by dayinfo, receiver_name
order by receiver_msg_cnt desc
limit 10;
-- 指标8:统计今日各地区发送消息总量
create table if not exists lj.tb_6
comment '今日各地区发送消息总量'
as select dayinfo, sender_gps,
cast(sender_lng as double) as longitude,
cast(sender_lat as double) as latitude,
count(*) as total_msg_cnt
from lj.table1
group by dayinfo, sender_gps, sender_lng, sender_lat;