相关项目文档:
链接:https://pan.baidu.com/s/1Wk9EfxkDTHdzshGBi6AGJQ
提取码:1124
安装部署文档的书写标准:
1.安装前的资料准备(软件、硬件、其他):
开发环境 生产环境
2.安装中的步骤:
很明确很明细,路径全部都是绝对路径 可以写变量:SUBJECT_HOME 如:
傻瓜式操作,写脚本一键安装
3.安装过程中可能出现的异常处理(回滚)
4.完成安装之后的测试:
5.完成整个部署后的环境清理:
删除脚本、生成的测试文件、给实施人员看的文档不应该留在生产环境上
数据来源技术:
网页埋点/日志:logstash/flume等日志收集工具
数据库信息:sqoop等数据抽取工具
各种形式文件(csv、js):可以放置hdfs或直接load进hive表
将原始数据导入MySQL数据库:
3.新零售数仓项目\02-项目数据\01-原始数据\snbap_ods.sql
source /root/day02/snbap_ods.sql
通过sqoop将MySQL里的表导入到hive中:
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user --target-dir /data/snbap_ods/user --m 10
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user --hive-import --hive-table snbap_ods.user --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user_extend --hive-import --hive-table snbap_ods.user_extend --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user_pc_click_log --hive-import --hive-table snbap_ods.user_pc_click_log --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table user_app_click_log --hive-import --hive-table snbap_ods.user_app_click_log --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table us_order --hive-import --hive-table snbap_ods.us_order --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table order_item --hive-import --hive-table snbap_ods.order_item --m 3
bin/sqoop-import --connect jdbc:mysql://hadoop100:3306/snbap_ods --username root --password ok --table order_delivery --hive-import --hive-table snbap_ods.order_delivery --m 3
补充:导入user_addr
./sqoop import \
--connect \
jdbc:mysql://hadoop100:3306/snbap_ods \
--table user_addr \
--username root \
--password ok \
--hive-import \
--create-hive-table \
--hive-database snbap_ods \
--hive-table user_addr \
--m 3
重点表字段罗列如下:
create database snbap_dw;
create table snbap_dw.user_basic as
select u.user_id,user_name,u.user_gender,user_birthday,user_age,
constellation,province,city,city_level,e_mail,op_mail,mobile,
num_seg_mobile,op_Mobile,register_time,login_ip,login_source,
request_user,total_score,used_score,is_blacklist,is_married,
education,monthly_income,profession,is_pregnant_woman,is_have_children,
is_have_car,phone_brand,phone_brand_level,phone_cnt,change_phone_cnt,
is_maja,majia_account_cnt,loyal_model,shopping_type_model,weight,height
from user u join user_extend e on u.user_id=e.user_id;
如果表位置错误可以采用如下方法:
create table snbap_dw.user_basic as select * from snbap_ods.user_basic;
drop table snbap_ods.user_basic;
最近一次的pc端用户访问情况
--最近一次的pc端用户访问情况
create table snbap_dw.user_last_pc_click as
select distinct * from
(select user_id,session_id,cookie_id,visit_os,browser_name,visit_time,
rank() over(partition by user_id order by visit_time desc) rn,
count(page_id) over(partition by user_id order by visit_time) cnt
from snbap_ods.user_pc_click_log) a where rn=1;
第一次的pc端用户访问情况
--第一次的pc端用户访问情况
create table snbap_dw.user_first_pc_click as
select distinct * from
(select user_id,session_id,cookie_id,visit_os,browser_name,visit_time,
rank() over(partition by user_id order by visit_time) rn,
count(page_id) over(partition by user_id order by visit_time) cnt
from snbap_ods.user_pc_click_log) a where rn=1;
pc连续7天访问次数(跑任务的日期的前7天)
unix_timestamp() 得到当前时间戳
unix_timestamp(string date)
指定日期参数调用UNIX_TIMESTAMP(),它返回参数值’1970-01-01 00:00:00′到指定日期的秒数。
unix_timestamp(string date, string pattern)
指定时间输入格式,返回到1970年秒数:unix_timestamp(’2009-03-20′, ‘yyyy-MM-dd’) = 1237532400
--pc连续7天访问次数(跑任务的日期的前7天) unix_timestamp()
select count(page_id) from snbap_ods.user_pc_click_log
where visit_time between date_sub(from_unixtime(unix_timestamp('2020-02-11'),'yyyy-MM-dd'),7) and '2020-02-11'
group by user_id
或者:
select count(page_id) from snbap_ods.user_pc_click_log
where visit_time between date_sub(current_date(),7) and current_date()
group by user_id
用户访问次数季度统计表
--用户访问次数季度统计表
create table user_pc_visit_total as
with
r1 as (select count(visit_time) cnt1 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),7) and current_date()),
r2 as (select count(visit_time) cnt2 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),15) and current_date()),
r3 as (select count(visit_time) cnt3 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),30) and current_date()),
r4 as (select count(visit_time) cnt4 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),60) and current_date()),
r5 as (select count(visit_time) cnt5 from snbap_dw.user_pc_click_partition
where visit_time between date_sub(current_date(),90) and current_date())
select cnt1,cnt2,cnt3,cnt4,cnt5 from r1,r2,r3,r4,r5;
创建分区表user_pc_click_partition
create table snbap_dw.user_pc_click_partition(
`log_id` bigint,
`user_id` bigint,
`session_id` string,
`cookie_id` string,
`visit_time` string,
`visit_url` string,
`visit_os` string,
`browser_name` string,
`visit_ip` string,
`province` string,
`city` string,
`page_id` int,
`goods_id` bigint,
`shop_id` bigint
)
partitioned by (visit_date string);
alter table snbap_dw.user_pc_click_partition change page_id pv int;
开启动态分区,非严格模式,设置分区参数上限:
set hive.ecec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=50000;
插入数据:
insert into snbap_dw.user_pc_click_partition partition (visit_date)
select
`log_id`,
`user_id`,
`session_id`,
`cookie_id`,
`visit_time`,
`visit_url`,
`visit_os`,
`browser_name`,
`visit_ip`,
`province`,
`city`,
count(`page_id`) over(partition by user_id,visit_time) pv,
`goods_id`,
`shop_id`,
date(visit_time)
from snbap_ods.user_pc_click_log
查看数据pv,但是这里的pv值时没有聚合的,pv=n代表有n条记录。
select user_id,pv from snbap_dw.user_pc_click_partition where pv=3;