-Hive-4-综合实战

1 需求

求各个区域下最受欢迎的产品的TOP

2 准备数据

MySQL

mysql> show tables;
+------------------+
| Tables_in_rzdata |
+------------------+
| city_info        |
| product_info     |
+------------------+


mysql> select * from city_info;
+---------+-----------+------+
| city_id | city_name | area |
+---------+-----------+------+
|       1 | BEIJING   | NC   |
|       2 | SHANGHAI  | EC   |
|       3 | NANJING   | EC   |
|       4 | GUANGZHOU | SC   |
|       5 | SANYA     | SC   |
|       6 | WUHAN     | CC   |
|       7 | CHANGSHA  | CC   |
|       8 | XIAN      | NW   |
|       9 | CHENGDU   | SW   |
|      10 | HAERBIN   | NE   |
+---------+-----------+------+


mysql> select * from product_info limit 5;
+------------+--------------+----------------------+
| product_id | product_name | extend_info          |
+------------+--------------+----------------------+
|          1 | product1     | {"product_status":1} |
|          2 | product2     | {"product_status":1} |
|          3 | product3     | {"product_status":1} |
|          4 | product4     | {"product_status":1} |
|          5 | product5     | {"product_status":1} |
+------------+--------------+----------------------

用户行为日志:Hive

#用户id,session id,访问时间,城市id,产品id
95,2bf501a7637549c89cf55342331b15db,2016-05-05 21:01:56,1,72
95,2bf501a7637549c89cf55342331b15db,2016-05-05 21:52:26,1,68
...
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:49:43,9,37
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:55:46,9,96
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:24:13,9,67
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:06:52,9,29
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:02:52,9,92
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:19:27,9,12
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:10:07,9,18



create table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
) partitioned by (date string)
row format delimited fields terminated by ',';

load data local inpath '/home/hadoop/data/topn/init/user_click.txt' overwrite into table user_click partition(date='2016-05-05');
	
select count(1) from user_click where='2016-05-05';

3分析

需求:
区域下受欢迎的产品的TOPN,但是hive表里现在并没有区域信息,我们的区域信息是存放在MySQL里面的,如何才能使用hive来统计呢?

1)city表和product_info表  放到Hive里面
2)通过user_click关联Hive里面的city_info和product_info
3)再使用窗口函数求分组内的TOPN 

有办法完成Hive表 JOIN MySQL表吗? 需要借助工具sqoop.

4 sqoop简介

Sqoop: RDBMS(关系型数据库) 与Hadoop(HDFS/Hive/HBase) 导入和导出的一个工具。
以Hadoop为基准点进行理解,导入:外面的东西 导到==> HDFS;导出: 导出去<== HDFS

下载和解压

wget http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.6-cdh5.7.0.tar.gz
tar -zxvf ./sqoop-1.4.6-cdh5.7.0 .tar.gz
权限
sudo chown -R hadoop:hadoop ./sqoop-1.4.6-cdh5.7.0
sudo chmod -R 777 ./sqoop-1.4.6-cdh5.7.0
sudo mv sqoop-1.4.6-cdh5.7.0 /opt
sudo ln -s /opt/sqoop-1.4.6-cdh5.7.0 /sqoop
#配置环境变量
vi /etc/profile
	export SQOOP_HOME=/sqoop
	export PATH=$PATH:$SQOOP_HOME/bin
source /etc/profile
添加mysql驱动
sudo cp /hive/lib/mysql-connector-java-5.1.37.jar /sqoop/lib
sqoop有个bug,少几个jar包
sudo cp /java-json.jar /sqoop/lib
sudo cp /hive/lib/hive-common-1.1.0-cdh5.7.0.jar /sqoop/lib/
sudo cp /hive/lib/hive-shims-*.jar /sqoop/lib/
配置sqoop
cp /sqoop/conf/sqoop-env-template.sh /sqoop/conf/sqoop-env.sh
vi /sqoop/conf/sqoop-env.sh
	export HADOOP_COMMON_HOME=/hadoop
	export HADOOP_MAPRED_HOME=/hadoop
	export HIVE_HOME=/hive

命令帮助

     ./sqoop --help
        ./sqooop list-databases --help
        ./sqoop help import
分隔符:
--input-fields-terminated-by
--fields-terminated-by
  条件: --where "TBL_ID>20"	
   SQL: --query/-e:     query和table不能同时使用     

#导入city_info,路径/user/hadoop/city_info
#delete-target-dir 先删除在导入
#-m 1 使用一个map
#columns导入指定column
#target-dir 目标路径

sqoop import \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--table city_info \
--delete-target-dir \
-m 1 \
--hive-database default \
--hive-table city_info \
--hive-import \
--hive-overwrite \
--fields-terminated-by '\t';

#导入product_info

sqoop import \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--table product_info \
--delete-target-dir \
-m 1 \
--hive-database default \
--hive-table product_info \
--hive-import \
--hive-overwrite \
--fields-terminated-by '\t';

#先把user_click和city_info,product_info进行关联

create table area_product_tmp as
select c.area,p.product_name  from 
user_click uc
left join city_info c on c.city_id=uc.city_id
left join product_info p on p.product_id=uc.product_id;

#统计每个区域下的每个产品的点击数
create table area_product_group_tmp as
select area,product_name ,count(*) total_click
from area_product_tmp
group by area,product_name;

#窗口函数进行top n

select '2016-05-05' date,tmp.* from(
select area,product_name,total_click,row_number() over(partition by area order by total_click desc) r
from area_product_group_tmp
) tmp
where tmp.r<=3; 

#mysql创建接收hive导出数据的表

create table area_product_top3 (
date varchar(50),
area varchar(50),
product_name varchar(50),
total_click int
); 

#sqoop导出

sqoop export \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--columns "date,area,product_name,total_click" \
--fields-terminated-by '\t' \
--export-dir /user/hive/warehouse/area_product_top3 \
--table area_product_top3 \
-m 2;

#mysql查看导出情况

select * from area_product_top3;


sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata_d7 \
--password root \
--username root \
--table TBLS \
--mapreduce-job-name MySQL2HDFS_EMP \
--delete-target-dir \
-m 2

使用shell脚本封装这个业务线的所有代码的思路

  1. 我们离线是一天一次,是今天凌晨去运行昨天的数据
    通过linux命令获取当前天,格式是2016-05-05 -1的到昨天的日期
    加:date +%y-%m-%d
    减:date --date=‘1 day ago’ +%y-%m-%d

vi ./t.sh

#!/bin/sh
current_date=‘date +%y-%m-%d’
echo$current_date

FROCESS_DATE=‘date --date=‘1 day ago’ +%y-%m-%d’
echo$FROCESS_DATE

echo “hive step1 start…”
hive -e “use d7_hive;select * from city_info”
echo “hive step1 stop…”

chmod +x t.sh

  1. city_info product_info通过其他的脚本导入
  2. 所有的sql封装到shell hive -e sql语句
    注意点:
    a) 每次创建的临时表,在执行之前一定要先删除,要使用if not exits
    b) 关键的执行要有日志输出
    c) 统计结果输出,如何解决幂等性问题

所有的业务逻辑都封装在shell里面了,但是如何触发呢?
oozie、azkaban等调度工具,那么使用crontab触发,每天凌晨2点开始执行

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值