1 需求
求各个区域下最受欢迎的产品的TOP
2 准备数据
MySQL
mysql> show tables;
+------------------+
| Tables_in_rzdata |
+------------------+
| city_info |
| product_info |
+------------------+
mysql> select * from city_info;
+---------+-----------+------+
| city_id | city_name | area |
+---------+-----------+------+
| 1 | BEIJING | NC |
| 2 | SHANGHAI | EC |
| 3 | NANJING | EC |
| 4 | GUANGZHOU | SC |
| 5 | SANYA | SC |
| 6 | WUHAN | CC |
| 7 | CHANGSHA | CC |
| 8 | XIAN | NW |
| 9 | CHENGDU | SW |
| 10 | HAERBIN | NE |
+---------+-----------+------+
mysql> select * from product_info limit 5;
+------------+--------------+----------------------+
| product_id | product_name | extend_info |
+------------+--------------+----------------------+
| 1 | product1 | {"product_status":1} |
| 2 | product2 | {"product_status":1} |
| 3 | product3 | {"product_status":1} |
| 4 | product4 | {"product_status":1} |
| 5 | product5 | {"product_status":1} |
+------------+--------------+----------------------
用户行为日志:Hive
#用户id,session id,访问时间,城市id,产品id
95,2bf501a7637549c89cf55342331b15db,2016-05-05 21:01:56,1,72
95,2bf501a7637549c89cf55342331b15db,2016-05-05 21:52:26,1,68
...
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:49:43,9,37
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:55:46,9,96
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:24:13,9,67
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:06:52,9,29
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:02:52,9,92
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:19:27,9,12
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:10:07,9,18
create table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
) partitioned by (date string)
row format delimited fields terminated by ',';
load data local inpath '/home/hadoop/data/topn/init/user_click.txt' overwrite into table user_click partition(date='2016-05-05');
select count(1) from user_click where='2016-05-05';
3分析
需求:
区域下受欢迎的产品的TOPN,但是hive表里现在并没有区域信息,我们的区域信息是存放在MySQL里面的,如何才能使用hive来统计呢?
1)city表和product_info表 放到Hive里面
2)通过user_click关联Hive里面的city_info和product_info
3)再使用窗口函数求分组内的TOPN
有办法完成Hive表 JOIN MySQL表吗? 需要借助工具sqoop.
4 sqoop简介
Sqoop: RDBMS(关系型数据库) 与Hadoop(HDFS/Hive/HBase) 导入和导出的一个工具。
以Hadoop为基准点进行理解,导入:外面的东西 导到==> HDFS;导出: 导出去<== HDFS
下载和解压
wget http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.6-cdh5.7.0.tar.gz
tar -zxvf ./sqoop-1.4.6-cdh5.7.0 .tar.gz
权限
sudo chown -R hadoop:hadoop ./sqoop-1.4.6-cdh5.7.0
sudo chmod -R 777 ./sqoop-1.4.6-cdh5.7.0
sudo mv sqoop-1.4.6-cdh5.7.0 /opt
sudo ln -s /opt/sqoop-1.4.6-cdh5.7.0 /sqoop
#配置环境变量
vi /etc/profile
export SQOOP_HOME=/sqoop
export PATH=$PATH:$SQOOP_HOME/bin
source /etc/profile
添加mysql驱动
sudo cp /hive/lib/mysql-connector-java-5.1.37.jar /sqoop/lib
sqoop有个bug,少几个jar包
sudo cp /java-json.jar /sqoop/lib
sudo cp /hive/lib/hive-common-1.1.0-cdh5.7.0.jar /sqoop/lib/
sudo cp /hive/lib/hive-shims-*.jar /sqoop/lib/
配置sqoop
cp /sqoop/conf/sqoop-env-template.sh /sqoop/conf/sqoop-env.sh
vi /sqoop/conf/sqoop-env.sh
export HADOOP_COMMON_HOME=/hadoop
export HADOOP_MAPRED_HOME=/hadoop
export HIVE_HOME=/hive
命令帮助
./sqoop --help
./sqooop list-databases --help
./sqoop help import
分隔符:
--input-fields-terminated-by
--fields-terminated-by
条件: --where "TBL_ID>20"
SQL: --query/-e: query和table不能同时使用
#导入city_info,路径/user/hadoop/city_info
#delete-target-dir 先删除在导入
#-m 1 使用一个map
#columns导入指定column
#target-dir 目标路径
sqoop import \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--table city_info \
--delete-target-dir \
-m 1 \
--hive-database default \
--hive-table city_info \
--hive-import \
--hive-overwrite \
--fields-terminated-by '\t';
#导入product_info
sqoop import \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--table product_info \
--delete-target-dir \
-m 1 \
--hive-database default \
--hive-table product_info \
--hive-import \
--hive-overwrite \
--fields-terminated-by '\t';
#先把user_click和city_info,product_info进行关联
create table area_product_tmp as
select c.area,p.product_name from
user_click uc
left join city_info c on c.city_id=uc.city_id
left join product_info p on p.product_id=uc.product_id;
#统计每个区域下的每个产品的点击数
create table area_product_group_tmp as
select area,product_name ,count(*) total_click
from area_product_tmp
group by area,product_name;
#窗口函数进行top n
select '2016-05-05' date,tmp.* from(
select area,product_name,total_click,row_number() over(partition by area order by total_click desc) r
from area_product_group_tmp
) tmp
where tmp.r<=3;
#mysql创建接收hive导出数据的表
create table area_product_top3 (
date varchar(50),
area varchar(50),
product_name varchar(50),
total_click int
);
#sqoop导出
sqoop export \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--columns "date,area,product_name,total_click" \
--fields-terminated-by '\t' \
--export-dir /user/hive/warehouse/area_product_top3 \
--table area_product_top3 \
-m 2;
#mysql查看导出情况
select * from area_product_top3;
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata_d7 \
--password root \
--username root \
--table TBLS \
--mapreduce-job-name MySQL2HDFS_EMP \
--delete-target-dir \
-m 2
使用shell脚本封装这个业务线的所有代码的思路
- 我们离线是一天一次,是今天凌晨去运行昨天的数据
通过linux命令获取当前天,格式是2016-05-05 -1的到昨天的日期
加:date +%y-%m-%d
减:date --date=‘1 day ago’ +%y-%m-%d
vi ./t.sh
#!/bin/sh
current_date=‘date +%y-%m-%d’
echo$current_date
FROCESS_DATE=‘date --date=‘1 day ago’ +%y-%m-%d’
echo$FROCESS_DATE
echo “hive step1 start…”
hive -e “use d7_hive;select * from city_info”
echo “hive step1 stop…”
chmod +x t.sh
- city_info product_info通过其他的脚本导入
- 所有的sql封装到shell hive -e sql语句
注意点:
a) 每次创建的临时表,在执行之前一定要先删除,要使用if not exits
b) 关键的执行要有日志输出
c) 统计结果输出,如何解决幂等性问题
所有的业务逻辑都封装在shell里面了,但是如何触发呢?
oozie、azkaban等调度工具,那么使用crontab触发,每天凌晨2点开始执行