Hive - 基础4 大综合练习
1.需求
需求:每个区域的受欢迎的产品的Top N
2.准备数据
-- Mysql数据库
-- 城市表
CREATE TABLE `city_info` (
`city_id` int(11) DEFAULT NULL,
`city_name` varchar(255) DEFAULT NULL,
`area` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO `city_info` (`city_id`, `city_name`, `area`)
VALUES (1, 'BEIJING', 'NC'),
(2, 'SHANGHAI', 'EC'),
(3, 'NANJING', 'EC'),
(4, 'GUANGZHOU', 'SC'),
(5, 'SANYA', 'SC'),
(6, 'WUHAN', 'CC'),
(7, 'CHANGSHA', 'CC'),
(8, 'XIAN', 'NW'),
(9, 'CHENGDU', 'SW'),
(10, 'HAERBIN', 'NE');
-- 产品表
CREATE TABLE product_info (
product_id INT PRIMARY KEY AUTO_INCREMENT,
product_name VARCHAR(50),
extend_info VARCHAR(50)
);
INSERT INTO product_info (product_id, product_name, extend_info)
VALUES (1, 'product1', '{"product_status":1}'),
(2, 'product2', '{"product_status":1}'),
(3, 'product3', '{"product_status":1}'),
(4, 'product4', '{"product_status":1}'),
(5, 'product5', '{"product_status":1}'),
(6, 'product6', '{"product_status":1}'),
(7, 'product7', '{"product_status":1}'),
(8, 'product8', '{"product_status":1}'),
(9, 'product9', '{"product_status":0}'),
(10, 'product10', '{"product_status":1}'),
(11, 'product11', '{"product_status":0}'),
(12, 'product12', '{"product_status":0}'),
...
(94, 'product94', '{"product_status":1}'),
(95, 'product95', '{"product_status":0}'),
(96, 'product96', '{"product_status":0}'),
(97, 'product97', '{"product_status":1}'),
(98, 'product98', '{"product_status":1}'),
(99, 'product99', '{"product_status":0}'),
(100, 'product100', '{"product_status":1}');
# user_click.txt
# 用户行为日志,存放在hive中
# 用户id,session id,访问时间,城市id,产品id
95,2bf501a7637549c89cf55342331b15db,2016-05-05 21:01:56,1,72
95,2bf501a7637549c89cf55342331b15db,2016-05-05 21:52:26,1,68
...
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:49:43,9,37
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:55:46,9,96
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:24:13,9,67
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:06:52,9,29
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:02:52,9,92
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:19:27,9,12
53,561e94c2fa99467d956a26d55aab3cb6,2016-05-05 3:10:07,9,18
CREATE TABLE user_click(user_id int,session_id string,visit_time string,city_id int,product_id int)
PARTITIONED BY (date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/hive/user_click.txt' INTO TABLE user_click PARTITION(date='2016-05-05');
3.分析
需求是:每个区域的受欢迎的产品的Top N,但是区域相关的表时存储在mysql中,如何才能使用hive来统计
- 将city_info和product_info表导入hive
- 使用join进行表关联
- 使用窗口函数进行Top N
如何将mysql数据导入hive呢?这时,我们需要使用sqoop
4.Sqoop简介
Apache Sqoop™ is a tool designed for efficiently transferring bulk data between Apache Hadoop and structured datastores such as relational databases.
Sqoop框架是用来高效连接Hadoop(广义的,不仅仅包含hdfs)和结构化数据存储的工具,用于hadoop数据导入和导出,这里的导入和到处以hadoop为相对参考
Sqoop现在有两个版本
1.x : 以1.4开头的
2.x : 以1.99开头的,一般生产不用
下载和配置
# 下载和解压
wget http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.6-cdh5.7.0.tar.gz
tar -zxvf ./sqoop-1.4.6-cdh5.7.0 .tar.gz
# 权限
sudo chown -R hadoop:hadoop ./sqoop-1.4.6-cdh5.7.0
sudo chmod -R 777 ./sqoop-1.4.6-cdh5.7.0
sudo mv sqoop-1.4.6-cdh5.7.0 /opt
sudo ln -s /opt/sqoop-1.4.6-cdh5.7.0 /sqoop
# 配置环境变量
vi /etc/profile
export SQOOP_HOME=/sqoop
export PATH=$PATH:$SQOOP_HOME/bin
source /etc/profile
# 添加mysql驱动
sudo cp /hive/lib/mysql-connector-java-5.1.37.jar /sqoop/lib
# sqoop有个bug,少几个jar包
sudo cp /java-json.jar /sqoop/lib
sudo cp /hive/lib/hive-common-1.1.0-cdh5.7.0.jar /sqoop/lib/
sudo cp /hive/lib/hive-shims-*.jar /sqoop/lib/
# 配置sqoop
cp /sqoop/conf/sqoop-env-template.sh /sqoop/conf/sqoop-env.sh
vi /sqoop/conf/sqoop-env.sh
export HADOOP_COMMON_HOME=/hadoop
export HADOOP_MAPRED_HOME=/hadoop
export HIVE_HOME=/hive
命令
sqoop help
usage: sqoop COMMAND [ARGS]
Available commands:
codegen Generate code to interact with database records
create-hive-table Import a table definition into Hive
eval Evaluate a SQL statement and display the results
export Export an HDFS directory to a database table
help List available commands
import Import a table from a database to HDFS 导入单张到hdfs
import-all-tables Import tables from a database to HDFS 导入所有表到dhfs
import-mainframe Import datasets from a mainframe server to HDFS
job Work with saved jobs
list-databases List available databases on a server 查看数据库
list-tables List available tables in a database 查看数据库的表
merge Merge results of incremental imports
metastore Run a standalone Sqoop metastore
version Display version information
# 查看数据库
sqooop list-databases --help
usage: sqoop list-databases [GENERIC-ARGS] [TOOL-ARGS]
Common arguments:
--connect <jdbc-uri> Specify JDBC connect
string
--connection-manager <class-name> Specify connection manager
class name
--connection-param-file <properties-file> Specify connection
parameters file
--driver <class-name> Manually specify JDBC
driver class to use
--hadoop-home <hdir> Override
$HADOOP_MAPRED_HOME_ARG
--hadoop-mapred-home <dir> Override
$HADOOP_MAPRED_HOME_ARG
--help Print usage instructions
-P Read password from console
--password <password> Set authentication
password
--password-alias <password-alias> Credential provider
password alias
--password-file <password-file> Set authentication
password file path
--relaxed-isolation Use read-uncommitted
isolation for imports
--skip-dist-cache Skip copying jars to
distributed cache
--username <username> Set authentication
username
--verbose Print more information
while working
Generic Hadoop command-line arguments:
(must preceed any tool-specific arguments)
Generic options supported are
-conf <configuration file> specify an application configuration file
-D <property=value> use value for given property
-fs <local|namenode:port> specify a namenode
-jt <local|resourcemanager:port> specify a ResourceManager
-files <comma separated list of files> specify comma separated files to be copied to the map reduce cluster
-libjars <comma separated list of jars> specify comma separated jar files to include in the classpath.
-archives <comma separated list of archives> specify comma separated archives to be unarchived on the compute machines.
sqoop help import
usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
Common arguments:
--connect <jdbc-uri> Specify JDBC connect
string
--connection-manager <class-name> Specify connection manager
class name
--connection-param-file <properties-file> Specify connection
...
Import control arguments:
--append Imports data
in append
mode
--as-avrodatafile Imports data
to Avro data
files
--as-parquetfile Imports data
to Parquet
files
--as-sequencefile Imports data
to
SequenceFile
s
--as-textfile Imports data
as plain
text
(default)
--autoreset-to-one-mapper Reset the
number of
mappers to
one mapper
if no split
key
available
--boundary-query <statement> Set boundary
query for
retrieving
max and min
value of the
primary key
--columns <col,col,col...> Columns to
import from
table
--compression-codec <codec> Compression
codec to use
for import
--delete-target-dir Imports data
in delete
mode
--direct Use direct
import fast
path
--direct-split-size <n> Split the
input stream
every 'n'
bytes when
importing in
direct mode
-e,--query <statement> Import
results of
SQL
'statement'
--fetch-size <n> Set number
'n' of rows
to fetch
from the
database
when more
rows are
needed
--inline-lob-limit <n> Set the
maximum size
for an
inline LOB
-m,--num-mappers <n> Use 'n' map
tasks to
import in
parallel
--mapreduce-job-name <name> Set name for
generated
mapreduce
job
--merge-key <column> Key column
to use to
join results
--split-by <column-name> Column of
the table
used to
split work
units
--split-limit <size> Upper Limit
of rows per
split for
split
columns of
Date/Time/Ti
mestamp and
integer
types. For
date or
timestamp
fields it is
calculated
in seconds.
split-limit
should be
greater than
0
--table <table-name> Table to
read
--target-dir <dir> HDFS plain
table
...
--warehouse-dir <dir> HDFS parent
for table
destination
--where <where clause> WHERE clause
to use
during
...
使用sqoop
# 查看数据库
sqoop list-databases \
--connect jdbc:mysql://hadoop001:3306 \
--password root@1234% \
--username root;
# 查看tables
sqoop list-tables \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root;
# 导入city_info,路径/user/hadoop/city_info
# delete-target-dir 先删除在导入
# -m 1 使用一个map
# columns导入指定column
# target-dir 目标路径
sqoop import \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--table city_info \
--delete-target-dir \
-m 1 \
--hive-database default \
--hive-table city_info \
--hive-import \
--hive-overwrite \
--fields-terminated-by '\t';
#导入product_info
sqoop import \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--table product_info \
--delete-target-dir \
-m 1 \
--hive-database default \
--hive-table product_info \
--hive-import \
--hive-overwrite \
--fields-terminated-by '\t';
# 先把user_click和city_info,product_info进行关联
create table area_product_tmp as
select c.area,p.product_name from
user_click uc
left join city_info c on c.city_id=uc.city_id
left join product_info p on p.product_id=uc.product_id;
# 统计每个区域下的每个产品的点击数
create table area_product_group_tmp as
select area,product_name ,count(*) total_click
from area_product_tmp
group by area,product_name;
# 窗口函数进行top n
select '2016-05-05' date,tmp.* from(
select area,product_name,total_click,row_number() over(partition by area order by total_click desc) r
from area_product_group_tmp
) tmp
where tmp.r<=3;
# mysql创建接收hive导出数据的表
create table area_product_top3 (
date varchar(50),
area varchar(50),
product_name varchar(50),
total_click int
);
# sqoop导出
sqoop export \
--connect jdbc:mysql://hadoop001:3306/test \
--password root@1234% \
--username root \
--columns "date,area,product_name,total_click" \
--fields-terminated-by '\t' \
--export-dir /user/hive/warehouse/area_product_top3 \
--table area_product_top3 \
-m 1;
# mysql查看导出情况
select * from area_product_top3;