Sqoop框架
介绍
1、Sqoop的导入导出
导入: RDBMS
导出: Hadoop平台
2、Sqoop的重点是写导入导出的命令
3、Sqoop的底层是没有Reduce的MR
Sqoop这是机械性将数据源的数据一条条进行搬移,不需要对数据做额外的聚合,所以不需要Reduce
测试
sqoop list-databases \
应用1-导入
数据导入到HDFS-全量
sqoop import \
--connect jdbc:mysql://192.168.88.80:3306/userdb \
--table emp \
--username root \
--password 123456 \
--target-dir /sqoop/result3 \
--delete-target-dir \
--fields-terminated-by '\t' \
--split-by id \
--m 2
SELECT MIN(`id`), MAX(`id`) FROM `emp`
-Dorg.apache.sqoop.splitter.allow_text_splitter=true \
数据导入到HDFS-增量
sqoop import \
--connect jdbc:mysql://192.168.88.80:3306/userdb \
--username root \
--password 123456 \
--target-dir /sqoop/result5 \
--query 'select id,name,deg from emp WHERE id>1203 and $CONDITIONS' \
--delete-target-dir \
--fields-terminated-by '\t' \
--m 1
(重点)数据导入-全量-到Hive-全量
create table test.emp_hive
(
id int,
name string,
deg string,
salary int,
dept string
)
row format delimited fields terminated by '\t'
stored as orc;
sqoop import \
-m 1
(重点)数据导入-全量-到Hive-增量
#!/bin/bash
yes_day=$(date -d "yesterday" +%Y-%m-%d)
/usr/bin/hive -f a.sql
wait
/usr/bin/sqoop import \
--connect jdbc:mysql://192.168.88.80:3306/userdb \
--username root \
--password 123456 \
--query "select * from userdb.customertest where last_mod between '${yes_day} 00:00:00' and '${yes_day} 23:59:59' and \$CONDITION
S" \
--fields-terminated-by '\t' \
--hcatalog-database test \
--hcatalog-table customertest \
-m 1