一,sqoop同步数据的几种方案
1,全量同步
对于数据量小的表,采取全量同步的方式,不会对数据源造成压力,写入到目标时也不用太多处理。
2,首次全量,后增量
对于数据量大,历史数据不会更新的数据且每天有新增的数据,可以采用增量同步的方式。
3,首次全量,后增量+更新
对于数据量大,历史数据会更新的数据且每天有新增的数据,可以采用同步增量+更新的方式,这种方式需要数据有一个识别修改的字段,如modify_time。
二,sqoop同步的原理
sqoop底层是mapreduce,使用sqoop时需要指定数据源、取数sql和条件、map任务数等信息,sqoop将这些指令解析后创建mapreduce任务,执行从数据源拉取数据后写入目标数据源的任务。
三,下载安装测试
1,下载
2,解压
tar -zxvf
3,修改配置
解压后,操作conf目录下的配置文件
mv sqoop-env-template.sh sqoop-env.sh
vim sqoop-env.sh
export HADOOP_COMMON_HOME=/export/soft/hadoop-3.1.3
export HADOOP_MAPRED_HOME=/export/soft/hadoop-3.1.3
export HIVE_HOME=/export/soft/hadoop-3.1.3
export ZOOKEEPER_HOME=/export/zookeeper-3.4.6/
export ZOOCFGDIR=/export/zookeeper-3.4.6/conf
4,上传mysql驱动包到sqoop的lib目录下
cp /export/server/hive-2.1.0/lib/mysql-connector-java-5.1.41.jar ./lib/
5,测试
bin/sqoop list-databases --connect jdbc:mysql://node3:3306/ --username root --password 123456
6,数据同步测试
bin/sqoop import \
--connect jdbc:mysql://node3:3306/db \
--username root \
--password 123456 \
--table user_info \
--columns id,login_name \
--where "id>=10 and id<=30" \
--target-dir /test \
--delete-target-dir \
--fields-terminated-by '\t' \
--num-mappers 2 \
--split-by id
sqoop底层是生成mapreduce任务去同步数据,所以同步一张表时可以指定map数,默认是1,如果指定的map数大于1,则要指定数据划分的依据,如上例所示,--num-mappers 2
, 则 --split-by id
,表示按照id将数据一分为二,由两个map任务同步。
四,通用的全量同步脚本编写
#! /bin/bash
APP=db
sqoop=/export/sqoop/bin/sqoop
if [ -n "$2" ] ;then
do_date=$2
else
echo "请传入日期参数"
exit
fi
# 下面抽取是所有全量同步的表的脚本的公共代码,抽象为一个函数,使用时传入参数,如表名和where条件
import_data(){
$sqoop import \
--connect jdbc:mysql://node3:3306/$APP \
--username root \
--password 000000 \
--target-dir /origin_data/$APP/db/$1/$do_date \
--delete-target-dir \
--query "$2 where \$CONDITIONS" \
--num-mappers 1 \
--fields-terminated-by '\t' \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N'
# 对lzop压缩后的文件创建索引
hadoop jar /export/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /origin_data/$APP/db/$1/$do_date
}
–null-string ‘\N’
–null-non-string ‘\N’
表示将空值转换为 \N,这样才能被Hive识别,毕竟我们将要创建的是Hive数据仓库
全量同步所有表的脚本:
#! /bin/bash
APP=db
sqoop=/export/soft/sqoop-1.4.6.bin__hadoop-2.0.4-alpha/
if [ -n "$2" ] ;then
do_date=$2
else
echo "请传入日期参数"
exit
fi
import_data(){
$sqoop import \
--connect jdbc:mysql://node3:3306/$APP \
--username root \
--password 123456 \
--target-dir /origin_data/$APP/db/$1/$do_date \
--delete-target-dir \
--query "$2 where \$CONDITIONS" \
--num-mappers 1 \
--fields-terminated-by '\t' \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N'
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /origin_data/$APP/db/$1/$do_date
}
import_order_info(){
import_data order_info "select
id,
total_amount,
order_status,
user_id,
payment_way,
delivery_address,
out_trade_no,
create_time,
operate_time,
expire_time,
tracking_no,
province_id,
activity_reduce_amount,
coupon_reduce_amount,
original_total_amount,
feight_fee,
feight_fee_reduce
from order_info"
}
import_coupon_use(){
import_data coupon_use "select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time,
expire_time
from coupon_use"
}
import_order_status_log(){
import_data order_status_log "select
id,
order_id,
order_status,
operate_time
from order_status_log"
}
import_user_info(){
import_data "user_info" "select
id,
login_name,
nick_name,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time
from user_info"
}
import_order_detail(){
import_data order_detail "select
id,
order_id,
sku_id,
sku_name,
order_price,
sku_num,
create_time,
source_type,
source_id,
split_total_amount,
split_activity_amount,
split_coupon_amount
from order_detail"
}
import_payment_info(){
import_data "payment_info" "select
id,
out_trade_no,
order_id,
user_id,
payment_type,
trade_no,
total_amount,
subject,
payment_status,
create_time,
callback_time
from payment_info"
}
import_comment_info(){
import_data comment_info "select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from comment_info"
}
import_order_refund_info(){
import_data order_refund_info "select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
refund_status,
create_time
from order_refund_info"
}
import_sku_info(){
import_data sku_info "select
id,
spu_id,
price,
sku_name,
sku_desc,
weight,
tm_id,
category3_id,
is_sale,
create_time
from sku_info"
}
import_base_category1(){
import_data "base_category1" "select
id,
name
from base_category1"
}
import_base_category2(){
import_data "base_category2" "select
id,
name,
category1_id
from base_category2"
}
import_base_category3(){
import_data "base_category3" "select
id,
name,
category2_id
from base_category3"
}
import_base_province(){
import_data base_province "select
id,
name,
region_id,
area_code,
iso_code,
iso_3166_2
from base_province"
}
import_base_region(){
import_data base_region "select
id,
region_name
from base_region"
}
import_base_trademark(){
import_data base_trademark "select
id,
tm_name
from base_trademark"
}
import_spu_info(){
import_data spu_info "select
id,
spu_name,
category3_id,
tm_id
from spu_info"
}
import_favor_info(){
import_data favor_info "select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from favor_info"
}
import_cart_info(){
import_data cart_info "select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time,
source_type,
source_id
from cart_info"
}
import_coupon_info(){
import_data coupon_info "select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
limit_num,
taken_count,
start_time,
end_time,
operate_time,
expire_time
from coupon_info"
}
import_activity_info(){
import_data activity_info "select
id,
activity_name,
activity_type,
start_time,
end_time,
create_time
from activity_info"
}
import_activity_rule(){
import_data activity_rule "select
id,
activity_id,
activity_type,
condition_amount,
condition_num,
benefit_amount,
benefit_discount,
benefit_level
from activity_rule"
}
import_base_dic(){
import_data base_dic "select
dic_code,
dic_name,
parent_code,
create_time,
operate_time
from base_dic"
}
import_order_detail_activity(){
import_data order_detail_activity "select
id,
order_id,
order_detail_id,
activity_id,
activity_rule_id,
sku_id,
create_time
from order_detail_activity"
}
import_order_detail_coupon(){
import_data order_detail_coupon "select
id,
order_id,
order_detail_id,
coupon_id,
coupon_use_id,
sku_id,
create_time
from order_detail_coupon"
}
import_refund_payment(){
import_data refund_payment "select
id,
out_trade_no,
order_id,
sku_id,
payment_type,
trade_no,
total_amount,
subject,
refund_status,
create_time,
callback_time
from refund_payment"
}
import_sku_attr_value(){
import_data sku_attr_value "select
id,
attr_id,
value_id,
sku_id,
attr_name,
value_name
from sku_attr_value"
}
import_sku_sale_attr_value(){
import_data sku_sale_attr_value "select
id,
sku_id,
spu_id,
sale_attr_value_id,
sale_attr_id,
sale_attr_name,
sale_attr_value_name
from sku_sale_attr_value"
}
case $1 in
"order_info")
import_order_info
;;
"base_category1")
import_base_category1
;;
"base_category2")
import_base_category2
;;
"base_category3")
import_base_category3
;;
"order_detail")
import_order_detail
;;
"sku_info")
import_sku_info
;;
"user_info")
import_user_info
;;
"payment_info")
import_payment_info
;;
"base_province")
import_base_province
;;
"base_region")
import_base_region
;;
"base_trademark")
import_base_trademark
;;
"activity_info")
import_activity_info
;;
"cart_info")
import_cart_info
;;
"comment_info")
import_comment_info
;;
"coupon_info")
import_coupon_info
;;
"coupon_use")
import_coupon_use
;;
"favor_info")
import_favor_info
;;
"order_refund_info")
import_order_refund_info
;;
"order_status_log")
import_order_status_log
;;
"spu_info")
import_spu_info
;;
"activity_rule")
import_activity_rule
;;
"base_dic")
import_base_dic
;;
"order_detail_activity")
import_order_detail_activity
;;
"order_detail_coupon")
import_order_detail_coupon
;;
"refund_payment")
import_refund_payment
;;
"sku_attr_value")
import_sku_attr_value
;;
"sku_sale_attr_value")
import_sku_sale_attr_value
;;
"all")
import_base_category1
import_base_category2
import_base_category3
import_order_info
import_order_detail
import_sku_info
import_user_info
import_payment_info
import_base_region
import_base_province
import_base_trademark
import_activity_info
import_cart_info
import_comment_info
import_coupon_use
import_coupon_info
import_favor_info
import_order_refund_info
import_order_status_log
import_spu_info
import_activity_rule
import_base_dic
import_order_detail_activity
import_order_detail_coupon
import_refund_payment
import_sku_attr_value
import_sku_sale_attr_value
;;
esac