方式一(推荐):
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test 'test' /user/hive/warehouse/s66_postgres.db/test_miaodazhuang1/dt=20210107 "SELECT CAST(EXTRACT(EPOCH FROM COALESCE(update_time,create_time)::TIMESTAMP WITH TIME ZONE)* 1000 AS int8)||'|9' AS event_timestamp,CAST('INSERT' AS VARCHAR) AS event_type,tt.* FROM test_miaodazhuang1 AS tt WHERE ((create_time >= '2021-01-07 00:00:00' and create_time<= '2021-01-08') OR (update_time >= '2021-01-07 00:00:00' and update_time<= '2021-01-08'))" create_time test_miaodazhuang1 root.etl
etl.sh 详细代码如下:
#!/bin/bash
export LNAG=zh_CN.UTF-8
# 补数逻辑,pg => hdfs
# -- --schema <name>
connect=$1
username=$2
password=$3
hdfs_path=$4
query_condition=$5
alias_time=$6
tablename=$7
queue=$8
hdfs_path_tmp=${hdfs_path}_tmp
query_condition="${query_condition}"
echo "query_condition==> ${query_condition}"
sqoop_sql="sqoop import -D mapred.job.queue.name=${queue}\
--connect ${connect}?connectTimeout=300 \
--username ${username} \
--password ${password} \
--target-dir ${hdfs_path_tmp} \
--fields-terminated-by '\t' \
--hive-delims-replacement ' ' \
--null-string '\\\\N' \
--null-non-string '\\\\N' \
--query \"${query_condition} AND \\\$CONDITIONS\" \
--split-by ${alias_time} \
"
echo "执行的sqoop脚本==> ${sqoop_sql}"
eval $sqoop_sql
if [ $? -eq 0 ] ;then
echo "`date "+%Y-%m-%d %H:%M:%S"` ==> 提交抽数任务到 hdfs ${hdfs_path_tmp} 成功"
echo "==>准备删除_SUCCESS文件"
rm_sucess_file=" hdfs dfs -rm ${hdfs_path_tmp}/_SUCCESS "
echo "rm_sucess_file == ${rm_sucess_file}"
eval $rm_sucess_file
if [ $? -eq 0 ] ;then
echo "删除${rm_sucess_file}文件成功"
else
echo "删除${rm_sucess_file}文件失败"
fi
echo "==>判断${hdfs_path}路径是否存在"
hdfs dfs -test -e ${hdfs_path}
if [ $? -eq 0 ] ;then
echo "${hdfs_path} exist"
else
echo "${hdfs_path} is not exist! create it"
hdfs dfs -mkdir -p ${hdfs_path}
fi
echo "==>数据从临时路径${hdfs_path_tmp} 移动到${hdfs_path}"
files=`hadoop fs -ls ${hdfs_path_tmp} | awk -F " " '{print $8}'`
for file in ${files}
do
time_stamp=`date +%s`
filename=`echo ${file} | awk -F "part-m-" '{print $2}'`
mv_sql=" hdfs dfs -mv ${file} ${hdfs_path}/${tablename}-etl-${time_stamp}"
echo "mv_sql == ${mv_sql}"
eval $mv_sql
if [ $? -eq 0 ] ;then
echo "移动${file}到${hdfs_path}成功"
else
echo "移动${file}到${hdfs_path}失败"
fi
done
echo "==>删除临时文件夹${hdfs_path_tmp}"
rm_tmp_dir="hdfs dfs -rm -r ${hdfs_path_tmp}"
echo "rm_tmp_dir == ${rm_tmp_dir}"
eval $rm_tmp_dir
if [ $? -eq 0 ] ;then
echo "删除临时文件夹${hdfs_path_tmp}成功"
else
echo "删除临时文件夹${hdfs_path_tmp}失败"
fi
echo "`date "+%Y-%m-%d %H:%M:%S"` ==> 抽数数据到 hdfs ${hdfs_path} 结束"
else
echo "`date "+%Y-%m-%d %H:%M:%S"` ==> 抽数库中数据到 hdfs ${hdfs_path_tmp} 失败"
exit 1
fi
生成的sqoop抽数的命令如下所示:
sqoop import -D mapred.job.queue.name=root.etl --connect jdbc:postgresql://192.168.0.36:5432/postgres?connectTimeout=300 --username test --password test --target-dir /user/hive/warehouse/s66_postgres.db/test_miaodazhuang1/dt=20210107_tmp --fields-terminated-by '\t' --hive-delims-replacement ' ' --null-string '\\N' --null-non-string '\\N' --query "SELECT CAST(EXTRACT(EPOCH FROM COALESCE(update_time,create_time)::TIMESTAMP WITH TIME ZONE)* 1000 AS int8)||'|9' AS event_timestamp,CAST('INSERT' AS VARCHAR) AS event_type,tt.* FROM test_miaodazhuang1 AS tt WHERE ((create_time >= '2021-01-07 00:00:00' and create_time<= '2021-01-08') OR (update_time >= '2021-01-07 00:00:00' and update_time<= '2021-01-08')) AND \$CONDITIONS" --split-by create_time
方式二:
params_tmp.txt
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
sh /data/dfp/etl/etl.sh jdbc:postgresql://192.168.0.36:5432/postgres test test /user/hive/warehouse/s66_postgres.db/test_part/dt=20201209 "SELECT CASE WHEN update_time !=null THEN update_time ELSE create_time END as event_timestamp,'INSERT' AS event_type,tt.* FROM test_part AS tt WHERE ((create_time >= '2020-12-09 00:00:00' and create_time<= '2020-12-10') OR (update_time >= '2020-12-09 00:00:00' and update_time<= '2020-12-10')) AND \\\\\\$CONDITIONS" create_time test_part root.etl
#!/bin/bash
export LNAG=zh_CN.UTF-8
# 补数逻辑,读取文件 array.txt=> line
cat params_tmp.txt|while read line
do
echo "sqoop_sql==>${line}"
eval $line
if [ $? -eq 0 ] ;then
echo "`date "+%Y-%m-%d %H:%M:%S"` ==> 提交抽数任务到 hdfs ${hdfs_path_tmp} 成功"
else
echo "`date "+%Y-%m-%d %H:%M:%S"` ==> 抽数库中数据到 hdfs ${hdfs_path_tmp} 失败"
exit 1
fi
sleep 1
done
echo "==>准备删除_参数文件"
rm_sucess_file="rm -f /data/dfp/etl/params_tmp.txt"
echo "rm_sucess_file == ${rm_sucess_file}"
eval $rm_sucess_file
echo "==>删除_参数文件成功!"