sqoop目前用的比较多的是数据仓库的建立,在很多数情况下,同步一个表,需要些sqoop脚本,来同步表,有的时候执行成功与否,数据量多少,数据源表是否更新,都不清楚,而且脚本文件不便于管理。
本次介绍的数据平台具有专业的管理功能。
下面是表的设计
DROP TABLE IF EXISTS task_info; CREATE TABLE task_info(task_id INT,database_id INT,db_table VARCHAR(100), hive_table VARCHAR(100), db_columns VARCHAR(5000), where_express VARCHAR(200)
, is_hive_import VARCHAR(2), hive_partition_key VARCHAR(10), mappers INT, split_by VARCHAR(10) , is_use VARCHAR(2), priority INT, cron VARCHAR(100), alter_date longtext, meta_data longtext, PRIMARY KEY (task_id));
DROP TABLE IF EXISTS db_info; CREATE TABLE db_info(id INT, uname VARCHAR(100) , password VARCHAR(500), url VARCHAR(200),PRIMARY KEY (id));
DROP TABLE IF EXISTS task_log; CREATE TABLE task_log(name VARCHAR(100) , begin_time VARCHAR(100), end_time VARCHAR(100), result VARCHAR(100), data_num VARCHAR(50), log longtext);
将sqoop需要的相关参数可配置化,包括数据源表也实行配置化。当然如果要改变就执行执行一个sql,让调度程序注意到就行了。这是一个定时执行的。
接下来是核心的具体代码
public class SecureEncrypter implements Encrypter{
private int seed = 0;
public SecureEncrypter(int seed){
this.seed=seed;
}
@Override
public String encode(String input) {
StringBuilder builder =new StringBuilder();
for(int i=0;i<input.length();i++){
int k =(int)input.charAt(i);
//int code = new Integer(k^flag);
builder.append("u" + new Integer(k^seed).toString());
}
return builder.substring(1);
}
@Override
public String decode(String input) {
String[] arr=input.split("u");
StringBuilder builder= new StringBuilder();
for(String str:arr){
int t=Integer.valueOf(str);
t = t ^ seed;
builder.append((char)t);
}
return builder.toString();
}
}
上面是密码的加密解密,有些密码不能轻易透漏给他人必须加密处理,然后解密作为sqoop的密码参数
public class HDFSExtract implements Extract {
/**
* @param args
* 该类负责管理HDFS的数据抽取工作
*/
public static Logger log = Logger.getLogger(HDFSExtract.class);
private Connection con = null;
private Statement st = null;
private ResultSet rs = null;
private int task_id;
private String db_user = null;
private String db_password = null;
private String db_table = null;
private String db_columns = null;
private String url = null;
private String where = null;
private String is_hive_import = null;
private String hive_table = null;
private String hive_partition_key = null;
private String split_by = null;
private String count_date = null;
private String mappers= null;
public void extract(TaskInfo tInfo)
{
// 1 读取配置表内容 yhd_extract_to_hdfs
//count_date 默认为yesterday
count_date = ManaUtils.getData(-1);
try {
if(tInfo.getAlter_date() != null && !"".equals(tInfo.getAlter_date())) {
count_date = tInfo.getAlter_date();
}
String sql = "select d.uname,d.password,d.url from db_info d where d.id = " ;
sql += tInfo.getDatabase_id();
log.info("sql: "+sql);
con = DBConnection.getInstance();
st = con.createStatement();
rs = st.executeQuery(sql);
Runtime run = null;
if(rs.next())
{
ETL_vo vo = new ETL_vo();
setFromResultSet(rs, tInfo);
vo.setTableName(hive_table);
run = Runtime.getRuntime();
try {
run.exec("hadoop fs -rmr "+db_table.trim());
run.exec("hadoop fs -rmr "+db_table.trim().toUpperCase());
run.exec("hadoop fs -rmr "+db_table.trim().toLowerCase());
log.info("Begin excuete task "+task_id+" "+hive_table+" ......"+ManaUtils.sdf.format(new Date())) ;
vo.setBeginTime(ManaUtils.sdf.format(new Date()));
vo.setResult("SUCCESS");
System.out.println(toString(createOptions()));
Process p = run.exec(createOptions());
ManaUtils.exeOption(p, tInfo, st, null);
} catch (Exception e) {
e.printStackTrace();
}
}
log.info("Finish Extracting ! "+ManaUtils.sdf.format(new Date())) ;
} catch (Exception e) {
log.info("Error: extract fail !");
e.printStackTrace();
return ;
}
finally{
// try {
// rs.close();
// st.close();
// con.close();
// } catch (SQLException e) {
// e.printStackTrace();
// }
}
}
public void setFromResultSet(ResultSet rs, TaskInfo tInfo) throws SQLException
{
task_id = tInfo.getTask_id();
db_user = rs.getString("uname").trim();
db_password = rs.getString("password").trim();
url = rs.getString("url");
db_table = tInfo.getDb_table();
db_columns = tInfo.getDb_columns();
where = tInfo.getWhere_express();
is_hive_import = tInfo.getIs_hive_import();
hive_table = tInfo.getHive_table();
hive_partition_key = tInfo.getHive_partition_key();
mappers = tInfo.getMappers().toString();
split_by = tInfo.getSplit_by();
}
public String toString(String[] args) {
StringBuffer stringBuffer = new StringBuffer();
// System.out.println("---shell---");
for(String a : args) {
// System.out.println(a);
stringBuffer.append(a);
stringBuffer.append(" ");
}
// System.out.println("---shell---");
return stringBuffer.toString();
}
public String[] createOptions() throws Exception
{
List<String> optSb = new ArrayList<String>();
optSb.add("sqoop");
optSb.add("import");
optSb.add("-D");
optSb.add("mapred.job.queue.name=pms");
optSb.add("--connect");
optSb.add(url);
optSb.add("--username");
optSb.add(db_user);
optSb.add("--password");
optSb.add(EncrypterFactory.getEncryperInstance(1).decode(db_password));
if (mappers != null && ! "".equals(mappers))
{
optSb.add("-m");
optSb.add(mappers);
}
if (split_by != null && !"".equals(split_by)) {
optSb.add("--split-by");
optSb.add(split_by.toUpperCase());
}
optSb.add("--null-string");
optSb.add("''");
optSb.add("--table");
optSb.add(db_table);
//以下是数据表非必填项,需要判断
if(db_columns != null && ! "".equals(db_columns)){
optSb.add("--columns");
optSb.add("\""+db_columns.toUpperCase()+"\"");
}
if (where != null && ! "".equals(where)) {
optSb.add("--where");
where = where.replaceAll("\\$TXDATE", "'"+count_date+"'") ;
optSb.add("\""+where.trim()+"\"");
}
if (is_hive_import != null && ! "".equals(is_hive_import)
&& "Y".equalsIgnoreCase(is_hive_import))
{
optSb.add("--hive-overwrite"); // 统一为覆盖模式,要求hive 表必须存在
optSb.add("--hive-import");
optSb.add("--hive-drop-import-delims");
if (hive_table == null || "".equals(hive_table)) {
log.info("Error: hive_table must be set 当--hive-import时 !");
}else {
optSb.add("--hive-table");
optSb.add(hive_table.trim());
}
if (hive_partition_key != null && !"".equals(hive_partition_key)) {
optSb.add("--hive-partition-key");
optSb.add(hive_partition_key.trim());
optSb.add("--hive-partition-value");
optSb.add("\""+count_date.trim()+"\"");
}
}
optSb.add("--null-string");
optSb.add("'\\\\N'");
optSb.add("--null-non-string");
optSb.add("'\\\\N'");
return optSb.toArray(new String[0]);
}
}
在很多情况下,需要每天增量的同步,条件里需要加上日期相关