功能:从mysql或pg中抓取数据,存储为hive orc格式
一. 准备工作
main函数
在主函数中,先生成CliOptionParser的单例,又调用CliOptionParser的parseArgs方法,解析传入参数
private static CliOptionParser opt = CliOptionParser.getInstance();
public static void main(String[] args) throws Throwable {
opt.parseArgs(args);
if (opt.command == null) {
opt.showHelp();
exit(0);
}
startTask(args);
}
parseArgs
创建JCommander,创建Reflections工具
找出带TaskOption注解的类,这类代表task的参数配置,这个注解的task属性中存的是对应的task
这样就能得到任务和任务的参数,放到map中k-v分别是
然后找出带TaskName注解的类,这些类是任务类,获取task的name属性,把这taskname作为key,生成一个tasks表,能够通过任务名的字符串,找到需要创建的task和task的属性option
String parseArgs(String[] ars) {
commander = new JCommander();
logger.info("Start scanning tasks in package");
Reflections reflections = new Reflections("com.qunar.hotel.qhstats.orc.toolkit");
// scan for options class
Map, Class>> optionMap = new HashMap<>();
for (Class> clazz : reflections.getTypesAnnotatedWith(TaskOption.class)) {
TaskOption taskOption = clazz.getAnnotation(TaskOption.class);
if (taskOption == null) continue; // this shouldn't happen
optionMap.put(taskOption.task(), clazz);
}
for (Class> clazz : reflections.getTypesAnnotatedWith(TaskName.class)) {
try {
//获取任务名
TaskName taskname = clazz.getAnnotation(TaskName.class);
if (taskname == null) {
logger.warn("class {} is not @TaskName annotated, ignore", clazz.getName());
continue;
}
if (!Task.class.isAssignableFrom(clazz)) {
logger.warn("class {} is not derived from Task, ignore", clazz.getName());
continue;
}
//转化为task类
Class> optionClass = optionMap.get(uncheckedCastToTask(clazz));
if (optionClass == null || !Task.Options.class.isAssignableFrom(optionClass)) {
logger.warn("cannot find option class for {}, ignored", clazz.getName());
continue;
}
String keyString = taskname.value();
//创建optionClass
Task.Options taskOption = (Task.Options) optionClass.newInstance();
//添加进command,k是task的名字,v是taskOption
commander.addCommand(keyString, taskOption);
logger.info("add command [{}] with option {} and task {}", keyString, optionClass.getSimpleName(), clazz.getSimpleName());
//存储到tasks中,>
tasks.put(keyString, new Tuple2<>(taskOption, uncheckedCastToTask(clazz)));
} catch (IllegalAccessException | InstantiationException e) {
logger.warn("fail to register task class " + clazz.getName(), e);
}
}
try {
//解析命令行,这时候,已经创建并配置好option
commander.parse(args);
} catch (ParameterException e) {
System.err.println(e.getMessage());
commander.usage();
exit(126);
}
//设置CliOptionParser的command,这command就是任务名,即传入的命令名
command = commander.getParsedCommand();
return command;
}
调用到commander.parse(args)的时候,已经创建并配置好option了
参数配置进option之后,需要创建任务,把option传入task中,并执行
startTask
回顾主函数
public static void main(String[] args) throws Throwable {
opt.parseArgs(args);
if (opt.command == null) {
opt.showHelp();
exit(0);
}
startTask(args);
}
startTask调用CliOptionParser的createTask方法创建任务
private static void startTask(String[] args) throws Throwable {
Task task = opt.createTask();
task.run();
}
createTask
createTask获得命令行输入的命令,获得该命令的option,验证数据库连接,然后创建任务类
Task createTask() throws IllegalArgumentException {
try {
//从tasks表中,获得task名对应的配置
Task.Options options = getOpts();
if (options.help) {
showHelp();
exit(0);
}
//检查db连接情况
options.sanity();
//获得任务
Class extends Task> taskClazz = tasks.get(command).snd();
logger.info("Creating task for command [{}]", command);
//创建task类,传入这个option
return taskClazz.getDeclaredConstructor(Task.Options.class).newInstance(options);
} catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException | InstantiationException e) {
logger.error("Error occur when create task", e);
throw new RuntimeException(e);
} catch (NullPointerException e) {
logger.error("Error finding command", e);
throw new RuntimeException("Cannot find command " + command);
}
}
sanity校验连接参数
public boolean sanity() throws IllegalArgumentException {
if (dbType == null)
throw new IllegalArgumentException("mission -d|--dbType");
OptionChecks.checkDBConnect(dbType, connectionDescription);
return true;
}
dbType和connectionDescription都是命令行传入的
connectionDescription为:host=?,port=?,dbname=?,user=?,password=?这样构成的map,传入就是map
dbType为mysql或者pg
checkDBConnect
public static boolean checkDBConnect(ConnectionPool.DBType dbType, Map connectionDescription) {
private static final List requiredForConnection = ImmutableList.of("dbname", "user", "password", "host", "port");
private static final List requiredForPxcConnection = ImmutableList.of("dbname", "user", "password", "namespace");
if (connectionDescription == null) throw new IllegalArgumentException("missing -c|--connection");
switch (dbType) {
case MYSQL:
case POSTGRES:
if (!requiredForConnection.stream().allMatch(connectionDescription::containsKey))
throw new IllegalArgumentException("need " + requiredForConnection + " in -c|--connection");
break;
case QMHA:
if (!requiredForPxcConnection.stream().allMatch(connectionDescription::containsKey))
throw new IllegalArgumentException("need " + requiredForConnection + " in -c|--connection");
}
return true;
}
二. 建表任务
public void run() throws Throwable {
StringBuilder builder = new StringBuilder();
builder.append("create external table ").append(opts.hiveTableName).append(" (");
ConnectionPool connectionPool = new ConnectionPool(opts.connectionDescription, opts.dbType);
TableComments comments = new TableComments(opts.dbType, opts.connectionDescription.get("dbname"), opts.schemaAndTable);
comments.getComments(connectionPool);
try (
final Connection connection = connectionPool.getConnection();
final PreparedStatement statement = connection.prepareStatement(pretendQuery())
) {
statement.execute();
final ResultSet rs = statement.getResultSet();
final ResultSetMetaData meta = rs.getMetaData();
for (int i = 1; i <= meta.getColumnCount(); i++) {
if (i != 1) builder.append(",");
builder.append("\n");
HiveMappedType hiveType = toHiveType(meta.getColumnType(i), meta.getColumnClassName(i));
String typeString;
if (hiveType.isPrimitive()) {
typeString = hiveType.hiveType();
} else if (hiveType == MAP) {
typeString = "map";
} else if (hiveType == DECIMAL) {
typeString = String.format("decimal(%2$d, %1$d)", meta.getScale(i), meta.getPrecision(i));
} else {
logger.warn("cannot map type {}:{}:{}.", meta.getColumnName(i), meta.getColumnType(i), meta.getColumnClassName(i));
typeString = "-=≡[ FILL TYPE HERE ]≡=-";
}
builder.append(String.format(" %-25s %-10s comment '%s'",
meta.getColumnName(i),
typeString,
Optional.ofNullable(comments.columnComments.get(meta.getColumnName(i))).orElse(FILL_COMMENT_HERE)));
}
rs.close();
} catch (SQLException e) {
Throwables.propagateIfPossible(e);
}
ConnectionPool
创建一个ConnectionPool,它内部维持了一个mysql的jdbc和PGSimpleDataSource
构建了一个HikariDataSource数据源
public ConnectionPool(Map connection, DBType DBType) {
Properties prop = new Properties();
//如果是QMHA,会执行pxc逻辑,通过接口获取连接
if (DBType == ConnectionPool.DBType.QMHA) {
DBType = ConnectionPool.DBType.MYSQL;
//connection是数据库信息的映射表
rewriteDBConnectionForPxc(connection);
}
//配置prop
switch (DBType) {
case MYSQL:
prop.setProperty("dataSourceClassName", MYSQL_DATA_SOURCE_CLASS);
prop.setProperty("readOnly", "true");
prop.setProperty("dataSource.useServerPrepStmts", "true");
prop.setProperty("dataSource.useUnicode", "true");
prop.setProperty("dataSource.port", connection.get("port"));
prop.setProperty("dataSource.zeroDateTimeBehavior", "convertToNull");
break;
case POSTGRES:
prop.setProperty("dataSourceClassName", POSTGRESQL_DATA_SOURCE_CLASS);
prop.setProperty("dataSource.readOnly", "true");
prop.setProperty("dataSource.portNumber", connection.get("port"));
break;
}
prop.setProperty("dataSource.user", connection.get("user"));
prop.setProperty("dataSource.password", connection.get("password"));
prop.setProperty("dataSource.serverName", connection.get("host"));
prop.setProperty("dataSource.databaseName", connection.get("dbname"));
prop.put("dataSource.logWriter", new PrintWriter(System.err));
config = new HikariConfig(prop);
datasource = new HikariDataSource(config);
}
rewriteDBConnectionForPxc
向pxc接口发请求,解析返回的json,如果都没有返回会抛出RuntimeException,这套逻辑只有在pxc的时候才执行
private static void rewriteDBConnectionForPxc(Map connection) {
String host = null;
int port = -1;
//pxcUris是一个list,里面存储了pxc的地址模式,比如http://dba.corp.qunar.com:6500/prod/pxc/%s/nodes
for (String pxcPattern : pxcUris) {
try {
//填充pxcUri中的%s
String pxcUri = String.format(pxcPattern, connection.get("namespace"));
HttpResponse resp = Unirest.get(pxcUri).header("accept", "application/json").asJson();
JSONObject respData = resp.getBody().getObject();
if (respData.getInt("ret") != 0) continue;
for (Object o : respData.getJSONArray("data")) {
JSONObject node = (JSONObject) o;
if (node.getString("online_status").equals("online") && node.getString("role_flag").contains("read")) {
host = node.getString("ip");
port = node.getInt("port");
logger.info("using {}:{} for read", host, port);
break;
}
}
if (port == -1 || host == null) continue;
connection.put("host", host);
connection.put("port", Integer.toString(port));
return;
} catch (UnirestException e) {
logger.error("Error getting pxc read node", e);
}
}
throw new RuntimeException("Cannot find pxc read node");
}
TableComments
用来获取表的注释
public TableComments(ConnectionPool.DBType type, String dbName, String tableName) {
tableComments = "";
columnComments = new HashMap<>();
this.type = type;
String[] tableAndSchema = getTableAndSchema(tableName, dbName);
this.tableName = tableAndSchema[1];
this.dbName = tableAndSchema[0];
}
然后调用comments.getComments(connectionPool)
传入了上面构建的ConnectionPool,根据db类型设置查表语句,提取列明和注释放在columnComments中
public void getComments(ConnectionPool pool) {
String tableCommentSql = null;
String columnCommentSql = null;
switch (type) {
case MYSQL:
case QMHA:
tableCommentSql = " select `TABLE_COMMENT`" +
" from information_schema.`TABLES`" +
" where `TABLE_SCHEMA` = '%1$s' and `TABLE_NAME` = '%2$s'" ;
columnCommentSql = " select `COLUMN_NAME`, `COLUMN_COMMENT`" +
" from information_schema.`COLUMNS`" +
" where `TABLE_SCHEMA` = '%1$s' and `TABLE_NAME` = '%2$s'";
break;
case POSTGRES:
tableCommentSql = " select obj_description(('%1$s' || '.' || '%2$s')::regclass, 'pg_class')";
columnCommentSql = " select column_name, col_description((table_schema || '.' || table_name)::regclass, ordinal_position)" +
" from information_schema.columns" +
" where table_schema = '%1$s' and table_name = '%2$s'";
break;
}
//获取表注释
pool.transformData(String.format(tableCommentSql, dbName, tableName), new DataTransformer() {
@Override
public void transform(ResultSet rs) throws Throwable {
tableComments = rs.getString(1);
}
});
//获取列和注释,把列名和注释放在columnComments中
pool.transformData(String.format(columnCommentSql, dbName, tableName), new DataTransformer() {
@Override
public void transform(ResultSet rs) throws Throwable {
columnComments.put(rs.getString(1), rs.getString(2));
}
});
}
这里用到的transformData如下,执行完查询之后调用transformer函数转化结果
但是这里的转化结果,其实就是把中间获取的内容添收集到tableComments和columnComments中
public void transformData(String sql, DataTransformer transformer) {
try (
//获取连接,进行预编译及查询
Connection connection = getConnection();
PreparedStatement statement = connection.prepareStatement(sql)
) {
statement.execute();
ResultSet rs = statement.getResultSet();
while (rs.next()) {
try {
//调用transform不断转化结果
transformer.transform(rs);
} catch (Throwable throwable) {
logger.warn("got Exception: ", throwable);
transformer.exceptionHandler(rs, throwable);
}
}
rs.close();
} catch (SQLException e) {
logger.error("Error while executing sql: " + sql, e);
Throwables.propagateIfPossible(e);
}
}
回顾run代码,接下来执行下面的语句
try (
final Connection connection = connectionPool.getConnection();
final PreparedStatement statement = connection.prepareStatement(pretendQuery())
) {
statement.execute();
final ResultSet rs = statement.getResultSet();
//这里能取得列信息
final ResultSetMetaData meta = rs.getMetaData();
for (int i = 1; i <= meta.getColumnCount(); i++) {
if (i != 1) builder.append(",");
builder.append("\n");
//toHiveType对hive类型的映射
HiveMappedType hiveType = toHiveType(meta.getColumnType(i), meta.getColumnClassName(i));
String typeString;
if (hiveType.isPrimitive()) {
typeString = hiveType.hiveType();
} else if (hiveType == MAP) {
typeString = "map";
} else if (hiveType == DECIMAL) {
typeString = String.format("decimal(%2$d, %1$d)", meta.getScale(i), meta.getPrecision(i));
} else {
logger.warn("cannot map type {}:{}:{}.", meta.getColumnName(i), meta.getColumnType(i), meta.getColumnClassName(i));
typeString = "-=≡[ FILL TYPE HERE ]≡=-";
}
builder.append(String.format(" %-25s %-10s comment '%s'",
meta.getColumnName(i),
typeString,
Optional.ofNullable(comments.columnComments.get(meta.getColumnName(i))).orElse(FILL_COMMENT_HERE)));
}
rs.close();
} catch (SQLException e) {
Throwables.propagateIfPossible(e);
}
toHiveType有两个参数,第一个是sql里的类型,第二个获取的是java中的类型,toHiveType将其转为hive中对应的类型
public static HiveMappedType toHiveType(int sqlType, final String javaType) {
if (javaType.equals(String.class.getName()) || AS_STRING_TYPES.contains(sqlType)) {
return STRING;
} else if (javaType.equals(Byte[].class.getName()) || AS_BINARY_TYPES.contains(sqlType)) {
return BINARY;
} else if (javaType.equals(BigInteger.class.getName()) || javaType.equals(Long.class.getName())) {
return BIGINT;
} else if (javaType.equals(Integer.class.getName())) {
return INT;
} else if (javaType.equals(Short.class.getName())) {
return SMALLINT;
} else if (javaType.equals(Byte.class.getName())) {
return TINYINT;
} else if (javaType.equals(Float.class.getName()) || sqlType == Types.FLOAT) {
return FLOAT;
} else if (javaType.equals(Double.class.getName()) || sqlType == Types.DOUBLE) {
return DOUBLE;
} else if (javaType.equals(Timestamp.class.getName()) || sqlType == Types.TIMESTAMP || sqlType == Types.TIMESTAMP_WITH_TIMEZONE) {
return TIMESTAMP;
} else if (javaType.equals(Date.class.getName()) || sqlType == Types.DATE) {
return DATE;
} else if (javaType.equals(Boolean.class.getName()) || sqlType == Types.BOOLEAN) {
return BOOLEAN;
} else if (sqlType == Types.DECIMAL || javaType.equals(BigDecimal.class.getName())) {
return DECIMAL;
} else if (sqlType == Types.STRUCT) {
return STRUCT;
} else if (sqlType == Types.ARRAY) {
return ARRAY;
} else if (sqlType == Types.OTHER) {
if (Map.class.getName().equals(javaType)) {
return MAP;
}
}
return NULL;
}
Primitive类型就是:INT,String,Double,Float
这里拼接好这个表,然后打印出来
三. 导数任务
run
首先检查,目录需要存在且为空
构建一个orcTransformer并配置信息
public void run() throws Throwable {
if (!HDFSUtils.checkDirExistence(opts.exportLocation)) {
logger.error("Dir [{}] not exists.", opts.exportLocation);
throw new RuntimeException("destination dir exists");
}
if (!HDFSUtils.checkDirEmpty(opts.exportLocation)) {
logger.error("Dir [{}] is not empty.", opts.exportLocation);
throw new RuntimeException("destination dir not empty");
}
//获取连接
connectionPool = new ConnectionPool(opts.connectionDescription, opts.dbType);
if (opts.dbType == ConnectionPool.DBType.POSTGRES) {
String[] tableSchema = opts.schemaAndTable.split(".", 2);
if (tableSchema.length == 2) {
getPgTypeMap(tableSchema[0], tableSchema[1]);
} else {
getPgTypeMap("public", tableSchema[0]);
}
}
OrcTransformer orcTransformer = new OrcTransformer();
//设置的path和split,path是目标目录如/user/qhstats/temp/test-pg-types,split一个orc文件的行数
orcTransformer.setPath(opts.exportLocation);
orcTransformer.setSplit(opts.split);
//构建TableComments并初始化库名,表明
TableComments tableComment = new TableComments(opts.dbType, opts.connectionDescription.get("dbname"), opts.schemaAndTable);
tableComment.getComments(connectionPool);
//设置orcTransformer的tableComment
orcTransformer.setTableComment(tableComment);
//设置batchStartRef
orcTransformer.setStartTimeRef(currentTimeMillis());
//迭代PagingSql
for (String sql: new PagingSql()) {
connectionPool.transformData(sql, orcTransformer);
}
long elapsed = System.currentTimeMillis() - orcTransformer.startTimeRef;
logger.info("All done, {} rows / {} ms = {} rows/s", orcTransformer.rows, elapsed, orcTransformer.rows * 1000.0 / elapsed);
orcTransformer.close();
}
OrcTransformer集成于DataTransformer,DataTransformer的内容如下
abstract public class DataTransformer {
public abstract void transform(ResultSet rs) throws Throwable;
void exceptionHandler(ResultSet rs, Throwable throwable) {
// just wrap it with RuntimeException
Throwables.propagateIfPossible(throwable);
}
// init transformer
public void init() {
}
// finish transformer
public void close() {
}
}
OrcTransformer中包含一个TableComments
建立PagingSql是一个Iterable,内部包含一个迭代器,构建过程如下
PagingSql() {
if (opts.partitionKey != null) getIdRange();
baseSql = buildBasicSql();
}
调用getIdRange获取maxId和minId
private void getIdRange() {
connectionPool.transformData(
String.format("select max(%1$s) as maxId, min(%1$s) as minId from %2$s", opts.partitionKey, opts.schemaAndTable),
new DataTransformer() {
@Override
public void transform(ResultSet rs) throws Throwable {
maxId = rs.getLong("maxId");
minId = rs.getLong("minId");
}
});
logger.info("table {}.{} range from {} to {}", opts.schemaAndTable, opts.partitionKey, minId, maxId);
}
partitionKey一般会设置id,buildBasicSql的过程如下,可以看出这里可以添加条件
private String buildBasicSql() {
StringBuilder builder = new StringBuilder();
builder.append("select ")
.append(Joiner.on(", ").join(opts.fields))
.append(" from ").append(opts.schemaAndTable)
.append(" where (1 = 1)");
if (opts.where != null) builder.append(" and (").append(opts.where).append(")");
return builder.toString();
}
因为这是Iterable,所以会不断迭代
public Iterator iterator() {
if (opts.partitionKey == null) {
logger.info("no paging: {}", baseSql);
return Collections.singletonList(baseSql).iterator();
}
return new PagingSqlIterator();
}
返回PagingSqlIterator,这里存了一共有多少页需要导出
next方法计算出lower和upper两个页,并返回pagingRange这个字符串,这个是分页查询语句
PagingSqlIterator() {
currentPage = 0;
totalPages = (maxId - minId + 1) / opts.batchSize + 1;
}
@Override
public boolean hasNext() {
return currentPage <= totalPages;
}
@Override
public String next() {
long lower = minId + currentPage * opts.batchSize;
long upper = lower + opts.batchSize - 1;
String pagingRange = String.format("%s and (%s between %d and %d)", baseSql, opts.partitionKey, lower, upper);
logger.info("paging {} / {}", currentPage, totalPages);
currentPage++;
return pagingRange;
}
然后调用transformData,把sql进行转换
public void transformData(String sql, DataTransformer transformer) {
try (
Connection connection = getConnection();
PreparedStatement statement = connection.prepareStatement(sql)
) {
statement.execute();
ResultSet rs = statement.getResultSet();
while (rs.next()) {
try {
transformer.transform(rs);
} catch (Throwable throwable) {
logger.warn("got Exception: ", throwable);
transformer.exceptionHandler(rs, throwable);
}
}
rs.close();
} catch (SQLException e) {
logger.error("Error while executing sql: " + sql, e);
Throwables.propagateIfPossible(e);
}
}
获取了查询结果之后,会执行DataTransformer即OrcTransformer的transfer方法
public void transform(ResultSet rs) throws Throwable {
this.rs = rs;
this.meta = rs.getMetaData();
//获得一个OrcWriter
OrcWriter orcMeta = lazyOrcWriter.get();
//如果恰好能被整除,这时进行写入
if (rows % split == 0) orcMeta.rotateFile();
//创建一个SettableStructObjectInspector
Object line = orcMeta.rootInspector.create();
for (int i = 0; i < orcMeta.fields.size(); i++) {
Object data = rs.getObject(i + 1);
if (data instanceof BigDecimal) {
data = HiveDecimal.create((BigDecimal) data);
} else if (data instanceof BigInteger) {
data = ((BigInteger) data).longValue();
}
//写入setStructFieldData
orcMeta.rootInspector.setStructFieldData(line, orcMeta.fields.get(i), data);
}
//orcWriter添加line
orcMeta.orcWriter.addRow(line);
rows++;
if (rows % scale == 0) {
long current = System.currentTimeMillis();
long elapsed = current - batchStartRef;
batchStartRef = current;
logger.info("{} rows exported, [{} rows / {} ms = {} rows/s] ",
rows, scale, elapsed, scale * 1000.0 / elapsed);
if (scale * 10 == rows && scale <= 100 * 1000) scale *= 10;
}
}
rotateFile,这里是一个写入操作,每当分页结束了就进行写入
void rotateFile() {
if (orcWriter != null) {
try {
orcWriter.close();
} catch (IOException e) {
logger.warn("Cannot close last file", e);
}
}
//获取orc的全路径及名称
Path currentPath = new Path(getCurrentFileName());
try {
//创建一个orcWriter
orcWriter = OrcFile.createWriter(currentPath, options);
} catch (IOException e) {
logger.warn("Cannot open file for writing", e);
}
fileSequence++;
}