sqoop同步数据流程decimal
文章目录
指定输入类
- 指定输入类
- ImportJobContext.class
public ImportJobContext(String table, String jar, SqoopOptions opts, Path destination) {
this.tableName = table;
this.jarFile = jar;
if (this.jarFile == null) {
this.jarFile = Jars.getJarPathForClass(Configuration.class);
}
this.options = opts;
this.inputFormatClass = DataDrivenDBInputFormat.class;
this.destination = destination;
}
指定orm
- 指定orm
- DataDrivenImportJobOverride.java
// 指定生成的orm
DataDrivenDBInputFormat.setInput(job, DBWritable.class, mgr.escapeTableName(tableName), whereClause, mgr.escapeColName(splitByCol), sqlColNames);
指定输入类
- 指定输入类
- DataDrivenImportJobOverride.java
// 指定输入类
job.setInputFormatClass(this.inputFormatClass);
指定输出类
- 指定输出类
- DataDrivenImportJobOverride.java
protected Class<? extends OutputFormat> getOutputFormatClass() throws ClassNotFoundException {
if (this.isHCatJob) {
LOG.debug("Returning HCatOutputFormat for output format");
return SqoopHCatUtilities.getOutputFormatClass();
} else if (this.options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
return RawKeyTextOutputFormat.class;
} else if (this.options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
return SequenceFileOutputFormat.class;
} else if (this.options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
return AvroOutputFormat.class;
} else {
// DatasetKeyOutputFormat 被替换成 AvroParquetOutputFormat
return this.options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile ? AvroParquetOutputFormat.class : null;
}
}
指定输出路径
- 指定输出路径
- ImportJobBase.class
Path outputPath = this.context.getDestination();
FileOutputFormat.setOutputPath(job, outputPath);
指定mapper类
- 指定mapper类
- DataDrivenImportJobOverride.java
protected Class<? extends Mapper> getMapperClass() {
if (this.options.getHCatTableName() != null) {
return SqoopHCatUtilities.getImportMapperClass();
} else if (this.options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
return TextImportMapper.class;
} else if (this.options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
return SequenceFileImportMapper.class;
} else if (this.options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
return AvroImportMapper.class;
} else {
return this.options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile ? HadoopParquetImportMapperOverride.class : null;
}
}
map方法
- map方法
- ParquetImportMapperOverride.java
protected void map(LongWritable key, SqoopRecord val, Mapper<LongWritable, SqoopRecord, KEYOUT, VALOUT>.Context context) throws IOException, InterruptedException {
try {
val.loadLargeObjects( this.lobLoader);
} catch (SQLException var5) {
throw new IOException(var5);
}
GenericRecord record = AvroUtilOverride.toGenericRecord(val.getFieldMap(), this.schema, this.bigDecimalFormatString, this.bigDecimalPadding);
this.write(context, record);
}
toAvro方法
- toAvro方法
- AvroUtilOverride.java
public static Object toAvro(Object o, Schema.Field field, boolean bigDecimalFormatString, boolean bigDecimalPaddingEnabled) {
if (o instanceof BigDecimal) {
if (bigDecimalPaddingEnabled) {
o = padBigDecimal((BigDecimal)o, field.schema());
}
if (!isDecimal(field)) {
if (bigDecimalFormatString) {
return ((BigDecimal)o).toPlainString();
}
return o.toString();
}
} else {
if (o instanceof Date) {
return ((Date)o).getTime();
}
if (o instanceof Time) {
return ((Time)o).getTime();
}
if (o instanceof Timestamp) {
return ((Timestamp)o).getTime();
}
if (o instanceof BytesWritable) {
BytesWritable bw = (BytesWritable)o;
return ByteBuffer.wrap(bw.getBytes(), 0, bw.getLength());
}
if (o instanceof BlobRef) {
BlobRef br = (BlobRef)o;
byte[] bytes = br.isExternal() ? br.toString().getBytes() : (byte[])br.getData();
return ByteBuffer.wrap(bytes);
}
if (o instanceof ClobRef) {
throw new UnsupportedOperationException("ClobRef not supported");
}
}
return o;
}
record 的put 方法
- record 的put 方法
- GenericData.java
@Override public void put(String key, Object value) {
Schema.Field field = schema.getField(key);
if (field == null)
throw new AvroRuntimeException("Not a valid schema field: "+key);
values[field.pos()] = value;
}
第二步merge任务入口
- 第二步merge任务入口
- ImportDecimalTool.java
if (options.isAppendMode()) {
AppendUtils app = new AppendUtils(context);
app.append();
} else if (options.getIncrementalMode() == SqoopOptions.IncrementalMode.DateLastModified) {
this.lastModifiedMerge(options, context);
}
merge目录
- merge目录
- ImportDecimalTool.java
protected void lastModifiedMerge(SqoopOptions options, ImportJobContext context) throws IOException {
if (context.getDestination() != null) {
// 目的地址 --target 指定地址
Path userDestDir = this.getOutputPath(options, context.getTableName(), false);
FileSystem fs = userDestDir.getFileSystem(options.getConf());
if (fs.exists(context.getDestination())) {
LOG.info("Final destination exists, will run merge job.");
if (fs.exists(userDestDir)) {
String tableClassName = null;
if (!context.getConnManager().isORMFacilitySelfManaged()) {
tableClassName = (new TableClassName(options)).getClassForTable(context.getTableName());
}
// tmp地址,临时地址
Path destDir = this.getOutputPath(options, context.getTableName());
options.setExistingJarName(context.getJarFile());
options.setClassName(tableClassName);
options.setMergeOldPath(userDestDir.toString());
options.setMergeNewPath(context.getDestination().toString());
options.setTargetDir(destDir.toString());
this.loadJars(options.getConf(), context.getJarFile(), ClassWriter.toJavaIdentifier("codegen_" + context.getTableName()));
HadoopParquetMergeJobConfiguratorOverride parquetMergeJobConfigurator = new HadoopParquetMergeJobConfiguratorOverride();
MergeJobOverride mergeJob = new MergeJobOverride(options,parquetMergeJobConfigurator);
if (mergeJob.runMergeJob()) {
// 将原来的目的地址改为临时地址,将现在的临时地址改为目的地址,删除有原来的目的地址改为的临时地址目录
Path tmpDir = this.getOutputPath(options, context.getTableName());
fs.rename(userDestDir, tmpDir);
fs.rename(destDir, userDestDir);
fs.delete(tmpDir, true);
} else {
LOG.error("Merge MapReduce job failed!");
}
this.unloadJars();
} else {
if (!fs.exists(userDestDir.getParent())) {
fs.mkdirs(userDestDir.getParent());
}
LOG.info("Moving data from temporary directory " + context.getDestination() + " to final destination " + userDestDir);
if (!fs.rename(context.getDestination(), userDestDir)) {
throw new RuntimeException("Couldn't move data from temporary directory " + context.getDestination() + " to final destination " + userDestDir);
}
}
}
}
}
runMergeJob方法
- runMergeJob方法
- MergeJobOverride.java
public boolean runMergeJob() throws IOException {
Configuration conf = this.options.getConf();
Job job = this.createJob(conf);
String userClassName = this.options.getClassName();
if (null == userClassName) {
throw new IOException("Record class name not specified with --class-name.");
} else {
String existingJar = this.options.getExistingJarName();
if (existingJar != null) {
LOG.debug("Setting job jar to user-specified jar: " + existingJar);
job.getConfiguration().set("mapred.jar", existingJar);
} else {
try {
Class<? extends Object> userClass = conf.getClassByName(userClassName);
if (null != userClass) {
String userJar = Jars.getJarPathForClass(userClass);
LOG.debug("Setting job jar based on user class " + userClassName + ": " + userJar);
job.getConfiguration().set("mapred.jar", userJar);
} else {
LOG.warn("Specified class " + userClassName + " is not in a jar. MapReduce may not find the class");
}
} catch (ClassNotFoundException var12) {
throw new IOException(var12);
}
}
try {
Path oldPath = new Path(this.options.getMergeOldPath());
Path newPath = new Path(this.options.getMergeNewPath());
Configuration jobConf = job.getConfiguration();
oldPath = FileSystemUtil.makeQualified(oldPath, jobConf);
newPath = FileSystemUtil.makeQualified(newPath, jobConf);
this.propagateOptionsToJob(job);
FileInputFormat.addInputPath(job, oldPath);
FileInputFormat.addInputPath(job, newPath);
jobConf.set("sqoop.merge.old.path", oldPath.toString());
jobConf.set("sqoop.merge.new.path", newPath.toString());
jobConf.set("sqoop.merge.key.col", this.options.getMergeKeyCol());
jobConf.set("sqoop.merge.class", userClassName);
FileOutputFormat.setOutputPath(job, new Path(this.options.getTargetDir()));
ExportJobBase.FileType fileType = ExportJobBase.getFileType(jobConf, oldPath);
switch(fileType) {
case PARQUET_FILE:
Path finalPath = new Path(this.options.getTargetDir());
finalPath = FileSystemUtil.makeQualified(finalPath, jobConf);
this.parquetMergeJobConfigurator.configureParquetMergeJob(jobConf, job, oldPath, newPath, finalPath);
break;
case AVRO_DATA_FILE:
this.configueAvroMergeJob(conf, job, oldPath, newPath);
break;
case SEQUENCE_FILE:
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(MergeRecordMapper.class);
job.setReducerClass(MergeReducer.class);
break;
default:
job.setMapperClass(MergeTextMapper.class);
job.setOutputFormatClass(RawKeyTextOutputFormat.class);
job.setReducerClass(MergeReducer.class);
}
jobConf.set("mapred.output.key.class", userClassName);
job.setOutputValueClass(NullWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MergeRecord.class);
this.cacheJars(job, (ConnManager)null);
this.setJob(job);
return this.runJob(job);
} catch (InterruptedException var10) {
throw new IOException(var10);
} catch (ClassNotFoundException var11) {
throw new IOException(var11);
}
}
}
配置output
- 配置output
- MergeJobOverride.java
/**
* 配置map 和 reduce 的 output
*/
jobConf.set("mapred.output.key.class", userClassName);
job.setOutputValueClass(NullWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MergeRecord.class);
配置mapper和reduce方法
- 配置mapper和reduce方法
- HadoopParquetMergeJobConfiguratorOverride.java
public void configureParquetMergeJob(Configuration conf, Job job, Path oldPath, Path newPath, Path finalPath) throws IOException {
try {
LOG.info("Trying to merge parquet files");
job.setOutputKeyClass(Void.class);
// 配置mapper,reduce
job.setMapperClass(MergeParquetMapperOverride.class);
job.setReducerClass(MergeParquetReducerOverride.class);
job.setOutputValueClass(GenericRecord.class);
Schema avroSchema = this.loadAvroSchema(conf, oldPath);
this.validateNewPathAvroSchema(AvroUtilOverride.getAvroSchemaFromParquetFile(newPath, conf), avroSchema);
job.setInputFormatClass(this.exportJobConfigurator.getInputFormatClass());
AvroParquetInputFormat.setAvroReadSchema(job, avroSchema);
conf.set("parquetjob.avro.schema", avroSchema.toString());
this.importJobConfigurator.configureAvroSchema(job, avroSchema);
this.importJobConfigurator.configureOutputCodec(job);
job.setOutputFormatClass(this.importJobConfigurator.getOutputFormatClass());
} catch (Exception var7) {
throw new IOException(var7);
}
}
merge过程的map方法
- merge过程的map方法
- MergeParquetMapperOverride.java
@Override
protected void map(GenericRecord key, GenericRecord val, Mapper.Context context)
throws IOException, InterruptedException {
// 将genericRecord 转化为 sqooprecord,并且以(mergeKey,sqooprecord)kv 格式写出
processRecord(toSqoopRecord(val), context);
}
merge的reduce方法
- merge的reduce方法
- MergeParquetReducer.java
public void reduce(Text key, Iterable<MergeRecord> vals, Reducer<Text, MergeRecord, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
SqoopRecord bestRecord = null;
try {
Iterator var5 = vals.iterator();
// 选出标记为new的mergeRecord
while(var5.hasNext()) {
MergeRecord mergeRecord = (MergeRecord)var5.next();
if (null == bestRecord && !mergeRecord.isNewRecord()) {
bestRecord = (SqoopRecord)mergeRecord.getSqoopRecord().clone();
} else if (mergeRecord.isNewRecord()) {
bestRecord = (SqoopRecord)mergeRecord.getSqoopRecord().clone();
}
}
} catch (CloneNotSupportedException var7) {
throw new IOException(var7);
}
if (null != bestRecord) {
// 将标记为new的mergeRecord对象转化为genericRecord
GenericRecord record = AvroUtil.toGenericRecord(bestRecord.getFieldMap(), this.schema, this.bigDecimalFormatString);
this.write(context, record);
}
}
源码
/**
* Will create union, because each type is assumed to be nullable.
*
* @param sqlType Original SQL type (might be overridden by user)
* @param columnName Column name from the query
* @param precision Fixed point precision
* @param scale Fixed point scale
* @return Schema
*/
public Schema toAvroSchema(int sqlType, String columnName, Integer precision, Integer scale) {
List<Schema> childSchemas = new ArrayList<Schema>();
childSchemas.add(Schema.create(Schema.Type.NULL));
if (isLogicalTypeConversionEnabled() && isLogicalType(sqlType)) {
childSchemas.add(
toAvroLogicalType(columnName, sqlType, precision, scale)
.addToSchema(Schema.create(Type.BYTES))
);
} else {
childSchemas.add(Schema.create(toAvroType(columnName, sqlType)));
}
return Schema.createUnion(childSchemas);
}
修改的源码
public Schema toAvroSchema(int sqlType, String columnName, Integer precision, Integer scale) {
List<Schema> childSchemas = new ArrayList();
childSchemas.add(Schema.create(Schema.Type.NULL));
if (this.options.getConf().getBoolean("sqoop.avro.logical_types.decimal.enable", false) || isLogicalType(sqlType)) {
if (precision > 18) {
childSchemas.add(this.toAvroLogicalType(columnName, sqlType, precision, scale).addToSchema(Schema.create(Schema.Type.BYTES)));
}
else {
childSchemas.add(this.toAvroLogicalType(columnName, sqlType, precision, scale).addToSchema(Schema.createFixed(columnName, (String)null, (String)null, AvroSchemaGeneratorOverride.PRECISION_TO_BYTE_COUNT[precision - 1])));
}
} else {
childSchemas.add(Schema.create(this.toAvroType(columnName, sqlType)));
}
return Schema.createUnion(childSchemas);
}
fixed 与 bytes 的区别
fixed根据presicion分配固定的字节数表示一个decimal 类型
bytes: 先给出一个long类型的字节数组长度表示这个decimal类型需要几个字节表示,后面的字节表示这个decimal类型