sqoop同步数据流程decimal

最新推荐文章于 2023-09-15 20:12:51 发布

weixin_37042673

最新推荐文章于 2023-09-15 20:12:51 发布

阅读量407

点赞数

分类专栏： java 大数据

本文链接：https://blog.csdn.net/weixin_37042673/article/details/115298884

版权

java 同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

大数据

9 篇文章 0 订阅

订阅专栏

sqoop同步数据流程decimal

指定输入类

指定输入类
ImportJobContext.class

public ImportJobContext(String table, String jar, SqoopOptions opts, Path destination) {
        this.tableName = table;
        this.jarFile = jar;
        if (this.jarFile == null) {
            this.jarFile = Jars.getJarPathForClass(Configuration.class);
        }

        this.options = opts;
        this.inputFormatClass = DataDrivenDBInputFormat.class;
        this.destination = destination;
    }

指定orm

指定orm
DataDrivenImportJobOverride.java

 // 指定生成的orm
                DataDrivenDBInputFormat.setInput(job, DBWritable.class, mgr.escapeTableName(tableName), whereClause, mgr.escapeColName(splitByCol), sqlColNames);

指定输入类

指定输入类
DataDrivenImportJobOverride.java

            // 指定输入类
            job.setInputFormatClass(this.inputFormatClass);

指定输出类

指定输出类
DataDrivenImportJobOverride.java

protected Class<? extends OutputFormat> getOutputFormatClass() throws ClassNotFoundException {
        if (this.isHCatJob) {
            LOG.debug("Returning HCatOutputFormat for output format");
            return SqoopHCatUtilities.getOutputFormatClass();
        } else if (this.options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
            return RawKeyTextOutputFormat.class;
        } else if (this.options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
            return SequenceFileOutputFormat.class;
        } else if (this.options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
            return AvroOutputFormat.class;
        } else {
            // DatasetKeyOutputFormat 被替换成 AvroParquetOutputFormat
            return this.options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile ? AvroParquetOutputFormat.class : null;
        }
    }

指定输出路径

指定输出路径
ImportJobBase.class

Path outputPath = this.context.getDestination();
            FileOutputFormat.setOutputPath(job, outputPath);

指定mapper类

指定mapper类
DataDrivenImportJobOverride.java

protected Class<? extends Mapper> getMapperClass() {
        if (this.options.getHCatTableName() != null) {
            return SqoopHCatUtilities.getImportMapperClass();
        } else if (this.options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
            return TextImportMapper.class;
        } else if (this.options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
            return SequenceFileImportMapper.class;
        } else if (this.options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
            return AvroImportMapper.class;
        } else {
            return this.options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile ? HadoopParquetImportMapperOverride.class : null;
        }
    }

map方法

map方法
ParquetImportMapperOverride.java

protected void map(LongWritable key, SqoopRecord val, Mapper<LongWritable, SqoopRecord, KEYOUT, VALOUT>.Context context) throws IOException, InterruptedException {
        try {
            val.loadLargeObjects( this.lobLoader);
        } catch (SQLException var5) {
            throw new IOException(var5);
        }

        GenericRecord record = AvroUtilOverride.toGenericRecord(val.getFieldMap(), this.schema, this.bigDecimalFormatString, this.bigDecimalPadding);
        this.write(context, record);
    }

toAvro方法

toAvro方法
AvroUtilOverride.java

public static Object toAvro(Object o, Schema.Field field, boolean bigDecimalFormatString, boolean bigDecimalPaddingEnabled) {
        if (o instanceof BigDecimal) {
            if (bigDecimalPaddingEnabled) {
                o = padBigDecimal((BigDecimal)o, field.schema());
            }

            if (!isDecimal(field)) {
                if (bigDecimalFormatString) {
                    return ((BigDecimal)o).toPlainString();
                }

                return o.toString();
            }
        } else {
            if (o instanceof Date) {
                return ((Date)o).getTime();
            }

            if (o instanceof Time) {
                return ((Time)o).getTime();
            }

            if (o instanceof Timestamp) {
                return ((Timestamp)o).getTime();
            }

            if (o instanceof BytesWritable) {
                BytesWritable bw = (BytesWritable)o;
                return ByteBuffer.wrap(bw.getBytes(), 0, bw.getLength());
            }

            if (o instanceof BlobRef) {
                BlobRef br = (BlobRef)o;
                byte[] bytes = br.isExternal() ? br.toString().getBytes() : (byte[])br.getData();
                return ByteBuffer.wrap(bytes);
            }

            if (o instanceof ClobRef) {
                throw new UnsupportedOperationException("ClobRef not supported");
            }
        }

        return o;
    }

record 的put 方法

record 的put 方法
GenericData.java

@Override public void put(String key, Object value) {
      Schema.Field field = schema.getField(key);
      if (field == null)
        throw new AvroRuntimeException("Not a valid schema field: "+key);

      values[field.pos()] = value;
    }

第二步merge任务入口

第二步merge任务入口
ImportDecimalTool.java

 if (options.isAppendMode()) {
                AppendUtils app = new AppendUtils(context);
                app.append();
            } else if (options.getIncrementalMode() == SqoopOptions.IncrementalMode.DateLastModified) {
                this.lastModifiedMerge(options, context);
            }

merge目录

merge目录
ImportDecimalTool.java

 protected void lastModifiedMerge(SqoopOptions options, ImportJobContext context) throws IOException {
        if (context.getDestination() != null) {
            
            // 目的地址  --target 指定地址
            Path userDestDir = this.getOutputPath(options, context.getTableName(), false);
            FileSystem fs = userDestDir.getFileSystem(options.getConf());
            if (fs.exists(context.getDestination())) {
                LOG.info("Final destination exists, will run merge job.");
                if (fs.exists(userDestDir)) {
                    String tableClassName = null;
                    if (!context.getConnManager().isORMFacilitySelfManaged()) {
                        tableClassName = (new TableClassName(options)).getClassForTable(context.getTableName());
                    }

                    // tmp地址，临时地址
                    Path destDir = this.getOutputPath(options, context.getTableName());
                    options.setExistingJarName(context.getJarFile());
                    options.setClassName(tableClassName);
                    options.setMergeOldPath(userDestDir.toString());
                    options.setMergeNewPath(context.getDestination().toString());
                    options.setTargetDir(destDir.toString());
                    this.loadJars(options.getConf(), context.getJarFile(), ClassWriter.toJavaIdentifier("codegen_" + context.getTableName()));
                    HadoopParquetMergeJobConfiguratorOverride parquetMergeJobConfigurator = new HadoopParquetMergeJobConfiguratorOverride();
                    MergeJobOverride mergeJob = new MergeJobOverride(options,parquetMergeJobConfigurator);
                    if (mergeJob.runMergeJob()) {
                        
                        // 将原来的目的地址改为临时地址，将现在的临时地址改为目的地址，删除有原来的目的地址改为的临时地址目录
                        Path tmpDir = this.getOutputPath(options, context.getTableName());
                        fs.rename(userDestDir, tmpDir);
                        fs.rename(destDir, userDestDir);
                        fs.delete(tmpDir, true);
                    } else {
                        LOG.error("Merge MapReduce job failed!");
                    }

                    this.unloadJars();
                } else {
                    if (!fs.exists(userDestDir.getParent())) {
                        fs.mkdirs(userDestDir.getParent());
                    }

                    LOG.info("Moving data from temporary directory " + context.getDestination() + " to final destination " + userDestDir);
                    if (!fs.rename(context.getDestination(), userDestDir)) {
                        throw new RuntimeException("Couldn't move data from temporary directory " + context.getDestination() + " to final destination " + userDestDir);
                    }
                }
            }

        }
    }

runMergeJob方法

runMergeJob方法
MergeJobOverride.java

public boolean runMergeJob() throws IOException {
        Configuration conf = this.options.getConf();
        Job job = this.createJob(conf);
        String userClassName = this.options.getClassName();
        if (null == userClassName) {
            throw new IOException("Record class name not specified with --class-name.");
        } else {
            String existingJar = this.options.getExistingJarName();
            if (existingJar != null) {
                LOG.debug("Setting job jar to user-specified jar: " + existingJar);
                job.getConfiguration().set("mapred.jar", existingJar);
            } else {
                try {
                    Class<? extends Object> userClass = conf.getClassByName(userClassName);
                    if (null != userClass) {
                        String userJar = Jars.getJarPathForClass(userClass);
                        LOG.debug("Setting job jar based on user class " + userClassName + ": " + userJar);
                        job.getConfiguration().set("mapred.jar", userJar);
                    } else {
                        LOG.warn("Specified class " + userClassName + " is not in a jar. MapReduce may not find the class");
                    }
                } catch (ClassNotFoundException var12) {
                    throw new IOException(var12);
                }
            }

            try {
                Path oldPath = new Path(this.options.getMergeOldPath());
                Path newPath = new Path(this.options.getMergeNewPath());
                Configuration jobConf = job.getConfiguration();
                oldPath = FileSystemUtil.makeQualified(oldPath, jobConf);
                newPath = FileSystemUtil.makeQualified(newPath, jobConf);
                this.propagateOptionsToJob(job);
                FileInputFormat.addInputPath(job, oldPath);
                FileInputFormat.addInputPath(job, newPath);
                jobConf.set("sqoop.merge.old.path", oldPath.toString());
                jobConf.set("sqoop.merge.new.path", newPath.toString());
                jobConf.set("sqoop.merge.key.col", this.options.getMergeKeyCol());
                jobConf.set("sqoop.merge.class", userClassName);
                FileOutputFormat.setOutputPath(job, new Path(this.options.getTargetDir()));
                ExportJobBase.FileType fileType = ExportJobBase.getFileType(jobConf, oldPath);
                switch(fileType) {
                    case PARQUET_FILE:
                        Path finalPath = new Path(this.options.getTargetDir());
                        finalPath = FileSystemUtil.makeQualified(finalPath, jobConf);
                        this.parquetMergeJobConfigurator.configureParquetMergeJob(jobConf, job, oldPath, newPath, finalPath);
                        break;
                    case AVRO_DATA_FILE:
                        this.configueAvroMergeJob(conf, job, oldPath, newPath);
                        break;
                    case SEQUENCE_FILE:
                        job.setInputFormatClass(SequenceFileInputFormat.class);
                        job.setOutputFormatClass(SequenceFileOutputFormat.class);
                        job.setMapperClass(MergeRecordMapper.class);
                        job.setReducerClass(MergeReducer.class);
                        break;
                    default:
                        job.setMapperClass(MergeTextMapper.class);
                        job.setOutputFormatClass(RawKeyTextOutputFormat.class);
                        job.setReducerClass(MergeReducer.class);
                }

                jobConf.set("mapred.output.key.class", userClassName);
                job.setOutputValueClass(NullWritable.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(MergeRecord.class);
                this.cacheJars(job, (ConnManager)null);
                this.setJob(job);
                return this.runJob(job);
            } catch (InterruptedException var10) {
                throw new IOException(var10);
            } catch (ClassNotFoundException var11) {
                throw new IOException(var11);
            }
        }
    }

配置output

配置output
MergeJobOverride.java

  /**
                 * 配置map 和 reduce 的 output
                 */
                jobConf.set("mapred.output.key.class", userClassName);
                job.setOutputValueClass(NullWritable.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(MergeRecord.class);

配置mapper和reduce方法

配置mapper和reduce方法
HadoopParquetMergeJobConfiguratorOverride.java

public void configureParquetMergeJob(Configuration conf, Job job, Path oldPath, Path newPath, Path finalPath) throws IOException {
        try {
            LOG.info("Trying to merge parquet files");
            job.setOutputKeyClass(Void.class);
            // 配置mapper,reduce
            job.setMapperClass(MergeParquetMapperOverride.class);
            job.setReducerClass(MergeParquetReducerOverride.class);
            job.setOutputValueClass(GenericRecord.class);
            Schema avroSchema = this.loadAvroSchema(conf, oldPath);
            this.validateNewPathAvroSchema(AvroUtilOverride.getAvroSchemaFromParquetFile(newPath, conf), avroSchema);
            job.setInputFormatClass(this.exportJobConfigurator.getInputFormatClass());
            AvroParquetInputFormat.setAvroReadSchema(job, avroSchema);
            conf.set("parquetjob.avro.schema", avroSchema.toString());
            this.importJobConfigurator.configureAvroSchema(job, avroSchema);
            this.importJobConfigurator.configureOutputCodec(job);
            job.setOutputFormatClass(this.importJobConfigurator.getOutputFormatClass());
        } catch (Exception var7) {
            throw new IOException(var7);
        }
    }

merge过程的map方法

merge过程的map方法
MergeParquetMapperOverride.java

 @Override
    protected void map(GenericRecord key, GenericRecord val, Mapper.Context context)
            throws IOException, InterruptedException {
        // 将genericRecord 转化为 sqooprecord,并且以（mergeKey,sqooprecord）kv 格式写出
        processRecord(toSqoopRecord(val), context);
    }

merge的reduce方法

merge的reduce方法
MergeParquetReducer.java

public void reduce(Text key, Iterable<MergeRecord> vals, Reducer<Text, MergeRecord, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
        SqoopRecord bestRecord = null;

        try {
            Iterator var5 = vals.iterator();

            // 选出标记为new的mergeRecord
            while(var5.hasNext()) {
                MergeRecord mergeRecord = (MergeRecord)var5.next();
                if (null == bestRecord && !mergeRecord.isNewRecord()) {
                    bestRecord = (SqoopRecord)mergeRecord.getSqoopRecord().clone();
                } else if (mergeRecord.isNewRecord()) {
                    bestRecord = (SqoopRecord)mergeRecord.getSqoopRecord().clone();
                }
            }
        } catch (CloneNotSupportedException var7) {
            throw new IOException(var7);
        }

        if (null != bestRecord) {
            // 将标记为new的mergeRecord对象转化为genericRecord
            GenericRecord record = AvroUtil.toGenericRecord(bestRecord.getFieldMap(), this.schema, this.bigDecimalFormatString);
            this.write(context, record);
        }

    }

源码

  /**
   * Will create union, because each type is assumed to be nullable.
   *
   * @param sqlType Original SQL type (might be overridden by user)
   * @param columnName Column name from the query
   * @param precision Fixed point precision
   * @param scale Fixed point scale
   * @return Schema
   */
  public Schema toAvroSchema(int sqlType, String columnName, Integer precision, Integer scale) {
    List<Schema> childSchemas = new ArrayList<Schema>();
    childSchemas.add(Schema.create(Schema.Type.NULL));
    if (isLogicalTypeConversionEnabled() && isLogicalType(sqlType)) {
      childSchemas.add(
          toAvroLogicalType(columnName, sqlType, precision, scale)
              .addToSchema(Schema.create(Type.BYTES))
      );
    } else {
      childSchemas.add(Schema.create(toAvroType(columnName, sqlType)));
    }
    return Schema.createUnion(childSchemas);
  }

修改的源码

public Schema toAvroSchema(int sqlType, String columnName, Integer precision, Integer scale) {
      List<Schema> childSchemas = new ArrayList();
      childSchemas.add(Schema.create(Schema.Type.NULL));
      if (this.options.getConf().getBoolean("sqoop.avro.logical_types.decimal.enable", false) || isLogicalType(sqlType)) {
          if (precision > 18) {
              childSchemas.add(this.toAvroLogicalType(columnName, sqlType, precision, scale).addToSchema(Schema.create(Schema.Type.BYTES)));
          }
          else {
              childSchemas.add(this.toAvroLogicalType(columnName, sqlType, precision, scale).addToSchema(Schema.createFixed(columnName, (String)null, (String)null, AvroSchemaGeneratorOverride.PRECISION_TO_BYTE_COUNT[precision - 1])));
          }
      } else {
          childSchemas.add(Schema.create(this.toAvroType(columnName, sqlType)));
      }

      return Schema.createUnion(childSchemas);
  }

fixed 与 bytes 的区别

fixed根据presicion分配固定的字节数表示一个decimal 类型
bytes：先给出一个long类型的字节数组长度表示这个decimal类型需要几个字节表示，后面的字节表示这个decimal类型

weixin_37042673

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
sqoop同步数据流程decimal

sqoop同步数据流程decimal文章目录sqoop同步数据流程decimal指定输入类指定orm指定输入类指定输出类指定输出路径指定mapper类map方法toAvro方法record 的put 方法第二步merge任务入口merge目录runMergeJob方法配置output配置mapper和reduce方法merge过程的map方法merge的reduce方法源码修改的源码指定输入类指定输入类ImportJobContext.classpublic ImportJobContext(
复制链接

扫一扫