sqoop2 - 数据导出

介绍

sqoop2从数据源读取数据,然后写入到目的地。sqoop2数据的导入是基于mapreduce的框架,所以sqoop2自己实现了OutputFormat类,支持将结果导入hive,kafka,数据库等类型。

OutputFormat

public class SqoopNullOutputFormat extends OutputFormat<SqoopWritable, NullWritable> {

  @Override
  public RecordWriter<SqoopWritable, NullWritable> getRecordWriter(TaskAttemptContext context) {
    SqoopOutputFormatLoadExecutor executor = new SqoopOutputFormatLoadExecutor(context);
    return executor.getRecordWriter();
  }
}

SqoopNullOutputFormat继承了OutputFormat,它调用了SqoopOutputFormatLoadExecutor的getRecordWriter方法,返回RecordWriter。

SqoopOutputFormatLoadExecutor

toDataFormat存储要处理的数据。这里涉及到多线程的读写竞争,所以用filled和free两个信号量来保证。

public class SqoopOutputFormatLoadExecutor {

  private volatile boolean readerFinished = false;
  private volatile boolean writerFinished = false;
  private volatile IntermediateDataFormat<? extends Object> toDataFormat;
  private Semaphore filled = new Semaphore(0, true);    // 初始值为0,必须先release
  private Semaphore free = new Semaphore(1, true);    // 初始值为1,先acquire


  public RecordWriter<SqoopWritable, NullWritable> getRecordWriter() {
    // 实例化ConsumerThread线程,处理toDataFormat中的数据
    consumerFuture = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setNameFormat
        ("OutputFormatLoader-consumer").build()).submit(
            new ConsumerThread(context));
    return writer;
  }

toDataFormat的读写

SqoopRecordWriter负责toDataFormat的写,SqoopOutputFormatDataReader负责toDataFormat的读。

  private class SqoopRecordWriter extends RecordWriter<SqoopWritable, NullWritable> {

    @Override
    public void write(SqoopWritable key, NullWritable value) throws InterruptedException {
       // 获取free锁,等待读操作完成
      free.acquire();
      checkIfConsumerThrew();
      toDataFormat.setCSVTextData(key.toString());
      // 释放filled锁,通知写操作完成
      filled.release();
    }
  }

  private class SqoopOutputFormatDataReader extends DataReader {

    @Override
    public Object[] readArrayRecord() throws InterruptedException {
      // 获取filled锁,等待写操作完成
      acquireSema();
      // 如果所有数据已经写完了,则返回null
      if (writerFinished) {
        return null;
      }
      try {
        return toDataFormat.getObjectData();
      } finally {
        // 释放free锁,通知读操作完成
        releaseSema();
      }
    }

    private void acquireSema() throws InterruptedException {
      try {
        filled.acquire();
      } catch (InterruptedException ex) {
        throw ex;
      }
    }

    private void releaseSema(){
      free.release();
    }
  }

通过filled和free两个信号量,实现了写读的轮询操作。因为filled的初始值为0,所以是写操作在前面。

ConsumerThread

ConsumerThread负责从toDataFormat读取数据,然后发送到数据目的地。

  private class ConsumerThread implements Runnable {

    private final JobContext jobctx;

    public ConsumerThread(final JobContext context) {
      jobctx = context;
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    public void run() {
      LOG.info("SqoopOutputFormatLoadExecutor consumer thread is starting");
      try {
        // 实例化DataReader, 用来读取toDataFormat的数据
        DataReader reader = new SqoopOutputFormatDataReader();
        Configuration conf = context.getConfiguration();
        // 实例化Loader,负责数据的处理
        Loader loader = (Loader) ClassUtils.instantiate(loaderName);

        PrefixContext subContext = new PrefixContext(conf,
            MRJobConstants.PREFIX_CONNECTOR_TO_CONTEXT);
        Object connectorLinkConfig = MRConfigurationUtils
            .getConnectorLinkConfig(Direction.TO, conf);
        Object connectorToJobConfig = MRConfigurationUtils
            .getConnectorJobConfig(Direction.TO, conf);

        LoaderContext loaderContext = new LoaderContext(subContext, reader, matcher.getToSchema());

        LOG.info("Running loader class " + loaderName);
        // 负责将数据,存储到目的地
        loader.load(loaderContext, connectorLinkConfig, connectorToJobConfig);
        LOG.info("Loader has finished");
        ((TaskAttemptContext) jobctx).getCounter(SqoopCounters.ROWS_WRITTEN).increment(
            loader.getRowsWritten());

      } catch (Throwable t) {
        // 如果处理数据失败,会将readerFinished设置为true
        // 释放free锁,防止另一个线程一直等待
        readerFinished = true;
        LOG.error("Error while loading data out of MR job.", t);
        free.release();
        throw new SqoopException(MRExecutionError.MAPRED_EXEC_0018, t);
      }

      if (!writerFinished) {
        // 如果读操作停止,在写操作之前。说明还有数据是没有被处理的
        readerFinished = true;
        LOG.error("Reader terminated, but writer is still running!");
        free.release();
        throw new SqoopException(MRExecutionError.MAPRED_EXEC_0019);
      }

      readerFinished = true;
    }
  }

GenericJdbcLoader

Loader有多个子类,支持不同的数据源。比如GenericJdbcLoader支持导入到sql数据库,KafkaLoader支持导入到kafka中等等。下面以GenericJdbcLoader为例,

public class GenericJdbcLoader extends Loader<LinkConfiguration, ToJobConfiguration> {

  public static final int DEFAULT_ROWS_PER_BATCH = 100;
  public static final int DEFAULT_BATCHES_PER_TRANSACTION = 100;
  private int rowsPerBatch = DEFAULT_ROWS_PER_BATCH;
  private int batchesPerTransaction = DEFAULT_BATCHES_PER_TRANSACTION;
  private long rowsWritten = 0;

  @Override
  public void load(LoaderContext context, LinkConfiguration linkConfig, ToJobConfiguration toJobConfig) throws Exception{
    String driver = linkConfig.linkConfig.jdbcDriver;
    String url = linkConfig.linkConfig.connectionString;
    String username = linkConfig.linkConfig.username;
    String password = linkConfig.linkConfig.password;
    // 实例化JdbcExecutor,用来执行sql语句
    GenericJdbcExecutor executor = new GenericJdbcExecutor(driver, url, username, password);
    executor.setAutoCommit(false);
    String sql = context.getString(GenericJdbcConnectorConstants.CONNECTOR_JDBC_TO_DATA_SQL);
    // 开始批处理,写入数据
    executor.beginBatch(sql);
    try {
      int numberOfRowsPerBatch = 0;
      int numberOfBatchesPerTransaction = 0;
      Object[] array;
      
      // 从dataReader循环获取数据。直到写操作停止。
      while ((array = context.getDataReader().readArrayRecord()) != null) {
        numberOfRowsPerBatch++;
        executor.addBatch(array, context.getSchema());

        // 每次批处理为100条数据。每次提交100个批处理。
        if (numberOfRowsPerBatch == rowsPerBatch) {
          numberOfBatchesPerTransaction++;
          if (numberOfBatchesPerTransaction == batchesPerTransaction) {
            executor.executeBatch(true);
            numberOfBatchesPerTransaction = 0;
          } else {
            executor.executeBatch(false);
          }
          numberOfRowsPerBatch = 0;
        }
        rowsWritten ++;
      }

      if (numberOfRowsPerBatch != 0 || numberOfBatchesPerTransaction != 0) {
        // 提交剩下的批处理
        executor.executeBatch(true);
      }

      executor.endBatch();

    } finally {
      executor.close();
    }
  }

  @Override
  public long getRowsWritten() {
    return rowsWritten;
  }

}

读写关闭

  private class SqoopRecordWriter extends RecordWriter<SqoopWritable, NullWritable> {

    @Override
    public void close(TaskAttemptContext context)
            throws InterruptedException, IOException {
      // 获取free锁,等待读操作完成
      free.acquire();
      //设置 writerFinished为true
      writerFinished = true;
      // 释放filled,通知读线程
      filled.release();
      // 等待Consume线程完成
      waitForConsumer();
      LOG.info("SqoopOutputFormatLoadExecutor::SqoopRecordWriter is closed");
    }
  }

  private void waitForConsumer() {
    try {
       // 获取future的值
      consumerFuture.get();
    } catch (ExecutionException ex) {
      Throwable t = ex.getCause();
      if (t instanceof SqoopException) {
        throw (SqoopException) t;
      }
      Throwables.propagate(t);
    } catch (Exception ex) {
      throw new SqoopException(MRExecutionError.MAPRED_EXEC_0019, ex);
    }
  }

概括

sqoop2自己实现了OutputFormat,以支持mapreduce的自定义输出。

OutputFormat采用了多线程的框架。一个线程负责读取mapreduce的输出,一个线程负责对读取的数据,进行处理,存储到目的地。

转载于:https://my.oschina.net/u/569730/blog/1537807

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值