sqoop2 - 数据导出-CSDN博客

本文深入解析Sqoop2如何利用自定义OutputFormat类实现数据从MapReduce到目标存储（如Hive、Kafka或数据库）的高效传输。通过多线程机制，详细介绍了数据读取、处理和写入流程，以及信号量控制下的读写同步。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

介绍

sqoop2从数据源读取数据，然后写入到目的地。sqoop2数据的导入是基于mapreduce的框架，所以sqoop2自己实现了OutputFormat类，支持将结果导入hive，kafka，数据库等类型。

OutputFormat

public class SqoopNullOutputFormat extends OutputFormat<SqoopWritable, NullWritable> {

  @Override
  public RecordWriter<SqoopWritable, NullWritable> getRecordWriter(TaskAttemptContext context) {
    SqoopOutputFormatLoadExecutor executor = new SqoopOutputFormatLoadExecutor(context);
    return executor.getRecordWriter();
  }
}

SqoopNullOutputFormat继承了OutputFormat，它调用了SqoopOutputFormatLoadExecutor的getRecordWriter方法，返回RecordWriter。

SqoopOutputFormatLoadExecutor

toDataFormat存储要处理的数据。这里涉及到多线程的读写竞争，所以用filled和free两个信号量来保证。

public class SqoopOutputFormatLoadExecutor {

  private volatile boolean readerFinished = false;
  private volatile boolean writerFinished = false;
  private volatile IntermediateDataFormat<? extends Object> toDataFormat;
  private Semaphore filled = new Semaphore(0, true);    // 初始值为0，必须先release
  private Semaphore free = new Semaphore(1, true);    // 初始值为1，先acquire


  public RecordWriter<SqoopWritable, NullWritable> getRecordWriter() {
    // 实例化ConsumerThread线程，处理toDataFormat中的数据
    consumerFuture = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setNameFormat
        ("OutputFormatLoader-consumer").build()).submit(
            new ConsumerThread(context));
    return writer;
  }

toDataFormat的读写

SqoopRecordWriter负责toDataFormat的写，SqoopOutputFormatDataReader负责toDataFormat的读。

  private class SqoopRecordWriter extends RecordWriter<SqoopWritable, NullWritable> {

    @Override
    public void write(SqoopWritable key, NullWritable value) throws InterruptedException {
       // 获取free锁，等待读操作完成
      free.acquire();
      checkIfConsumerThrew();
      toDataFormat.setCSVTextData(key.toString());
      // 释放filled锁，通知写操作完成
      filled.release();
    }
  }

  private class SqoopOutputFormatDataReader extends DataReader {

    @Override
    public Object[] readArrayRecord() throws InterruptedException {
      // 获取filled锁，等待写操作完成
      acquireSema();
      // 如果所有数据已经写完了，则返回null
      if (writerFinished) {
        return null;
      }
      try {
        return toDataFormat.getObjectData();
      } finally {
        // 释放free锁，通知读操作完成
        releaseSema();
      }
    }

    private void acquireSema() throws InterruptedException {
      try {
        filled.acquire();
      } catch (InterruptedException ex) {
        throw ex;
      }
    }

    private void releaseSema(){
      free.release();
    }
  }

通过filled和free两个信号量，实现了写读的轮询操作。因为filled的初始值为0，所以是写操作在前面。

ConsumerThread

ConsumerThread负责从toDataFormat读取数据，然后发送到数据目的地。

  private class ConsumerThread implements Runnable {

    private final JobContext jobctx;

    public ConsumerThread(final JobContext context) {
      jobctx = context;
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    public void run() {
      LOG.info("SqoopOutputFormatLoadExecutor consumer thread is starting");
      try {
        // 实例化DataReader， 用来读取toDataFormat的数据
        DataReader reader = new SqoopOutputFormatDataReader();
        Configuration conf = context.getConfiguration();
        // 实例化Loader，负责数据的处理
        Loader loader = (Loader) ClassUtils.instantiate(loaderName);

        PrefixContext subContext = new PrefixContext(conf,
            MRJobConstants.PREFIX_CONNECTOR_TO_CONTEXT);
        Object connectorLinkConfig = MRConfigurationUtils
            .getConnectorLinkConfig(Direction.TO, conf);
        Object connectorToJobConfig = MRConfigurationUtils
            .getConnectorJobConfig(Direction.TO, conf);

        LoaderContext loaderContext = new LoaderContext(subContext, reader, matcher.getToSchema());

        LOG.info("Running loader class " + loaderName);
        // 负责将数据，存储到目的地
        loader.load(loaderContext, connectorLinkConfig, connectorToJobConfig);
        LOG.info("Loader has finished");
        ((TaskAttemptContext) jobctx).getCounter(SqoopCounters.ROWS_WRITTEN).increment(
            loader.getRowsWritten());

      } catch (Throwable t) {
        // 如果处理数据失败，会将readerFinished设置为true
        // 释放free锁，防止另一个线程一直等待
        readerFinished = true;
        LOG.error("Error while loading data out of MR job.", t);
        free.release();
        throw new SqoopException(MRExecutionError.MAPRED_EXEC_0018, t);
      }

      if (!writerFinished) {
        // 如果读操作停止，在写操作之前。说明还有数据是没有被处理的
        readerFinished = true;
        LOG.error("Reader terminated, but writer is still running!");
        free.release();
        throw new SqoopException(MRExecutionError.MAPRED_EXEC_0019);
      }

      readerFinished = true;
    }
  }

GenericJdbcLoader

Loader有多个子类，支持不同的数据源。比如GenericJdbcLoader支持导入到sql数据库，KafkaLoader支持导入到kafka中等等。下面以GenericJdbcLoader为例，

public class GenericJdbcLoader extends Loader<LinkConfiguration, ToJobConfiguration> {

  public static final int DEFAULT_ROWS_PER_BATCH = 100;
  public static final int DEFAULT_BATCHES_PER_TRANSACTION = 100;
  private int rowsPerBatch = DEFAULT_ROWS_PER_BATCH;
  private int batchesPerTransaction = DEFAULT_BATCHES_PER_TRANSACTION;
  private long rowsWritten = 0;

  @Override
  public void load(LoaderContext context, LinkConfiguration linkConfig, ToJobConfiguration toJobConfig) throws Exception{
    String driver = linkConfig.linkConfig.jdbcDriver;
    String url = linkConfig.linkConfig.connectionString;
    String username = linkConfig.linkConfig.username;
    String password = linkConfig.linkConfig.password;
    // 实例化JdbcExecutor，用来执行sql语句
    GenericJdbcExecutor executor = new GenericJdbcExecutor(driver, url, username, password);
    executor.setAutoCommit(false);
    String sql = context.getString(GenericJdbcConnectorConstants.CONNECTOR_JDBC_TO_DATA_SQL);
    // 开始批处理，写入数据
    executor.beginBatch(sql);
    try {
      int numberOfRowsPerBatch = 0;
      int numberOfBatchesPerTransaction = 0;
      Object[] array;
      
      // 从dataReader循环获取数据。直到写操作停止。
      while ((array = context.getDataReader().readArrayRecord()) != null) {
        numberOfRowsPerBatch++;
        executor.addBatch(array, context.getSchema());

        // 每次批处理为100条数据。每次提交100个批处理。
        if (numberOfRowsPerBatch == rowsPerBatch) {
          numberOfBatchesPerTransaction++;
          if (numberOfBatchesPerTransaction == batchesPerTransaction) {
            executor.executeBatch(true);
            numberOfBatchesPerTransaction = 0;
          } else {
            executor.executeBatch(false);
          }
          numberOfRowsPerBatch = 0;
        }
        rowsWritten ++;
      }

      if (numberOfRowsPerBatch != 0 || numberOfBatchesPerTransaction != 0) {
        // 提交剩下的批处理
        executor.executeBatch(true);
      }

      executor.endBatch();

    } finally {
      executor.close();
    }
  }

  @Override
  public long getRowsWritten() {
    return rowsWritten;
  }

}

读写关闭

  private class SqoopRecordWriter extends RecordWriter<SqoopWritable, NullWritable> {

    @Override
    public void close(TaskAttemptContext context)
            throws InterruptedException, IOException {
      // 获取free锁，等待读操作完成
      free.acquire();
      //设置 writerFinished为true
      writerFinished = true;
      // 释放filled，通知读线程
      filled.release();
      // 等待Consume线程完成
      waitForConsumer();
      LOG.info("SqoopOutputFormatLoadExecutor::SqoopRecordWriter is closed");
    }
  }

  private void waitForConsumer() {
    try {
       // 获取future的值
      consumerFuture.get();
    } catch (ExecutionException ex) {
      Throwable t = ex.getCause();
      if (t instanceof SqoopException) {
        throw (SqoopException) t;
      }
      Throwables.propagate(t);
    } catch (Exception ex) {
      throw new SqoopException(MRExecutionError.MAPRED_EXEC_0019, ex);
    }
  }