浅谈DBInputFormat

最新推荐文章于 2019-09-08 15:41:42 发布

midNightParis

最新推荐文章于 2019-09-08 15:41:42 发布

阅读量694

点赞数

分类专栏： mapreduce

本文链接：https://blog.csdn.net/aiwode1234/article/details/34846795

版权

mapreduce 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

通常情况下用sqoop把关系型数据库里面的内容导入到hdfs，但是sqoop在数据分布不均的情况下，效率非常低，如：可能你给sqoop任务分配了10个map，但是真正起导数据作用的就只有1个，非常影响效率，这时候就可以考虑自己写MR把数据导入到hdfs，这时就需要使用DBInputFormat。

DBInputFormat 主要用于把关系型数据库（如oracle,mysql）里面的数据导入到hdfs。使用如下：

自定义一个writable类，实现Writable和DBWritable接口。里面的字段和数据库中你所要取的字段一致。

public class StudentWritable implements Writable, DBWritable {
	private String name;
	private String time;

	@Override
	public void write(PreparedStatement statement) throws SQLException {
		// TODO Auto-generated method stub
		statement.setString(1, name);
		statement.setString(2, time);
	}

	@Override
	public void readFields(ResultSet resultSet) throws SQLException {
		// TODO Auto-generated method stub
		this.name = resultSet.getString(1);
		this.time = resultSet.getString(2);
	}

	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		Text.writeString(out, name);
		Text.writeString(out, time);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		this.name = Text.readString(in);
		this.time = Text.readString(in);
	}

	@Override
	public String toString() {
		return name + "\t" + time;
	}

}

再写一个mapper类：

public class ImportMapper extends Mapper<LongWritable, StudentWritable, Text, NullWritable>{
	private NullWritable outVal = NullWritable.get();
	private Text outKey = new Text();
	@Override
	protected void map(LongWritable key, StudentWritable value,
			Context context)
			throws IOException, InterruptedException {
		outKey.set(value.toString());
		context.write(outKey, outVal);
	}
}

main方法运行：

/**
 * 把oracle里面的数据导入到hdfs中
 * @author root
 *
 */
public class ImportJob {

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();

		conf.setInt("mapred.map.tasks", 10);
		Job job = new Job(conf, "student import");
		job.setJarByClass(ImportJob.class);
		
		//删除输出路径
		Path outputDir = new Path(args[0]);
		outputDir.getFileSystem(conf).delete(outputDir, true);
		
		job.setMapperClass(ImportMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setNumReduceTasks(0);
		
		job.setInputFormatClass(DBInputFormat.class);
		
		DBConfiguration.configureDB(conf, "oracle.jdbc.driver.OracleDriver", "jdbc:oracle:thin:@localhost:1521:orcl", "user", "user", 1000);
		
		DBInputFormat.setInput(job,  StudentWritable.class, "select name,to_char(time,'yyyy-mm-dd hh24:mi:ss') time from student","select count(*) from student");
		
		job.waitForCompletion(true);
	}

}

在写好后就可以运行了，本来以为一切OK了，可是发现mapreduce已经显示运行100%了，可还有一个map在跑，点进行一看，发现这个map任务已经跑了150%。我靠，怎么可能超过100%呢！继续观察，一直看到它运行到900%多，快到1000%时，这个任务才运行完。再看一下map所导入到hdfs上的数据，发现有9个map文件的大小基本一致，而另外一个文件的大小特别大，大概是其它文件的10倍左右。

只好去阅读DBInputFormat的源码，getSplit源码如下：

@Override

  public List<InputSplit> getSplits(JobContext job) throws IOException {

    ResultSet results = null;
    Statement statement = null;
    try {
      statement = connection.createStatement();

      results = statement.executeQuery(getCountQuery());
      results.next();

      long count = results.getLong(1);
      int chunks = ConfigurationHelper.getJobNumMaps(job);
      long chunkSize = (count / chunks);

      results.close();
      statement.close();

      List<InputSplit> splits = new ArrayList<InputSplit>();

      // Split the rows into n-number of chunks and adjust the last chunk
      // accordingly
      for (int i = 0; i < chunks; i++) {
        DBInputSplit split;
<span style="white-space:pre">	</span>//当i=0时，split=new DBInputSplit(0,chunkSize)</span>
        if ((i + 1) == chunks) {
          split = new DBInputSplit(i * chunkSize, count);
        } else {
          split = new DBInputSplit(i * chunkSize, (i * chunkSize)
              + chunkSize);
        }

        splits.add(split);
      }

      connection.commit();
      return splits;
    } catch (SQLException e) {
      throw new IOException("Got SQLException", e);
    } finally {
      try {
        if (results != null) { results.close(); }
      } catch (SQLException e1) { /* ignored */ }
      try {
        if (statement != null) { statement.close(); }
      } catch (SQLException e1) { /* ignored */ }

      closeConnection();
    }
  }

从上面可以看出，当i=0时,split=new DBInputSplit(0,chunkSize)，此时，split.getStart()=0。

再看OracleDBRecordReader的getSelectQuery方法：

 /** Returns the query for selecting the records from an Oracle DB. */

  protected String getSelectQuery() {
    StringBuilder query = new StringBuilder();
    DBConfiguration dbConf = getDBConf();
    String conditions = getConditions();
    String tableName = getTableName();
    String [] fieldNames = getFieldNames();

    // Oracle-specific codepath to use rownum instead of LIMIT/OFFSET.
    if (dbConf.getInputQuery() == null) {
      query.append("SELECT ");

      for (int i = 0; i < fieldNames.length; i++) {
        query.append(fieldNames[i]);
        if (i != fieldNames.length -1) {
          query.append(", ");
        }
      }

      query.append(" FROM ").append(tableName);
      if (conditions != null && conditions.length() > 0) {
        query.append(" WHERE ").append(conditions);
      }
      String orderBy = dbConf.getInputOrderBy();
      if (orderBy != null && orderBy.length() > 0) {
        query.append(" ORDER BY ").append(orderBy);
      }
    } else {
      //PREBUILT QUERY
      query.append(dbConf.getInputQuery());
    }

    try {
      DBInputFormat.DBInputSplit split = getSplit();

      //split.getStart()=0时，所返回的查询语句查询的是所有的数据，而不是某一段的数据
      if (split.getLength() > 0 && split.getStart() > 0) {
        String querystring = query.toString();

        query = new StringBuilder();
        query.append("SELECT * FROM (SELECT a.*,ROWNUM dbif_rno FROM ( ");
        query.append(querystring);
        query.append(" ) a WHERE rownum <= ").append(split.getStart());
        query.append(" + ").append(split.getLength());
        query.append(" ) WHERE dbif_rno >= ").append(split.getStart());
      }
    } catch (IOException ex) {
      // ignore, will not throw.
    }

    return query.toString();
  }

<span style="white-space:pre">	</span>从上面可以看出，当split.getStart()=0时，不能进入到 if (split.getLength() > 0 && split.getStart() > 0) 语句中，返回的是全部的数据。而我们从DBInputFormat中，发现split.getStart()是可以为0的，即这个split所对应的map会把所有的数据都导入到hdfs上，即它的数据量为全部的数据量。

<span style="white-space:pre">	</span>再看一下map所导入的文件中最大的那个文件的记录条数，和在oracle里面用sql查出来的记录条数一致，和猜想一致。所以只要修改getSplit让i=0时的分片的start为1，或者修改getSelectQuery，把split.getStart()>0改为split.getStart()>=0即可。

<span style="white-space:pre">	</span>发现源码里还有一个问题，如有两个分片，一个为:new DBInputSplit(10,20),一个为:new DBInputSplit(20,30)，这两个分片的getSelectQuery()所返回的查询语句分别为：

<span style="white-space:pre">	</span>SELECT * FROM (SELECT a.*,ROWNUM dbif_rno FROM (select name,to_char(time,'yyyy-mm-dd hh24:mi:ss') time from student) a WHERE rownum<=20) WHERE dbif_rno>=10;

<span style="white-space:pre">	</span>SELECT * FROM (SELECT a.*,ROWNUM dbif_rno FROM (select name,to_char(time,'yyyy-mm-dd hh24:mi:ss') time from student) a WHERE rownum<=30) WHERE dbif_rno>=20;

<span style="white-space:pre">	</span>rownum=20的数据会被查出两次！即会被重复导入。每连续的两个split之间都有一条数据会被重复导入（前一个split的end和后一个split的start是一样的），如：有10个map，则导进来的数据会比原表多10-1条。

<span style="white-space:pre">	</span>重新写一个DBInputFormat，修改getSplit()方法，解决上面的问题：

public class CorrectDBInputFormat extends DBInputFormat<DBWritable> {

	@Override
	public List<InputSplit> getSplits(JobContext job) throws IOException {
		ResultSet results = null;
		Statement statement = null;
		try {
			statement = getConnection().createStatement();

			results = statement.executeQuery(getCountQuery());
			results.next();

			long count = results.getLong(1);
			int chunks = ConfigurationHelper.getJobNumMaps(job);
			long chunkSize = (count / chunks);

			results.close();
			statement.close();

			List<InputSplit> splits = new ArrayList<InputSplit>();

			// Split the rows into n-number of chunks and adjust the last chunk
			// accordingly
			for (int i = 0; i < chunks; i++) {
				DBInputSplit split;

				//
				// if ((i + 1) == chunks) {
				// split = new DBInputSplit(i * chunkSize, count);
				// } else {
				// split = new DBInputSplit(i * chunkSize, (i * chunkSize)
				// + chunkSize);
				// }

				//当i=0时，把split的start置为1而不是0；把每个split的end值减一（最后一个split除外），防止重复导入。</span>
				if (i == 0) {
					split = new DBInputSplit(1, (i * chunkSize) + chunkSize - 1);
				} else if ((i + 1) == chunks) {
					split = new DBInputSplit(i * chunkSize, count);
				} else {
					split = new DBInputSplit(i * chunkSize, (i * chunkSize)
							+ chunkSize - 1);
				}

				splits.add(split);
			}

			getConnection().commit();
			return splits;
		} catch (SQLException e) {
			throw new IOException("Got SQLException", e);
		} finally {
			try {
				if (results != null) {
					results.close();
				}
			} catch (SQLException e1) { /* ignored */
			}
			try {
				if (statement != null) {
					statement.close();
				}
			} catch (SQLException e1) { /* ignored */
			}

			closeConnection();
		}
	}
}

然后在main方法里面调用CorrectDBInputFormat:

public class ImportJob {

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		conf.setInt("mapred.map.tasks", 10);
		Job job = new Job(conf, "student import");
		job.setJarByClass(ImportJob.class);
		
		//删除输出路径
		Path outputDir = new Path(args[0]);
		outputDir.getFileSystem(conf).delete(outputDir, true);
		
		job.setMapperClass(ImportMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setNumReduceTasks(0);
		
		
		DBConfiguration.configureDB(conf, "oracle.jdbc.driver.OracleDriver", "jdbc:oracle:thin:@localhost:1521:orcl", "user", "user", 1000);
		
		CorrectDBInputFormat.setInput(job,  StudentWritable.class, "select name,to_char(time,'yyyy-mm-dd hh24:mi:ss') time from student","select count(*) from student");
		
		job.setInputFormatClass(CorrectDBInputFormat.class);
		
		job.waitForCompletion(true);
	}

}

midNightParis

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
浅谈DBInputFormat

通常情况下用sqoop把关系型数据库里面的内容导入到hdfs，但是sqoop在数据分布不均的情况下，效率非常低，如：可能你给sqoop任务分配了10个map，但是真正起导数据作用的就只有1个，非常影响效率，这时候就可以考虑自己写MR把数据导入到hdfs，这时就需要使用DBInputFormat。DBInputFormat 主要用于把关系型数据库（如oracle,mysql）里面的数据导入到hd
复制链接

扫一扫

专栏目录