DBInputformat 理解

最新推荐文章于 2019-08-17 15:47:03 发布

Michael_Shentu

最新推荐文章于 2019-08-17 15:47:03 发布

阅读量1.1k

点赞数

分类专栏： Hadoop分布式系统

本文链接：https://blog.csdn.net/shenxiaoming77/article/details/43203713

版权

Hadoop分布式系统专栏收录该内容

19 篇文章 0 订阅

订阅专栏

Mapreduce 从传统数据库 oracle mysql 等等读取数据到hdfs hive 等分布式存储系统在业务中是很常见，除了自己写自定义的inputformat格式，hadoop为我们提供了 DBInputFormat类，这个类原理很简单，getInputSplits函数为我们提供了如何来切分oracle或者mysql中的源数据，分散到N个 MapTask当中去

public class DBInputFormat extends InputFormat

implements Configurable
{
public static class DBInputSplit extends InputSplit
implements Writable
{

public String[] getLocations()
throws IOException
{
return new String[0];
}

public long getStart()
{
return start;
}

public long getEnd()
{
return end;
}

public long getLength()
throws IOException
{
return end - start;
}

public void readFields(DataInput input)
throws IOException
{
start = input.readLong();
end = input.readLong();
}

public void write(DataOutput output)
throws IOException
{
output.writeLong(start);
output.writeLong(end);
}

private long end;
private long start;

public DBInputSplit()
{
end = 0L;
start = 0L;
}

public DBInputSplit(long start, long end)
{
this.end = 0L;
this.start = 0L;
this.start = start;
this.end = end;
}
}

public static class NullDBWritable
implements DBWritable, Writable
{

public void readFields(DataInput datainput)
throws IOException
{
}

public void readFields(ResultSet resultset)
throws SQLException
{
}

public void write(DataOutput dataoutput)
throws IOException
{
}

public void write(PreparedStatement preparedstatement)
throws SQLException
{
}

public NullDBWritable()
{
}
}

public DBInputFormat()
{
dbProductName = "DEFAULT";
}

public void setConf(Configuration conf)
{
dbConf = new DBConfiguration(conf);
try
{
getConnection();
DatabaseMetaData dbMeta = connection.getMetaData();
dbProductName = dbMeta.getDatabaseProductName().toUpperCase();
}
catch(Exception ex)
{
throw new RuntimeException(ex);
}
tableName = dbConf.getInputTableName();
fieldNames = dbConf.getInputFieldNames();
conditions = dbConf.getInputConditions();
}

public Configuration getConf()
{
return dbConf.getConf();
}

public DBConfiguration getDBConf()
{
return dbConf;
}

public Connection getConnection()
{
try
{
if(null == connection)
{
connection = dbConf.getConnection();
connection.setAutoCommit(false);
connection.setTransactionIsolation(8);
}
}
catch(Exception e)
{
throw new RuntimeException(e);
}
return connection;
}

public String getDBProductName()
{
return dbProductName;
}

protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf)
throws IOException
{
Class inputClass = dbConf.getInputClass();
if(dbProductName.startsWith("ORACLE"))
return new OracleDBRecordReader(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName);
if(dbProductName.startsWith("MYSQL"))
return new MySQLDBRecordReader(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName);
return new DBRecordReader(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName);
SQLException ex;
ex;
throw new IOException(ex.getMessage());
}

public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException
{
return createDBRecordReader((DBInputSplit)split, context.getConfiguration());
}

/**

在这个函数中，实现了根据源数据的大小，以及用户自定义的maptask数量，来决定数据切分的split 块大小

当然在DBInputformat中并没有将真实数据直接赋予maptask，不然AM直接就会直接报OOM错误了

**/

public List getSplits(JobContext job)
throws IOException
{
ResultSet results;
Statement statement;
Exception exception;
results = null;
statement = null;
List list;
try
{
statement = connection.createStatement();
results = statement.executeQuery(getCountQuery());
results.next();
long count = results.getLong(1);
int chunks = job.getConfiguration().getInt("mapreduce.job.maps", 1);
long chunkSize = count / (long)chunks;
results.close();
statement.close();
List splits = new ArrayList();
for(int i = 0; i < chunks; i++)
{
DBInputSplit split;
if(i + 1 == chunks)
split = new DBInputSplit((long)i * chunkSize, count);
else
split = new DBInputSplit((long)i * chunkSize, (long)i * chunkSize + chunkSize);
splits.add(split);
}

connection.commit();
list = splits;
}
catch(SQLException e)
{
throw new IOException("Got SQLException", e);
}
finally { }
try
{
if(results != null)
results.close();
}
catch(SQLException e1) { }
try
{
if(statement != null)
statement.close();
}
catch(SQLException e1) { }
closeConnection();
return list;
try
{
if(results != null)
results.close();
}
catch(SQLException e1) { }
try
{
if(statement != null)
statement.close();
}
catch(SQLException e1) { }
closeConnection();
throw exception;
}

protected String getCountQuery()
{
if(dbConf.getInputCountQuery() != null)
return dbConf.getInputCountQuery();
StringBuilder query = new StringBuilder();
query.append((new StringBuilder()).append("SELECT COUNT(*) FROM ").append(tableName).toString());
if(conditions != null && conditions.length() > 0)
query.append((new StringBuilder()).append(" WHERE ").append(conditions).toString());
return query.toString();
}

public static transient void setInput(Job job, Class inputClass, String tableName, String conditions, String orderBy, String fieldNames[])
{
job.setInputFormatClass(org/apache/hadoop/mapreduce/lib/db/DBInputFormat);
DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
dbConf.setInputClass(inputClass);
dbConf.setInputTableName(tableName);
dbConf.setInputFieldNames(fieldNames);
dbConf.setInputConditions(conditions);
dbConf.setInputOrderBy(orderBy);
}

public static void setInput(Job job, Class inputClass, String inputQuery, String inputCountQuery)
{
job.setInputFormatClass(org/apache/hadoop/mapreduce/lib/db/DBInputFormat);
DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
dbConf.setInputClass(inputClass);
dbConf.setInputQuery(inputQuery);
dbConf.setInputCountQuery(inputCountQuery);
}

protected void closeConnection()
{
try
{
if(null != connection)
{
connection.close();
connection = null;
}
}
catch(SQLException sqlE)
{
LOG.debug("Exception on close", sqlE);
}
}

private static final Log LOG = LogFactory.getLog(org/apache/hadoop/mapreduce/lib/db/DBInputFormat);
protected String dbProductName;
protected String conditions;
protected Connection connection;
protected String tableName;
protected String fieldNames[];
protected DBConfiguration dbConf;

}

Michael_Shentu

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
DBInputformat 理解

Mapreduce 从传统数据库 oracle mysql 等等读取数据到hdfs hive 等分布式存储系统在业务中是很常见，除了自己写自定义的inputformat格式，hadoop为我们提供了 DBInputFormat类，这个类原理很简单，getInputSplits函数为我们提供了如何来切分oracle或者mysql中的源数据，分散到N个 MapTask当中去publi
复制链接

扫一扫