Mapreduce 从传统数据库 oracle mysql 等等读取数据到hdfs hive 等分布式存储系统在业务中是很常见,除了自己写自定义的inputformat格式,hadoop为我们提供了 DBInputFormat类,这个类原理很简单,getInputSplits函数为我们提供了如何来切分oracle或者mysql中的源数据, 分散到N个 MapTask当中去
public class DBInputFormat extends InputFormat
implements Configurable{
public static class DBInputSplit extends InputSplit
implements Writable
{
public String[] getLocations()
throws IOException
{
return new String[0];
}
public long getStart()
{
return start;
}
public long getEnd()
{
return end;
}
public long getLength()
throws IOException
{
return end - start;
}
public void readFields(DataInput input)
throws IOException
{
start = input.readLong();
end = input.readLong();
}
public void write(DataOutput output)
throws IOException
{
output.writeLong(start);
output.writeLong(end);
}
private long end;
private long start;
public DBInputSplit()
{
end = 0L;
start = 0L;
}
public DBInputSplit(long start, long end)
{
this.end = 0L;
this.start = 0L;
this.start = start;
this.end = end;
}
}
public static class NullDBWritable
implements DBWritable, Writable
{
public void readFields(DataInput datainput)
throws IOException
{
}
public void readFields(ResultSet resultset)
throws SQLException
{
}
public void write(DataOutput dataoutput)
throws IOException
{
}
public void write(PreparedStatement preparedstatement)
throws SQLException
{
}
public NullDBWritable()
{
}
}
public DBInputFormat()
{
dbProductName = "DEFAULT";
}
public void setConf(Configuration conf)
{
dbConf = new DBConfiguration(conf);
try
{
getConnection();
DatabaseMetaData dbMeta = connection.getMetaData();
dbProductName = dbMeta.getDatabaseProductName().toUpperCase();
}
catch(Exception ex)
{
throw new RuntimeException(ex);
}
tableName = dbConf.getInputTableName();
fieldNames = dbConf.getInputFieldNames();
conditions = dbConf.getInputConditions();
}
public Configuration getConf()
{
return dbConf.getConf();
}
public DBConfiguration getDBConf()
{
return dbConf;
}
public Connection getConnection()
{
try
{
if(null == connection)
{
connection = dbConf.getConnection();
connection.setAutoCommit(false);
connection.setTransactionIsolation(8);
}
}
catch(Exception e)
{
throw new RuntimeException(e);
}
return connection;
}
public String getDBProductName()
{
return dbProductName;
}
protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf)
throws IOException
{
Class inputClass = dbConf.getInputClass();
if(dbProductName.startsWith("ORACLE"))
return new OracleDBRecordReader(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName);
if(dbProductName.startsWith("MYSQL"))
return new MySQLDBRecordReader(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName);
return new DBRecordReader(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName);
SQLException ex;
ex;
throw new IOException(ex.getMessage());
}
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException
{
return createDBRecordReader((DBInputSplit)split, context.getConfiguration());
}
/**
在这个函数中,实现了根据源数据的大小,以及用户自定义的maptask数量,来决定数据切分的split 块大小
当然在DBInputformat中 并没有将真实数据直接赋予maptask, 不然AM直接就会直接报OOM错误了
**/
public List getSplits(JobContext job)throws IOException
{
ResultSet results;
Statement statement;
Exception exception;
results = null;
statement = null;
List list;
try
{
statement = connection.createStatement();
results = statement.executeQuery(getCountQuery());
results.next();
long count = results.getLong(1);
int chunks = job.getConfiguration().getInt("mapreduce.job.maps", 1);
long chunkSize = count / (long)chunks;
results.close();
statement.close();
List splits = new ArrayList();
for(int i = 0; i < chunks; i++)
{
DBInputSplit split;
if(i + 1 == chunks)
split = new DBInputSplit((long)i * chunkSize, count);
else
split = new DBInputSplit((long)i * chunkSize, (long)i * chunkSize + chunkSize);
splits.add(split);
}
connection.commit();
list = splits;
}
catch(SQLException e)
{
throw new IOException("Got SQLException", e);
}
finally { }
try
{
if(results != null)
results.close();
}
catch(SQLException e1) { }
try
{
if(statement != null)
statement.close();
}
catch(SQLException e1) { }
closeConnection();
return list;
try
{
if(results != null)
results.close();
}
catch(SQLException e1) { }
try
{
if(statement != null)
statement.close();
}
catch(SQLException e1) { }
closeConnection();
throw exception;
}
protected String getCountQuery()
{
if(dbConf.getInputCountQuery() != null)
return dbConf.getInputCountQuery();
StringBuilder query = new StringBuilder();
query.append((new StringBuilder()).append("SELECT COUNT(*) FROM ").append(tableName).toString());
if(conditions != null && conditions.length() > 0)
query.append((new StringBuilder()).append(" WHERE ").append(conditions).toString());
return query.toString();
}
public static transient void setInput(Job job, Class inputClass, String tableName, String conditions, String orderBy, String fieldNames[])
{
job.setInputFormatClass(org/apache/hadoop/mapreduce/lib/db/DBInputFormat);
DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
dbConf.setInputClass(inputClass);
dbConf.setInputTableName(tableName);
dbConf.setInputFieldNames(fieldNames);
dbConf.setInputConditions(conditions);
dbConf.setInputOrderBy(orderBy);
}
public static void setInput(Job job, Class inputClass, String inputQuery, String inputCountQuery)
{
job.setInputFormatClass(org/apache/hadoop/mapreduce/lib/db/DBInputFormat);
DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
dbConf.setInputClass(inputClass);
dbConf.setInputQuery(inputQuery);
dbConf.setInputCountQuery(inputCountQuery);
}
protected void closeConnection()
{
try
{
if(null != connection)
{
connection.close();
connection = null;
}
}
catch(SQLException sqlE)
{
LOG.debug("Exception on close", sqlE);
}
}
private static final Log LOG = LogFactory.getLog(org/apache/hadoop/mapreduce/lib/db/DBInputFormat);
protected String dbProductName;
protected String conditions;
protected Connection connection;
protected String tableName;
protected String fieldNames[];
protected DBConfiguration dbConf;
}