背景:
当前的基础业务数据存放于mysql数据库中,通过mapreduce将数据从数据库(一条数据涉及到数十张表关联,数据量级几十万)中读出,再导入至solr cloud,以支撑搜索功能。mapreduce用来做分布式任务调度。
知识点:
1. mapreduce是怎样适配源数据的
2. 如何从mysql批量读取数据,批量处理,批量导入至solr cloud
3. 性能测试
mapreduce是怎样适配源数据的
mapreduce基础知识本文不详细介绍。参见
http://hadoop.apache.org/docs/r2.7.3/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html
首先看一下抽象类InputFormat的定义:
public abstract class InputFormat<K, V> {
/**
* Logically split the set of input files for the job.
*
* <p>Each {@link InputSplit} is then assigned to an individual {@link Mapper}
* for processing.</p>
*
* <p><i>Note</i>: The split is a <i>logical</i> split of the inputs and the
* input files are not physically split into chunks. For e.g. a split could
* be <i><input-file-path, start, offset></i> tuple. The InputFormat
* also creates the {@link RecordReader} to read the {@link InputSplit}.
*
* @param context job configuration.
* @return an array of {@link InputSplit}s for the job.
*/
// 如何切割源数据,分配给各个mapper
public abstract
List<InputSplit> getSplits(JobContext context
) throws IOException, InterruptedException;
/**
* Create a record reader for a given split. The framework will call
* {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before
* the split is used.
* @param split the split to be read
* @param context the information about the task
* @return a new record reader
* @throws IOException
* @throws InterruptedException
*/
// 如何适配源数据,生成具有迭代器功能的RecordReader
public abstract
RecordReader<K,V> createRecordReader(InputSplit split,
TaskAttemptContext context
) throws IOException,
InterruptedException;
}
再来看一下实现类DBInputFormat,它利用database connection driver去适配存放在数据库中的数据
/**
* A InputFormat that reads input data from an SQL table.
* <p>
* DBInputFormat emits LongWritables containing the record number as
* key and DBWritables as value.
*
* The SQL query, and input class can be using one of the two
* setInput methods.
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public class DBInputFormat<T extends DBWritable>
extends InputFormat<LongWritable, T> implements Configurable {
private static final Log LOG = LogFactory.getLog(DBInputFormat.class);
protected String dbProductName = "DEFAULT";
protected String conditions;
protected Connection connection;
protected String tableName;
protected String[] fieldNames;
protected DBConfiguration dbConf;
/** {@inheritDoc} */
// 从配置中获取该类关心的信息,例如数据库信息,查询信息
public void setConf(Configuration conf) {
dbConf = new DBConfiguration(conf);
try {
this.connection = createConnection();
// 该方法用于获取数据库产品名称,例如ORACLE、MYSQL
DatabaseMetaData dbMeta = connection.getMetaData();
this.dbProductName =
StringUtils.toUpperCase(dbMeta.getDatabaseProductName());
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
tableName = dbConf.getInputTableName();
fieldNames = dbConf.getInputFieldNames();
conditions = dbConf.getInputConditions();
}
public Connection getConnection() {
// TODO Remove this code that handles backward compatibility.
if (this.connection == null) {
this.connection = createConnection();
}
return this.connection;
}
public Connection createConnection() {
try {
// dbConf.getConnection()可能有玄机,稍后一探究竟
Connection newConnection = dbConf.getConnection();
// 在执行commit之前的一系列指令都处于一个事务中
newConnection.setAutoCommit(false);
// 在数据集上放置一个范围锁,以防止其他用户在事务完成之前更新数据集或将行插入数据集内
newConnection.setTransactionIsolation(
Connection.TRANSACTION_SERIALIZABLE);
return newConnection;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
// 适配不同的数据库产品
protected RecordReader<LongWritable, T> createDBRecordReader(DBInputSplit split,
Configuration conf) throws IOException {
@SuppressWarnings("unchecked")
Class<T> inputClass = (Class<T>) (dbConf.getInputClass());
try {
// use database product name to determine appropriate record reader.
if (dbProductName.startsWith("ORACLE")) {
// use Oracle-specific db reader.
return new OracleDBRecordReader<T>(split, inputClass,
conf, createConnection(), getDBConf(), conditions, fieldNames,
tableName);
} else if (dbProductName.startsWith("MYSQL")) {
// use MySQL-specific db reader.
// 稍后重点关注MySQLDBRecordReader
return new MySQLDBRecordReader<T>(split, inputClass,
conf, createConnection(), getDBConf(), conditions, fieldNames,
tableName);
} else {
// Generic reader.
return new DBRecordReader<T>(split, inputClass,
conf, createConnection(), getDBConf(), conditions, fieldNames,
tableName);
}
} catch (SQLException ex) {
throw new IOException(ex.getMessage());
}
}
/** {@inheritDoc} */
public RecordReader<LongWritable, T> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return createDBRecordReader((DBInputSplit) split, context.getConfiguration());
}
/** {@inheritDoc} */
// 如何切割数据
public List<InputSplit> getSplits(JobContext job) throws IOException {
ResultSet results = null;
Statement statement = null;
try {
statement = connection.createStatement();
results = statement.executeQuery(getCountQuery());
results.next();
long count = results.getLong(1);
// 注意这里是根据MRJobConfig.NUM_MAPS属性算出要分成几个段
int chunks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
long chunkSize = (count / chunks);
results.close();
statement.close();
List<InputSplit> splits = new ArrayList<InputSplit>();
// Split the rows into n-number of chunks and adjust the last chunk
// accordingly
// 平均切分数据,
for (int i = 0; i < chunks; i++) {
// 需要关注DBInputSplit
DBInputSplit split;
if ((i + 1) == chunks)
split = new DBInputSplit(i * chunkSize, count);
else
split = new DBInputSplit(i * chunkSize, (i * chunkSize)
+ chunkSize);
splits.add(split);
}
// 注意开启了事务,所以这里要执行commit
connection.commit();
return splits;
} catch (SQLException e) {
throw new IOException("Got SQLException", e);
} finally {
try {
if (results != null) { results.close(); }
} catch (SQLException e1) {}
try {
if (statement != null) { statement.close(); }
} catch (SQLException e1) {}
closeConnection();
}
}
/** Returns the query for getting the total number of rows,
* subclasses can override this for custom behaviour.*/
// 建议不要在dbConf中设置countQuery
protected String getCountQuery() {
if(dbConf.getInputCountQuery() != null) {
return dbConf.getInputCountQuery();
}
StringBuilder query = new StringBuilder();
query.append("SELECT COUNT(*) FROM " + tableName);
if (conditions != null && conditions.length() > 0)
query.append(" WHERE " + conditions);
return query.toString();
}
/**
* Initializes the map-part of the job with the appropriate input settings.
*
* @param job The map-reduce job
* @param inputClass the class object implementing DBWritable, which is the
* Java object holding tuple fields.
* @param tableName The table to read data from
* @param conditions The condition which to select data with,
* eg. '(updated > 20070101 AND length > 0)'
* @param orderBy the fieldNames in the orderBy clause.
* @param fieldNames The field names in the table
* @see #setInput(Job, Class, String, String)
*/
public static void setInput(Job job,
Class<? extends DBWritable> inputClass,
String tableName,String conditions,
String orderBy, String... fieldNames) {
job.setInputFormatClass(DBInputFormat.class);
DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
dbConf.setInputClass(inputClass);
dbConf.setInputTableName(tableName);
dbConf.setInputFieldNames(fieldNames);
dbConf.setInputConditions(conditions);
dbConf.setInputOrderBy(orderBy);
}
/**
* Initializes the map-part of the job with the appropriate input settings.
*
* @param job The map-reduce job
* @param inputClass the class object implementing DBWritable, which is the
* Java object holding tuple fields.
* @param inputQuery the input query to select fields. Example :
* "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
* @param inputCountQuery the input query that returns
* the number of records in the table.
* Example : "SELECT COUNT(f1) FROM Mytable"
* @see #setInput(Job, Class, String, String, String, String...)
*/
public static void setInput(Job job,
Class<? extends DBWritable> inputClass,
String inputQuery, String inputCountQuery) {
job.setInputFormatClass(DBInputFormat.class);
DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
dbConf.setInputClass(inputClass);
dbConf.setInputQuery(inputQuery);
dbConf.setInputCountQuery(inputCountQuery);
}
protected void closeConnection() {
try {
if (null != this.connection) {
this.connection.close();
this.connection = null;
}
} catch (SQLException sqlE) {
LOG.debug("Exception on close", sqlE);
}
}
}
看完以上代码,我们需要关注以下几个地方:
1. 数据库操作是带有事务性的,且设置了查询集范围锁,所以不用担心读的过程中数据被修改
2. 要设置数据库中的数据范围挺不方便的,需要通过countQuery或者selectQuery中的conditions来间接实现limit,建议不要采用设置countQuery的方式,因为修改conditions后可能忘记同步countQuery。
3. dbConf.getConnection()
4. DBInputSplit
5. MySQLDBRecordReader
看看dbConf.getConnection()相关代码,挺简单的。
/** Returns a connection object o the DB
* @throws ClassNotFoundException
* @throws SQLException */
public Connection getConnection()
throws ClassNotFoundException, SQLException {
// 这里做了类加载操作
Class.forName(conf.get(DBConfiguration.DRIVER_CLASS_PROPERTY));
if(conf.get(DBConfiguration.USERNAME_PROPERTY) == null) {
// 可以将用户名、密码设置在url中
return DriverManager.getConnection(
conf.get(DBConfiguration.URL_PROPERTY));
} else {
// 也可以拎出来设置
return DriverManager.getConnection(
conf.get(DBConfiguration.URL_PROPERTY),
conf.get(DBConfiguration.USERNAME_PROPERTY),
conf.get(DBConfiguration.PASSWORD_PROPERTY));
}
}
再来看一下DBInputSplit的代码
public static class DBInputSplit extends InputSplit implements Writable {
private long end = 0;
private long start = 0;
/**
* Default Constructor
*/
public DBInputSplit() {
}
/**
* Convenience Constructor
* @param start the index of the first row to select
* @param end the index of the last row to select
*/
public DBInputSplit(long start, long end) {
this.start = start;
this.end = end;
}
/** {@inheritDoc} */
public String[] getLocations() throws IOException {
// TODO Add a layer to enable SQL "sharding" and support locality
return new String[] {};
}
/**
* @return The index of the first row to select
*/
public long getStart() {
return start;
}
/**
* @return The index of the last row to select
*/
public long getEnd() {
return end;
}
/**
* @return The total row count in this split
*/
public long getLength() throws IOException {
return end - start;
}
/** {@inheritDoc} */
public void readFields(DataInput input) throws IOException {
start = input.readLong();
end = input.readLong();
}
/** {@inheritDoc} */
public void write(DataOutput output) throws IOException {
output.writeLong(start);
output.writeLong(end);
}
}
很简单,DBInputSplit定义了要处理的数据段起始位置start和结束位置end,并且实现了Writable接口,可以在hadoop节点之间传播
最后看看DBRecordReader
public class DBRecordReader<T extends DBWritable> extends
RecordReader<LongWritable, T> {
private static final Log LOG = LogFactory.getLog(DBRecordReader.class);
private ResultSet results = null;
// 将数据库每一列数据格式化成inputClass类型的实例
private Class<T> inputClass;
private Configuration conf;
// 这里记录了处理数据段的信息,即start row, end row
private DBInputFormat.DBInputSplit split;
// 处理进度信息
private long pos = 0;
private LongWritable key = null;
private T value = null;
private Connection connection;
protected PreparedStatement statement;
private DBConfiguration dbConf;
private String conditions;
private String [] fieldNames;
private String tableName;
/**
* @param split The InputSplit to read data for
* @throws SQLException
*/
public DBRecordReader(DBInputFormat.DBInputSplit split,
Class<T> inputClass, Configuration conf, Connection conn, DBConfiguration dbConfig,
String cond, String [] fields, String table)
throws SQLException {
this.inputClass = inputClass;
this.split = split;
this.conf = conf;
this.connection = conn;
this.dbConf = dbConfig;
this.conditions = cond;
this.fieldNames = fields;
this.tableName = table;
}
protected ResultSet executeQuery(String query) throws SQLException {
// 这里的设置的目的是以流的方式接收数据,数据库端只执行一次查询操作,客户端保持连接可以逐条读取。
// 防止一次读取过多撑爆内存,且比limit方式读取数据库更简单高效
this.statement = connection.prepareStatement(query,
ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
return statement.executeQuery();
}
/** Returns the query for selecting the records,
* subclasses can override this for custom behaviour.*/
// 根据conf配置的table,conditions等信息拼接sql
protected String getSelectQuery() {
StringBuilder query = new StringBuilder();
// Default codepath for MySQL, HSQLDB, etc. Relies on LIMIT/OFFSET for splits.
if(dbConf.getInputQuery() == null) {
query.append("SELECT ");
for (int i = 0; i < fieldNames.length; i++) {
query.append(fieldNames[i]);
if (i != fieldNames.length -1) {
query.append(", ");
}
}
query.append(" FROM ").append(tableName);
query.append(" AS ").append(tableName); //in hsqldb this is necessary
if (conditions != null && conditions.length() > 0) {
query.append(" WHERE (").append(conditions).append(")");
}
String orderBy = dbConf.getInputOrderBy();
if (orderBy != null && orderBy.length() > 0) {
query.append(" ORDER BY ").append(orderBy);
}
} else {
//PREBUILT QUERY
query.append(dbConf.getInputQuery());
}
try {
// 注意这里用到InputSplit来定义offset及limit
query.append(" LIMIT ").append(split.getLength());
query.append(" OFFSET ").append(split.getStart());
} catch (IOException ex) {
// Ignore, will not throw.
}
return query.toString();
}
/** {@inheritDoc} */
public void close() throws IOException {
try {
if (null != results) {
results.close();
}
if (null != statement) {
statement.close();
}
if (null != connection) {
// close之前执行commit,结束本次查询
connection.commit();
connection.close();
}
} catch (SQLException e) {
throw new IOException(e.getMessage());
}
}
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
//do nothing
}
/** {@inheritDoc} */
// 迭代功能相关
public LongWritable getCurrentKey() {
return key;
}
/** {@inheritDoc} */
// 迭代功能相关
public T getCurrentValue() {
return value;
}
/**
* @deprecated
*/
@Deprecated
public long getPos() throws IOException {
return pos;
}
/**
* @deprecated Use {@link #nextKeyValue()}
*/
@Deprecated
// 迭代功能相关
public boolean next(LongWritable key, T value) throws IOException {
this.key = key;
this.value = value;
return nextKeyValue();
}
/** {@inheritDoc} */
public float getProgress() throws IOException {
return pos / (float)split.getLength();
}
/** {@inheritDoc} */
// 迭代功能相关
public boolean nextKeyValue() throws IOException {
try {
if (key == null) {
key = new LongWritable();
}
if (value == null) {
value = createValue();
}
if (null == this.results) {
// First time into this method, run the query.
this.results = executeQuery(getSelectQuery());
}
if (!results.next())
return false;
// Set the key field value as the output key value
key.set(pos + split.getStart());
value.readFields(results);
pos ++;
} catch (SQLException e) {
throw new IOException("SQLException in nextKeyValue", e);
}
return true;
}
}
需要注意的地方:
1. 一个mapper只开一个数据库连接,并且通过流的方式读取数据;优点是高性能,缺点是读的过程中网络波动导致数据读了一半连接搞出什么幺蛾子就悲剧了。这里并没有对异常做精细的控制,(断线会导致任务失败,hadoop重新执行一次这个mapper?容灾控制粒度比较粗,且实际工程中需要根据数据量和网络情况合理设置mapper数量)
2. 还是那个问题,对数据范围控制不够好,需要精确设置limit, offset就不方便。
了解DBInputFormat后可以确定两件事情,一是通过重载InputFormat、InputSplit及RecordReader就能够自由的控制将源数据适配到mapreduce中了;二是DBInputFormat读数据库的方式是事务性的,流式的,保持一个连接且提交一次查询操作。
下一篇介绍怎样解决关联表的查询以及批量导入数据至solr cloud