mapreduce适合的工作方式是一开始就准备好数据资源,由mapper一条一条的处理数据,再由reducer整合。所谓兵马未动粮草先行,数据资源准备充足,mapreduce才能顺利算出结果。然而目前面临的问题是一条数据在mapper处理的过程中,需要多次查询数据库,以及写入solr cloud中。如果还是逐条处理,那么一条数据的生产代价将会非常昂贵。一个解决思路是在mapper的过程中缓存数据,批量处理。
话不多说,直接上代码
ServiceRecord类
public class ServiceRecord implements Writable, DBWritable {
// 基础信息,从表xx_service查得
int saleId;
String g_sid;
String title;
String content;
// 地域信息,从表xx_service_info查得
String provinceName;
String cityName;
String townName;
// 这里实现clone方法,因为mapper在逐条格式化数据时会回收ServiceRecord对象
public ServiceRecord clone() {
ServiceRecord serviceRecord = new ServiceRecord();
serviceRecord.saleId = saleId;
serviceRecord.g_sid = g_sid;
serviceRecord.title = title;
serviceRecord.content = content;
serviceRecord.provinceName = provinceName;
serviceRecord.cityName = cityName;
serviceRecord.townName = townName;
return serviceRecord;
}
public void readFields(DataInput dataInput) throws IOException {
}
public void write(DataOutput dataOutput) throws IOException {
}
public void write(PreparedStatement statement) throws SQLException {
}
public void readFields(ResultSet resultSet) throws SQLException {
saleId = resultSet.getInt("saleId");
title = resultSet.getString("title");
content = resultSet.getString("content");
g_sid = UUID.nameUUIDFromBytes((saleId + "").getBytes()).toString();
}
public void readPositionFields(ResultSet resultSet) throws SQLException {
provinceName = resultSet.getString("provinceName");
cityName = resultSet.getString("cityName");
townName = resultSet.getString("townName");
}
}
ServiceMapper类:
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ServiceMapper extends Mapper<LongWritable, ServiceRecord, Text, IntWritable>{
// 每缓存MAX_CAPACITY条数据后批量查询关联表,并且提交至solr
private static final int MAX_CAPACITY = 100;
// 总共提交至solr的数据量
private int commitCount;
// 缓存ServiceRecord
private Map<Integer, ServiceRecord> serviceRecordMap;
@Override
protected void setup(Mapper<LongWritable, ServiceRecord, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
super.setup(context);
System.out.println("setup");
commitCount = 0;
serviceRecordMap = new HashMap<Integer, ServiceRecord>(MAX_CAPACITY);
Configuration conf = context.getConfiguration();
// init SQLUtil
String sqlUrl = conf.get("mysql.url");
String sqlUser = conf.get("mysql.user");
String sqlPassword = conf.get("mysql.password");
SQLUtil.instance().setDbUrl(sqlUrl);
SQLUtil.instance().setDbUser(sqlUser);
SQLUtil.instance().setDbPass(sqlPassword);
SQLUtil.instance().init();
// init SolrUtil
String solrUrl = conf.get("solr.url");
SolrUtil.instance().setUrl(solrUrl);
SolrUtil.instance().init();
}
@Override
protected void map(LongWritable key, ServiceRecord value,
Mapper<LongWritable, ServiceRecord, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// 注意这里clone了value,因为value会被回收
serviceRecordMap.put(value.saleId, value.clone());
if (serviceRecordMap.size() >= MAX_CAPACITY) {
// 批量处理缓存数据
batchProcess();
}
}
@Override
protected void cleanup(Mapper<LongWritable, ServiceRecord, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// 处理剩余的缓存数据
if (!serviceRecordMap.isEmpty()) {
batchProcess();
}
// solr hard commit
SolrUtil.instance().commit(false);
super.cleanup(context);
}
private boolean batchProcess() {
// 1. 查询关联表
getPositionInfo();
// 2. 更新至solr
updateSolr();
// 3. solr soft commit
SolrUtil.instance().commit(true);
// 4. increase commitCount
commitCount += serviceRecordMap.size();
// 5. 清除缓存数据
serviceRecordMap.clear();
System.out.println("commit: " + commitCount);
return true;
}
private boolean getPositionInfo() {
if (serviceRecordMap.isEmpty()) {
return true;
}
// fetch connection
Connection connection = SQLUtil.instance().getConnection();
String sql = "SELECT service_id as saleId, province_name as provinceName, city_name as cityName, "
+ "town_name as townName from xx_service_info where service_id in ("
+ StringUtils.join(serviceRecordMap.keySet(), ',') + ");";
Statement statement = null;
ResultSet resultSet = null;
try {
statement = connection.createStatement();
statement.execute(sql);
resultSet = statement.getResultSet();
if (resultSet != null) {
while (resultSet.next()) {
int saleID = resultSet.getInt("saleId");
ServiceRecord serviceRecord = serviceRecordMap.get(saleID);
if (serviceRecord != null) {
serviceRecord.readPositionFields(resultSet);
}
}
}
} catch (SQLException e) {
e.printStackTrace();
return false;
} finally {
SQLUtil.instance().releaseResourceNoThrow(resultSet, statement, connection);
}
return true;
}
private boolean updateSolr() {
if (serviceRecordMap.isEmpty()) {
return true;
}
List<ServiceRecord> serviceRecords = new ArrayList<ServiceRecord>(serviceRecordMap.values());
return SolrUtil.instance().indexAsJson(serviceRecords);
}
}
SQLUtil类:
import java.beans.PropertyVetoException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Wrapper;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import javax.sql.DataSource;
import com.mchange.v2.c3p0.ComboPooledDataSource;
public class SQLUtil {
private static SQLUtil instance = null;
// 连接池
private DataSource dataSource = null;
private String dbUrl = null;
private String dbUser = null;
private String dbPass = null;
private SQLUtil() {
}
public static SQLUtil instance() {
if (instance == null) {
synchronized (SQLUtil.class) {
if (instance == null) {
instance = new SQLUtil();
}
}
}
return instance;
}
public void init() {
Objects.requireNonNull(dbUrl);
Objects.requireNonNull(dbUser);
Objects.requireNonNull(dbPass);
create();
}
private void create() {
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e1) {
e1.printStackTrace();
}
ComboPooledDataSource dataSource = new ComboPooledDataSource();
dataSource.setJdbcUrl(dbUrl);
dataSource.setUser(dbUser);
dataSource.setPassword(dbPass);
try {
dataSource.setDriverClass("com.mysql.jdbc.Driver");
} catch (PropertyVetoException e) {
e.printStackTrace();
}
dataSource.setDataSourceName("SQLUtil");
dataSource.setAutoCommitOnClose(false);
dataSource.setPreferredTestQuery("select 1");
dataSource.setMaxConnectionAge(3600);
dataSource.setTestConnectionOnCheckout(true);
dataSource.setIdleConnectionTestPeriod(300);
dataSource.setDebugUnreturnedConnectionStackTraces(true);
dataSource.setUnreturnedConnectionTimeout(120);
this.dataSource = dataSource;
}
public Connection getConnection() {
try {
Connection connection = dataSource.getConnection();
return connection;
} catch (SQLException e) {
e.printStackTrace();
}
return null;
}
public void releaseResourceNoThrow(Wrapper... wrapperSet) {
// 1. close ResultSet
for (Wrapper wrapper : wrapperSet) {
if (wrapper instanceof ResultSet) {
try {
((ResultSet) wrapper).close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
// 2. close Statement or PreparedStatement
for (Wrapper wrapper : wrapperSet) {
if (wrapper instanceof PreparedStatement) {
try {
((PreparedStatement) wrapper).close();
} catch (SQLException e) {
e.printStackTrace();
}
} else if (wrapper instanceof Statement) {
try {
((Statement) wrapper).close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
// 3. close Connection
for (Wrapper wrapper : wrapperSet) {
if (wrapper instanceof Connection) {
try {
((Connection) wrapper).close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
public void setDbUrl(String dbUrl) {
this.dbUrl = dbUrl;
}
public void setDbUser(String dbUser) {
this.dbUser = dbUser;
}
public void setDbPass(String dbPass) {
this.dbPass = dbPass;
}
}
SolrUtil类:
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
public class SolrUtil {
private Gson gson = null;
private List<NameValuePair> softCommitPairs = null;
private List<NameValuePair> hardCommitPairs = null;
private String url = null;
private static SolrUtil instance = null;
private SolrUtil() {
gson = new GsonBuilder().create();
NameValuePair softCommitPair = new BasicNameValuePair("softCommit", "true");
softCommitPairs = new ArrayList<NameValuePair>();
softCommitPairs.add(softCommitPair);
NameValuePair hardCommitPair = new BasicNameValuePair("commit", "true");
hardCommitPairs = new ArrayList<NameValuePair>();
hardCommitPairs.add(hardCommitPair);
}
public static SolrUtil instance() {
if (instance == null) {
synchronized (SolrUtil.class) {
if (instance == null) {
instance = new SolrUtil();
}
}
}
return instance;
}
public void init() {
Objects.requireNonNull(url);
}
public boolean indexAsJson(ServiceRecord serviceRecord) {
List<ServiceRecord> serviceRecords = new ArrayList<ServiceRecord>(1);
serviceRecords.add(serviceRecord);
return indexAsJson(serviceRecords);
}
public boolean indexAsJson(List<ServiceRecord> serviceRecords) {
HttpClient httpclient = new DefaultHttpClient();
try {
HttpPost httppost = new HttpPost(url);
InputStreamEntity reqEntity = new InputStreamEntity(
new ByteArrayInputStream(gson.toJson(serviceRecords).getBytes("UTF-8")), -1);
reqEntity.setContentType("application/json");
httppost.setEntity(reqEntity);
HttpResponse response = httpclient.execute(httppost);
if (response.getStatusLine().getStatusCode() != 200) {
System.out.println(String.format("commit:%s with error:%s-%s",
httppost.getRequestLine(),
response.getStatusLine().toString(),
EntityUtils.toString(response.getEntity())));
return false;
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// When HttpClient instance is no longer needed,
// shut down the connection manager to ensure
// immediate deallocation of all system resources
httpclient.getConnectionManager().shutdown();
}
return true;
}
public boolean commit(boolean isSoftCommit) {
HttpClient httpclient = new DefaultHttpClient();
try {
HttpPost httppost = new HttpPost(url);
UrlEncodedFormEntity entity = null;
if (isSoftCommit) {
entity = new UrlEncodedFormEntity(softCommitPairs, HTTP.UTF_8);
} else {
entity = new UrlEncodedFormEntity(hardCommitPairs, HTTP.UTF_8);
}
httppost.setEntity(entity);
HttpResponse response = httpclient.execute(httppost);
if (response.getStatusLine().getStatusCode() != 200) {
System.out.println(String.format("commit:%s with error:%s-%s",
httppost.getRequestLine(),
response.getStatusLine().toString(),
EntityUtils.toString(response.getEntity())));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// When HttpClient instance is no longer needed,
// shut down the connection manager to ensure
// immediate deallocation of all system resources
httpclient.getConnectionManager().shutdown();
}
return true;
}
public void setUrl(String url) {
this.url = url;
}
}
最后由IndexDriver创建job并提交job
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.db.DBConfiguration;
import org.apache.hadoop.mapred.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class IndexDriver {
public static void main(String []args) throws Exception {
String driverClass = "com.mysql.jdbc.Driver";
String dbUrl = "jdbc:mysql://<ip>:<port>/<database>";
String userName = "<user>";
String password = "<password>";
String solrUrl = "http://<ip>:<port>/solr/<collection>/update";
long time_start, time_end;
Configuration conf = new Configuration();
conf.set("mysql.url", dbUrl);
conf.set("mysql.user", userName);
conf.set("mysql.password", password);
conf.set("solr.url", solrUrl);
// 单机执行时这个设置似乎没起作用,只创建了一个mapper实例
conf.setInt(MRJobConfig.NUM_MAPS, 5);
DBConfiguration.configureDB(conf, driverClass, dbUrl, userName, password);
Job job = Job.getInstance(conf, IndexDriver.class.getName());
job.setJarByClass(IndexDriver.class);
DBInputFormat.setInput(job, ServiceRecord.class, "xx_service", null, null, "service_id as saleId", "subject as title", "cont as content");
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path("/user/tanghuan/output"));
job.setMapperClass(ServiceMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
time_start = System.currentTimeMillis();
job.waitForCompletion(true);
time_end = System.currentTimeMillis();
System.out.println("finish job, take time: " + (time_end - time_start) + " ms");
}
}
单机测试结果:commit: 572026. finish job, take time: 661384 ms
大约:865条/s
这里每条数据查mysql表两次,更新solr一次,solr每更新100条数据执行一次soft commit,在最后执行一次hard commit
总结
- ServiceRecord需要实现clone方法
- mapper并没有提供批量处理数据的接口,需要自己缓存数据再批量处理,且该方案得利于DBInputFormat查询数据库后以流的方式读取数据。
下一篇文章讲讲利用docker在单机上搭建虚拟的hadoop集群,再测试一下性能