解决Solr对Mysql做数据导入时内存溢出的问题

转载自:http://blog.csdn.net/yuwenruli/article/details/8426940

近日在对Mysql数据做索引的时候,由于数据量太大(百万级),所以在索引过程中经常会内存溢出,在网上上找了不少解决的办法都没有效果,BatchSize不起作用,应该是Mysql不支持Batch操作。搞的非常恼火,觉得Solr怎么对这么常见的问题都没有解决办法,实在不行就只能通过手写SQL的方式来分批导入了。

    抱着最后一点期望向前辈同事请教,结果他门前也是遇到过这样的问题,自己写了一个类,原理就是对SqlEntityProcessor做了改写,然后用Mysql的limit方法做到批量导入的效果,实际试了一下,果然成功执行。看来以后遇到问题还是得多问,自己遇到的问题别人很可能也会遇到过,这样会少走不少弯路。

附上实际的代码:
package org.apache.solr.handler.dataimport;

import static org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow;

import java.util.Iterator;
import java.util.Map;

import org.apache.solr.handler.dataimport.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Basically this is a hacked version of SqlEntityProcessor.
 * Just for MySQL, and maybe someone can modify it to fit to other DBMS.
 * 
 * The goal of this hacked version is want to make the batch process of huge number of data possible.
 * This processor will auto append "LIMIT" syntax to SQL query until end of data.
 * Each SQL query will have "batchSize" rows and then we'll not OOM anymore.
  */
public class MySqlBatchEntityProcessor extends EntityProcessorBase {
private static final Logger LOG = LoggerFactory
.getLogger(MySqlBatchEntityProcessor.class);

protected DataSource<Iterator<Map<String, Object>>> dataSource;

private int batchSize;
private int offset = 0;

@SuppressWarnings("unchecked")
public void init(Context context) {
super.init(context);
dataSource = context.getDataSource();
batchSize = Integer.parseInt(context.getEntityAttribute("batchSize"));
}

protected void initQuery(String q) {
try {
DataImporter.QUERY_COUNT.get().incrementAndGet();
rowIterator = dataSource.getData(q + " LIMIT " + offset + ","
+ batchSize);
this.query = q;
} catch (DataImportHandlerException e) {
throw e;
} catch (Exception e) {
LOG.error("The query failed '" + q + "'", e);
throw new DataImportHandlerException(
DataImportHandlerException.SEVERE, e);
}
}

public Map<String, Object> nextRow() {
if (rowIterator == null) {
String q = getQuery();
initQuery(context.replaceTokens(q));
}
return getNext();
}

public Map<String, Object> nextModifiedRowKey() {
if (rowIterator == null) {
String deltaQuery = context.getEntityAttribute(DELTA_QUERY);
if (deltaQuery == null)
return null;
initQuery(context.replaceTokens(deltaQuery));
}
return getNext();
}

public Map<String, Object> nextDeletedRowKey() {
if (rowIterator == null) {
String deletedPkQuery = context.getEntityAttribute(DEL_PK_QUERY);
if (deletedPkQuery == null)
return null;
initQuery(context.replaceTokens(deletedPkQuery));
}
return getNext();
}

public Map<String, Object> nextModifiedParentRowKey() {
if (rowIterator == null) {
String parentDeltaQuery = context
.getEntityAttribute(PARENT_DELTA_QUERY);
if (parentDeltaQuery == null)
return null;
LOG.info("Running parentDeltaQuery for Entity: "
+ context.getEntityAttribute("name"));
initQuery(context.replaceTokens(parentDeltaQuery));
}
return getNext();
}

public String getQuery() {
String queryString = context.getEntityAttribute(QUERY);
if (Context.FULL_DUMP.equals(context.currentProcess())) {
return queryString;
}
if (Context.DELTA_DUMP.equals(context.currentProcess())) {
String deltaImportQuery = context
.getEntityAttribute(DELTA_IMPORT_QUERY);
if (deltaImportQuery != null)
return deltaImportQuery;
}
return getDeltaImportQuery(queryString);
}

public String getDeltaImportQuery(String queryString) {
StringBuilder sb = new StringBuilder(queryString);
if (SELECT_WHERE_PATTERN.matcher(queryString).find()) {
sb.append(" and ");
} else {
sb.append(" where ");
}
boolean first = true;
String[] primaryKeys = context.getEntityAttribute("pk").split(",");
for (String primaryKey : primaryKeys) {
if (!first) {
sb.append(" and ");
}
first = false;
Object val = context.resolve("dataimporter.delta." + primaryKey);
if (val == null) {
Matcher m = DOT_PATTERN.matcher(primaryKey);
if (m.find()) {
val = context.resolve("dataimporter.delta." + m.group(1));
}
}
sb.append(primaryKey).append(" = ");
if (val instanceof Number) {
sb.append(val.toString());
} else {
sb.append("'").append(val.toString()).append("'");
}
}
return sb.toString();
}

protected Map<String, Object> getNext() {
try {
if (rowIterator == null)
return null;
if (rowIterator.hasNext()) {
offset++;
return rowIterator.next();
}
else if (offset >= (batchSize)) {
DataImporter.QUERY_COUNT.get().incrementAndGet();
rowIterator = dataSource.getData(query + " LIMIT " + offset + "," + batchSize);

if (rowIterator.hasNext()) {
offset++;
return rowIterator.next();
}
}
offset = 0;
query = null;
rowIterator = null;
return null;
} catch (Exception e) {
LOG.error("getNext() failed for query '" + query + "' LIMIT(" + offset + "," + batchSize + ")", e);
query = null;
rowIterator = null;
wrapAndThrow(DataImportHandlerException.WARN, e);
return null;
}
}

private static Pattern SELECT_WHERE_PATTERN = Pattern.compile(
"^\\s*(select\\b.*?\\b)(where).*", Pattern.CASE_INSENSITIVE);

public static final String QUERY = "query";

public static final String DELTA_QUERY = "deltaQuery";

public static final String DELTA_IMPORT_QUERY = "deltaImportQuery";

public static final String PARENT_DELTA_QUERY = "parentDeltaQuery";

public static final String DEL_PK_QUERY = "deletedPkQuery";

public static final Pattern DOT_PATTERN = Pattern.compile(".*?\\.(.*)$");
}



最近遇到一样的问题,找到你这里,然后翻阅官网文档,居然被我找到了。 mysql的batchSize要这么设置batchSize="-1",问题解决。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值