马上2017就要结束了,今年研究了不少solr相关的问题,年末的时候还是做一个总结吧。明年说不定就开始搞ES了。
一、数据流传递过程
- 数据添加的API是以post的形式发送,此处以solrj数据导入为例,接收数据就用到了JavabinLoader。前面的SolrDispatcher流程暂时跳过,直接看handleRequestBody的处理过程。
ContentStreamHandlerBase.java
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
SolrParams params = req.getParams();
UpdateRequestProcessorChain processorChain =
req.getCore().getUpdateProcessorChain(params);
UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp);
try {
ContentStreamLoader documentLoader = newLoader(req, processor); // 此处的loader就是JavaBinLoader
Iterable<ContentStream> streams = req.getContentStreams();
if (streams == null) {
if (!RequestHandlerUtils.handleCommit(req, processor, params, false) && !RequestHandlerUtils.handleRollback(req, processor, params, false)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "missing content stream");
}
} else {
for (ContentStream stream : streams) {
documentLoader.load(req, rsp, stream, processor); // 具体的数据流处理流程
}
// Perhaps commit from the parameters
RequestHandlerUtils.handleCommit(req, processor, params, false);
RequestHandlerUtils.handleRollback(req, processor, params, false);
}
} finally {
// finish the request
processor.finish();
}
}
这些操作只算是数据的预处理。
public class JavabinLoader extends ContentStreamLoader {
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
InputStream is = null;
try {
is = stream.getStream();
parseAndLoadDocs(req, rsp, is, processor);
} finally {
if(is != null) {
is.close();
}
}
}
private void parseAndLoadDocs(final SolrQueryRequest req, SolrQueryResponse rsp, InputStream stream,
final UpdateRequestProcessor processor) throws IOException {
UpdateRequest update = null;
JavaBinUpdateRequestCodec.StreamingUpdateHandler handler = new JavaBinUpdateRequestCodec.StreamingUpdateHandler() { // 匿名类,定义doc的处理逻辑,关注update方法
private AddUpdateCommand addCmd = null;
@Override
public void update(SolrInputDocument document, UpdateRequest updateRequest, Integer commitWithin, Boolean overwrite) { // 需要注意的是,solrJ可以提交多个doc,但在这里进行处理的时候,只会一各一个的串行处理。
if (document == null) {
// Perhaps commit from the parameters
try {
RequestHandlerUtils.handleCommit(req, processor, updateRequest.getParams(), false);
RequestHandlerUtils.handleRollback(req, processor, updateRequest.getParams(), false);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "ERROR handling commit/rollback");
}
return;
}
if (addCmd == null) {
addCmd = getAddCommand(req, updateRequest.getParams());
}
addCmd.solrDoc = document;
if (commitWithin != null) {
addCmd.commitWithin = commitWithin;
}
if (overwrite != null) {
addCmd.overwrite = overwrite;
}
if (updateRequest.isLastDocInBatch()) {
// this is a hint to downstream code that indicates we've sent the last doc in a batch
addCmd.isLastDocInBatch = true;
}
try {
processor.processAdd(addCmd); // 使用构造的addCmd传入processor进行添加请求的处理。
addCmd.clear();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "ERROR adding document " + document, e);
}
}
};
FastInputStream in = FastInputStream.wrap(stream);
for (; ; ) {
try {
update = new JavaBinUpdateRequestCodec().unmarshal(in, handler); // 最终会调用上面匿名类的update方法,进行doc添加操作
} catch (EOFException e) {
break; // this is expected
}
if (update.getDeleteByIdMap() != null || update.getDeleteQuery() != null) {
delete(req, update, processor);
}
}
}
......
......
}
具体看一下JavaBinUpdateRequestCodec的处理逻辑,主要就是定义数据流的处理方式,然后调用上面定义的handler。
public UpdateRequest unmarshal(InputStream is, final StreamingUpdateHandler handler) throws IOException {
final UpdateRequest updateRequest = new UpdateRequest();
List<List<NamedList>> doclist;
List<Entry<SolrInputDocument,Map<Object,Object>>> docMap;
List<String> delById;
Map<String,Map<String,Object>> delByIdMap;
List<String> delByQ;
final NamedList[] namedList = new NamedList[1];
JavaBinCodec codec = new JavaBinCodec() { // 定义url请求传入的stream的读取方式,并遍历stream中的每一个doc.
// NOTE: this only works because this is an anonymous inner class
// which will only ever be used on a single stream -- if this class
// is ever refactored, this will not work.
private boolean seenOuterMostDocIterator = false;
@Override
public NamedList readNamedList(DataInputInputStream dis) throws IOException {
int sz = readSize(dis);
NamedList nl = new NamedList();
if (namedList[0] == null) {
namedList[0] = nl;
}
for (int i = 0; i < sz; i++) {
String name = (String) readVal(dis);
Object val = readVal(dis);
nl.add(name, val);
}
return nl;
}
@Override
public List readIterator(DataInputInputStream fis) throws IOException {
// default behavior for reading any regular Iterator in the stream
if (seenOuterMostDocIterator) return super.readIterator(fis);
// special treatment for first outermost Iterator
// (the list of documents)
seenOuterMostDocIterator = true;
return readOuterMostDocIterator(fis);
}
private List readOuterMostDocIterator(DataInputInputStream fis) throws IOException {
NamedList params = (NamedList) namedList[0].get("params");
updateRequest.setParams(new ModifiableSolrParams(SolrParams.toSolrParams(params)));
if (handler == null) return super.readIterator(fis);
Integer commitWithin = null;