Solr获得高亮词(Highlighter/Term)的position及offset信息

1 篇文章 0 订阅
1 篇文章 0 订阅
package org.scbit.lsbi.solr.highlighting;

import org.apache.lucene.search.Query;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldTermStack;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.PluginInfoInitialized;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;

public class PositionsSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {

    public void init(PluginInfo info) {
    }

    @Override
    public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
        SolrParams params = req.getParams();

        // if highlighting isn't enabled, then why call doHighlighting?
        if (isHighlightingEnabled(params)) {
            FastVectorHighlighter fvh = new FastVectorHighlighter(
                    // FVH cannot process hl.usePhraseHighlighter parameter per-field basis
                    params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true),
                    // FVH cannot process hl.requireFieldMatch parameter per-field basis
                    params.getBool(HighlightParams.FIELD_MATCH, false));
            fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, Integer.MAX_VALUE));

            SolrIndexSearcher searcher = req.getSearcher();
            IndexSchema schema = searcher.getSchema();
            int[] docIDs = toDocIDs(docs);

            // query-time parameters
            String[] fieldNames = getHighlightFields(query, req, defaultFields);
            Set<String> fset = new HashSet<String>();

            // pre-fetch documents using the Searcher's doc cache
            for (String f : fieldNames) {
                fset.add(f);
            }
            // fetch unique key if one exists.
            SchemaField keyField = schema.getUniqueKeyField();
            if (null != keyField) {
                fset.add(keyField.getName());
            }

            NamedList<Object> list = new SimpleOrderedMap<>();

            for (int docID : docIDs) {
                NamedList<Object> summary = new SimpleOrderedMap<>();
                for (String field : fieldNames) {
                    FieldQuery fq = fvh.getFieldQuery(query, searcher.getIndexReader());
                    FieldTermStack stack = new FieldTermStack(req.getSearcher().getIndexReader(), docID, field, fq);
                    FieldPhraseList fpl = new FieldPhraseList(stack, fq);

                    NamedList<NamedList<Object>> terms = new SimpleOrderedMap<>();
                    for (FieldPhraseList.WeightedPhraseInfo wpi : fpl.getPhraseList()) {
                        for (FieldTermStack.TermInfo ti : wpi.getTermsInfos()) {
                            NamedList<Object> term = new SimpleOrderedMap<>();

                            term.add("position", ti.getPosition());

                            ArrayList<Integer> ofst = new ArrayList<>(2);
                            ofst.add(ti.getStartOffset());
                            ofst.add(ti.getEndOffset());
                            term.add("offsets", ofst);

                            terms.add(ti.getText(), term);
                        }
                    }
                    NamedList<Object> info = new SimpleOrderedMap<>();
                    info.add("terms", terms);
                    summary.add(field, info);
                }

                String printId = schema.printableUniqueKey(searcher.doc(docID, fset));
                list.add(printId == null ? null : printId, summary);
            }

            return list;
        } else {
            return null;
        }

    }

    protected int[] toDocIDs(DocList docs) {
        int[] docIDs = new int[docs.size()];
        DocIterator iterator = docs.iterator();
        for (int i = 0; i < docIDs.length; i++) {
            if (!iterator.hasNext()) {
                throw new AssertionError();
            }
            docIDs[i] = iterator.nextDoc();
        }
        if (iterator.hasNext()) {
            throw new AssertionError();
        }
        return docIDs;
    }

}

代码来源参考:https://issues.apache.org/jira/browse/SOLR-4722

查询结果如下:

{
        "pmid":28705234,
        "pmcid":5513360,
        "title":"In silico characterization of cell-cell interactions using a cellular automata model of cell culture.",
        "author":"Takanori Kihara; Kosuke Kashitani; Jun Miyake; ",
        "articleAbstract":"<label>BACKGROUND</label>Cell proliferation is a key characteristic of eukaryotic cells. During cell proliferation, cells interact with each other. In this study, we developed a cellular automata model to estimate cell-cell interactions using experimentally obtained images of cultured cells.\n<label>RESULTS</label>We used four types of cells; HeLa cells, human osteosarcoma (HOS) cells, rat mesenchymal stem cells (MSCs), and rat smooth muscle A7r5 cells. These cells were cultured and stained daily. The obtained cell images were binarized and clipped into squares containing about 10(4) cells. These cells showed characteristic cell proliferation patterns. The growth curves of these cells were generated from the cell proliferation images and we determined the doubling time of these cells from the growth curves. We developed a simple cellular automata system with an easily accessible graphical user interface. This system has five variable parameters, namely, initial cell number, doubling time, motility, cell-cell adhesion, and cell-cell contact inhibition (of proliferation). Within these parameters, we obtained initial cell numbers and doubling times experimentally. We set the motility at a constant value because the effect of the parameter for our simulation was restricted. Therefore, we simulated cell proliferation behavior with cell-cell adhesion and cell-cell contact inhibition as variables. By comparing growth curves and proliferation cell images, we succeeded in determining the cell-cell interaction properties of each cell. Simulated HeLa and HOS cells exhibited low cell-cell adhesion and weak cell-cell contact inhibition. Simulated MSCs exhibited high cell-cell adhesion and positive cell-cell contact inhibition. Simulated A7r5 cells exhibited low cell-cell adhesion and strong cell-cell contact inhibition. These simulated results correlated with the experimental growth curves and proliferation images.\n<label>CONCLUSIONS</label>Our simulation approach is an easy method for evaluating the cell-cell interaction properties of cells.\n",
        "keyword":"Cell assay system; Cell proliferation; Cellular automata; Cell–cell adhesion; Cell–cell contact inhibition; ",
        "publishedYear":2017,
        "publishedMonth":7,
        "publishedDay":14,
        "elecPublishedDate":20170714,
        "year":2017,
        "volume":"10",
        "issue":"1",
        "page":"283",
        "hitNum":0,
        "journalTitle":"BMC Research Notes",
        "journalIsoAbbr":"BMC Res Notes",
        "journalMedlineTA":"BMC Res Notes",
        "journalIssnElec":"1756-0500",
        "publisherName":"BioMed Central",
        "doi":"10.1186/s13104-017-2613-x",
        "indexTime":1500865734000,
        "_version_":1609023004723904512}

得到的高亮部分:

"highlighting":{
    "28705234":{
      "title":{
        "terms":{
          "cell":{
            "position":4,
            "offsets":[30,
              34]},
          "cell":{
            "position":6,
            "offsets":[35,
              39]},
          "cell":{
            "position":14,
            "offsets":[88,
              92]}}}},
。。。。。。。

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值