Lucene源代码之高亮显示《三》

最新推荐文章于 2008-12-07 14:58:00 发布
caoxu1987728
最新推荐文章于 2008-12-07 14:58:00 发布
阅读量1.6k
点赞数
分类专栏： Lucene 文章标签： lucene token null string
本文链接：https://blog.csdn.net/caoxu1987728/article/details/3112409
版权
Lucene 专栏收录该内容
52 篇文章 0 订阅
订阅专栏
本篇为高亮显示源代码中最具重量级别的一个类。我现在只是把代码看了一遍，小的地方理解了一点，但仍然还未从整体上把握全局。
 
 package org.apache.lucene.search.highlight;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
/**
 * Class used to markup highlighted terms found in the best sections of a
 * text, using configurable（配置好的）tokenizers.
 */
public class Highlighter
{
    public static final  int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
    private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
    private Formatter formatter;
    private Encoder encoder;
    private Fragmenter textFragmenter=new SimpleFragmenter();
    private Scorer fragmentScorer=null;
    public Highlighter(Scorer fragmentScorer)
    {
        this(new SimpleHTMLFormatter(),fragmentScorer);
    }
    public Highlighter(Formatter formatter, Scorer fragmentScorer)
    {
        this(formatter,new DefaultEncoder(),fragmentScorer);
    }
    public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
    {
        this.formatter = formatter;
        this.encoder = encoder;
        this.fragmentScorer = fragmentScorer;
    }
    /**
     * Highlights chosen terms in a text, extracting the most relevant 
     * section（提取最相关的部分）.
     *
     * @param analyzer   the analyzer that will be used to split (分割）
     * <code>text</code> into chunks（大块）
     * @param text text to highlight terms in
     * @param fieldName Name of field used to influence analyzer's 
     * tokenization policy
     *
     * @return highlighted text fragment or null if no terms found
     */
    public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
    throws IOException
    {
        TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
        return getBestFragment(tokenStream, text);
    }
    /**
     * Highlights chosen terms in a text, extracting the most relevant section.
     * The document text is analysed in chunks to record hit statistics
     * across the document. After accumulating stats（统计资料）, the fragment with the highest score
     * is returned
     *
     * @param tokenStream  a stream of tokens identified in the text parameter, including offset information.
     * This is typically produced by an analyzer re-parsing a document's
     * text. Some work may be done on retrieving（检索）TokenStreams more efficently
     * by adding support for storing original text position data in the Lucene
     * index but this support is not currently available (as of Lucene 1.4 rc2).
     * @param text text to highlight terms in
     *
     * @return highlighted text fragment or null if no terms found
     */
    public final String getBestFragment(TokenStream tokenStream, String text)
    throws IOException
    {
        String[] results = getBestFragments(tokenStream,text, 1);
        if (results.length > 0)
        {
            return results[0];
        }
        return null;
    }
    /**
     * Highlights chosen terms in a text, extracting the most relevant sections.
     * This is a convenience method that calls
     * {@link #getBestFragments(TokenStream, String, int)}
     *
     * @param analyzer   the analyzer that will be used to split <code>text</code>
     * into chunks
     * @param text text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     * 
     * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
     * method of the same name that takes a fieldname.（基本上不用了）
     * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
     */
    public final String[] getBestFragments(
            Analyzer analyzer,
            String text,
            int maxNumFragments)
    throws IOException
    {
        TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
        return getBestFragments(tokenStream, text, maxNumFragments);
    }
    /**
     * Highlights chosen terms in a text, extracting the most relevant sections.
     * This is a convenience method that calls
     * {@link #getBestFragments(TokenStream, String, int)}
     *
     * @param analyzer   the analyzer that will be used to split <code>text</code>
     * into chunks
     * @param fieldName     the name of the field being highlighted (used by analyzer)
     * @param text          text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     *
     * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
     */
    public final String[] getBestFragments(
            Analyzer analyzer,
            String fieldName,
            String text,
            int maxNumFragments)
    throws IOException
    {
        TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
        return getBestFragments(tokenStream, text, maxNumFragments);
    }
    /**
     * Highlights chosen terms in a text, extracting the most relevant sections.
     * The document text is analysed in chunks to record hit statistics
     * across the document. 
     * After accumulating stats, the fragments with the highest scores
     * are returned as an array of strings in order of score (contiguous（连续的） fragments are merged into
     * one in their original order to improve readability)
     *
     * @param text text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     *
     * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
     */
    public final String[] getBestFragments(
            TokenStream tokenStream,
            String text,
            int maxNumFragments)
    throws IOException
    {
        maxNumFragments = Math.max(1, maxNumFragments); 
        TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
        //Get text 
        ArrayList fragTexts = new ArrayList();
        for (int i = 0; i < frag.length; i++)
        {
            if ((frag[i] != null) && (frag[i].getScore() > 0))
            {
                fragTexts.add(frag[i].toString());
            }
        }
        return (String[]) fragTexts.toArray(new String[0]);
    }
    /**
     * Low level api to get the most relevant (formatted) sections of the document.
     * This method has been made public to allow visibility of score information held in TextFragment objects.
     * @param mergeContiguousFragments(合并临近的片段）,
     * @throws IOException
     */
    public final TextFragment[] getBestTextFragments(
            TokenStream tokenStream,
            String text,
            boolean mergeContiguousFragments,
            int maxNumFragments)
    throws IOException
    {
        ArrayList docFrags = new ArrayList();
        StringBuffer newText=new StringBuffer();
        TextFragment currentFrag =  new TextFragment(newText,newText.length(), docFrags.size());
        fragmentScorer.startFragment(currentFrag);
        docFrags.add(currentFrag);
        //实例化一个Fragment队列。 
        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
        try
        {
            org.apache.lucene.analysis.Token token;
            String tokenText;
            int startOffset;
            int endOffset;
            int lastEndOffset = 0;
            textFragmenter.start(text);
            TokenGroup tokenGroup=new TokenGroup();
            //tokenStream 是实例中已经对text剖析过的。 
            token = tokenStream.next();
            while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
            {
                if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
                {
                    startOffset = tokenGroup.matchStartOffset;
                    endOffset = tokenGroup.matchEndOffset;
                    tokenText = text.substring(startOffset, endOffset);
                    String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                    //把空格及前面沒有标记的字符串先加到newText中。 
                    if (startOffset > lastEndOffset)
                        newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                    newText.append(markedUpText);
                    lastEndOffset=Math.max(endOffset, lastEndOffset);
                    tokenGroup.clear();
                    //check是否已经超过设定的字符数量，有新的片段生成。 
                    if(textFragmenter.isNewFragment(token))
                    {
                        currentFrag.setScore(fragmentScorer.getFragmentScore());
                        //record stats for a new fragment 
                        currentFrag.textEndPos = newText.length();
                        currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
                        fragmentScorer.startFragment(currentFrag);
                        docFrags.add(currentFrag);
                    }
                }
                tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
                token = tokenStream.next();
            }
            currentFrag.setScore(fragmentScorer.getFragmentScore());
            if(tokenGroup.numTokens>0)
            {
                /**
                 * flush the accumulated text (same code as in above loop)
                 * 闪存余下的的字段。
                 */
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset=Math.max(lastEndOffset,endOffset);
            }
            /**
             * Test what remains of the original text beyond the point where we stopped analyzing 
             * 检测一下源文件中我们没有分析到的文本。
               if there is text beyond the last token considered and that text is not too large
             */
            if ((lastEndOffset < text.length())&&(text.length()<maxDocBytesToAnalyze))              
            {
                //加到newText的后面。 
                newText.append(encoder.encodeText(text.substring(lastEndOffset)));
            }
            currentFrag.textEndPos = newText.length();
            /**
             * sort the most relevant sections of the text
             * 排序最关联部分。fragQueue难道有此功能？
             */
            for (Iterator i = docFrags.iterator(); i.hasNext();)
            {
                currentFrag = (TextFragment) i.next();
                fragQueue.insert(currentFrag);
            }
            /**
             * 出队列，保存到frag数组里。
             */
            TextFragment frag[] = new TextFragment[fragQueue.size()];
            for (int i = frag.length - 1; i >= 0; i--)
            {
                frag[i] = (TextFragment) fragQueue.pop();
            }
            /**
             * merge any contiguous fragments to improve readability.
             * 合并任何连续的碎片以提高可读性
             */
            if(mergeContiguousFragments)
            {
                mergeContiguousFragments(frag);
                ArrayList fragTexts = new ArrayList();
                for (int i = 0; i < frag.length; i++)
                {
                    if ((frag[i] != null) && (frag[i].getScore() > 0))
                    {
                        fragTexts.add(frag[i]);
                    }
                }
                frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
            }
            return frag;
        }
        finally
        {
            if (tokenStream != null)
            {
                try
                {
                    tokenStream.close();
                }
                catch (Exception e)
                {
                }
            }
        }
    }
    /** 
     * Improves readability of a score-sorted（评分分类） list of TextFragments by merging any fragments
     * that were contiguous in the original text into one larger fragment with the correct order.
     * This will leave a "null" in the array entry for the lesser scored fragment. 
     * 
     * @param frag An array of document fragments in descending score
     */
    private void mergeContiguousFragments(TextFragment[] frag)
    {
        boolean mergingStillBeingDone;
        if (frag.length > 1)
            do
            {
                mergingStillBeingDone = false; 
                //for each fragment, scan other frags looking for contiguous blocks 
                for (int i = 0; i < frag.length; i++)
                {
                    if (frag[i] == null)
                    {
                        continue;
                    }
                    //合并所有连续的块。 
                    for (int x = 0; x < frag.length; x++)
                    {
                        if (frag[x] == null)
                        {
                            continue;
                        }
                        if (frag[i] == null)
                        {
                            break;
                        }
                        TextFragment frag1 = null;
                        TextFragment frag2 = null;
                        int frag1Num = 0;
                        int frag2Num = 0;
                        int bestScoringFragNum;
                        int worstScoringFragNum;
                        //if blocks are contiguous.... 
                        if (frag[i].follows(frag[x]))
                        {
                            frag1 = frag[x];
                            frag1Num = x;
                            frag2 = frag[i];
                            frag2Num = i;
                        }
                        else
                            if (frag[x].follows(frag[i]))
                            {
                                frag1 = frag[i];
                                frag1Num = i;
                                frag2 = frag[x];
                                frag2Num = x;
                            }
                        //merging required.. 
                        if (frag1 != null)
                        {
                            if (frag1.getScore() > frag2.getScore())
                            {
                                bestScoringFragNum = frag1Num;
                                worstScoringFragNum = frag2Num;
                            }
                            else
                            {
                                bestScoringFragNum = frag2Num;
                                worstScoringFragNum = frag1Num;
                            }
                            frag1.merge(frag2);
                            frag[worstScoringFragNum] = null;
                            mergingStillBeingDone = true;
                            frag[bestScoringFragNum] = frag1;
                        }
                    }
                }
            }
            while (mergingStillBeingDone);
    }
    /**
     * Highlights terms in the  text , extracting the most relevant sections
     * and concatenating the chosen fragments with a separator (typically "...").
     * The document text is analysed in chunks to record hit statistics
     * across the document. After accumulating stats, the fragments with the highest scores
     * are returned in order as "separator" delimited strings.
     *
     * @param text        text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     * @param separator  the separator used to intersperse the document fragments (typically "...")
     *
     * @return highlighted text
     */
    public final String getBestFragments(
            TokenStream tokenStream,    
            String text,
            int maxNumFragments,
            String separator)
    throws IOException
    {
        String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
        StringBuffer result = new StringBuffer();
        for (int i = 0; i < sections.length; i++)
        {
            if (i > 0)
            {
                result.append(separator);
            }
            result.append(sections[i]);
        }
        return result.toString();
    }
    /**
     * @return the maximum number of bytes to be tokenized per doc 
     */
    public int getMaxDocBytesToAnalyze()
    {
        return maxDocBytesToAnalyze;
    }
    /**
     * @param byteCount the maximum number of bytes to be tokenized per doc
     * (This can improve performance with large documents)
     */
    public void setMaxDocBytesToAnalyze(int byteCount)
    {
        maxDocBytesToAnalyze = byteCount;
    }
    /**
     */
    public Fragmenter getTextFragmenter()
    {
        return textFragmenter;
    }
    /**
     * @param fragmenter
     */
    public void setTextFragmenter(Fragmenter fragmenter)
    {
        textFragmenter = fragmenter;
    }
    /**
     * @return Object used to score each text fragment 
     */
    public Scorer getFragmentScorer()
    {
        return fragmentScorer;
    }
    /**
     * @param scorer
     */
    public void setFragmentScorer(Scorer scorer)
    {
        fragmentScorer = scorer;
    }
    public Encoder getEncoder()
    {
        return encoder;
    }
    public void setEncoder(Encoder encoder)
    {
        this.encoder = encoder;
    }
}
/**
 * FragmentQueue队列类的定义
 */
class FragmentQueue extends PriorityQueue
{
    public FragmentQueue(int size)
    {
        initialize(size);
    }
    public final boolean lessThan(Object a, Object b)
    {
        TextFragment fragA = (TextFragment) a;
        TextFragment fragB = (TextFragment) b;
        if (fragA.getScore() == fragB.getScore())
            return fragA.fragNum > fragB.fragNum;
            else
                return fragA.getScore() < fragB.getScore();
    }
}
 
还需慢慢推敲。
caoxu1987728
关注
0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Lucene源代码之高亮显示《三》

本篇为高亮显示源代码中最具重量级别的一个类。我现在只是把代码看了一遍，小的地方理解了一点，但仍然还未从整体上把握全局。package org.apache.lucene.search.highlight;import java.io.IOException;import java.io.StringReader;import java.util.ArrayList;
复制链接

扫一扫
专栏目录