clucene做高亮也有相应的highlighter包。
cppunit测试样例中有各种搜索的高亮,包括短语、模糊、范围等查询类型
//testSimpleHighlighter();
//testGetBestFragmentsSimpleQuery();
//testGetFuzzyFragments();
//testGetWildCardFragments();
//testGetMidWildCardFragments();
//testGetRangeFragments();
//testGetBestFragmentsPhrase();
//testGetBestFragmentsMultiTerm();
//testGetBestFragmentsWithOr();
RAMDirectory* hl_ramDir = NULL;
StandardAnalyzer* hl_analyzer = NULL;
TCHAR* hl_FIELD_NAME = _T("contents");
Query* hl_originalquery = NULL;
Query* hl_query = NULL;
Query* hl_rewrittenquery = NULL;
IndexReader* hl_reader=NULL;
Searcher* hl_searcher=NULL;
Hits* hl_hits = NULL;
void setupHighlighter()
{
hl_ramDir = _CLNEW RAMDirectory();
hl_analyzer = _CLNEW StandardAnalyzer;
IndexWriter writer(hl_ramDir, hl_analyzer, true);
for (int i = 0; hl_texts[i] != NULL; i++)
{
Document d;
d.add( *_CLNEW Field(hl_FIELD_NAME, hl_texts[i], true, true, true) );
writer.addDocument(&d);
}
writer.optimize();
writer.close();
hl_reader = IndexReader::open(hl_ramDir);
}
void testSimpleHighlighter(){
doSearching(_T("Kennedy") );
QueryScorer scorer(hl_query);
Highlighter highlighter(&scorer);
SimpleFragmenter fragmenter(40);
highlighter.setTextFragmenter(&fragmenter);
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hl_hits->length(); i++)
{
const TCHAR* text = hl_hits->doc(i).get(hl_FIELD_NAME);
StringReader reader( text );
TokenStream* tokenStream=hl_analyzer->tokenStream(hl_FIELD_NAME, &reader);
TCHAR* result = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, _T("...") );
setlocale(LC_ALL, "chs");
wprintf(L"%s/n",result);
_CLDELETE_CARRAY(result);
_CLDELETE(tokenStream);
}
//Not sure we can assert anything here - just running to check we dont throw any exceptions
}
void TestHighLighter()
{
setupHighlighter();
testSimpleHighlighter();
//testGetBestFragmentsSimpleQuery();
//testGetFuzzyFragments();
//testGetWildCardFragments();
//testGetMidWildCardFragments();
//testGetRangeFragments();
//testGetBestFragmentsPhrase();
//testGetBestFragmentsMultiTerm();
//testGetBestFragmentsWithOr();
}
索引的语句是
const TCHAR* hl_texts[6] =
{
_T("Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot"),
_T("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy"),
_T("JFK has been shot"),
_T("John Kennedy has been shot"),
_T("This text has a typo in referring to Keneddy"),
NULL
};
查询词为"Kinnedy"
简单的高亮结果如:
John <B>Kennedy</B> has been shot
to <B>Kennedy</B>...This piece of text refers to <B>Kennedy</B> at
<B>kennedy</B> has been shot