TermVector是Lucene 1.4新增的 它提供一种向量机制来进行模糊查询,TermVector保存Token.getPositionIncrement() 和Token.startOffset() 以及Token.endOffset() 信息.
Field.TermVector.NO:不保存term vectors
Field.TermVector.YES:保存term vectors
Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
下面是个简单的例子:
Analyzer analyzer = new StandardAnalyzer();
RAMDirectory directory = new RAMDirectory();
/**
* 创建索引
*
* @throws IOException
*/
public void index() throws IOException{
IndexWriter indexWriter = new IndexWriter(directory,analyzer,true);
Document doc1 = new Document();
doc1.add(new Field("title","java",Store.YES,Index.TOKENIZED));
doc1.add(new Field("author","callan",Store.YES,Index.TOKENIZED));
doc1.add(new Field("subject", "java一门编程语言",
Store.YES, Index.TOKENIZED,TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.addDocument(doc1);
Document doc2 = new Document();
doc2.add(new Field("title","english",Store.YES,Index.TOKENIZED));
doc2.add(new Field("author","wcq",Store.YES,Index.TOKENIZED));
doc2.add(new Field("subject", "英语用的人很多",
Store.YES, Index.TOKENIZED,TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.addDocument(doc2);
Document doc3 = new Document();
doc3.add(new Field("title","asp",Store.YES,Index.TOKENIZED));
doc3.add(new Field("author","ca",Store.YES,Index.TOKENIZED));
doc3.add(new Field("subject", "asp很多人用",
Store.YES, Index.TOKENIZED,TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.addDocument(doc3);
indexWriter.optimize();
indexWriter.close();
}
// 进行搜索
public void searcher() throws IOException{
IndexSearcher searcher = new IndexSearcher(directory);
// 搜索书名为java的索引
TermQuery query = new TermQuery(new Term("title","java"));
Hits hits = searcher.search(query);
// 能找到一条记录
for(int i = 0; i < hits.length(); i++){
Document doc = hits.doc(i);
System.out.println("书名:" + doc.get("title") + " " + "作者:" + doc.get("author") + "简介:" + doc.get("subject"));
System.out.println("相关的书:");
docsLike(hits.id(i));
}
}
// 在subject中模糊搜索与doc相进的索引
public void docsLike(int id) throws IOException {
IndexReader reader = IndexReader.open(directory);
TermFreqVector vector = reader.getTermFreqVector(id, "subject");
BooleanQuery query = new BooleanQuery();
for (int j = 0; j < vector.size(); j++) {
TermQuery tq = new TermQuery(new Term("subject",
vector.getTerms()[j]));
query.add(tq, BooleanClause.Occur.SHOULD);
}
IndexSearcher searcher = new IndexSearcher(directory);
Hits hits = searcher.search(query);
printResult(hits);
}
// 显示结果
public void printResult(Hits hits) throws IOException{
for(int i = 0; i < hits.length(); i++){
Document d = hits.doc(i);
System.out.println("书名:" + d.get("title")+" " + "作者:" + d.get("author") +" " + "简介:" + d.get("subject"));
}
}
public static void main(String[] args) throws IOException {
TermFreqVectorTest3 test = new TermFreqVectorTest3();
test.index();
test.searcher();
}
搜索结果:
书名:java 作者:callan简介:java一门编程语言
相关的书:
书名:java 作者:callan 简介:java一门编程语言
书名:english 作者:wcq 简介:英语用的人很多
搜索书名为java 的索引,并且搜索与java的简介相关的索引.
将书<<java>>的subject分词为java/一/门/编/程/语/言/
在subject中搜索包含java/一/门/编/程/语/言/的索引
<<english>>包含语