package cn.itcast.lucene.helloword;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.junit.Test;
import cn.itcast.lucene.utils.FileAndDocumentUtils;
public class HelloWorld {
// 索引文件
// String filePath = "D:\\javacode\\lucenesDemo\\datasource\\IndexWriter addDocument's a javadoc .txt";
String filePath = "D:\\javacode\\lucenesDemo\\datasource\\小笑话_总统的房间 Room .txt";
// 索引库的位置
String indexPath = "D:\\javacode\\lucenesDemo\\luceneIndex";
// 默认的分词器 创建索引和查询都用同一个分词器
// Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new MMAnalyzer();
@Test
public void createIndex() throws CorruptIndexException, IOException {
// 把文件转成lucene索引库里面的document对象 file ---> document
Document doc = FileAndDocumentUtils.file2Document(new File(filePath));
// IndexWriter对索引库进行增删改操作 查询不包括 要对文件进行操作 必须要有io流 索引都要对他进行关闭操作
// analyzer 代表索引库对应的是那个分词器
// 增加 true 表示索引库不存在 就创建 false 没有也不创建 但是到时候就会报错
// MaxFieldLength.LIMITED 表示对一个文件进行索引 只对其前10000个词进行索引 每个字段中有多少个词进行索引
IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, true, MaxFieldLength.LIMITED);
// 把文件添加到索引库中
indexWriter.addDocument(doc);
// indexWriter.deleteDocuments(query)
// indexWriter.updateDocument(term, doc)
// indexWriter.optimize();
indexWriter.close();
}
@Test
public void search() throws Exception {
String searchString = "房间";
// 把要搜索的文本解析为query对象 和hibernate里面的相似
// 这样才能打印结果
String[] fields = { "name", "content" };
Map<String, Float> boosts = new HashMap<String, Float>();
boosts.put("name", 3f);
// boosts.put("content", 1.0f); 默认为1.0f
// QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,boosts);
// Multi多重的 Parser解剖器
QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
Query query = queryParser.parse(searchString);
Filter filter = null;
// 对索引库进行查询操作IndexSearcher
// indexPath 索引库的位置
IndexSearcher indexSearcher = new IndexSearcher(indexPath);
// -------------------------TermQuery查询------------------------------
/*
* IndexSearcher indexSearcher = new IndexSearcher(indexPath); Term term = new Term("name", "房间"); Query query = new
* TermQuery(term); Filter filter = null; TopDocs topDocs = indexSearcher.search(query, filter, 10000); // 10000
* 一次性在索引库中查询多少个结果 默认值是50
*/
// -------------RangeQuery----------------------
// IndexSearcher indexSearcher = new IndexSearcher(indexPath);
// Term lowerTerm = new Term("size", NumberTools.longToString(0200));
// Term upperTerm = new Term("size", NumberTools.longToString(1000));
// Term lowerTerm = new Term("size", "050");
// Term upperTerm = new Term("size", "500");
// Query query = new RangeQuery(lowerTerm, upperTerm, false);//false 代表是否包含边界
// Filter filter = null;
// TopDocs topDocs = indexSearcher.search(query, filter, 10000);
// ============WildcardQuery 通配符查询
// IndexSearcher indexSearcher = new IndexSearcher(indexPath);
// Term term = new Term("name", "roo?");
// // Term term = new Term("name", "ro*"); // 前缀查询 PrefixQuery
// // Term term = new Term("name", "*o*");
// // Term term = new Term("name", "房*");
// Query query = new WildcardQuery(term);
// Filter filter = null;
// TopDocs topDocs = indexSearcher.search(query, filter, 10000);
// ==========短语查询 PhraseQuery
// IndexSearcher indexSearcher = new IndexSearcher(indexPath);
// PhraseQuery query = new PhraseQuery();
// // phraseQuery.add(new Term("content", "绅士"), 1); // 1 代表绅士的位置
// // phraseQuery.add(new Term("content", "饭店"), 4);
// query.add(new Term("content", "绅士"));
// query.add(new Term("content", "饭店"));
// query.setSlop(2);// 设置指定词之间隔了几个词
// Filter filter = null;
// TopDocs topDocs = indexSearcher.search(query, filter, 10000);
// IndexSearcher indexSearcher = new IndexSearcher(indexPath);
// PhraseQuery query1 = new PhraseQuery();
// query1.add(new Term("content", "绅士"));
// query1.add(new Term("content", "饭店"));
// query1.setSlop(2);
//
// // 条件2
// Term lowerTerm = new Term("size", NumberTools.longToString(500));
// Term upperTerm = new Term("size", NumberTools.longToString(1000));
// Query query2 = new RangeQuery(lowerTerm, upperTerm, true);
//
// // 组合
// BooleanQuery query = new BooleanQuery();
// query.add(query1, Occur.MUST);
// query.add(query2, Occur.SHOULD);
// Filter filter = null;
// 在搜索的时候按照指定的方式排序
// 排序方式 Sort
// Sort sort = new Sort();
// sort.setSort(new SortField("size", true));
// TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
TopDocs topDocs = indexSearcher.search(query, filter, 10000);
// ================================准备高亮器
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
Fragmenter fragmenter = new SimpleFragmenter(50);// 摘要的前多少个字符
highlighter.setTextFragmenter(fragmenter);
// ===================================
System.out.println("一工有的记录数是:" + topDocs.totalHits);
for (ScoreDoc scoreDocs : topDocs.scoreDocs) {
int docSn = scoreDocs.doc;// 文档内部编号
Document doc = indexSearcher.doc(docSn);// 根据文档编号取出相应的文档
// ====================把值设置为高亮
String hc = highlighter.getBestFragment(analyzer, "content", doc.get("content"));
if (hc == null) {
// 如果没有找到 就找前50个字符
// 疑问 如果没找到 我们可以找标题啊
String content = doc.get("content");
int index = Math.min(50, content.length());
hc = content.substring(0, index);
}
doc.getField("content").setValue(hc);
FileAndDocumentUtils.printDocumentInfo(doc);
}
}
@Test
public void delete() throws Exception {
String searchString = "document";
// 吧要搜索的文本解析为query对象 和hibernate里面的相似
// 这样才能打印结果
String[] fields = { "name", "content" };
// Multi多重的 Parser解剖器
QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
Query query = queryParser.parse(searchString);
IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, true, MaxFieldLength.LIMITED);
indexWriter.deleteDocuments(query);
indexWriter.close();
}
}