lucene 的基本操作,详解

package Lucene;
import groovy.sql.Sql;
import java.io.IOException;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.Date;
import java.util.List;
import java.util.Map;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;


public class LuceneDemo {

public static final String path = "d://lucene";


//创建索引
public  void createIndex(List<Map<String, String>> maps){
try{
//使用nio,存储索引文件
Directory directory = FSDirectory.open(Paths.get(path));
//自带的分词器
// Analyzer analyzer = new StandardAnalyzer();//一元分词

Analyzer analyzer = new IKAnalyzer(true);
//配置信息indexWriterConfig
IndexWriterConfig config = new IndexWriterConfig(analyzer);
//创建IndexWriter
IndexWriter indexWriter = new IndexWriter(directory, config);


for(Map<String, String> map : maps){
Document doc = new Document();
String date = map.get("dateStr");
String tydm = map.get("TYDM");
String jgmc = map.get("JGMC");
long time = 0L;
/**
* TextField:既索引又分词
* StringField:只索引不分词
* Field.Store.YES:存储,使用document可以取得值展示在搜索页面中
* Field.Store.NO:不存储,取不到值,只做筛选项
*/
if(tydm != null)
doc.add(new StringField("tydm", tydm, Field.Store.YES));
if(jgmc != null)
doc.add(new TextField("jgmc", jgmc, Field.Store.YES));
if(date != null) 
time = DateUtils.parseDate(date,"yyyy/MM/dd HH:mm:ss").getTime();
doc.add(new LongField("date", time, Field.Store.YES));
///要排序必须加同名的field,且类型为NumericDocValuesField  
doc.add(new NumericDocValuesField("date",time));
indexWriter.addDocument(doc);
}
indexWriter.close();
}catch (Exception e) {
e.getStackTrace();
System.out.println(e.getMessage());
}
}

//搜索
public void Search(String keyWord){
DirectoryReader directoryReader = null;
try{
//1.创建目录
Directory directory = FSDirectory.open(Paths.get(path));
//2.创建indexReader
directoryReader = DirectoryReader.open(directory);
//3.根据indexReader 创建IndexSearch
IndexSearcher indexSearcher  = new IndexSearcher(directoryReader);
//4.创建搜索的Query
Analyzer analyzer = new IKAnalyzer(true);//使用IK分词
//简单的查询,创建Query表示搜索域为content包含keyword的文档
String []fields = {"jgmc","jgmc"};//要搜索的字段,一般搜索时都不会只搜索一个字段
String fieldName = "jgmc";
long beginTime = System.currentTimeMillis();
// 字段之间的与或非关系,MUST表示and,MUST_NOT表示not,SHOULD表示or,有几个fields就必须有几个clauses
       BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
       // MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国" 
       Query multiFieldQuery = MultiFieldQueryParser.parse(keyWord, fields, clauses, analyzer);
       //单个解析
//        QueryParser queryParser = new QueryParser(fieldName, analyzer);
//        Query multiFieldQuery = queryParser.parse(keyWord);
       /**
        * 对数据排序 
        */
       Sort sort = new Sort();
       SortField sortField = new SortField("date", SortField.Type.LONG,true);
       sort.setSort(sortField);
       
       //日期范围搜索
//        Filter filter = NumericRangeFilter.newLongRange("date", 1496250403001L, 1517519139010L, true, true);
       
       //根据search搜索并且返回TopDocs
//        TopDocs topDocs = indexSearcher.search(multiFieldQuery, 1000); //搜索前100条
       TopDocs topDocs = indexSearcher.search(multiFieldQuery, 1000, sort);
//        TopFieldDocs topDocs = indexSearcher.search(multiFieldQuery, filter, 1000, sort);
       
       /**
        * 复杂的搜索
        */
      /* BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
       //1.词语搜索,完全匹配,搜索具体的域
       Term term = new Term("jgmc","张家港");
       TermQuery termQuery = new TermQuery(term);
//        queryBuilder.add(multiFieldQuery, BooleanClause.Occur.SHOULD);
//        queryBuilder.add(termQuery,BooleanClause.Occur.SHOULD);
       //2.通配符查询   ?表示0或1个  *表示0或多个
       Term term2 = new Term("jgmc","*张家港市阳光*");
       WildcardQuery wildcardQuery = new WildcardQuery(term2);
       queryBuilder.add(wildcardQuery, BooleanClause.Occur.SHOULD);
//        TermRangeQuery rangeQuery = new TermRangeQuery("date", "2011-03-09", "2013-01-07", true, true);
       //3.短语查询
       PhraseQuery phraseQuery = new PhraseQuery();
       phraseQuery.add(new Term("jgmc","无锡"));
       phraseQuery.add(new Term("jgmc","公司"));
       phraseQuery.setSlop(10);//之间的距离不超过5个单词
//        queryBuilder.add(phraseQuery,BooleanClause.Occur.MUST);
       BooleanQuery query = queryBuilder.build();
       TopDocs topDocs = indexSearcher.search(query, 1000,sort);*/
       
       
       
       
       long endTime = System.currentTimeMillis();
       
      
       //6.根据TopDocs获取ScoreDoc对象
       ScoreDoc[] scoreDocs  = topDocs.scoreDocs;
       
       QueryScorer scorer = new QueryScorer(multiFieldQuery,"jgmc");
       
       // 自定义高亮代码
       SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"backgroud:red\">", "</span>");
       Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
       highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
       int ii  = 1;
       for (ScoreDoc scoreDoc : scoreDocs)
       {
           // 7、根据searcher和ScoreDoc对象获取具体的Document对象
           Document document = indexSearcher.doc(scoreDoc.doc);// 根据文档打分得到文档的内容  
           //TokenStream tokenStream = new SimpleAnalyzer().tokenStream("content", new StringReader(content));
           //TokenSources.getTokenStream("content", tvFields, content, analyzer, 100);
           //TokenStream tokenStream = TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), scoreDoc.doc, "content", document, analyzer);
           //System.out.println(highlighter.getBestFragment(tokenStream, content));
           System.out.println("------------------第"+ii+"行:-----------------------");
           Date date = new Date();
           String string = document.get("date");
           date.setTime(Long.parseLong(string));
           System.out.println(document.get("jgmc") + ":" + document.get("tydm")+":"+DateUtils.format(date, "yyyy/MM/dd HH:mm:ss")+":long--》"+string);
           System.out.println(highlighter.getBestFragment(analyzer, "jgmc", document.get("jgmc")));
           System.out.println("");
           ii++;
       }
       System.out.println("共找到匹配处:"+topDocs.totalHits);
       System.out.println("共找到匹配文档数:"+scoreDocs.length);
       System.out.println("搜索的时长:"+(endTime-beginTime));
    // 总共的索引文档  
}catch (Exception e) {
System.out.println(e.getMessage());
e.getStackTrace();
}
}
/**
* 删除索引的两种方法
* @return
* @throws IOException 
* @throws ClassNotFoundException
* @throws SQLException
*/
public  void deleteDocument(String tydm,String jgmc) {
try {
Date date1 = new Date();
//使用nio,存储索引文件
Directory directory = FSDirectory.open(Paths.get(path));
//自带的分词器
// Analyzer analyzer = new StandardAnalyzer();//一元分词
Analyzer analyzer = new IKAnalyzer(true);
//配置信息indexWriterConfig
IndexWriterConfig config = new IndexWriterConfig(analyzer);
//创建IndexWriter
IndexWriter indexWriter = new IndexWriter(directory, config);
//第一种根据Term 词删除
/**
* 删除不成功的原因是,删除的该字段被分词了,将其设置为不分词仅仅索引即可
*/
Term arg0 = new Term("tydm", tydm);
//第二种根据Query查询来删除
QueryParser parser = new QueryParser("jgmc", analyzer);
Query arg1 = parser.parse(jgmc);
//调用删除,然后关闭
indexWriter.deleteDocuments(arg1);
indexWriter.close();
Date date2 = new Date();
System.out.println("删除索引耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}

/**
* 新增索引
* @return
* @throws ClassNotFoundException
* @throws SQLException
*/
public void insertDoc(){
IndexWriter indexWriter = null;
try {
Date date1 = new Date();
//使用nio,存储索引文件
Directory directory = FSDirectory.open(Paths.get(path));
//自带的分词器
// Analyzer analyzer = new StandardAnalyzer();//一元分词
Analyzer analyzer = new IKAnalyzer(true);
//配置信息indexWriterConfig
IndexWriterConfig config = new IndexWriterConfig(analyzer);
//创建IndexWriter
indexWriter = new IndexWriter(directory, config);
Document doc = new Document();
String tydm = "91320200MA1MKKPE6R";
String jgmc = "无锡昆瑞化工有限公司";
String dateStr = "2017/05/24 05:05:39";
long time = DateUtils.parseDate(dateStr,"yyyy/MM/dd HH:mm:ss").getTime();
doc.add(new StringField("tydm", tydm, Store.YES));
doc.add(new TextField("jgmc", jgmc,Store.YES));
doc.add(new LongField("date", time, Store.YES));
doc.add(new NumericDocValuesField("date", time));
indexWriter.addDocument(doc);
Date date2 = new Date();
System.out.println("新增索引耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}finally{
try {
indexWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

/**
* 更新索引,实质上是先删除索引,在新增索引,因为不支持更新操作
* @return
* @throws ClassNotFoundException
* @throws SQLException
*/
public void updateDocument(){
IndexWriter indexWriter = null;
try {
Date date1 = new Date();
//使用nio,存储索引文件
Directory directory = FSDirectory.open(Paths.get(path));
//自带的分词器
// Analyzer analyzer = new StandardAnalyzer();//一元分词
Analyzer analyzer = new IKAnalyzer(true);
//配置信息indexWriterConfig
IndexWriterConfig config = new IndexWriterConfig(analyzer);
//创建IndexWriter
indexWriter = new IndexWriter(directory, config);
Document doc = new Document();
String tydm = "91320200MA1MKKPE6R";
String jgmc = "无锡崔春驰昆瑞化工有限公司";
String dateStr = "2018/02/02 05:05:39";
long time = DateUtils.parseDate(dateStr,"yyyy/MM/dd HH:mm:ss").getTime();
doc.add(new StringField("tydm", tydm, Store.YES));
doc.add(new TextField("jgmc", jgmc,Store.YES));
doc.add(new LongField("date", time, Store.YES));
doc.add(new NumericDocValuesField("date", time));
Term arg0 = new Term("tydm", tydm);
indexWriter.updateDocument(arg0, doc);
Date date2 = new Date();
System.out.println("更新索引耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}finally{
try {
indexWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

//数据的分页
/**
* 两种:1这就是先查询所有的数据,然后去分页数据 
    * 注意 这种方式处理海量数据的时候,容易内存溢出
    * 2:使用searchAfter
* @return
* @throws ClassNotFoundException
* @throws SQLException
*/
public void queryPage1(int pageSize,int pageIndex,String keyword){
DirectoryReader directoryReader = null;
try{
//1.创建目录
Directory directory = FSDirectory.open(Paths.get(path));
//2.创建indexReader
directoryReader = DirectoryReader.open(directory);
//3.根据indexReader 创建IndexSearch
IndexSearcher indexSearcher  = new IndexSearcher(directoryReader);
//4.创建搜索的Query
Analyzer analyzer = new IKAnalyzer(true);//使用IK分词
//简单的查询,创建Query表示搜索域为content包含keyword的文档
String fieldName = "jgmc";
       //单个解析
       QueryParser queryParser = new QueryParser(fieldName, analyzer);
       Query multiFieldQuery = queryParser.parse(keyword);
       //根据search搜索并且返回TopDocs
       TopDocs topDocs = indexSearcher.search(multiFieldQuery, 10000);
       //6.根据TopDocs获取ScoreDoc对象
       ScoreDoc[] scoreDocs  = topDocs.scoreDocs;
       //进行分页
       int start = (pageIndex -1)*pageSize;
       int end = pageIndex * pageSize;
       int ii  = 1;
       for (int i =start;i<end ;i++)
       {
           // 7、根据searcher和ScoreDoc对象获取具体的Document对象
           Document document = indexSearcher.doc(scoreDocs[i].doc);// 根据文档打分得到文档的内容  
           System.out.println("------------------第"+ii+"行:-----------------------");
           Date date = new Date();
           String string = document.get("date");
           date.setTime(Long.parseLong(string));
           System.out.println(document.get("jgmc") + ":" + document.get("tydm")+":"+DateUtils.format(date, "yyyy/MM/dd HH:mm:ss")+":long--》"+string);
           System.out.println("");
           ii++;
       }
       System.out.println("共找到匹配处:"+topDocs.totalHits);
       System.out.println("共找到匹配文档数:"+scoreDocs.length);
    // 总共的索引文档  
}catch (Exception e) {
System.out.println(e.getMessage());
e.getStackTrace();
}
}
//2:使用searchAfter分页
public void queryPage2(int pageSize,int pageIndex,String keyword){
DirectoryReader directoryReader = null;
try{
Date date1 = new Date();
//1.创建目录
Directory directory = FSDirectory.open(Paths.get(path));
//2.创建indexReader
directoryReader = DirectoryReader.open(directory);
//3.根据indexReader 创建IndexSearch
IndexSearcher indexSearcher  = new IndexSearcher(directoryReader);
//4.创建搜索的Query
Analyzer analyzer = new IKAnalyzer(true);//使用IK分词
//简单的查询,创建Query表示搜索域为content包含keyword的文档
String fieldName = "jgmc";
       //单个解析
       QueryParser queryParser = new QueryParser(fieldName, analyzer);
       Query multiFieldQuery = queryParser.parse(keyword);
       //进行searchAfter分页
       int num = (pageIndex -1)*pageSize;
      //根据search搜索并且返回TopDocs
    TopDocs topDocs = null;
      //获取上一页的最后数量
    ScoreDoc beforeDoc = null;
       if(pageIndex == 1){
        num = pageSize;
        topDocs =  indexSearcher.search(multiFieldQuery, num);
       }else{
        topDocs =  indexSearcher.search(multiFieldQuery, num);
        beforeDoc = topDocs.scoreDocs[num-1];
       }
    //通过最后一个元素去搜索下一页元素
    TopDocs searchAfter = indexSearcher.searchAfter(beforeDoc, multiFieldQuery, pageSize);
    ScoreDoc[] scoreDocs2 = searchAfter.scoreDocs;
       int ii  = 1;
       for (ScoreDoc scoreDocs : scoreDocs2)
       {
           // 7、根据searcher和ScoreDoc对象获取具体的Document对象
           Document document = indexSearcher.doc(scoreDocs.doc);// 根据文档打分得到文档的内容  
           //TokenStream tokenStream = new SimpleAnalyzer().tokenStream("content", new StringReader(content));
           //TokenSources.getTokenStream("content", tvFields, content, analyzer, 100);
           //TokenStream tokenStream = TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), scoreDoc.doc, "content", document, analyzer);
           //System.out.println(highlighter.getBestFragment(tokenStream, content));
           System.out.println("------------------第"+ii+"行:-----------------------");
           Date date = new Date();
           String string = document.get("date");
           date.setTime(Long.parseLong(string));
           System.out.println(document.get("jgmc") + ":" + document.get("tydm")+":"+DateUtils.format(date, "yyyy/MM/dd HH:mm:ss")+":long--》"+string);
           System.out.println("");
           ii++;
       }
       System.out.println("共找到匹配处:"+topDocs.totalHits);
       // 总共的索引文档  
       System.out.println("共找到匹配文档数:"+scoreDocs2.length);
       Date date2 = new Date();
System.out.println("分页索引耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
}catch (Exception e) {
System.out.println(e.getMessage());
e.getStackTrace();
}
}


//连接数据库
public List<Map<String, String>> connectionOBDC() throws ClassNotFoundException, SQLException{
Sql con = Sql.newInstance("jdbc:oracle:thin:@192.168.12.250:1521/jsdm", "jsdm_gbk", "jsdm_gbk","oracle.jdbc.driver.OracleDriver");
String selectSql = "select to_char(t.CHECK_END_DATE,'YYYY/MM/dd HH:MM:ss') as dateStr,t.* from tydm_tydm t where  rownum <= 100 and BZJGDM is not null";
List<Map<String, String>> rows = con.rows(selectSql);
System.out.println(rows.size());
return rows;
}

public static void main(String[] args) throws ClassNotFoundException, SQLException {
LuceneDemo luceneDemo = new LuceneDemo();
// List<Map<String, String>> rows = luceneDemo.connectionOBDC();
创建索引
// luceneDemo.createIndex(rows);
//删除索引文档的中的信息
// luceneDemo.deleteDocument("91320200MA1MKKPE6R","无锡");
// luceneDemo.insertDoc();
// luceneDemo.updateDocument();
// luceneDemo.queryPage1(10, 1, "无锡分公司");
luceneDemo.queryPage2(20, 1, "无锡分公司");
// luceneDemo.Search("无锡分公司");
}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值