全文检索Lucene的使用

全文检索流程:
创建索引文件、从文件中解析出文本(各个文件类型解析的方法不同)、把文本加入到索引文件;
所支持文件格式:txt、pdf、ppt、xls、doc、html、htm;
检索文件(从索引文件中查找匹配的关键字);
实例(已经测试通过) 
TxtFileIndexer.java
 

package net.lucene.edu;

/**
 * 根据不同的文件类型,使用相对应的工具进行解析,创建索引文件
 * @author lvx
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStream;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;

import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TextNode;

import org.htmlparser.util.NodeList;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;


public class TxtFileIndexer {


  public static void main(String[] args) throws Exception{          
      
        File indexDir = new File("D://tmp//luceneIndex");  //索引文件存放路径     
        File dataDir  = new File("D://tmp//luceneData");   //文件所存路径         
      
        Analyzer luceneAnalyzer = new CJKAnalyzer();         
        File[] dataFiles  = dataDir.listFiles();
        IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,false); //创建索引文件的时候创建indexWriter时候指定true,往指定索引文件中加入新的索引指定false
        Document document = null;
       
        for(int i = 0; i < dataFiles.length; i++){        
         if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){          
          document = new Document();
          FileReader reader = new FileReader(dataFiles[i]);        
          document.add(new Field("path",dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));        
          document.add(new Field("contents",reader));          
          indexWriter.addDocument(document);    
               
         }else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".pdf")){
          document = new Document();
          document = LucenePDFDocument.getDocument(dataFiles[i]);        
          indexWriter.addDocument(document); 
          
         }else if(dataFiles[i].getName().endsWith(".htm") || dataFiles[i].getName().endsWith(".html")){
          document = new Document();
             NodeFilter filter = new NodeClassFilter(TextNode.class);
             Parser parser = new Parser();
             parser.setURL(dataFiles[i].getPath());
             parser.setEncoding("GBK");
             NodeList list = parser.extractAllNodesThatMatch(filter);            
             document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));        
             document.add(new Field("contents", list.asString(),Field.Store.YES,Field.Index.TOKENIZED));
             indexWriter.addDocument(document);    
          
         }else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".doc")){
             document = new Document();
             FileInputStream stream = new FileInputStream (dataFiles[i]);
             WordExtractor extractor = new WordExtractor(stream);
             String str = extractor.getText();       
             document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
             document.add(new Field("contents", str,Field.Store.YES,Field.Index.TOKENIZED));
             indexWriter.addDocument(document);   
            
         }else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".xls")){           
             document = new Document();
             StringBuffer str = new StringBuffer();
             InputStream fileInputStream = new FileInputStream(dataFiles[i]);                        
             Workbook rwb = Workbook.getWorkbook(fileInputStream);        //获得总 Sheets
             Sheet[] sheets = rwb.getSheets();
             int sheetLen = sheets.length;
             for(int j=0;j<sheetLen;j++){                                  //获得单个Sheets 含有的行数
              Sheet rs = rwb.getSheet(j);
              Cell[] cell_domain = null;
              for(int ii=0;ii<rs.getRows();ii++){
               cell_domain = rs.getRow(ii);
               for(int jj=0;jj<cell_domain.length;jj++){
                str.append(cell_domain[jj].getContents());
               }             
              }     
             }                                   
             document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
             document.add(new Field("contents", str.toString(),Field.Store.YES,Field.Index.TOKENIZED));
             indexWriter.addDocument(document); 
            
         }else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".ppt")){           
             document = new Document();
             FileInputStream fileInputStream = new FileInputStream (dataFiles[i]);
             PowerPointExtractor extractor = new PowerPointExtractor(fileInputStream);
             String str = extractor.getText();       
             document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
             document.add(new Field("contents", str,Field.Store.YES,Field.Index.TOKENIZED));
             indexWriter.addDocument(document);
         }  
        }      
        indexWriter.optimize();      
        indexWriter.close();      
        System.out.println("索引建立完成!");   
  }
 
}

package net.lucene.edu;

import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;

public class TxtFileSearcher {
 
 public static void main(String[] args) throws Exception{ 
       
  System.out.println("开始检索关键字:双鸭山");
  String queryStr = "吕兴";    //检索关键字
  String indexDirect = "D://tmp//luceneIndex";    //索引文件存放路径
  Hits hits = null;
  Query query = null;
        Analyzer analyzer = new CJKAnalyzer();
        IndexSearcher searcher = null;
        searcher = new IndexSearcher(indexDirect);
        QueryParser parser = new QueryParser("contents", analyzer);
        query = parser.parse(queryStr); 
        hits = searcher.search(query);
        Highlighter highlighter = new Highlighter(new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(50));
  for(int i = 0; i < hits.length(); i++){       
     Document document = hits.doc(i);       
     System.out.println("检索文件: " + document.get("path"));
     String content = document.get("contents");    
     if (content!= null){
      TokenStream tokenStream =analyzer.tokenStream("content", new StringReader(content));
      System.out.println(highlighter.getBestFragment(tokenStream,hits.doc(i).get("contents")));
     }
  }
 }
}

完成,测试时需要导入相应的包(PDFBox、poi、lucene、jxl、htmlparser)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值