lucene索引word/pdf/html/txt文件及检索(搜索引擎)

最新推荐文章于 2024-09-19 15:10:23 发布

dengjianqiang001

最新推荐文章于 2024-09-19 15:10:23 发布

阅读量902

点赞数

文章标签： lucene 搜索引擎 path 文档 string null

因为lucene索引的时候是将String型的信息建立索引的，所以这里必须是将word/pdf/html等文件的内容转化问字符型。
lucene的jar包自己去下载。
首先是建立索引的代码：

public class TextFileIndexer {
     public static void main(String[] args) throws Exception {
         /* 指明要索引文件夹的位置,这里是d盘的s文件夹下 */
        File fileDir = new File( " d://s " );

         /* 这里放索引文件的位置 */
        File indexDir = new File( " d://index " );
        Analyzer luceneAnalyzer = new StandardAnalyzer();
        IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
                 true );
        File[] textFiles = fileDir.listFiles();
         long startTime = new Date().getTime();

         // 增加document到索引去
                System.out.println( " File正在被索引

. " );

                 /*
                 * 注意要变的就是这里，路径和读取文件的方法
                 * */
                String path = " d://s//2.doc " ;
                String temp = ReadFile.readWord(path);
//                 String path ="d://s//index.htm";
//                 String temp = ReadFile.readHtml(path);
                Document document = new Document();
                Field FieldPath = new Field( " path " ,path,
                        Field.Store.YES, Field.Index.NO);
                Field FieldBody = new Field( " body " , temp, Field.Store.YES,
                        Field.Index.TOKENIZED,
                        Field.TermVector.WITH_POSITIONS_OFFSETS);
                document.add(FieldPath);
                document.add(FieldBody);
                indexWriter.addDocument(document);


         // optimize()方法是对索引进行优化
        indexWriter.optimize();
        indexWriter.close();

         // 测试一下索引的时间
         long endTime = new Date().getTime();
        System.out
                .println( " 这花费了 "
                         + (endTime - startTime)
                         + " 毫秒来把文档增加到索引里面去! "
                         + fileDir.getPath());
    }
}

上面已经注释了要换的地方，我们要做的就是换文件的路径和读取文件的方法。

下面来具体看下读取文件的方法

1.首先来看WORD文档：
我这里用的是poi，相关jar包自己去下载，然后加到工程中（以下所要用的jar包也是，不再重复说）。

来看相关代码：

     public static String readWord(String path) {
        StringBuffer content = new StringBuffer( "" ); // 文档内容
         try {

            HWPFDocument doc = new HWPFDocument( new FileInputStream(path));
            Range range = doc.getRange();
             int paragraphCount = range.numParagraphs(); // 段落
             for ( int i = 0 ; i < paragraphCount; i ++ ) { // 遍历段落读取数据
                Paragraph pp = range.getParagraph(i);
                content.append(pp.text());
            }

        } catch (Exception e) {

        }
         return content.toString().trim();
    }

2.PDF文件用的是PDFbox：

public static String readPdf(String path) throws Exception {
        StringBuffer content = new StringBuffer( "" ); // 文档内容
        FileInputStream fis = new FileInputStream(path);
        PDFParser p = new PDFParser(fis);
        p.parse();
        PDFTextStripper ts = new PDFTextStripper();
        content.append(ts.getText(p.getPDDocument()));
        fis.close();
         return content.toString().trim();
    }

3.html文件：

public static String readHtml(String urlString) {

        StringBuffer content = new StringBuffer( "" );
        File file = new File(urlString);
        FileInputStream fis = null ;
         try {
            fis = new FileInputStream(file);
             // 读取页面
            BufferedReader reader = new BufferedReader( new InputStreamReader(
                    fis, " utf-8 " )); // 这里的字符编码要注意，要对上html头文件的一致，否则会出乱码

            String line = null ;

             while ((line = reader.readLine()) != null ) {
                content.append(line + " /n " );
            }
            reader.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        String contentString = content.toString();
         return contentString;
    }

4.txt文件：

public static String readTxt(String path) {
        StringBuffer content = new StringBuffer( "" ); // 文档内容
         try {
            FileReader reader = new FileReader(path);
            BufferedReader br = new BufferedReader(reader);
            String s1 = null ;

             while ((s1 = br.readLine()) != null ) {
                content.append(s1 + " /r " );
            }
            br.close();
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
         return content.toString().trim();
    }

接下来数搜索代码：

public class TestQuery {
     public static void main(String[] args) throws IOException, ParseException {
        Hits hits = null ;
         // 搜索内容自己换
        String queryString = " 根据国务院的决定 " ;
        Query query = null ;

        IndexSearcher searcher = new IndexSearcher( " d://index " ); // 这里注意索引存放的路径

        Analyzer analyzer = new StandardAnalyzer();
         try {
            QueryParser qp = new QueryParser( " body " , analyzer);
             /**
             * 建索引的时候我们指定了body建立为内容，我们搜索的时候也是针对body的，所以
             *   QueryParser qp = new QueryParser("body", analyzer);
             *   这句和建立索引时候
                Field FieldBody = new Field("body", temp, Field.Store.YES,
                        Field.Index.TOKENIZED,
                        Field.TermVector.WITH_POSITIONS_OFFSETS);
             *的这句的"body"是对应的。
              */
            query = qp.parse(queryString);
        } catch (ParseException e) {
            System.out.println( " 异常 " );
        }
         if (searcher != null ) {
            hits = searcher.search(query);
             if (hits.length() > 0 ) {
                System.out.println( " 找到: " + hits.length() + " 个结果! " );
                 for ( int i = 0 ; i < hits.length(); i ++ ) { // 输出搜索信息

                     Document document = hits.doc(i);
                     System.out.println( " contents： " + document.get( " body " ));
                      // 同样原理这里的document.get("body")就是取得建立在索引文件里面的额body的所有内容
                      // 你若想输出文件路径就用document.get("path")就可以了
                }
            } else {
                System.out.println( " 0个结果! " );
            }
        }
    }