Lucene3.1 之读取txt文本文档内容

最新推荐文章于 2021-04-12 09:16:57 发布

阳光宅男

最新推荐文章于 2021-04-12 09:16:57 发布

阅读量1.8k

点赞数

分类专栏： Lucene 文章标签： lucene 文档 string exception query file

本文链接：https://blog.csdn.net/yysjch/article/details/6451937

版权

Lucene 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

package luch;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class DoSearch {
/**
* @param args
*/
public static void main(String[] args) throws Exception {
  createIndex();
     //实行检索操作
     //指定索引所在文件夹
     Directory dir = new SimpleFSDirectory(new File("d://index"));
     //创建 IndexSearcher对象
     IndexSearcher searcher = new IndexSearcher(dir);
     //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
     String fields = "content";
     String queryString = "difficult";
  try {
   QueryParser queryParser = new QueryParser(Version.LUCENE_30,
       fields, new StandardAnalyzer(Version.LUCENE_30));
   // 将检索关键字打包成Query对象
         Query query = queryParser.parse(queryString);
         //搜索结果 TopDocs里面有scoreDocs[]数组，里面保存着索引值
         TopDocs hits = searcher.search(query, 100);
         //hits.totalHits表示一共搜到多少个
         System.out.println("索引个数:" + searcher.maxDoc() + "个     包含关键字的有:" + hits.totalHits + "个");
         //循环hits.scoreDocs数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值
         for (int i = 0; i < hits.scoreDocs.length; i++) {
             ScoreDoc sdoc = hits.scoreDocs[i];
             Document doc = searcher.doc(sdoc.doc);
             System.out.println("文件名为:" + doc.get("name") );
             System.out.println("内容为:" + doc.get("content"));
             System.out.println("路径为:" + doc.get("path"));
         }
   searcher.close();
  } catch (ParseException e) {
   e.printStackTrace();
  }
}

/*
*创建索引
**/
private static void createIndex() throws Exception {
  Date start = new Date();
  //索引文件存放的位置
  File indexDir=new File("d://index");
  //需要建立索引的文档集合的位置
  File docDir = new File("d://content");
  //建立索引器
  IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),
          new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
  File[] textFiles = docDir.listFiles();
  for (int i = 0; i < textFiles.length; i++) {
   if(textFiles[i].isFile()
            && textFiles[i].getName().endsWith(".txt")) {
    //Lucene的文档结构
    Document doc = new Document();
    //文件名称，可查询，不分词
    String fileName = textFiles[i].getName().substring(0, textFiles[i].getName().length());
          doc.add(new Field("name", fileName, Field.Store.YES, Field.Index.NOT_ANALYZED));
          //文件路径，可查询，不分词
          String filePath = textFiles[i].getPath();
          doc.add(new Field("path", filePath, Field.Store.YES, Field.Index.NOT_ANALYZED));
          //文件内容，需要检索注释掉的这种做法，只做索引不做存储
//           doc.add(new Field("content", new FileReader(fileSrc)));
//           doc.add(new Field("content",
//             new InputStreamReader(
//               new FileInputStream(filePath), "UTF-8")));
          //读取文件内容
          BufferedReader reader = new BufferedReader(new InputStreamReader(
                     new FileInputStream(filePath), "GBK"));
          String line = new String();
          String temp = new String();
          while ((line = reader.readLine()) != null) {
          temp += line;
          }
          reader.close();
          //这里有个属性是 Field.Index.ANALYZED,之前我用Field.Index.NOT_ANALYZED，然后文件内容就一直读不到，搞了我好长时间。
          Field fieldBody = new Field("content", temp, Field.Store.YES,
                     Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
          doc.add(fieldBody);
          //追加一条索引记录
          writer.addDocument(doc);
   }
  }
  //关闭索引器，并写入磁盘索引文件
  writer.optimize();
  writer.close();
  Date end = new Date();
  //计算时间
        System.out.println("建立索引用时:" + (end.getTime()-start.getTime()) + "毫秒");
}
}