lucene全文检索
- 适用于word,ppt ,excel,pdf
代码块Indexer
lucene检索代码 :
package com.fzky.diams.web.luncene;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
private IndexWriter writer; //写索引实例
//构造方法,实例化IndexWriter
public Indexer(String indexDir) throws Exception {
Directory dir = FSDirectory.open(Paths.get(indexDir));
// Analyzer analyzer = new StandardAnalyzer(); //标准分词器,会自动去掉空格啊,is a the等单词
// SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
Analyzer analyzer=new MyIkAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer); //将标准分词器配到写索引的配置中
config.setOpenMode(OpenMode.CREATE);
writer = new IndexWriter(dir, config); //实例化写索引对象
// writer.deleteAll();//清除以前的index
}
//关闭写索引
public void close() throws Exception {
writer.close();
}
//索引指定目录下的所有文件
public int indexAll(String dataDir) throws Exception {
File[] files = new File(dataDir).listFiles(); //获取该路径下的所有文件
for(File file : files) {
indexFile(file); //调用下面的indexFile方法,对每个文件进行索引
}
return writer.numDocs(); //返回索引的文件数
}
//索引指定的文件
private void indexFile(File file) throws Exception {
System.out.println("索引文件的路径:" + file.getCanonicalPath());
Document doc = getDocument(file); //获取该文件的document
writer.addDocument(doc); //调用下面的getDocument方法,将doc添加到索引中
System.out.println("索引数:::"+writer.numDocs());
}
//获取文档,文档里再设置每个字段,就类似于数据库中的一行记录
private Document getDocument(File file) throws Exception{
Document doc = new Document();
//添加字段
//BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK"));
String filepath =file.getAbsolutePath().toString();
String ftype = filepath.substring(filepath.lastIndexOf(".") + 1);// 文件后缀
String reader = "";
ReaderFile readerfile = new ReaderFile();
switch (ftype) {
case "txt":
reader = readerfile.readFile(filepath);
break;
case "doc":
reader = readerfile.readWord(filepath);
break;
case "docx":
reader = readerfile.readWordDocx(filepath);
break;
case "xlsx":
reader = readerfile.getTextFromExcel2007(filepath);
break;
case " xls":
reader = readerfile.getTextFromExcel(filepath);
break;
case "pdf":
reader = readerfile.readPdf(filepath);
break;
case " ppt":
reader = readerfile.getTextFromPPT(filepath);
break;
case "pptx":
reader = readerfile.getTextFromPPT2007(filepath);
break;
default:
break;
}
doc.add(new Field("contents",reader,TextField.TYPE_STORED)); //添加内容
doc.add(new Field("fileName", file.getName(), TextField.TYPE_STORED)); //添加文件名,并把这个字段存到索引文件里
doc.add(new Field("fullPath", file.getCanonicalPath(),TextField.TYPE_STORED)); //添加文件路径
// doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES)); //添加文件路径
return doc;
}
/**
* 遍历文件夹中的文件
* @param datadir
*/
static List<String> list = new ArrayList<String>();
private static List<String> indexDocsNew(String datadir) throws IOException {
List<String> newlist = new ArrayList<String>();
newlist=indexDocs(datadir);
list = new ArrayList<String>();
return newlist;
}
private static List<String> indexDocs(String datadir) throws IOException {
File file=new File(datadir);
String dirson="";
if (file.canRead()) {
// 如果file是一个目录(该目录下面可能有文件、目录文件、空文件三种情况)
if (file.isDirectory()) {
// 获取file目录下的所有文件(包括目录文件)File对象,放到数组files里
String[] files = file.list();
if (files != null) {// 如果files!=null
// 对files数组里面的File对象递归索引,通过广度遍历
for (int i = 0; i < files.length; i++) {
dirson=new File(file, files[i]).getAbsolutePath();
indexDocs(dirson);
}
}
}else { // 到达叶节点时,说明是一个File,而不是目录,则建立索引
list.add(file.getAbsolutePath());
}
}
return list;
}
public static void index(String indexdir,String datadir) throws IOException {
String indexDir = indexdir; //将索引保存到的路径
List<String> dataDir = indexDocsNew(datadir); //需要索引的文件数据存放的目录
Indexer indexer = null;
long startTime = System.currentTimeMillis(); //记录索引开始时间
try {
indexer = new Indexer(indexDir);
for(int i=0;i<dataDir.size();i++){
indexer.indexFile(new File(dataDir.get(i).toString()));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
indexer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
long endTime = System.currentTimeMillis(); //记录索引结束时间
System.out.println("索引耗时" + (endTime-startTime) + "毫秒");
}
/* public static void main(String[] args) throws IOException {
String indexdir="D:\\lucene";
String datadir="D:\\lucene\\data";
// String indexdir="D:\\workspace\\.metadata\\.plugins\\org.eclipse.wst.server.core\\tmp1\\wtpwebapps\\diams\\index";
// String datadir="D:\\workspace\\.metadata\\.plugins\\org.eclipse.wst.server.core\\tmp1\\wtpwebapps\\diams\\file";
index(indexdir,datadir);
}*/
}
### 代码块Searcher
lucene搜索代码 :
``` python
package com.fzky.diams.web.luncene;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Searcher {
public static List<String> search(String indexDir, String q) throws Exception {
ArrayList<String> list = new ArrayList<String>();
Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径,也就是索引所在的位置
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// Analyzer analyzer = new StandardAnalyzer(); //标准分词器,会自动去掉空格啊,is a the等单词
// SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();//中文分词
Analyzer analyzer=new MyIkAnalyzer();
QueryParser parser = new QueryParser("contents", analyzer); //查询解析器
Query query = parser.parse(q); //通过解析要查询的String,获取查询对象
long startTime = System.currentTimeMillis(); //记录索引开始时间
TopDocs docs = searcher.search(query, 10);//开始查询,查询前10条数据,将记录保存在docs中
long endTime = System.currentTimeMillis(); //记录索引结束时间
System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒");
System.out.println("查询到" + docs.totalHits + "条记录");
for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果
Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档
System.out.println(doc.get("fullPath")); //fullPath是刚刚建立索引的时候我们定义的一个字段
list.add(doc.get("fullPath"));
// System.out.println("查询名字" + doc.get("fileName"));
}
reader.close();
return list;
}
/* public static void main(String[] args) {
String indexDir = "D:\\lucene";
String q = "关系"; //查询这个字符串
try {
search(indexDir, q);
} catch (Exception e) {
e.printStackTrace();
}
}*/
}