一:第一个类文件,改类的方法作用为采集document文档,将指定路径的所有文件夹下的所有文件加载到document中
package ut.tst;
import java.io.File;import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
/**
* 采集文档document,将要索引的文本转换为document,document中包括过个field
* (域,将一个文档的各个属性分开,包括标题,内容,大小,路径等,类似字段的作用)
* @author Administrator
*/
public class indexUtils {
public static List<Document> file2document(String souDir){
List<Document> list = new ArrayList<Document>();
List<File> filelist = getFileList(new ArrayList<File>(),souDir);
for (File file :filelist) {
String filename = file.getName();
String fileContext="";
try {
fileContext = FileUtils.readFileToString(file);
} catch (IOException e) {
e.printStackTrace();
}
Long fileSize = FileUtils.sizeOf(file);
String filePath = file.getAbsolutePath();
//创建文档
Document document = new Document();
//创建域field
Field fieldName = new StringField("fieldName",filename, Store.YES);
Field fieldPath = new StoredField("fieldPath", filePath);
Field fieldContext = new org.apache.lucene.document.TextField("fieldContext", fileContext, Store.NO);
Field fieldSize = new LongField("fieldSize",fileSize , Store.YES);
document.add(fieldSize);
document.add(fieldPath);
document.add(fieldName);
document.add(fieldContext);
list.add(document);
}
return list;
}
private static List<File> getFileList(List<File> list,String filePath){
//源文件夹
File SourceDir = new File(filePath);
if(!SourceDir.exists()) SourceDir.mkdir();
File[] fileList = SourceDir.listFiles();
for (File file : fileList) {
if(file.isDirectory()){
getFileList(list,filePath+"//"+file.getName());
}else{
if(file.getName().lastIndexOf(".doc")>0||file.getName().lastIndexOf(".docx")>0){
list.add(file);
}
}
}
return list;
}
}
二:通过采集document,生成索引文件
package ut.tst;
import java.io.File;
import java.io.IOException;
import java.util.List;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class testIndex {
public static void createIndex(String indexDir,String souDir){
try {
//采集文档document
List<Document> docList = indexUtils.file2document(souDir);
//创建分词器
Analyzer standardAnalyzer = new MMAnalyzer();
//指定索引存储目录
Directory directory = null;
directory = FSDirectory.open(new File(indexDir).toPath());
//创建索引操作配置对象
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
// 定义索引操作对象indexWriter
IndexWriter indexwriter = new IndexWriter(directory, indexWriterConfig);
// 遍历目录 下的文件生成的文档,调用indexWriter方法创建索引
for (Document document : docList) {
indexwriter.addDocument(document);
}
// 索引操作流关闭
indexwriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
三:进行查询,输出查询结果
package ut.tst;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class testSearch {
//索引文件
private static String indexDir="D://luceneDir";
//源文件
private static String souDir="D://data//临时第三标段";
//查询方法
private static void testTermQuery() throws IOException {
// 创建查询对象,根据文件名称域搜索匹配文件名称的文档
Query query = new TermQuery(new Term("fieldContent", "西安"));
// 指定索引目录
Directory directory = FSDirectory.open(new File(indexDir).toPath());
// 定义IndexReader
IndexReader reader = DirectoryReader.open(directory);
// 创建indexSearcher
IndexSearcher indexSearcher = new IndexSearcher(reader);
// 执行搜索
TopDocs topDocs = indexSearcher.search(query, 100);
// 提取搜索结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
System.out.println("共搜索到总记录数:" + topDocs.totalHits);
for (ScoreDoc scoreDoc : scoreDocs) {
// 文档id
int docID = scoreDoc.doc;
// 得到文档
Document doc = indexSearcher.doc(docID);
// 输出 文件内容
System.out.println("------------------------------");
System.out.println("文件名称 =" + doc.get("fieldName"));
System.out.println("文件大小 =" + doc.get("fieldSize"));
System.out.println("文件内容 =" + doc.get("fieldContent"));
}
}
public static void main(String[] args) {
testIndex.createIndex(indexDir,souDir);
try {
testTermQuery();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}