目录
简介:
Lucene是一套用于全文检索和搜寻的开源程式库,由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程式接口,能够做全文索引和搜寻。在Java开发环境里Lucene是一个成熟的免费开源工具。就其本身而言,Lucene是当前以及最近几年最受欢迎的免费Java资讯检索程式库。
人们经常提到资讯检索程式库,虽然与搜索引擎有关,但不应该将资讯检索程式库与搜索引擎相混淆。
全文检索服务主要由两大部门构成:索引器及搜索器。索引器主要负责对文件名称及文件内容进行分词,并创建索引表。搜索器负责检索索引表,获取相关内容信息,并显示。
索引器:
package com.lm.IndexTxt;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* lucene 索引创建
* @author LM
*/
public class Indexer {
private String fieldName;//索引段名称
private String endStr;//问价后缀名称
public Indexer(String fieldName, String endStr){
this.fieldName = fieldName;
this.endStr = endStr;
}
/**
* 创建索引
* @param indexDir 索引创建路径
* @param dataDir 文本保存路径
* @return
* @throws IOException
* @author lm
*/
public int index(File indexDir, File dataDir ) throws IOException {
//数据路径不存在或不是目录
if(!dataDir.exists() || !dataDir.isDirectory()){
throw new IOException(dataDir + " does not exist or is not a directory");
}
// 分词器
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_46,true);
// 索引创建器配置
IndexWriterConfig indexWriterConfig=new IndexWriterConfig(Version.LUCENE_46,analyzer);
// 文件目录
Directory directory = FSDirectory.open(indexDir);
if (IndexWriter.isLocked(directory)) {
IndexWriter.unlock(directory);
}
// 索引创建器
IndexWriter writer = new IndexWriter(directory,indexWriterConfig);
writer.deleteAll();//先全部删除
indexDirectory(writer, dataDir);
int numIndexed = writer.numDocs();
writer.close();
return numIndexed;
}
/**
* 递归查找符合条件的文件
* @param writer
* @param dir
* @author lm
* @throws IOException
*/
public void indexDirectory(IndexWriter writer,File dir) throws IOException{
File[] files = dir.listFiles();
for (File file : files) {
if(file.isDirectory()){
indexDirectory(writer, file);
}else if(file.getName().endsWith(this.endStr)){
indexFile(writer, file);
}
}
}
/**
* 对文件创建索引
* @param writer
* @param f
* @throws IOException
* @author lm
*/
@SuppressWarnings("deprecation")
public void indexFile(IndexWriter writer,File f) throws IOException{
// 过滤隐藏文件,不存在文件,不可读文件
if(f.isHidden() || !f.exists() || !f.canRead())
return;
System.out.println("Indexing: "+f.getCanonicalPath());
Document doc = new Document();
// String content = getTXT(f,"GBK");
// doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(this.fieldName, f.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
/**
* 读取文件
* @param file
* @param charset
* @return
* @throws IOException
* @author lm
*/
public String getTXT(File file,String charset) throws IOException{
FileInputStream fileInputStream = new FileInputStream(file);
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream,charset);
BufferedReader reader = new BufferedReader(inputStreamReader);
String line = new String();
String result = new String();
while( (line=reader.readLine())!=null ){
result += line;
}
reader.close();
return result;
}
// 主方法
public static void main(String[] args) {
Indexer indexer = new Indexer("filepath",".txt");
try {
File indexDir = new File("d:\\lucene\\index");
File dataDir = new File("d:\\lucene\\file");
int result = indexer.index(indexDir, dataDir);
System.out.println("indexing "+result+" files.");
} catch (IOException e) {
e.printStackTrace();
}
}
}
运行结果:
Indexing: D:\lucene\file\2013年新员工报到通知.txt
Indexing: D:\lucene\file\create a index.txt
Indexing: D:\lucene\file\数据分析挖掘.txt
Indexing: D:\lucene\file\数据汇总统计.txt
Indexing: D:\lucene\file\数据预测与优化.txt
Indexing: D:\lucene\file\运行 Indexer.txt
indexing 6 files.
搜索器:
package com.lm.IndexTxt;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索程序
* @author LM
*/
public class Searcher {
private File indexDir;//索引文件目录
private String fieldName;//索引段名称
public Searcher(String IndexDir,String fieldName){
this.indexDir = new File(IndexDir);
this.fieldName = fieldName;
}
/**
* 查询
* @param indexDir
* @param q
* @throws Exception
* @author lm
*/
@SuppressWarnings("deprecation")
public void search(String keywords) throws Exception {
//① 打开索引
Directory fsDir = FSDirectory.open(indexDir);
IndexReader reader = IndexReader.open(fsDir);
IndexSearcher is = new IndexSearcher(reader);
//② 分析查询 fieldName:索引创建时的field名称
// Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_46,true);
QueryParser queryParser = new QueryParser(Version.LUCENE_46, fieldName, analyzer);
Query query = queryParser.parse(keywords);
//③ 搜索索引
TopDocs docs =is.search(query, 6);
// ④ 得到匹配的文档
ScoreDoc[] scoreDoc = docs.scoreDocs;
System.err.println("Found " + docs.totalHits + " document(s) that matched query '" + keywords + "'.");
for (int i = 0; i < scoreDoc.length; i++) {
Document midDoc = reader.document(scoreDoc[i].doc);
System.out.println(midDoc.get(fieldName));
}
reader.close();
}
// 主函数
public static void main(String[] args) {
Searcher searcher = new Searcher("d:\\lucene\\index","filepath");
try {
searcher.search("预测");
} catch (Exception e) {
e.printStackTrace();
}
}
}
结果:
Found 1 document(s) that matched query '预测'.
D:\lucene\file\数据预测与优化.txt