Lucene 是apache软件基金会一个开放源代码的全文检索引擎工具包,是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎。 Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便的在目标系统中实现全文检索的功能,或者是以此为基础建立起完整的全文检索引擎。
下载地址:
http://www.apache.org/dyn/closer.cgi/lucene/java/4.10.1
1.建立lucene索引模块
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Index {
public static void main(String[] args) throws Exception {
String str1 = args[0];
String str2 = args[1];
Index in = new Index();
in.index(str1, str2);
}
//读取输入文件数据,输出索引数据到输出文件
public void index(String str1, String str2) throws Exception {
IndexWriter writer = null;
try {
//1、创建Directory
Directory directory = FSDirectory.open(new File(str1));
//2、创建IndexWriter,通过它来写索引
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, new StandardAnalyzer(Version.LUCENE_4_9));
iwc.setUseCompoundFile(false);
writer = new IndexWriter(directory, iwc);
//3、创建Document对象,大小,路径、内容等信息作为Field存在在Document里面
Document document = null;
//4、为Document文档添加Field
List<String> list = new ArrayList<String>();
File f = new File(str2);
long startTime = new Date().getTime();
for (File file : f.listFiles()) {
System.out.println("开始建立索引...");
// 先存储路径、名字、内容
list = getContent(file);
System.out.println("文件有" + list.size() + "条数据");
for (int i = 0; i < list.size(); i++) {
String[] con = list.get(i).split("\001");
document = new Document();
for (int j = 0; j < con.length; j++) {
document.add(new TextField("c" + j, con[j], Field.Store.YES));
}
// Field.Store.YES是否把这个文件的全名存储到硬盘中,Field.Index.NOT_ANALYZED没有必要分词
document.add(new StringField("file_name", file.getName(), Field.Store.YES));
document.add(new StringField("path", file.getAbsolutePath(), Field.Store.YES));
//5、通过IndexWriter添加文档到索引中
writer.addDocument(document);
con = null;//清空数组
}
list.clear();
}
long endTime = new Date().getTime();
//打印简历索引所花费的时间到控制台
System.out.println("共花了" + (endTime - startTime) + "毫秒将文档增加到索引中" + str1);
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭索引生成流
if (writer != null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
//获取文本内容
public List<String> getContent(File file) throws Exception {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
String line = br.readLine();
List<String> contents = new ArrayList<String>();
while (line != null) {
contents.add(line);
line = br.readLine();
}
br.close();
return contents;
}
}
2.检索模块
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.queryparser.classic.QueryParser;
public class Search {
public static void main(String[] args) throws Exception {
String str1 = "e:\\luc2";//索引文件
String str2 = "mokdis旗舰店";//检索内容
String str3 = "c2";//检索字段名
int in4 = 10;//检索出多少条数据
Search hl = new Search();
hl.search(str1, str2, str3, in4);
}
@SuppressWarnings("deprecation")
public void search(String str1, String str2, String str3, int in4) {
Directory directory = null;
IndexReader reader = null;
long startTime = new Date().getTime();
try {
//1、创建Directory,要去什么地方搜索
directory = FSDirectory.open(new File(str1));
//2、创建IndexReader,通过它来读取索引
reader = IndexReader.open(directory);
//3、根据IndexReader来创建IndexSearcher
IndexSearcher searcher = new IndexSearcher(reader);
//4、创建搜索的Query,创建parser来确定要搜索文件的内容,创建搜索的域,创建索引时设置的值
QueryParser parser = new QueryParser(Version.LUCENE_4_9, str3,
new StandardAnalyzer(Version.LUCENE_4_9));
//
Query query;
try {
//5、根据Searcher搜索并返回TopDocs
query = parser.parse(str2);
TopDocs tds = searcher.search(query, in4);
System.out.println("共为您查找到" + tds.totalHits + "条结果");
//6、根据TopDocs获取ScoreDoc对象
ScoreDoc[] sds = tds.scoreDocs;
long midTime = new Date().getTime();
System.out.println("访问索引时间Tindex-1 = " + (midTime - startTime)
+ "毫秒");
int[] docCount = new int[in4];
int i = 0;
for (ScoreDoc sd : sds) {
docCount[i] = sd.doc;
i++;
System.out.println("sd.doc " + sd.doc);
//7、根据Searcher和ScoreDoc对象获取具体的Document对象
Document document = searcher.doc(sd.doc);
//8、根据Document对象获取需要的值
System.out.println(document.get(str3));
document = null;
}
//打印检索花费时间到控制台
long endTime = new Date().getTime();
System.out.println("访问Lucene数据时间Tlocal = " + (endTime - midTime) + "毫秒");
System.out.println("总检索时间TLucene = " + (endTime - startTime) + "毫秒");
// 打印检索记录的序号,DocId序列涵盖顺序、逆序、随机等情况
List<Integer> list = new ArrayList<Integer>();
for (int j = 0; j < docCount.length; j++) {
list.add(docCount[j]);
}
List<Integer> list1 = new ArrayList<Integer>();// 存原序列
List<Integer> list2 = new ArrayList<Integer>();// 存顺序排列
List<Integer> list3 = new ArrayList<Integer>();// 存倒序排列
list1 = list;// 原序列
System.out.println("原序列为:" + list1);
Collections.sort(list); // 顺序排列
list2 = list;// list2存顺序序列
System.out.println("顺序序列为:" + list2);
Collections.reverse(list); // 倒序排列
list3 = list;// list3存倒序排列
System.out.println("倒序序列为:" + list3);
Collections.shuffle(list); // 随机排列
System.out.println("随机序列为:" + list);// list存随机序列
} catch (ParseException e) {
e.printStackTrace();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// (9)关闭Reader
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}