1.建立索引
package org.senssic.lucene;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Scanner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
/**
* jdk7+
*
* @ClassName: IndexFiles
* @Description: 索引建立
* @author senssic
* @date 2014年7月8日 上午9:39:30
*
*/
public class IndexFiles {
private static String[] name = { ".txt", ".html" };
private IndexFiles() {
}
public static void main(String[] args) {
String indexPath = "D:\\Index";// 建立索引的目录
String docsPath = "D:\\LuceneIndex";// 被索引目录
boolean create = true;// 是否重新删除建立
final File docDir = new File(docsPath);
Date start = new Date();
try {
System.out.println("索引目录中 '" + indexPath + "'...");
Directory dir = FSDirectory.open(new File(indexPath));
// 使用标准分词
Analyzer analyzer = new MMSegAnalyzer();
// 建立索引配置类
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,
analyzer);
if (create) {
// 删除索引重新建立改变状态
iwc.setOpenMode(OpenMode.CREATE);
} else {
// 在索引中添加一个新的document改变状态
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
// 索引缓冲
iwc.setRAMBufferSizeMB(100);
IndexWriter writer = new IndexWriter(dir, iwc);
// 递归建立索引
indexDocs(writer, docDir);
writer.close();
Date end = new Date();
System.out.println("总耗时\t" + (end.getTime() - start.getTime())
+ "\t毫秒");
} catch (IOException e) {
System.out.println(" 异常: " + e.getClass() + "\n异常信息: "
+ e.getMessage());
}
}
static void indexDocs(IndexWriter writer, File file) throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list(new FilenameFilter() {
@Override
public boolean accept(File paramFile, String pString) {
for (String stn : name) {
if (!pString.toLowerCase().endsWith(stn)) {
return true;
}
}
return false;
}
});
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
StringBuilder sb = new StringBuilder();
Scanner scanner = new Scanner(file);
scanner.useDelimiter("\n");
while (scanner.hasNext()) {
sb.append(scanner.next() + "\n");
}
try {
Document doc = new Document();
// 建立索引信息元素如果不保存则不会存储到Document
Field pathField = new StringField("path", file.getPath(),
Field.Store.YES);
doc.add(pathField);
doc.add(new TextField("contents", sb.toString(),
Field.Store.YES));
doc.add(new StringField("lastmodified",
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
.format(new Date(file.lastModified())),
Field.Store.YES));
doc.add(new StringField("filename", file.getName(),
Field.Store.YES));
float length = (float) file.length() / (float) 1024;
doc.add(new StringField("filelength", String.format("%.3f",
length) + "kB", Field.Store.YES));
doc.add(new StringField("absolutepath", file
.getAbsolutePath(), Field.Store.YES));
// 判断刚才IndexWriterConfig的OpenMode的状态
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// 重新建立索引
System.out.println("添加中 " + file);
writer.addDocument(doc);
} else {
// 更新建立索引
System.out.println("更新中 " + file);
writer.updateDocument(new Term("path", file.getPath()),
doc);
}
} finally {
scanner.close();
}
}
}
}
}
2.查询
package org.senssic.lucene;
import java.io.File;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class SearchFiles {
private SearchFiles() {
}
public static void main(String[] args) throws Exception {
String index = "D:\\Index";
String queryString = "我爱你";
// 打开索引
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(
index)));
IndexSearcher searcher = new IndexSearcher(reader);
// 标准分词
Analyzer analyzer = new MMSegAnalyzer();
// 分析的字段
QueryParser parser = new QueryParser(Version.LUCENE_48, "contents",
analyzer);
// 查询的内容
Query query = parser.parse(queryString);
System.out.println("查询内容: " + query.toString("contents"));
Date start = new Date();
TopDocs results = searcher.search(query, null, 100);
ScoreDoc sDoc[] = results.scoreDocs;
int i = 0;
for (ScoreDoc scoreDoc : sDoc) {
Document document = searcher.doc(scoreDoc.doc);
System.out.println("\n\n\n第" + ++i + "个文件:");
System.out.println("文件名称:" + document.get("filename") + "\n路径:"
+ document.get("path") + "\n绝对路径:"
+ document.get("absolutepath") + "\n内容:"
// document.get("contents") + "\n最后修改时间:"
+ document.get("lastmodified") + "\n文件大小:"
+ document.get("filelength"));
}
Date end = new Date();
System.out.println("\n\n\n耗时: " + (end.getTime() - start.getTime())
+ "ms");
System.out.println(results.totalHits);
reader.close();
}
}
3.使用mmseg4j分词的例子
package org.senssic.lucene.util;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class AnalyzerUtils {
public static void displayAllTokenInfo(String str, Analyzer a) {
try {
TokenStream stream = a
.tokenStream("content", new StringReader(str));
// 位置增量的属性,存储语汇单元之间的距离
PositionIncrementAttribute pia = stream
.addAttribute(PositionIncrementAttribute.class);
// 每个语汇单元的位置偏移量
OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);
// 存储每一个语汇单元的信息(分词单元信息)
CharTermAttribute cta = stream
.addAttribute(CharTermAttribute.class);
// 使用的分词器的类型信息
TypeAttribute ta = stream.addAttribute(TypeAttribute.class);
for (; stream.incrementToken();) {
System.out.print("[" + cta + "]");
System.out.print(pia.getPositionIncrement() + ":");
System.out.print(cta + "[" + oa.startOffset() + "-"
+ oa.endOffset() + "]-->" + ta.type() + "\n");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
AnalyzerUtils.displayAllTokenInfo("我爱你中国", new MMSegAnalyzer());
}
}
需要的jar包