java lucence使用案例_Lucene入门例子 | 学步园

1.建立索引

package org.senssic.lucene;

import java.io.File;

import java.io.FilenameFilter;

import java.io.IOException;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.index.Term;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

/**

* jdk7+

*

* @ClassName: IndexFiles

* @Description: 索引建立

* @author senssic

* @date 2014年7月8日 上午9:39:30

*

*/

public class IndexFiles {

private static String[] name = { ".txt", ".html" };

private IndexFiles() {

}

public static void main(String[] args) {

String indexPath = "D:\\Index";// 建立索引的目录

String docsPath = "D:\\LuceneIndex";// 被索引目录

boolean create = true;// 是否重新删除建立

final File docDir = new File(docsPath);

Date start = new Date();

try {

System.out.println("索引目录中 '" + indexPath + "'...");

Directory dir = FSDirectory.open(new File(indexPath));

// 使用标准分词

Analyzer analyzer = new MMSegAnalyzer();

// 建立索引配置类

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,

analyzer);

if (create) {

// 删除索引重新建立改变状态

iwc.setOpenMode(OpenMode.CREATE);

} else {

// 在索引中添加一个新的document改变状态

iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);

}

// 索引缓冲

iwc.setRAMBufferSizeMB(100);

IndexWriter writer = new IndexWriter(dir, iwc);

// 递归建立索引

indexDocs(writer, docDir);

writer.close();

Date end = new Date();

System.out.println("总耗时\t" + (end.getTime() - start.getTime())

+ "\t毫秒");

} catch (IOException e) {

System.out.println(" 异常: " + e.getClass() + "\n异常信息: "

+ e.getMessage());

}

}

static void indexDocs(IndexWriter writer, File file) throws IOException {

if (file.canRead()) {

if (file.isDirectory()) {

String[] files = file.list(new FilenameFilter() {

@Override

public boolean accept(File paramFile, String pString) {

for (String stn : name) {

if (!pString.toLowerCase().endsWith(stn)) {

return true;

}

}

return false;

}

});

if (files != null) {

for (int i = 0; i < files.length; i++) {

indexDocs(writer, new File(file, files[i]));

}

}

} else {

StringBuilder sb = new StringBuilder();

Scanner scanner = new Scanner(file);

scanner.useDelimiter("\n");

while (scanner.hasNext()) {

sb.append(scanner.next() + "\n");

}

try {

Document doc = new Document();

// 建立索引信息元素如果不保存则不会存储到Document

Field pathField = new StringField("path", file.getPath(),

Field.Store.YES);

doc.add(pathField);

doc.add(new TextField("contents", sb.toString(),

Field.Store.YES));

doc.add(new StringField("lastmodified",

new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")

.format(new Date(file.lastModified())),

Field.Store.YES));

doc.add(new StringField("filename", file.getName(),

Field.Store.YES));

float length = (float) file.length() / (float) 1024;

doc.add(new StringField("filelength", String.format("%.3f",

length) + "kB", Field.Store.YES));

doc.add(new StringField("absolutepath", file

.getAbsolutePath(), Field.Store.YES));

// 判断刚才IndexWriterConfig的OpenMode的状态

if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {

// 重新建立索引

System.out.println("添加中 " + file);

writer.addDocument(doc);

} else {

// 更新建立索引

System.out.println("更新中 " + file);

writer.updateDocument(new Term("path", file.getPath()),

doc);

}

} finally {

scanner.close();

}

}

}

}

}

2.查询

package org.senssic.lucene;

import java.io.File;

import java.util.Date;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class SearchFiles {

private SearchFiles() {

}

public static void main(String[] args) throws Exception {

String index = "D:\\Index";

String queryString = "我爱你";

// 打开索引

IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(

index)));

IndexSearcher searcher = new IndexSearcher(reader);

// 标准分词

Analyzer analyzer = new MMSegAnalyzer();

// 分析的字段

QueryParser parser = new QueryParser(Version.LUCENE_48, "contents",

analyzer);

// 查询的内容

Query query = parser.parse(queryString);

System.out.println("查询内容: " + query.toString("contents"));

Date start = new Date();

TopDocs results = searcher.search(query, null, 100);

ScoreDoc sDoc[] = results.scoreDocs;

int i = 0;

for (ScoreDoc scoreDoc : sDoc) {

Document document = searcher.doc(scoreDoc.doc);

System.out.println("\n\n\n第" + ++i + "个文件:");

System.out.println("文件名称:" + document.get("filename") + "\n路径:"

+ document.get("path") + "\n绝对路径:"

+ document.get("absolutepath") + "\n内容:"

// document.get("contents") + "\n最后修改时间:"

+ document.get("lastmodified") + "\n文件大小:"

+ document.get("filelength"));

}

Date end = new Date();

System.out.println("\n\n\n耗时: " + (end.getTime() - start.getTime())

+ "ms");

System.out.println(results.totalHits);

reader.close();

}

}

3.使用mmseg4j分词的例子

package org.senssic.lucene.util;

import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class AnalyzerUtils {

public static void displayAllTokenInfo(String str, Analyzer a) {

try {

TokenStream stream = a

.tokenStream("content", new StringReader(str));

// 位置增量的属性,存储语汇单元之间的距离

PositionIncrementAttribute pia = stream

.addAttribute(PositionIncrementAttribute.class);

// 每个语汇单元的位置偏移量

OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);

// 存储每一个语汇单元的信息(分词单元信息)

CharTermAttribute cta = stream

.addAttribute(CharTermAttribute.class);

// 使用的分词器的类型信息

TypeAttribute ta = stream.addAttribute(TypeAttribute.class);

for (; stream.incrementToken();) {

System.out.print("[" + cta + "]");

System.out.print(pia.getPositionIncrement() + ":");

System.out.print(cta + "[" + oa.startOffset() + "-"

+ oa.endOffset() + "]-->" + ta.type() + "\n");

}

} catch (Exception e) {

e.printStackTrace();

}

}

public static void main(String[] args) {

AnalyzerUtils.displayAllTokenInfo("我爱你中国", new MMSegAnalyzer());

}

}

需要的jar包

e94a2e36126adea251dc6bc0f05a124e.png

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值