lucene2.4 gong on

最新推荐文章于 2024-11-10 16:26:55 发布

zhyt710

最新推荐文章于 2024-11-10 16:26:55 发布

阅读量111

点赞数

分类专栏：检索/P2P/分布式文章标签： lucene Apache Excel Eclipse

检索/P2P/分布式专栏收录该内容

2 篇文章 0 订阅

订阅专栏

我把lucene2.4发行包的一个例子改的更简单些，仅供参考，其中在eclipse中运行遇到中文乱码问题，这些在代码中会有体现。

完成这个例子后，我顺便试了一下qieqie 的庖丁解牛中文分词器，很好用，而且分发包中有个简单的说明文档。

package tutorial;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

public class IndexFiles {

	public static void main(String[] args) {
		long start = System.currentTimeMillis();
		try {
			IndexWriter writer = new IndexWriter("index",
					new StandardAnalyzer(), true,
					IndexWriter.MaxFieldLength.LIMITED);
			indexDocs(writer, new File("data"));
			writer.optimize();
			writer.close();
			System.out.println("用时：" + (System.currentTimeMillis() - start)
					+ " 毫秒");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	static void indexDocs(IndexWriter writer, File file) throws IOException {
		if (file.canRead()) {
			if (file.isDirectory()) {
				String[] files = file.list();
				if (files != null) {
					for (int i = 0; i < files.length; i++) {
						indexDocs(writer, new File(file, files[i]));
					}
				}
			} else {
				System.out.println("添加 " + file);
				try {
					//针对参数文件建立索引文档  
					Document doc = new Document();
					//Field.Index.NOT_ANALYZED 文件名称 建立索引，但不分词  
					doc.add(new Field("filename", file.getCanonicalPath(),
							Field.Store.YES, Field.Index.NOT_ANALYZED));
					doc.add(new Field("contents", 
							new InputStreamReader(new FileInputStream(file.getCanonicalPath()), "utf-8")));
					//在writer中加入此文档  
					writer.addDocument(doc);
					//抛弃了原来demo中的如下做法，因为这样不能设定字符编码，当出现因为编码为题查不到
					//中文字符时，便束手无策。
					//writer.addDocument(FileDocument.Document(file));
				} catch (FileNotFoundException fnfe) {
					;
				}
			}
		}
	}

}

package tutorial;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;

public class SearchFiles {

	public static void main(String[] args) throws Exception {

		IndexReader reader = IndexReader.open("index");

		Searcher searcher = new IndexSearcher(reader);
		Analyzer analyzer = new StandardAnalyzer();

		BufferedReader in = new BufferedReader(new InputStreamReader(System.in));

		String field = "contents";
		QueryParser parser = new QueryParser(field, analyzer);

		String queries = null;
		Query query = null;
		while (true) {
			if (queries == null) {
				System.out.print("输入查询词(quit or exit 退出): ");
			}

			String line = in.readLine();
			if (line == null || line.length() == -1) {
				continue;
			} else if (line.equals("quit") || line.equals("exit")) {
				break;
			}
			line = line.trim();

			query = parser.parse(line);
			System.out.println("查询: " + query.toString(field));

			long start = System.currentTimeMillis();
			doPagingSearch(searcher, query, 5);
			System.out.println("用时：" + (System.currentTimeMillis() - start)
					+ " 毫秒");
		}

		reader.close();
	}

	public static void doPagingSearch(Searcher searcher, Query query, 
            int hitsPerPage) throws IOException {
		
		TopDocCollector collector = new TopDocCollector(5 * hitsPerPage);
		searcher.search(query, collector);
		ScoreDoc[] hits = collector.topDocs().scoreDocs;
		
		int numTotalHits = collector.getTotalHits();
		System.out.println("符合查询词的文件数：" + numTotalHits);
		
		int start = 0;
		int end = Math.min(numTotalHits, hitsPerPage);
		
		for (int i = start; i < end; i++) {
			Document doc = searcher.doc(hits[i].doc);
			String path = doc.get("filename");
			if (path != null) {
				System.out.println((i + 1) + ". " + path);
			} else {
				System.out.println((i + 1) + ". " + "沒有这个目录指定的文件");
			}
		}
	}
	
}

也许我的进度有点缓慢，因为这两天有点事，且总是心神不宁的。今天去火车站买票了，给我的感觉是跟我平常买票没什么区别，我很容易就买到票了，而且还有座位。也许是因为家近的原因吧。或者是我买的日期不是一个高峰点。

2009-01-12

下面是从庖丁文档里粘出来的，并为了适应版本2.4做了部分修改的代码：

package tutorial;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.highlight.TokenSources;

public class PaodingExample {

	public static void main(String[] args) throws Exception {
		String IDNEX_PATH = "index";

		// 获取Paoding中文分词器
		Analyzer analyzer = new PaodingAnalyzer();

		// 建立索引
		IndexWriter writer = new IndexWriter(IDNEX_PATH, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
		Document doc = new Document();
		Field field = new Field("content", "你好，世界!", Field.Store.YES,
				Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
		doc.add(field);
		writer.addDocument(doc);
		writer.close();

		System.out.println("Indexed success!");

		// 检索
		IndexReader reader = IndexReader.open(IDNEX_PATH);
		QueryParser parser = new QueryParser("content", analyzer);
		Query query = parser.parse("你好");
		Searcher searcher = new IndexSearcher(reader);
		TopDocCollector collector = new TopDocCollector(5);
		searcher.search(query, collector);
		ScoreDoc[] hits = collector.topDocs().scoreDocs;

		if (collector.getTotalHits() == 0) {
			System.out.println("hits.length=0");
			System.exit(0);
		}

		Document doc2 = searcher.doc(hits[0].doc);
		// 高亮处理
		String text = doc2.get("content");
		TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
				0, "content");
		TokenStream ts = TokenSources.getTokenStream(tpv);
		Formatter formatter = new Formatter() {
			public String highlightTerm(String srcText, TokenGroup g) {
				if (g.getTotalScore() <= 0) {
					return srcText;
				}
				return "<b>" + srcText + "</b>";
			}
		};

		Highlighter highlighter = new Highlighter(formatter, new QueryScorer(
				query));
		String result = highlighter.getBestFragments(ts, text, 5, "…");
		System.out.println("result:\n\t" + result);
		reader.close();

	}

}