Lucene实例

最新推荐文章于 2024-10-10 10:34:11 发布

xuganggogo

最新推荐文章于 2024-10-10 10:34:11 发布

阅读量85

点赞数

分类专栏： Lucene/Nutch 文章标签： lucene Apache Cache F# PHP

本文链接：https://blog.csdn.net/xuganggogo/article/details/83336614

版权

Lucene/Nutch 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

建立索引：

package paoding;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

public class IndexFiles {

	public static void main(String[] args) {
		long start = System.currentTimeMillis();
		try {
			// 获取Paoding中文分词器
			Analyzer analyzer = new PaodingAnalyzer();
			// Analyzer analyzer = new StandardAnalyzer();
			// indexWriter建立索引
			IndexWriter writer = new IndexWriter("f:\\indexpaoding", analyzer, true, 
					IndexWriter.MaxFieldLength.UNLIMITED);
			indexDocs(writer, new File("F:\\徐剛：28tel(繁firfox)"));
			writer.optimize();
			writer.close();
			System.out.println("用时：" + (System.currentTimeMillis() - start)
					+ " 毫秒");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	// 遍历文件夹文件，对需要的文件建立索引
	static void indexDocs(IndexWriter writer, File file) throws IOException {
		if (file.canRead()) {
			if (file.isDirectory()) {
				String[] files = file.list();
				if (files != null) {
					for (int i = 0; i < files.length; i++) {
						indexDocs(writer, new File(file, files[i]));
					}
				}
			} else {
				if (file.getName().endsWith(".htm")
						|| file.getName().endsWith(".html")
						|| file.getName().endsWith(".jsp")
						|| file.getName().endsWith(".php")
						|| file.getName().endsWith(".txt")) {
					System.out.println("添加 " + file);
					try {
						// 针对参数文件建立索引文档 ，一个Document就相当于一跳记录
						Document doc = new Document();
						// Field.Index.ANALYZED 文件名称 建立索引，分词
						doc.add(new Field("filename", file.getCanonicalPath(),
								Field.Store.YES, Field.Index.ANALYZED,
								Field.TermVector.WITH_POSITIONS_OFFSETS));
						doc.add(new Field("contents", ReadFile(file),
								Field.Store.YES, Field.Index.ANALYZED,
								Field.TermVector.WITH_POSITIONS_OFFSETS));
						// new InputStreamReader(new
						// FileInputStream(file.getCanonicalPath()), "utf-8")));
						writer.addDocument(doc);
					} catch (FileNotFoundException fnfe) {
						;
					}
				}
			}
		}
	}

	// 用字符串形式，读取一个File的内容
	public static String ReadFile(File f) {
		String line = null;
		StringBuffer temp = new StringBuffer();
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(
					new FileInputStream(f), "utf-8"));
			while ((line = br.readLine()) != null) {
				temp.append(line);
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return temp.toString();
	}

}

用来搜索：并带简单分页效果

package paoding;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.highlight.TokenSources;

public class SearchFiles {
	/**
	 * 
	 * @param key
	 *            搜索的关键字
	 * @param perPage
	 *            每页显示多少条记录
	 * @param begin
	 *            从第几页开始显示
	 * @throws CorruptIndexException
	 * @throws IOException
	 * @throws ParseException
	 */
	int CACHE_PAGE = 3; // 缓存的页面数

	public void search(String key, int perPage, int begin)
			throws CorruptIndexException, IOException, ParseException {
		String IDNEX_PATH = "f:\\indexpaoding";     //索引所在目录

		int total_Page = 0; // 总页数

		// 获取Paoding中文分词器
		Analyzer analyzer = new PaodingAnalyzer();
		// Analyzer analyzer = new StandardAnalyzer();
		// 检索
		IndexReader reader = IndexReader.open(IDNEX_PATH);
		Searcher searcher = new IndexSearcher(reader);

		/* 下面这个表示要同时搜索这两个域，而且只要一个域里面有满足我们搜索的内容就行 */
		BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,
				BooleanClause.Occur.SHOULD };
		Query query = MultiFieldQueryParser.parse(key, new String[] {
				"filename", "contents" }, clauses, analyzer);
		// QueryParser parser = new QueryParser("contents", analyzer);
		// Query query = parser.parse(key);

		TopDocCollector collector = new TopDocCollector(perPage * CACHE_PAGE); // perPage
		
		searcher.search(query, collector);
		ScoreDoc[] hits = collector.topDocs().scoreDocs;

		int numTotalHits = collector.getTotalHits();
		System.out.println("符合查询词的文件数：" + numTotalHits);

		// 获得总页数
		if (numTotalHits % perPage != 0) {
			total_Page = numTotalHits / perPage + 1;
		} else {
			total_Page = numTotalHits / perPage;
		}

		if (begin > total_Page) {
			System.err.println("超出范围");
		} else {
			// 如果起始页大于缓存页，这就代表我们需要重新搜索更多的资源
			if (begin > CACHE_PAGE) {
				// 这时，我把搜索的资源都搜索出来，缓存页数=总页数
				CACHE_PAGE = total_Page;
				// 返回调用
				search(key, perPage, begin);
				// collector = new TopDocCollector( numTotalHits ); //缓存不够，重新搜索
				// searcher.search(query, collector);
				// hits = collector.topDocs().scoreDocs;
			} else {
				int temp = (begin - 1) * perPage + perPage;
				if ((begin - 1) * perPage + perPage > numTotalHits) {
					temp = numTotalHits;
				}
				// 根据参数，从指定的位置开始获取数据（用于分页）
				for (int i = (begin - 1) * perPage; i < temp; i++) {
					System.out.println(i);
					int docId = hits[i].doc;
					Document doc3 = searcher.doc(docId);
					String filename = doc3.get("filename");
					System.out.println("filename=" + filename);
					// 高亮处理
					String text = doc3.get("contents");
					TermPositionVector tpv = (TermPositionVector) reader
							.getTermFreqVector(hits[i].doc, "contents");
					TokenStream ts = TokenSources.getTokenStream(tpv);
					Formatter formatter = new Formatter() {
						public String highlightTerm(String srcText, TokenGroup g) {
							if (g.getTotalScore() <= 0) {
								return srcText;
							}
							return "<b>" + srcText + "</b>";
						}
					};
					Highlighter highlighter = new Highlighter(formatter,
							new QueryScorer(query));
					String result = highlighter.getBestFragments(ts, text, 5,
							"…");
					System.out.println("result:\n\t" + result);
				}
				System.out.println("循环结束");
			}
		}
		reader.close();
		System.out.println("关闭reader");

	}

	public static void main(String[] args) throws Exception {
		SearchFiles sf = new SearchFiles();
		sf.search("vvczvxcxz", 5, 1);
	}
}