走进lucene - 创建索引、检索

最新推荐文章于 2024-09-30 10:36:58 发布

Erli11

最新推荐文章于 2024-09-30 10:36:58 发布

阅读量965

点赞数

分类专栏： JAVA 搜索引擎文章标签： lucene 搜索引擎源码

本文链接：https://blog.csdn.net/Erli11/article/details/26482327

版权

JAVA 同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

搜索引擎

1 篇文章 0 订阅

订阅专栏

看过王老师的信息检索导论，一直想学lucene，看看从实现的角度，搜索引擎是如何工作的。

正好工作中用到一点lucene，尝试使用，为后续分析做好准备。

截止到现在，lucene已经更新到4.8了，适配Java8，并做了很多封装。

自己会用3.6来进行学习。因为阅读lucene源码还是4版本一下的更为合适，而且一些简单查询，3.6也足够用了。

1 构建索引

public boolean indexBuild(String indexPath, String inputFile) {
		boolean suc = true;
		try {
			Directory dir = FSDirectory.open(new File(indexPath));

			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36,
					analyzer);

			boolean create = true;
			if (create) {
				// Create a new index in the directory,
				// removing any previously indexed documents:
				iwc.setOpenMode(OpenMode.CREATE);
			} else {
				// Add new documents to an existing index:
				iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
			}

			// Optional: for better indexing performance, if you
			// are indexing many documents, increase the RAM
			// buffer. But if you do this, increase the max heap
			// size to the JVM (eg add -Xmx512m or -Xmx1g):
			// iwc.setRAMBufferSizeMB(256.0);

			IndexWriter writer = new IndexWriter(dir, iwc);
			FileInputStream fis = new FileInputStream(inputFile);

			// make a new, empty document
			Document doc = new Document();
			// Add the path of the file as a field named "path". Use a
			// field that is indexed (i.e. searchable), but don't tokenize
			// the field into separate words and don't index term frequency
			// or positional information:
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					fis, "UTF-8"));
			String line = null;
			while ((line = reader.readLine()) != null) {
				int beginPos = line.indexOf("\t");
				String hid = line.substring(0, beginPos);
				String tags = line.substring(beginPos+1);
				Field hidField = new Field("hid", hid, Field.Store.YES,
						Field.Index.NOT_ANALYZED_NO_NORMS);
				hidField.setIndexOptions(IndexOptions.DOCS_ONLY);
				doc.add(hidField);
				
				Field tagsField = new Field("tags", tags, Field.Store.YES,
						Field.Index.NOT_ANALYZED_NO_NORMS);
				tagsField.setIndexOptions(IndexOptions.DOCS_ONLY);
				doc.add(tagsField);

				if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
					// New index, so we just add the document (no old document
					// can be there):
					System.out.println("adding " + inputFile);
					writer.addDocument(doc);
				} else {
					// Existing index (an old copy of this document may have
					// been indexed) so
					// we use updateDocument instead to replace the old one
					// matching the exact
					// path, if present:
					writer.updateDocument(new Term("path", inputFile), doc);
				}
			}
			writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return suc;
	}

2 检索

最开始的时候使用QueryParser生成Query，发现不需要分析的字段，检索不出结果来。改为有Term生成Query就OK了。

	private IndexReader reader = null;
	private Analyzer analyzer = null;
	private IndexSearcher searcher = null;

	public boolean init(String indexPath) {
		boolean suc = true;
		try {
			reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
			searcher = new IndexSearcher(reader);
			analyzer = new StandardAnalyzer(Version.LUCENE_36);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		return suc;
	}

	public String queryTags(String hid) {
		String result = null;
//		QueryParser parser = new QueryParser(Version.LUCENE_36, "hid", analyzer);
		try {
			Term term = new Term("hid", hid);
			Query query = new TermQuery(term);
			
			int hitsPerPage = 1;
			// Collect enough docs to show 5 pages
			TopDocs results = searcher.search(query, 5 * hitsPerPage);
			ScoreDoc[] hits = results.scoreDocs;

			int numTotalHits = results.totalHits;
			System.out.println(numTotalHits + " total matching documents");

			String str;
			int start = 0;
			// end pos for return docs
			int end = Math.min(numTotalHits, hitsPerPage);

			boolean raw = false;
			for (int i = start; i < end; i++) {
				if (raw) { // output raw format
					String log = "doc=" + hits[i].doc +	" score=" + hits[i].score;
					System.out.println(log);
					continue;
				}

				Document doc = searcher.doc(hits[i].doc);
				String tags = doc.get("tags");
				if (tags != null) {
					//just for work
					result = tags;
					break;
					//System.out.println((i + 1) + ". " + tags);
				} else {
					String log = (i + 1) + " No tags for " + hid;
					System.out.println(log);
				}
			}
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return result;
	}

	public void close() {
		try {
			searcher.close();
			reader.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}