Lucene检索WORD等文件

最新推荐文章于 2022-08-29 19:56:57 发布

JJC001

最新推荐文章于 2022-08-29 19:56:57 发布

阅读量1.8k

点赞数

分类专栏： Elasticsearch+Lucene 文章标签： lucene 搜索引擎

本文链接：https://blog.csdn.net/edc0228/article/details/51791741

版权

Elasticsearch+Lucene 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

###Lucene是什么？
####lucene是apache开源的全文检索的框架，不像百度那样的搜索引擎拿来就能用！
###Lucene实现检索的过程？
####Lucene实际上是先将文本写入，然后再搜索出来。
###写入：
####涉及的类: Document 、Field 、IndexWriter
####Document相当于数据库表的一行，Field相当于数据库表的一个字段，Document可以包含多个Field，用IndexWriter对象将Document对象写在磁盘上或内存里，这就实现了字符串的写入！
###搜索：
####对写入的文本进行搜索！

###如何检索WORD等microsoft文件？
####要实现Lucene检索WORD等文件，首先需要读取出WORD文件中的内容，再使用Lucene将内容写入。

###如何读取microsoft文件？
####可以使用apache的POI开源项目进行读取。

#####javaCode:

package org.fazlan.lucene.demo;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Indexer {

	// location where the index will be stored.
	public static final String INDEX_DIR = "src/main/resources/index";

	private IndexWriter writer = null;

	public Indexer() {
		try {
			writer = new IndexWriter(FSDirectory.open(new File(INDEX_DIR)),
					new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * This method will add the items into index
	 */
	public void writeIndex(IndexItem indexItem) throws IOException {

		// deleting the item, if already exists
		writer.deleteDocuments(new Term(IndexItem.ID, indexItem.getId().toString()));

		Document doc = new Document();

		doc.add(new Field(IndexItem.ID, indexItem.getId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field(IndexItem.TITLE, indexItem.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field(IndexItem.FILENAME, indexItem.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field(IndexItem.CONTENT, indexItem.getContent(), Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field(IndexItem.DATE, indexItem.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field(IndexItem.USER_NAME, indexItem.getUserName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field(IndexItem.URL, indexItem.getUrl(), Field.Store.YES, Field.Index.NOT_ANALYZED));

		// add the document to the index
		writer.addDocument(doc);
	}

	/**
	 * Closing the writer
	 */
	public void close() throws IOException {
		writer.close();
	}
}

package org.fazlan.lucene.demo;

/**
 * 索引对象，根据业务要求改动
 * 
 * @author JiaJiCheng
 *
 */
public class IndexItem {

	private Long id;

	private String title;

	private String filename;

	private String content;

	private String date;

	private String userName;

	private String url;

	public static final String ID = "id";
	public static final String TITLE = "title";
	public static final String CONTENT = "content";
	public static final String DATE = "date";
	public static final String USER_NAME = "userName";
	public static final String FILENAME = "filename";
	public static final String URL = "url";

	public IndexItem(Long id, String title, String filename, String content, String date, String userName, String url) {
		this.id = id;
		this.title = title;
		this.content = content;
		this.date = date;
		this.userName = userName;
		this.filename = filename;
		this.url = url;
	}

	public String getFilename() {
		return filename;
	}

	public String getUrl() {
		return url;
	}

	public String getDate() {
		return date;
	}

	public String getUserName() {
		return userName;
	}

	public Long getId() {
		return id;
	}

	public String getTitle() {
		return title;
	}

	public String getContent() {
		return content;
	}

	@Override
	public String toString() {
		return "IndexItem{" + "id=" + id + ", title='" + title + ", content='" + content + '\'' + "date=" + date
				+ "userName=" + userName + '}';
	}
}

package org.fazlan.lucene.demo;

import org.apache.poi.extractor.ExtractorFactory;

import java.io.File;
import java.io.IOException;

/**
 * 文件转换器
 * 
 * @author JiaJiCheng
 *
 */
public class MSDocumentParser {
	private static String getFilename(String filename) {
		return filename.substring(0, filename.lastIndexOf("."));
	}

	public static IndexItem parser(File file, String date, String userName, String url) throws IOException {
		String content = null;
		try {
			content = ExtractorFactory.createExtractor(file).getText();
		} catch (Exception e) {
			e.printStackTrace();
		}

		return new IndexItem((long) file.hashCode(), getFilename(file.getName()), file.getName(), content, date,
				userName, url);
	}
}

package org.fazlan.lucene.demo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {
	private IndexSearcher searcher;
	private QueryParser titleQueryParser;
	private QueryParser contentQueryParser;
	private static final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
	// default find result size.
	private static final int DEFAULT_RESULT_SIZE = 100;

	public Searcher() throws IOException {
		// open the index directory to search
		searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File(Indexer.INDEX_DIR))));

		// defining the query parser to search items by title field.
		titleQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.TITLE, analyzer);

		// defining the query parser to search items by content field.
		contentQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.CONTENT, analyzer);
	}

	/**
	 * This method is used to find the indexed items by the title.
	 * 
	 * @param queryString
	 *            - the query string to search for
	 */
	public List<IndexItem> findByTitle(String queryString) throws ParseException, IOException {
		// create query from the incoming query string.
		Query query = titleQueryParser.parse(queryString);
		// execute the query and get the results
		ScoreDoc[] queryResults = searcher.search(query, DEFAULT_RESULT_SIZE).scoreDocs;

		List<IndexItem> results = new ArrayList<IndexItem>();
		// process the results
		for (ScoreDoc scoreDoc : queryResults) {
			Document doc = searcher.doc(scoreDoc.doc);
			results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE),
					doc.get(IndexItem.FILENAME), doc.get(IndexItem.CONTENT), doc.get(IndexItem.DATE),
					doc.get(IndexItem.USER_NAME), doc.get(IndexItem.URL)));
		}

		return results;
	}

	/**
	 * This method is used to find the indexed items by the content.
	 * 
	 * @param queryString
	 *            - the query string to search for
	 */
	public List<IndexItem> findByContent(String queryString) throws ParseException, IOException {
		// create query from the incoming query string.
		Query query = contentQueryParser.parse(queryString);
		// execute the query and get the results
		ScoreDoc[] queryResults = searcher.search(query, DEFAULT_RESULT_SIZE).scoreDocs;
		List<IndexItem> results = new ArrayList<IndexItem>();
		// process the results
		for (ScoreDoc scoreDoc : queryResults) {
			Document doc = searcher.doc(scoreDoc.doc);
			results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE),
					doc.get(IndexItem.FILENAME), doc.get(IndexItem.CONTENT), doc.get(IndexItem.DATE),
					doc.get(IndexItem.USER_NAME), doc.get(IndexItem.URL)));
		}

		return results;
	}

	public void close() throws IOException {
		searcher.close();
	}
}

package org.fazlan.lucene.demo;

import org.apache.lucene.queryParser.ParseException;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * 实例
 * 
 * @author JiaJiCheng
 *
 */
public class FileIndexApplication {
	public static void main(String[] args) throws IOException, ParseException {

		File msWordFile = new File("src/main/resources/files/MSWord.doc");
		File msWord2003File = new File("src/main/resources/files/MSWord.docx");
		File msExcellFile = new File("src/main/resources/files/招商局系统.xls");

		// creating the indexer and indexing the items
		Indexer indexer = new Indexer();
		indexer.writeIndex(MSDocumentParser.parser(msWordFile, "1990-0-0", "zhangsan", "www.baidu.com"));
		indexer.writeIndex(MSDocumentParser.parser(msWord2003File, "1990-0-0", "zhangsan", "www.baidu.com"));
		indexer.writeIndex(MSDocumentParser.parser(msExcellFile, "1990-0-0", "zhangsan", "www.baidu.com"));
		// close the index to enable them index
		indexer.close();

		// creating the Searcher to the same index location as the Indexer
		Searcher searcher = new Searcher();
		// List<IndexItem> result = searcher.findByContent("Microfost",
		// DEFAULT_RESULT_SIZE);
		List<IndexItem> result = searcher.findByTitle("招");
		print(result);

		searcher.close();
	}

	/**
	 * print the results.
	 */
	private static void print(List<IndexItem> result) {
		System.out.println("Result Size: " + result.size());

		for (IndexItem item : result) {
			System.out.println(item);
		}
	}
}

 <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>3.6.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.8</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.8</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>

##完整的项目代码见附件