12-tika索引的创建和搜索

最新推荐文章于 2022-05-23 15:31:15 发布

会编程的阿强

最新推荐文章于 2022-05-23 15:31:15 发布

阅读量800

点赞数

分类专栏： lucene

本文链接：https://blog.csdn.net/shuangrenyu1234/article/details/45173183

版权

lucene 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

TestIndex.java

package org.lucene.test;

import java.io.File;

import org.junit.Test;
import org.lucene.util.FileIndexUtil;
import org.lucene.util.IndexUtil;
import org.lucene.util.SearcherUtil;

public class TestIndex {
	/**
	 *@MethodName:testIndex
	 *@Description:创建索引
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21上午11:50:58
	 */
	@Test
	public void testIndex() {
		IndexUtil iu = new IndexUtil();
		iu.index();
	}

	/**
	 *@MethodName:testTika01
	 *@Description:使用tika插件进行解析doc文件到控制台
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午12:03:31
	 */
	@Test
	public void testTika01() {
		IndexUtil iu = new IndexUtil();
		System.out.println(iu
				.fileToTxt(new File("D:/lucene/example2/职位JD.doc")));
	}

	/**
	 *@MethodName:testTika02
	 *@Description:使用tika进行解析doc(第二种方式)
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午01:13:05
	 */
	@Test
	public void testTika02() {
		IndexUtil iu = new IndexUtil();
		System.out
				.println(iu.tikaTool(new File("D:/lucene/example2/职位JD.doc")));
	}

	/**
	 *@MethodName:testIndex03
	 *@Description:使用Tika进行索引的创建
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午02:14:00
	 */
	@Test
	public void testIndex03() {
		FileIndexUtil.index(true);
	}

	/**
	 *@MethodName:testSearcher01 
	 *@Description:使用tika进行解析文档之后，进行搜索 
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午03:38:43
	 */
	@Test
	public void testSearcher01() {
		SearcherUtil su = new SearcherUtil();
		su.searcher01();
	}
}

FileIndexUtil.java

package org.lucene.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class FileIndexUtil {
	private static Directory directory = null;
	static {
		try {
			directory = FSDirectory.open(new File("d:/lucene/files"));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static Directory getDirectory() {
		return directory;
	}

	/**
	 *@MethodName:generatorDocument
	 *@Description:获取文件的页数
	 *@param f
	 *@return
	 *@author:半仙儿
	 *@return Document
	 * @throws IOException
	 *@date:2015-4-21下午02:05:48
	 */
	public static Document generatorDocument(File f) throws IOException {
		Document doc = new Document();
		Metadata metadata = new Metadata();
		doc.add(new Field("content", new Tika().parse(new FileInputStream(f),
				metadata)));
		doc.add(new Field("title", FilenameUtils.getBaseName(f.getName()),
				Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("filename", f.getName(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		// 类型
		doc.add(new Field("type", FilenameUtils.getExtension(f.getName()),
				Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
		int page = 0;

	
		doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));

		try {
			page = Integer.parseInt(metadata.get("xmpTPg:NPage"));
		} catch (Exception e) {
			
		}
		// 存储页码
		doc.add(new NumericField("page", Field.Store.YES, true)
				.setIntValue(page));
		doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(f
				.lastModified()));
		doc.add(new NumericField("size", Field.Store.YES, true)
				.setIntValue((int) f.length() / 1024));
		return doc;
	}

	/**
	 *@MethodName:index
	 *@Description:创建索引
	 *@param hasNew是否要新建索引
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-15下午04:05:04
	 */
	public static void index(boolean hasNew) {
		IndexWriter writer = null;
		try {
			writer = new IndexWriter(directory, new IndexWriterConfig(
					Version.LUCENE_35, new MMSegAnalyzer()));
			if (hasNew) {
				writer.deleteAll();
			}
			File file = new File("d:/lucene/example2");
			Document doc = null;
			for (File f : file.listFiles()) {
				doc = generatorDocument(f);
				// 通过tika直接存储
				doc.add(new Field("content", new Tika().parse(f)));
				doc.add(new Field("title", FilenameUtils.getBaseName(f
						.getName()), Field.Store.YES, Field.Index.ANALYZED));
				doc.add(new Field("filename", f.getName(), Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				// 类型
				doc.add(new Field("type", FilenameUtils.getExtension(f
						.getName()), Field.Store.YES,
						Field.Index.NOT_ANALYZED_NO_NORMS));
				doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				doc.add(new NumericField("date", Field.Store.YES, true)
						.setLongValue(f.lastModified()));
				doc.add(new NumericField("size", Field.Store.YES, true)
						.setIntValue((int) f.length() / 1024));
				writer.addDocument(doc);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (writer != null)
					writer.close();
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

IndexUtil.java

package org.lucene.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class IndexUtil {
	/**
	 * 
	 *@MethodName:index
	 *@Description:创建索引
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21上午11:36:54
	 */
	public void index() {
		try {
			File f = new File("D:/lucene/example2/职位JD.doc");
			Directory dir = FSDirectory.open(new File("d:/lucene/file2"));
			IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
					Version.LUCENE_35, new MMSegAnalyzer()));
			writer.deleteAll();
			Document doc = new Document();
			doc.add(new Field("content", new FileReader(f)));
			writer.addDocument(doc);
			writer.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 *@MethodName:fileToTxt
	 *@Description:使用tika进行doc文件的解析
	 *@param f
	 *@return
	 *@author:半仙儿
	 *@return String
	 *@date:2015-4-21下午01:08:32
	 */
	public String fileToTxt(File f) {
		Parser parser = new AutoDetectParser();
		InputStream is = null;
		try {
			Metadata metadata = new Metadata();
			metadata.set(Metadata.AUTHOR, "空号");
			is = new FileInputStream(f);
			ContentHandler handler = new BodyContentHandler();
			ParseContext context = new ParseContext();
			context.set(Parser.class, parser);
			parser.parse(is, handler, metadata, context);
			for (String name : metadata.names()) {
				System.out.println(name + ":" + metadata.get(name));
			}
			return handler.toString();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (is != null)
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
		}
		return null;
	}

	/**
	 *@MethodName:tikaTool
	 *@Description:封装工具类
	 *@param f
	 *@return
	 *@author:半仙儿
	 *@return String
	 *@date:2015-4-21下午01:09:27
	 */
	public String tikaTool(File f) {
		Tika tika=new Tika();
		try {
			return tika.parseToString(f);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (TikaException e) {
			e.printStackTrace();
		}
		return null;
	}
}

SearcherUtil.java

package org.lucene.util;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;

public class SearcherUtil {
	public void searcher01() {
		try {
			IndexSearcher searcher = new IndexSearcher(IndexReader
					.open(FileIndexUtil.getDirectory()));
			TermQuery query = new TermQuery(new Term("content", "强"));
			TopDocs tds = searcher.search(query, 20);
			for (ScoreDoc sd : tds.scoreDocs) {
				Document doc = searcher.doc(sd.doc);
				System.out.println(doc.get("title"));
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}