14-使用自定义的高亮标签和搜索title和content中包含搜索关键字的内容

本文链接：https://blog.csdn.net/shuangrenyu1234/article/details/45198791

TestIndex.java

package org.lucene.test;

import java.io.File;

import org.junit.Test;
import org.lucene.util.FileIndexUtil;
import org.lucene.util.IndexUtil;
import org.lucene.util.SearcherUtil;

public class TestIndex {
	/**
	 *@MethodName:testIndex
	 *@Description:创建索引
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21上午11:50:58
	 */
	@Test
	public void testIndex() {
		IndexUtil iu = new IndexUtil();
		iu.index();
	}

	/**
	 *@MethodName:testTika01
	 *@Description:使用tika插件进行解析doc文件到控制台
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午12:03:31
	 */
	@Test
	public void testTika01() {
		IndexUtil iu = new IndexUtil();
		System.out.println(iu
				.fileToTxt(new File("D:/lucene/example2/职位JD.doc")));
	}

	/**
	 *@MethodName:testTika02
	 *@Description:使用tika进行解析doc(第二种方式)
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午01:13:05
	 */
	@Test
	public void testTika02() {
		IndexUtil iu = new IndexUtil();
		System.out
				.println(iu.tikaTool(new File("D:/lucene/example2/职位JD.doc")));
	}

	/**
	 *@MethodName:testIndex03
	 *@Description:使用Tika进行索引的创建
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午02:14:00
	 */
	@Test
	public void testIndex03() {
		FileIndexUtil.index(true);
	}

	/**
	 *@MethodName:testSearcher01
	 *@Description:使用tika进行解析文档之后，进行搜索
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午03:38:43
	 */
	@Test
	public void testSearcher01() {
		SearcherUtil su = new SearcherUtil();
		su.searcher01();
	}

	/**
	 *@MethodName:testLighter01 
	 *@Description:测试高亮显示
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午04:49:44
	 */
	@Test
	public void testLighter01() {
		SearcherUtil su = new SearcherUtil();
		su.lighter01();
	}
	
	@Test
	public void testLighter02() {
		SearcherUtil su = new SearcherUtil();
		su.searcherByHighlighter("content:职位");
	}
}

FileIndexUtil.java

package org.lucene.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class FileIndexUtil {
	private static Directory directory = null;
	static {
		try {
			directory = FSDirectory.open(new File("d:/lucene/files"));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static Directory getDirectory() {
		return directory;
	}

	/**
	 *@MethodName:generatorDocument
	 *@Description:获取文件的页数
	 *@param f
	 *@return
	 *@author:半仙儿
	 *@return Document
	 * @throws IOException
	 *@date:2015-4-21下午02:05:48
	 */
	public static Document generatorDocument(File f) throws IOException {
		Document doc = new Document();
		Metadata metadata = new Metadata();
		doc.add(new Field("content", new Tika().parse(new FileInputStream(f),
				metadata)));
		doc.add(new Field("title", FilenameUtils.getBaseName(f.getName()),
				Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("filename", f.getName(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		// 类型
		doc.add(new Field("type", FilenameUtils.getExtension(f.getName()),
				Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
		int page = 0;

	
		doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));

		try {
			page = Integer.parseInt(metadata.get("xmpTPg:NPage"));
		} catch (Exception e) {
			
		}
		// 存储页码
		doc.add(new NumericField("page", Field.Store.YES, true)
				.setIntValue(page));
		doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(f
				.lastModified()));
		doc.add(new NumericField("size", Field.Store.YES, true)
				.setIntValue((int) f.length() / 1024));
		return doc;
	}

	/**
	 *@MethodName:index
	 *@Description:创建索引
	 *@param hasNew是否要新建索引
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-15下午04:05:04
	 */
	public static void index(boolean hasNew) {
		IndexWriter writer = null;
		try {
			writer = new IndexWriter(directory, new IndexWriterConfig(
					Version.LUCENE_35, new MMSegAnalyzer()));
			if (hasNew) {
				writer.deleteAll();
			}
			File file = new File("d:/lucene/example2");
			Document doc = null;
			for (File f : file.listFiles()) {
				doc = generatorDocument(f);
				// 通过tika直接存储
				doc.add(new Field("content", new Tika().parse(f)));
				doc.add(new Field("title", FilenameUtils.getBaseName(f
						.getName()), Field.Store.YES, Field.Index.ANALYZED));
				doc.add(new Field("filename", f.getName(), Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				// 类型
				doc.add(new Field("type", FilenameUtils.getExtension(f
						.getName()), Field.Store.YES,
						Field.Index.NOT_ANALYZED_NO_NORMS));
				doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				doc.add(new NumericField("date", Field.Store.YES, true)
						.setLongValue(f.lastModified()));
				doc.add(new NumericField("size", Field.Store.YES, true)
						.setIntValue((int) f.length() / 1024));
				writer.addDocument(doc);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (writer != null)
					writer.close();
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

IndexUtil.java

package org.lucene.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class IndexUtil {
	/**
	 * 
	 *@MethodName:index
	 *@Description:创建索引
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21上午11:36:54
	 */
	public void index() {
		try {
			File f = new File("D:/lucene/example2/职位JD.doc");
			Directory dir = FSDirectory.open(new File("d:/lucene/file2"));
			IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
					Version.LUCENE_35, new MMSegAnalyzer()));
			writer.deleteAll();
			Document doc = new Document();
			doc.add(new Field("content", new FileReader(f)));
			writer.addDocument(doc);
			writer.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 *@MethodName:fileToTxt
	 *@Description:使用tika进行doc文件的解析
	 *@param f
	 *@return
	 *@author:半仙儿
	 *@return String
	 *@date:2015-4-21下午01:08:32
	 */
	public String fileToTxt(File f) {
		Parser parser = new AutoDetectParser();
		InputStream is = null;
		try {
			Metadata metadata = new Metadata();
			metadata.set(Metadata.AUTHOR, "空号");
			is = new FileInputStream(f);
			ContentHandler handler = new BodyContentHandler();
			ParseContext context = new ParseContext();
			context.set(Parser.class, parser);
			parser.parse(is, handler, metadata, context);
			for (String name : metadata.names()) {
				System.out.println(name + ":" + metadata.get(name));
			}
			return handler.toString();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (is != null)
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
		}
		return null;
	}

	/**
	 *@MethodName:tikaTool
	 *@Description:封装工具类
	 *@param f
	 *@return
	 *@author:半仙儿
	 *@return String
	 *@date:2015-4-21下午01:09:27
	 */
	public String tikaTool(File f) {
		Tika tika=new Tika();
		try {
			return tika.parseToString(f);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (TikaException e) {
			e.printStackTrace();
		}
		return null;
	}
}

SearcherUtil.java

package org.lucene.util;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class SearcherUtil {
	public void searcher01() {
		try {
			IndexSearcher searcher = new IndexSearcher(IndexReader
					.open(FileIndexUtil.getDirectory()));
			TermQuery query = new TermQuery(new Term("content", "强"));
			TopDocs tds = searcher.search(query, 20);
			for (ScoreDoc sd : tds.scoreDocs) {
				Document doc = searcher.doc(sd.doc);
				System.out.println(doc.get("title"));
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 *@MethodName:searcherByHighlighter
	 *@Description:高亮搜索2
	 *@param name
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-22上午09:57:53
	 */
	public void searcherByHighlighter(String name) {
		try {
			Analyzer a = new MMSegAnalyzer();
			IndexSearcher searcher = new IndexSearcher(IndexReader
					.open(FileIndexUtil.getDirectory()));
			// QueryParser parser = new QueryParser(Version.LUCENE_35, "title",
			// a);
			MultiFieldQueryParser parser = new MultiFieldQueryParser(
					Version.LUCENE_35, new String[] { "title", "content" }, a);
			Query query = parser.parse(name);
			TopDocs tds = searcher.search(query, 20);
			for (ScoreDoc sd : tds.scoreDocs) {
				Document doc = searcher.doc(sd.doc);
				String title = doc.get("title");
				title = ligterStr(a, query, title, "title");
				System.out.println("标题--->"+title);
				System.out
						.println("**************************************************************");
				String content = new Tika().parseToString(new File(doc
						.get("path")));
				content = ligterStr(a, query, content, "content");
				System.out.println("内容--->"+content);
				System.out
						.println("--------------------------------------------------------------");

			}
			searcher.close();
		} catch (Exception e) {
		}
	}

	private String ligterStr(Analyzer a, Query query, String txt,
			String fieldname) throws IOException, InvalidTokenOffsetsException {
		String str = null;
		QueryScorer scorer = new QueryScorer(query);
		Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
		Formatter fmt = new SimpleHTMLFormatter("<b>", "</b>");
		Highlighter lighter = new Highlighter(fmt, scorer);
		lighter.setTextFragmenter(fragmenter);
		str = lighter.getBestFragment(a, fieldname, txt);
		if (str == null)
			return txt;
		return str;
	}

	/**
	 *@MethodName:lighter01
	 *@Description:高亮基础
	 *@author:半仙儿
	 *@return void
	 *@date:2015-4-21下午04:48:49
	 */
	public void lighter01() {
		try {
			String txt = "我爱北京天安门，天安门上彩旗飞，伟大领袖毛主席，指引我们向前进";

			// 只加粗北京
			// TermQuery query = new TermQuery(new Term("f", "北京"));
			// 加粗所有北京和伟大
			Query query = new QueryParser(Version.LUCENE_35, "f",
					new MMSegAnalyzer()).parse("北京 伟大");
			QueryScorer scorer = new QueryScorer(query);
			Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
			// 要想使用自定义的标签
			Formatter formatter = new SimpleHTMLFormatter(
					"<span style='color:red'>", "</span>");
			Highlighter highlighter = new Highlighter(formatter, scorer);
			highlighter.setTextFragmenter(fragmenter);
			String str = highlighter.getBestFragment(new MMSegAnalyzer(), "f",
					txt);
			System.out.println(str);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}