开源全文搜索工具包Lucene3.0.1的使用。
项目环境Struts2 (2.18)+Hibernate(3.0)+Spring(2.5) JDK:1.6 IDE:myEclipse 8.5
项目需求:把站内发布的新闻进行全文解索
新闻实体News
- public class News {
- private int id;
- /**标题*/
- private String title;
- /**内容*/
- private String contents;
- setters();
- getters();
- }
- package com.hkrt.dao;
- import com.hkrt.domain.LuceneSearchResult;
- import com.hkrt.domain.News;
- public interface NewsLuceneDao {
- public static final String FIELD_ID="id";
- public static final String FIELD_TITLE = "title";
- public static final String FIELD_CONTENTS = "contents";
- // 索引存放目录
- public static final String INDEX_DIR = Thread.currentThread().getContextClassLoader().getResource("").getPath()+"index_dir";
- /**
- * 对所有文件进行重新索引
- */
- public void rebuildAllIndex();
- /**
- * 对指定上传文件对象进行索引并追加到已有的索引文件中
- * @param news
- */
- public void doIndexSingle(News news);
- /**
- * 根据关键字搜索,返回符合条件的分页数据
- * @param keyword 关键字
- * @param pageNo 起始页
- * @param pageSize 每页要显示的记录数
- * @return LuceneSearchResult对象
- */
- public LuceneSearchResult<News> doSeacher(String keyword, int pageNo,int pageSize);
- /**
- * 更新文件的索引
- * @param news
- */
- public void updateIndex(News news);
- /**
- * 根据文件id删除索引
- * @param id
- */
- public void deleteIndex(Integer id);
- }
- package com.hkrt.dao.impl;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryParser.MultiFieldQueryParser;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.search.BooleanClause;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.SimpleFragmenter;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.util.Version;
- import com.hkrt.dao.NewsDao;
- import com.hkrt.dao.NewsLuceneDao;
- import com.hkrt.domain.LuceneSearchResult;
- import com.hkrt.domain.News;
- public class NewsLuceneDaoImpl implements NewsLuceneDao {
- private NewsDao newsDao;
- /** 获取语法解析器 */
- public Analyzer getAnalyzer() {
- return new StandardAnalyzer(Version.LUCENE_30);
- }
- /** 打开索引的存放目录 */
- public Directory openDirectory() {
- try {
- System.out.println(new File(INDEX_DIR) + "-------打开索引--------------");
- return FSDirectory.open(new File(INDEX_DIR));
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
- /** 对文件的指定属性映射成域,返回文件文档对象 */
- public Document createForumuploadDocument(News news) {
- Document doc = new Document(); // 创建一个文档对象
- //id 域
- Field field = new Field(FIELD_ID,String.valueOf(news.getId()),Field.Store.YES, Field.Index.NOT_ANALYZED);
- doc.add(field);
- // title域
- Field field1 = new Field(FIELD_TITLE, String.valueOf(news.getTitle()),Field.Store.YES, Field.Index.ANALYZED);
- doc.add(field1);
- // content域
- Field field2 = new Field(FIELD_CONTENTS, String.valueOf(news.getContents()), Field.Store.YES, Field.Index.ANALYZED);
- doc.add(field2);
- return doc;
- }
- public void deleteIndex(Integer id) {
- IndexReader ir = null;
- try {
- ir = IndexReader.open(this.openDirectory(), false); //打开指定目录下索引文件的索引读取器
- ir.deleteDocuments(new Term(FIELD_ID,String.valueOf(id))); //删除符合条件的Document
- } catch (IOException e) {
- e.printStackTrace();
- }finally{
- if(ir != null){
- try {
- ir.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- @Override
- public void doIndexSingle(News news) {
- //创建索引写入器
- IndexWriter indexWriter = null;
- try {
- indexWriter = new IndexWriter(openDirectory(), getAnalyzer(),false, IndexWriter.MaxFieldLength.UNLIMITED);
- Document doc = this.createForumuploadDocument(news);
- indexWriter.addDocument(doc);
- indexWriter.optimize(); // 对索引进行优化
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if (indexWriter != null) {
- indexWriter.close(); // 关闭IndexWriter,把内存中的数据写到文件
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- @Override
- public LuceneSearchResult<News> doSeacher(String keyword, int pageNo,int pageSize) {
- LuceneSearchResult<News> lsr = new LuceneSearchResult<News>();
- lsr.setPageNo(pageNo);
- lsr.setPageSize(pageSize);
- lsr.setKeyword(keyword);
- IndexSearcher searcher = null;
- try {
- // 创建一个索引搜索器
- searcher = new IndexSearcher(this.openDirectory(), true);
- // 用多域查询解析器来创建一个查询器,
- Query query = MultiFieldQueryParser.parse(Version.LUCENE_30,keyword, new String[] { FIELD_TITLE, FIELD_CONTENTS },
- new BooleanClause.Occur[] {BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD }, this.getAnalyzer());
- long begin = System.currentTimeMillis();
- // 查询结集信息类
- TopDocs ts = searcher.search(query, null, 100000);
- // 获取命中的数量
- lsr.setRecordCount(ts.totalHits);
- // 用这个进行高亮显示,默认是<b>..</b>
- SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style=color:red>", "</span>");
- // 构造高亮:指定高亮的格式,指定查询评分
- Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
- // 获取匹配到的结果集
- ScoreDoc[] hits = ts.scoreDocs;
- List<News> ais = new ArrayList<News>();
- int pageCount = (lsr.getRecordCount() + pageSize - 1) / pageSize; // 总页数
- int start = 0; // 要开始返回的文档编号
- int end = 0; // 要结束返回的文档编号
- if (pageCount > 0) {
- start = (pageNo - 1) * pageSize;
- end = start + pageSize;
- if (pageNo == pageCount) { // 处理最后一页的结束文档的编号
- end = start + (lsr.getRecordCount() % pageSize);
- }
- }
- if (start < end) {
- lsr.setStratNo(start + 1);
- lsr.setEndNo(end);
- }
- for (int i = start; i < end; i++) { // 循环获取分页数据
- // 通过内部编号从搜索器中得到对应的文档
- Document doc = searcher.doc(hits[i].doc);
- News news = new News();
- news.setTitle(doc.getField(FIELD_TITLE).stringValue());
- news.setContents(doc.getField(FIELD_CONTENTS).stringValue());
- // 处理文件名称的高亮显示问题
- String title = doc.getField(FIELD_TITLE).stringValue();
- String title2 = highlighter.getBestFragment(this.getAnalyzer(),FIELD_TITLE, title);
- if (title2 == null) {
- news.setTitle(title);
- } else {
- news.setTitle(title2);
- }
- // 文件描述高亮显示
- String contents1 = doc.getField(FIELD_CONTENTS).stringValue();
- String contents2 = highlighter.getBestFragment(this.getAnalyzer(), FIELD_CONTENTS, contents1);
- if (contents2 == null) {
- news.setContents(contents1);
- } else {
- if (contents2.length() > 512) {
- news.setContents(contents2.substring(0, 512) + "...");
- } else {
- news.setContents(contents2);
- }
- }
- ais.add(news); // 把符合条件的数据添加到List
- }
- lsr.setTime((System.currentTimeMillis() - begin) / 1000.0); // 计算搜索耗时秒数
- lsr.setDatas(ais); // 把查询到的数据添加到LuceneSearchResult
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ParseException e) {
- e.printStackTrace();
- } catch (InvalidTokenOffsetsException e) {
- e.printStackTrace();
- } finally {
- if (searcher != null) {
- try {
- searcher.close(); // 关闭搜索器
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- return lsr;
- }
- @Override
- public void rebuildAllIndex() {
- File file = new File(INDEX_DIR);
- if (file.exists()) {
- for (File subFile : file.listFiles()) {
- subFile.delete();
- }
- } else {
- file.mkdirs();
- }
- List<News> data = this.newsDao.findAll();
- IndexWriter indexWriter = null;
- try {
- indexWriter = new IndexWriter(this.openDirectory(), getAnalyzer(),true, IndexWriter.MaxFieldLength.UNLIMITED);
- // 设置打开使用复合文件
- // indexWriter.setUseCompoundFile(true);
- int size = data == null ? 0 : data.size();
- for (int i = 0; i < size; i++) {
- News news = data.get(i);
- Document doc = createForumuploadDocument(news);
- indexWriter.addDocument(doc);
- if (i % 20 == 0) {
- indexWriter.commit();
- }
- }
- indexWriter.optimize(); // 对索引进行优化
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if (indexWriter != null) {
- indexWriter.close();// 关闭IndexWriter,把内存中的数据写到文件
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- @Override
- public void updateIndex(News news) {
- this.deleteIndex(news.getId());
- this.doIndexSingle(news);
- }
- public NewsDao getNewsDao() {
- return newsDao;
- }
- public void setNewsDao(NewsDao newsDao) {
- this.newsDao = newsDao;
- }
- }
对查询结果进行分页处理
- package com.hkrt.domain;
- import java.util.List;
- public class LuceneSearchResult<T> {
- private int pageNo = 1; //当前页
- private int pageSize = 5; //每页显示记录数
- private int recordCount; //总记录数
- private double time; //耗时
- private List<T> datas; //当前页的数据
- private int stratNo; //开始记录数
- private int endNo; //结束记录数
- private String keyword; //关键字
- public int getPageNo() {
- return pageNo;
- }
- public void setPageNo(int pageNo) {
- this.pageNo = pageNo;
- }
- public int getPageSize() {
- return pageSize;
- }
- public void setPageSize(int pageSize) {
- this.pageSize = pageSize;
- }
- public int getRecordCount() {
- return recordCount;
- }
- public void setRecordCount(int recordCount) {
- this.recordCount = recordCount;
- }
- public List<T> getDatas() {
- return datas;
- }
- public void setDatas(List<T> datas) {
- this.datas = datas;
- }
- public double getTime() {
- return time;
- }
- public void setTime(double time) {
- this.time = time;
- }
- public String getKeyword() {
- return keyword;
- }
- public void setKeyword(String keyword) {
- this.keyword = keyword;
- }
- public int getStratNo() {
- return stratNo;
- }
- public void setStratNo(int stratNo) {
- this.stratNo = stratNo;
- }
- public int getEndNo() {
- return endNo;
- }
- public void setEndNo(int endNo) {
- this.endNo = endNo;
- }
- }
代码已经实现对news 进行建立索引和对关键字进行索引
lucene3.0.1 中需要的jar 包
建立索引:
搜索页面数据展示
- <table width="100%" height="92" border="0" cellpadding="0" cellspacing="1">
- <div class="title">搜索结果:搜索关键字【${lsr.keyword}】,共搜索到【${lsr.recordCount }】个文件,耗时:${lsr.time}秒,当前显示${lsr.stratNo}—${lsr.endNo}记录</div>
- <c:forEach items="${request.lsr.datas}" var="news">
- <tr>
- <td height="30" colspan="6" align="left" bgcolor="#f2f2f2" class="left_txt">
- ${news.id}
- </td>
- <td height="30" colspan="6" align="left" bgcolor="#f2f2f2" class="left_txt">
- ${news.title}
- </td>
- <td height="30" colspan="6" align="left" bgcolor="#f2f2f2" class="left_txt">${news.contents}</td>
- </tr>
- </c:forEach>
- </table>