目录结果
一,lucene的索引工具类
- package com.hwt.lucene.index;
- import java.io.File;
- import java.io.IOException;
- import java.util.List;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.util.Version;
- /**
- * lucene的索引工具类
- *
- * @author 黄文韬
- *
- */
- public class IndexUtils {
- // 庖丁解牛分词器(单例)
- private static Analyzer ANALYZER = new PaodingAnalyzer();
- // 索引的路径
- private static final String indexPath = "WebRoot/lucene/index";
- /**
- * 得到庖丁解牛分词器
- *
- * @return
- */
- public static Analyzer getAnalyzer() {
- return ANALYZER;
- }
- /**
- * 得到路径对象
- *
- * @param path 相对路径
- * @return
- */
- public static Directory getDirectory(String path) {
- Directory directory = null;
- try {
- directory = FSDirectory.open(new File(path));
- } catch (IOException e) {
- e.printStackTrace();
- }
- return directory;
- }
- /**
- * 得到读索引类
- * @return
- */
- public static IndexReader getIndexReader() {
- IndexReader reader = null;
- try {
- reader = IndexReader.open(getDirectory(indexPath));
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return reader;
- }
- /**
- * 得到些索引类
- * @return
- */
- public static IndexWriter getIndexWriter() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(getDirectory(indexPath),
- new IndexWriterConfig(Version.LUCENE_36, ANALYZER));
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return writer;
- }
- /**
- * 得到索引搜索类
- * @return
- */
- public static IndexSearcher getIndexSearcher() {
- IndexSearcher searcher = null;
- try {
- searcher = new IndexSearcher(getIndexReader());
- } catch (Exception e) {
- e.printStackTrace();
- }
- return searcher;
- }
- /**
- * 创建索引
- *
- * @param result
- */
- public static void createIndex(List<IndexField> result) {
- // 得到输出索引类
- IndexWriter indexWriter = null;
- // 索引类
- try {
- indexWriter = getIndexWriter();
- Document doc = new Document();
- for (IndexField findx : result) {
- // 是否存储:Store.YES/Store.NO
- // 是否分词:
- // Index.ANALYZED/Index.NOT_ANALYZED/Index.NO/Index.ANALYZED_NO_NORMS
- doc.add(new Field(findx.getFieldName(), findx.getFieldValue(),
- findx.getFieldStore(), findx.getFieldAnalyzed()));
- }
- indexWriter.addDocument(doc);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- // 关闭writer
- indexWriter.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 优化索引
- */
- public static void mergeIndex() {
- IndexWriter indexWriter = null;
- // 强制优化索引
- try {
- indexWriter = getIndexWriter();
- indexWriter.forceMerge(1);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- indexWriter.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 更新所有
- * @param fields 新的document字段信息
- * @param term 需要替换的查找条件
- */
- public static void updateIndex(List<IndexField> fields, Term term) {
- // 得到输出索引类
- IndexWriter indexWriter = null;
- // 索引类
- try {
- indexWriter = getIndexWriter();
- Document doc = new Document();
- // 是否存储:Store.YES/Store.NO
- // 是否分词:
- // Index.ANALYZED/Index.NOT_ANALYZED/Index.NO/Index.ANALYZED_NO_NORMS
- for (IndexField field : fields) {
- doc.add(new Field(field.getFieldName(), field.getFieldValue(),
- field.getFieldStore(), field.getFieldAnalyzed()));
- }
- indexWriter.updateDocument(term, doc, ANALYZER);
- indexWriter.forceMerge(1);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- // 关闭writer
- indexWriter.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 删除全部索引文件
- */
- public static void deleteAll() {
- IndexWriter writer = null;
- try {
- writer = getIndexWriter();
- writer.deleteAll();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 根据条件删除索引
- * @param term 条件
- */
- public static void delete(Term term) {
- IndexWriter writer = null;
- IndexReader reader = getIndexReader();
- try {
- writer = getIndexWriter();
- writer.deleteDocuments(term);
- writer.forceMerge(1);
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
二,文件类型的搜索
- package com.hwt.lucene.index;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- /**
- * 文件类型的搜索
- * @author 黄文韬
- *
- */
- public class FileDocument {
- /**
- * 将文件转换为一个document对象
- * @param file 文件
- * @return
- */
- public Document fileToDocument(File file){
- Document document=new Document();
- document.add(new Field("name", file.getName(), Store.YES, Index.ANALYZED));
- document.add(new Field("content", this.readFileRetStr(file), Store.YES, Index.ANALYZED));
- return document;
- }
- /**
- * 将名字、内容字段转为document
- * @param content 内容
- * @param name 文件名字
- * @return
- */
- public Document stringToDocumet(String name,String content){
- Document document=new Document();
- document.add(new Field("name",name, Store.YES, Index.ANALYZED));
- document.add(new Field("content", content, Store.YES, Index.ANALYZED));
- return document;
- }
- /**
- * 将文件内容转为string类型
- * @param file 文件
- * @return
- */
- public String readFileRetStr(File file){
- FileInputStream fStream = null;
- String tempStr = "";
- StringBuffer sBuffer = new StringBuffer();
- try {
- fStream = new FileInputStream(file);
- BufferedReader bReader=new BufferedReader(new InputStreamReader(fStream,"UTF-8"));
- while((tempStr=bReader.readLine())!=null){
- sBuffer.append(tempStr);
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- fStream.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return sBuffer.toString();
- }
- }
三,封装索引字段类
- package com.hwt.lucene.index;
- import java.io.Serializable;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- /**
- * 封装索引字段类
- * @author hwt
- *
- */
- public class IndexField implements Serializable{
- private String fieldName;
- private String fieldValue;
- private Store fieldStore;//是否存储:Store.YES/Store.NO
- private Index fieldAnalyzed;//是否分词: Index.ANALYZED/Index.NOT_ANALYZED/Index.NO/Index.ANALYZED_NO_NORMS
- public String getFieldName() {
- return fieldName;
- }
- public void setFieldName(String fieldName) {
- this.fieldName = fieldName;
- }
- public String getFieldValue() {
- return fieldValue;
- }
- public void setFieldValue(String fieldValue) {
- this.fieldValue = fieldValue;
- }
- public Store getFieldStore() {
- return fieldStore;
- }
- public void setFieldStore(Store fieldStore) {
- this.fieldStore = fieldStore;
- }
- public Index getFieldAnalyzed() {
- return fieldAnalyzed;
- }
- public void setFieldAnalyzed(Index fieldAnalyzed) {
- this.fieldAnalyzed = fieldAnalyzed;
- }
- }
四,分页缓存类
- package com.hwt.lucene.index;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import org.apache.log4j.Logger;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.Sort;
- import org.apache.lucene.search.TopDocs;
- public class CachePage {
- private static final Logger LOGGER = Logger.getLogger(CachePage.class);
- private int pageStart = 1; // 页码
- private int pageSize = 15; // 每页显示的大小
- private int pageNum = 0; //总页数
- private int totalNum = 0; //总记录条数
- private int cacheSize = 100; // 缓存大小
- private List<Document> cacheList = new ArrayList<Document>(); // 缓存列表
- /**
- * 构造方法
- * @param pageSize 每页大小
- * @param cacheSize 缓存大小
- */
- public CachePage(Integer pageSize, Integer cacheSize) {
- this.pageSize = pageSize;
- if (cacheSize != null) {
- this.cacheSize = cacheSize;
- }
- }
- /**
- * 判断是否存在缓存中
- *
- * @param page
- * 页码
- * @return
- */
- public boolean inCache(int page) {
- // 当前缓存对象的个数
- int cacheNum = cacheList.size();
- if (cacheNum > 0) {
- if (page <= 0) {
- page = 1;
- }
- // 判断当前页是不是在缓存中
- if (page >= pageStart && (page - pageStart) * pageSize <= cacheNum) {
- return true;
- } else {
- return false;
- }
- }else {
- return false;
- }
- }
- /**
- * 清空缓存
- * @param pageNum 起始页
- */
- public void refleshCache() {
- // this.isFirst = true;
- for (int i = cacheList.size() -1 ; i >= 0; i--) {
- cacheList.remove(i);
- }
- }
- /**
- * 新增缓存
- *
- * @param doc
- */
- public void addCache(Document doc) {
- if (this.cacheList.size() < cacheSize) {
- this.cacheList.add(doc);
- } else {
- LOGGER.info("缓存池已满");
- }
- }
- /**
- * 读缓存中的数据
- * @param page
- * @return
- */
- public Map readCache(int page) {
- // 判断是否存在于缓存池中
- int start = (page - pageStart) * pageSize;
- int end = start + pageSize > cacheList.size() ? cacheList.size()
- : start + pageSize;
- //缓存中的结果集
- List<Document> cacheRs = new ArrayList<Document>();
- for (int i = start; i < end; i++) {
- cacheRs.add(cacheList.get(i));
- }
- //缓存结果集
- Map resultMap = new HashMap();
- resultMap.put("currentPage", page); //当前页
- resultMap.put("totalNum", totalNum); //总记录条数
- resultMap.put("pageNum", pageNum); //总页数
- resultMap.put("list", cacheRs);
- return resultMap;
- }
- /**
- * 搜索
- * @param query query对象
- * @param sort 排序对象
- * @param page 页码
- * @return
- */
- public Map search(Query query,Sort sort,int page){
- if (page < 0) {
- page = 1;
- }
- //如果存在缓存中
- if (inCache(page)) {
- return readCache(page);
- }else {//如果不在缓存中
- IndexSearcher searcher = IndexUtils.getIndexSearcher();
- try {
- //显示条数
- int querySize = (page*pageSize / cacheSize + 1 )*100;
- //设置查询、查询显示的条数、排序对象
- TopDocs topDocs = searcher.search(query, querySize , sort);
- //总共记录条数
- int totalNum = topDocs.totalHits;
- int pageNum = totalNum % pageSize == 0 ? totalNum / pageSize : totalNum / pageSize + 1;
- if (page > pageNum) {
- page = pageNum;
- }
- //得到记录集
- ScoreDoc[] docs = topDocs.scoreDocs;
- //保存当前页的前后两页放入缓存中
- int startPage = 1;
- int endPage = 1;
- if (page < 3) { //前五页
- startPage = 1;
- endPage = startPage + 4 > pageNum ? pageNum : startPage + 4;
- }else if(page > pageNum - 2){ //后五页
- endPage = pageNum ;
- startPage = endPage - 4 < 0 ? 1 : endPage - 4;
- } else { //中间页
- startPage = page - 2 <= 0 ? 1 : page - 2;
- endPage = page + 2 > pageNum ? pageNum : page + 2;
- }
- //清空缓存
- refleshCache();
- int startSize = (startPage - 1)*pageSize ;
- int endSize = startSize + cacheSize > totalNum ? totalNum : startSize + cacheSize ;
- //将对象加入缓存中
- for (int i = startSize ; i < endSize; i++) {
- Document doc = searcher.doc(docs[i].doc);
- addCache(doc);
- }
- //替换缓存集合
- this.pageNum = pageNum;
- this.totalNum = totalNum;
- this.pageStart = startPage;
- return readCache(page);
- } catch (IOException e) {
- e.printStackTrace();
- return null;
- }
- }
- }
- public Integer getPageSize() {
- return pageSize;
- }
- public void setPageSize(Integer pageSize) {
- this.pageSize = pageSize;
- }
- public Integer getPageStart() {
- return pageStart;
- }
- public void setPageStart(Integer pageStart) {
- this.pageStart = pageStart;
- }
- public Integer getCacheSize() {
- return cacheSize;
- }
- public void setCacheSize(Integer cacheSize) {
- this.cacheSize = cacheSize;
- }
- public List<Document> getCacheList() {
- return cacheList;
- }
- public void setCacheList(List<Document> cacheList) {
- this.cacheList = cacheList;
- }
- // public boolean isFirst() {
- // return isFirst;
- // }
- //
- // public void setFirst(boolean isFirst) {
- // this.isFirst = isFirst;
- // }
- }
测试类:
- package test;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import javax.print.Doc;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.cjk.CJKAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.BooleanClause.Occur;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.Searcher;
- import org.apache.lucene.search.Sort;
- import org.apache.lucene.search.SortField;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- import org.springframework.context.ApplicationContext;
- import org.springframework.context.support.ClassPathXmlApplicationContext;
- import com.hwt.lucene.index.CachePage;
- import com.hwt.lucene.index.IndexField;
- import com.hwt.lucene.index.IndexUtils;
- public class Test {
- public static void main(String[] args) throws IOException, ParseException {
- List<IndexField> fieldIndexs2 = new ArrayList<IndexField>();
- IndexField ind3 = new IndexField();
- ind3.setFieldName("title");
- ind3.setFieldValue("美国攻打伊朗");
- ind3.setFieldStore(Store.YES);
- ind3.setFieldAnalyzed(Index.ANALYZED);
- fieldIndexs2.add(ind3);
- IndexField ind = new IndexField();
- ind.setFieldName("content");
- ind.setFieldValue("美国派兵3333,航母出发了,中国航公出发");
- ind.setFieldStore(Store.YES);
- ind.setFieldAnalyzed(Index.ANALYZED);
- fieldIndexs2.add(ind);
- IndexField ind2 = new IndexField();
- ind2.setFieldName("Id");
- ind2.setFieldValue("12");
- ind2.setFieldStore(Store.YES);
- ind2.setFieldAnalyzed(Index.NOT_ANALYZED);
- fieldIndexs2.add(ind2);
- //创建索引
- // IndexUtils.createIndex(fieldIndexs2);
- //删除索引
- // IndexUtils.delete(new Term("Id","2"));
- //修改索引
- // IndexUtils.updateIndex(fieldIndexs2, new Term("Id","2"));
- Analyzer analyzer = IndexUtils.getAnalyzer();
- QueryParser titleParser = new QueryParser(Version.LUCENE_36,"title",analyzer);
- QueryParser contentParser = new QueryParser(Version.LUCENE_36,"content",analyzer);
- // Query contentQuery = new TermQuery(new Term("title","美国"));
- Query titleQuery = titleParser.parse("美国");
- Query contentQuery = contentParser.parse("美国");
- BooleanQuery query = new BooleanQuery();
- query.add(titleQuery, Occur.MUST);
- query.add(contentQuery,Occur.SHOULD);
- IndexSearcher searcher = IndexUtils.getIndexSearcher();
- //排序对象:排序字段,排序字段类型,是否降序(默认false升序)
- Sort sort = new Sort(new SortField("Id",SortField.INT, true));
- //对多个字段进行排序
- // Sort sort = new Sort(new SortField[]{new SortField("Id",SortField.INT, true),
- // new SortField("title",SortField.INT, true)});
- CachePage cachePage = new CachePage(1, 100);
- Map map = cachePage.search(query, sort, 1);
- System.out.println("起始页:"+ cachePage.getPageStart());
- System.out.println("总页数:"+map.get("pageNum"));
- System.out.println("总条数:"+map.get("totalNum"));
- List<Document> docs = (List<Document>) map.get("list");
- for (Document document : docs) {
- System.out.println(document.get("Id"));
- System.out.println(document.get("title"));
- System.out.println(document.get("content"));
- }
- System.out.println("+++++++++++++++++++");
- cachePage.refleshCache();
- //
- //查询缓存的
- Map map2 = cachePage.search(query, sort, 4);
- System.out.println("起始页:"+ cachePage.getPageStart());
- System.out.println("总页数:"+map2.get("pageNum"));
- System.out.println("总条数:"+map2.get("totalNum"));
- List<Document> docs2 = (List<Document>) map2.get("list");
- for (Document document : docs2) {
- System.out.println(document.get("Id"));
- System.out.println(document.get("title"));
- System.out.println(document.get("content"));
- }
- //
- // System.out.println("+++++++++++++++++++");
- // Map map3 = cachePage.search(query, sort, 5);
- // System.out.println("总页数:"+map3.get("pageNum"));
- // System.out.println("总条数:"+map3.get("totalNum"));
- // List<Document> docs4 = (List<Document>) map3.get("list");
- // for (Document document : docs4) {
- // System.out.println(document.get("Id"));
- // System.out.println(document.get("title"));
- // System.out.println(document.get("content"));
- // }
- //对对个字段进行排序
- // Sort sort = new Sort(new SortField[]{new SortField("Id",SortField.STRING, true),
- // new SortField("title",SortField.STRING, true)});
- // TopDocs docs = searcher.search(query,100,sort); //返回前100条记录
- // docs.totalHits是所有的记录条数,与上面设置的100无关
- // System.out.println("共找到"+docs.totalHits+"条记录");
- //
- // ScoreDoc[] scoreDocs = docs.scoreDocs;
- //
- // for (int i = 0,len = scoreDocs.length ; i < len; i++) {
- // System.out.println(scoreDocs[i].doc);
- // }
- // for (ScoreDoc scoreDoc : scoreDocs) {
- // int docid = scoreDoc.doc;
- // Document document = searcher.doc(docid);
- // System.out.println(document.get("Id"));
- // System.out.println(document.get("title"));
- // System.out.println(document.get("content"));
- // System.out.println("===============================");
- // }
- // IndexSearcher indexSearcher = IndexUtils.getIndexSearcher();
- //
- // TopDocs topDocs = indexSearcher.search(query, 10);
- //
- // ScoreDoc[] docs = topDocs.scoreDocs;
- // System.out.println("共找到:"+docs.length);
- // for (ScoreDoc scoreDoc : docs) {
- // int docid = scoreDoc.doc;
- // Document document = indexSearcher.doc(docid);
- // System.out.println(document.get("Id"));
- // System.out.println(document.get("title"));
- // System.out.println(document.get("content"));
- // System.out.println("===============================");
- // }
- //
- // }
- // IndexUtils.deleteAll();
- }
- }