lucene 4.0 学习

本文详细介绍了如何优化索引、添加与删除索引的方法,包括设置合理的合并策略、查询索引、更新索引状态及实现添加索引的功能。同时,通过封装信息到索引、查询索引和一般查询方法的实现,展示了Lucene库在信息检索领域的应用。
摘要由CSDN通过智能技术生成
[b][size=large]添加删除索引方法:[/size][/b]

/**
* 优化索引,返回优化策略
*
* @return
*/
private static LogMergePolicy optimizeIndex()
{
LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();

// 设置segment添加文档(Document)时的合并频率
// 值较小,建立索引的速度就较慢
// 值较大,建立索引的速度就较快,>10适合批量建立索引
// 达到50个文件时就和合并
mergePolicy.setMergeFactor(50);

// 设置segment最大合并文档(Document)数
// 值较小有利于追加索引的速度
// 值较大,适合批量建立索引和更快的搜索
mergePolicy.setMaxMergeDocs(5000);

// 启用复合式索引文件格式,合并多个segment
mergePolicy.setUseCompoundFile(true);
return mergePolicy;
}

private void Index()
{
Directory directory = null;
File indexDir = new File(index_path);
try
{
directory = FSDirectory.open(indexDir);
IKAnalyzer analyzer = new IKAnalyzer();
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
indexWriterConfig.setMergePolicy(optimizeIndex());
indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
// 查询索引
SWResultList indexList = new SWResultList(0, 1);
SWInfoFactory.getInfo().listIndex(" order by idx_id", " and cz_status in ('0') ", indexList);
int delCount = 0;
int createCount = 0;
for (Object idx : indexList.getMydata())
{
// 类型转换
Map<?, ?> idxMap = (Map<?, ?>) idx;
if (idxMap.get("cz_flag").toString().equals("1"))
{
// 增加索引
Document document = InfoDocument.document("", idxMap.get("cz_id").toString());
if (document == null)
{
setStatus("9", idxMap.get("idx_id").toString());
logger.error("要创建索引的信息不存在,info_code=" + idxMap.get("cz_id").toString());
} else
{
try
{
setStatus("1", idxMap.get("idx_id").toString());
writer.addDocument(document);
setStatus("8", idxMap.get("idx_id").toString());
createCount++;
} catch (Exception e)
{
setStatus("9", idxMap.get("idx_id").toString());
}
}
} else
{
// 删除索引
Term term = new Term("Infocode", idxMap.get("cz_id").toString());
try
{
setStatus("1", idxMap.get("idx_id").toString());
writer.deleteDocuments(term);
setStatus("8", idxMap.get("idx_id").toString());
delCount++;
} catch (Exception e)
{
setStatus("9", idxMap.get("idx_id").toString());
}
}
}
logger.info("新增索引 " + createCount + " 条!");
logger.info("删除索引 " + delCount + " 条!");
writer.close();
} catch (IOException e)
{
e.printStackTrace();
}
}

private void setStatus(String status, String idxid)
{
this.dao.update("update cec_idx set cz_status=" + status + " where idx_id='" + idxid + "'");
}

public boolean isRun()
{
return run;
}

[size=large][b]实现添加索引[/b][/size]

/**
* <ul>
* <li>项目名称: ***</li>
* <li>功能描述: 将信息封装到索引</li>
*</ul>
* @author Administrator
* @version $Id: InfoDocument.java, 2013-7-22 下午4:52:01 Administrator Exp $
*/
public class InfoDocument
{
public static Document document(String cz_table, String cz_id)
{
//查出要建索引的数据封装到map中
Map info = SWInfoFactory.getInfo().getInfo(cz_table, cz_id);
if (info == null)
{
return null;
}
Document doc = new Document();

FieldType ft = new FieldType();
ft.setStored(true); // 设置是否进行存储
ft.setIndexed(true); // 设置是否能够索引到
ft.setTokenized(true);// 设置是否进行分词分析

FieldType ft2 = new FieldType();
ft2.setStored(true); // 设置是否进行存储
ft2.setIndexed(true); // 设置是否能够索引到
ft2.setTokenized(false);// 设置是否进行分词分析

Field ci_id = new Field("ci_id", info.get("ci_id") == null?"": info.get("ci_id").toString(), ft2);
Field ci_title = new Field("ci_title", info.get("ci_title") == null?"": info.get("ci_title").toString(), ft);
Field ci_title2 = new Field("ci_title2", info.get("ci_title2") == null?"": info.get("ci_title2").toString(), ft);
Field ci_ctime = new Field("ci_ctime", info.get("ci_ctime") == null?"": info.get("ci_ctime").toString(), ft2);
Field ci_keys = new Field("ci_keys", info.get("ci_keys") == null?"": info.get("ci_keys").toString(), ft);

Field ci_content = new Field("ci_content", info.get("ci_content") == null?"": info.get("ci_content").toString(), ft);
Field cf_title = new Field("cf_title", info.get("cf_title") == null?"": info.get("cf_title").toString(), ft);
Field s_cc_id = new Field("s_cc_id", info.get("s_cc_id") == null?"": info.get("s_cc_id").toString(), ft2);
Field s_cc_name = new Field("s_cc_name", info.get("s_cc_name") == null?"": info.get("s_cc_name").toString(), ft);
Field cc_supid = new Field("cc_supid", info.get("cc_supid") == null?"": info.get("cc_supid").toString(), ft2);

Field f_cc_id = new Field("f_cc_id", info.get("f_cc_id") == null?"": info.get("f_cc_id").toString(), ft2);
Field f_cc_name = new Field("f_cc_name", info.get("f_cc_name") == null?"": info.get("f_cc_name").toString(), ft);

doc.add(ci_id);
doc.add(ci_title);
doc.add(ci_title2);
doc.add(ci_ctime);
doc.add(ci_keys);

doc.add(ci_content);
doc.add(cf_title);
doc.add(s_cc_id);
doc.add(s_cc_name);
doc.add(cc_supid);

doc.add(f_cc_id);
doc.add(f_cc_name);

return doc;
}
}

[size=large][b]查询索引[/b][/size]

public class SearchInfoutile
{
private static String path = ConfigManager.getInstance().getConfigItem("cms.infoindex.path", "./eqinfoindex").toString();
public static Map getInfoContent(String sareacode, int page)
{
Map map = new HashMap();
map.put("resultlist", null);
map.put("totalpage", 0);// 总页数
map.put("totalpagenum", 0);// 总条数

int searchRows = 7;// 每页显示的数据条数

int totalpage = 0;
int iStartRow = (page - 1) * searchRows;
int iEndRow = page * searchRows;

List list = new ArrayList();
Directory directory = null;
IndexSearcher isearcher = null;
String[] searchFields = { "ci_title", "ci_title2", "ci_keys", "ci_content", "cf_title","s_cc_name","f_cc_name"};

try
{
TopScoreDocCollector collector = TopScoreDocCollector.create(searchRows * page, false);
Date start = new Date();
File indexDir = new File(path);
directory = FSDirectory.open(indexDir);
IKAnalyzer analyzer = new IKAnalyzer();

IndexReader indexReader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(indexReader);
BooleanQuery booleanQuery = new BooleanQuery();

// 检索的关键词
if (sareacode != null && !"".equals(sareacode))
{
QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_40, searchFields, analyzer);
Query q1 = parser.parse(sareacode);
booleanQuery.add(q1, BooleanClause.Occur.MUST);
}else {
Date date = new Date();
SimpleDateFormat time=new SimpleDateFormat("yyyyMMddHHmmss");
String times = time.format(date);
BytesRef lowerTerm = new BytesRef("19000101010100");
BytesRef upperTerm = new BytesRef(times);
TermRangeQuery query = new TermRangeQuery("ci_ctime", lowerTerm, upperTerm, true, true);
booleanQuery.add(query, BooleanClause.Occur.MUST);
}

isearcher.search(booleanQuery, collector);
int numTotalHits = collector.getTotalHits();
iStartRow = Math.min(numTotalHits, iStartRow);
iEndRow = Math.min(numTotalHits, iEndRow);
TopDocs topDocs = collector.topDocs(iStartRow, iEndRow - iStartRow);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;

list = toList(isearcher, scoreDocs);

if (numTotalHits % searchRows == 0)
{
totalpage = numTotalHits / searchRows;
} else
{
totalpage = numTotalHits / searchRows + 1;
}

map.put("resultlist", list);
map.put("totalpage", totalpage);// 总页数
map.put("totalpagenum", numTotalHits);// 总条数
return map;
} catch (IOException e)
{
e.printStackTrace();
} catch (ParseException e)
{
e.printStackTrace();
}
return map;
}

// 索引集合转换成list集合
public static List toList(IndexSearcher isearcher, ScoreDoc[] scoreDocs)
{
List list = new ArrayList();
for (int i = 0; i < scoreDocs.length; i++)
{
Map map = new HashMap();
try
{
Document targetDoc = isearcher.doc(scoreDocs[i].doc);

map.put("ci_id", targetDoc.get("ci_id"));
map.put("ci_title", targetDoc.get("ci_title"));
map.put("ci_title2", targetDoc.get("ci_title2"));
map.put("ci_ctime", targetDoc.get("ci_ctime"));
map.put("ci_keys", targetDoc.get("ci_keys"));

map.put("ci_content", targetDoc.get("ci_content"));
map.put("s_cc_id", targetDoc.get("s_cc_id"));
map.put("s_cc_name", targetDoc.get("s_cc_name"));
map.put("s_cc_name", targetDoc.get("s_cc_name"));
map.put("cc_supid", targetDoc.get("cc_supid"));

map.put("f_cc_id", targetDoc.get("f_cc_id"));
map.put("f_cc_name", targetDoc.get("f_cc_name"));

list.add(map);
} catch (CorruptIndexException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}

return list;
}

public static String torealtime(Date date, int a)
{
Calendar cal = Calendar.getInstance();
cal.setTime(date);
cal.add(Calendar.DATE, a);
String starttime = (new SimpleDateFormat("yyyyMMdd")).format(cal.getTime()) + "000000";
return starttime;
}
}


[size=large][b]lucene的一般查询方法[/b][/size]


/**
* 查询方法
* @throws IOException
* @throws CorruptIndexException
* @throws ParseException
* @throws InvalidTokenOffsetsException
*/
public List Search(String searchString,LuceneResultCollector luceneResultCollector) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException{
//方法一:
//String[] fields = {"fileName","fieldid","date"};
//MultiFieldQueryParser 用来查询多个字段 queryparser
//QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_40 ,fields, this.indexSettings.getAnalyzer());
//Query query = parser.parse(searchString);

//词条查询 方法二:
//Term t = new Term("fileName", searchString);
//Query query = new TermQuery(t);
//String[] searchFields = {"Infotitle","Linkaddr","Skuname","Skudetaile"};
//布尔类型查询(BooleanQuery)
//BooleanQuery query = new BooleanQuery();
BooleanQuery query = new BooleanQuery();
//检索的关键词
/*QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_40 ,searchFields,this.indexSettings.getAnalyzer());
Query q1 = parser.parse("苹果");
query.add(q1,BooleanClause.Occur.MUST);*/
//哪个模块下下的
Term t2 = new Term("fileName","java宝典");
Query q2 = new TermQuery(t2);
query.add(q2,BooleanClause.Occur.SHOULD);
Term t2s = new Term("fileName","java神器");
Query q2s = new TermQuery(t2s);
query.add(q2s,BooleanClause.Occur.SHOULD);
//供求类型
/*Term t3 = new Term("Infotype","2");
Query q3 = new TermQuery(t3);
query.add(q3,BooleanClause.Occur.MUST);*/
/*
//地区编号
Term t4 = new Term("Areacode","");
PrefixQuery q4 = new PrefixQuery(t4);
query.add(q4,BooleanClause.Occur.MUST);

//产品种类
Term t5 = new Term("Infocateg","");
PrefixQuery q5 = new PrefixQuery(t5);
query.add(q5,BooleanClause.Occur.MUST);*/
/* Term t1 = new Term("fileName", "*");
TermQuery q1 = new TermQuery(t1);

Term t2 = new Term("date", "");
PrefixQuery q2 = new PrefixQuery(t2);*/

// Term t3 = new Term("fileName", "java");
//PrefixQuery q3 = new PrefixQuery(t3);


//query.add(q1,BooleanClause.Occur.MUST);
//query.add(q3,BooleanClause.Occur.MUST);
//query.add(q2,BooleanClause.Occur.MUST);

//范围搜索(TermRangeQuery) ------------------------
//当查询范围较大且满足条件的文档数量也很大时效率是很低的
//Date date = new Date();
//SimpleDateFormat time=new SimpleDateFormat("yyyyMMddHHmmss");
//String times = time.format(date);
//范围搜索(TermRangeQuery)
//当查询范围较大且满足条件的文档数量也很大时效率是很低的
//BytesRef lowerTerm = new BytesRef("19000101010100");
//BytesRef upperTerm = new BytesRef(times);
//System.out.println("19000101010100");
//System.out.println(times);
//TermRangeQuery query = new TermRangeQuery("Pubtime", lowerTerm, upperTerm, true, true);
//模糊查询
//另外两个构造函数
//FuzzyQuery fd = new FuzzyQuery(term, maxEdits)
//maxEdits 最大相似度,默认为0.5(待议 3.0版的)
//prefixLength 表示进行模糊匹配的时候,要有多少个前缀字母必须完全匹配
//如果是1,表示所有词条只有第一个字母与检索关键字相符时,才会被放入备选集合中
//FuzzyQuery fd = new FuzzyQuery(t,int 1,int prefixLength)
//FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("fileName", "java"), 0); // 数字越小越精确0-2

//前缀查询 最前面的几个字
//PrefixQuery query2 = new PrefixQuery(term);

//短语查询 先拆分成单字 核心方法 使用 布尔类型查询(BooleanQuery)来装载子句
//子句间关系设为 MUST取其交集
//setSlop(int) 设置坡度,默认值为0,给出严格匹配短语的文档作为结果,
//设置为setSlop(1),表示中间可以插入无关字数的个数 1代表中间可以插入无关字个数为1个
//PhraseQuery query2 = new PhraseQuery();
//query2.add(term);
//query2.setSlop(1);

//多短语查询(自我感觉鸡肋,也许有一天我会用到他的)
//MultiPhraseQuery query2 = new MultiPhraseQuery();

//通配符搜索
//Term term = new Term("filed","?o*");
//WildcardQuery query2 = new WildcardQuery(term);

//跨度搜索
//SpanTermQuery query2 = new SpanTermQuery(term);

//Sort sort = new Sort();
//SortField sortField = new SortField("fileid",FieldCache.DEFAULT_INT_PARSER, false);
//sort.setSort(sortField);
//ScoreDoc[] docs = this.indexSearcher.search(query,100,sort).scoreDocs;

ScoreDoc[] docs = this.indexSearcher.search(query,100).scoreDocs;
System.out.println("一共有:"+docs.length+"条记录");
List result = luceneResultCollector.collect(docs, this.indexSearcher, query ,this.indexSearcher,this.indexSettings.getAnalyzer());
return result;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值