一,对Lucene的知识进行介绍http://lym6520.iteye.com/category/82172
二,以下对最近所使用的Lucene,进行总结下:
为了使Lucene创建的索引文件,能够及时与数据库中同步,使用了quartz进行任务调度可查看
http://wuquanyin1011.iteye.com/admin/blogs/745382
下面是一个任务调度执行Lucene创建索引
以下给个大概重建索引,是使用了建模端配置创建索引
package com.fdauto.bws.business.module.lucene.index.job;
import java.io.File;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.quartz.Job;
import org.quartz.JobDataMap;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import com.fdauto.bws.business.datasource.DataSource;
import com.fdauto.bws.business.datasource.DataSourceHelper;
import com.fdauto.bws.common.logger.SystemLogsHelper;
import com.fdauto.bws.service.config.BWSConfigHelper;
/**
* 任务调度定时创建索引
*
* @author wu_quanyin(09817)
* @version 1.0
* @date 2010-7-16 上午10:54:01
*/
public class LuceneIndexJob implements Job {
private String indexParentDir = BWSConfigHelper.getBWSConfig()
.getProperties().getProperty("indexDir");
@SuppressWarnings("unchecked")
public void execute(JobExecutionContext context)
throws JobExecutionException {
JobDataMap jobDataMap = context.getJobDetail().getJobDataMap();
String sql = jobDataMap.getString("sql");
String indexDir = jobDataMap.getString("indexFileDir");
String dataSourceName = jobDataMap.getString("dataSource");
if (sql == null || indexDir == null || dataSourceName == null) {
SystemLogsHelper.error("索引数据集中:sql语句-->" + sql + "\n" + "索引文件:-->"
+ indexDir + "\n" + "数据源-->" + dataSourceName + "都不能为空!");
}
// 指定父目录
indexDir = indexParentDir + "/" + indexDir;
File indexFile = new File(indexDir);
if (!indexFile.exists()) {
indexFile.mkdirs();
}
// 获取字段索引策略
ColumnIndexStrategy columnIndexStrategy = new ColumnIndexStrategy();
// 删除sql字段后,,对查询字段进行处理
jobDataMap.remove("sql");
jobDataMap.remove("dataSource");
jobDataMap.remove("indexFileDir");
Set<Map.Entry<String, String>> columnSet = jobDataMap.entrySet();
for (Iterator<Map.Entry<String, String>> iter = columnSet.iterator(); iter
.hasNext();) {
Map.Entry<String, String> columnEntry = iter.next();
String columnKey = columnEntry.getKey();
String columnValue = columnEntry.getValue();
String[] strategys = columnValue.split(",");
if (strategys.length == 3) {
columnIndexStrategy.add(columnKey.toUpperCase(), strategys[0]
.toUpperCase(), strategys[1].toUpperCase(),
strategys[2], 1);
} else if (strategys.length == 4) {
columnIndexStrategy.add(columnKey.toUpperCase(), strategys[0]
.toUpperCase(), strategys[1].toUpperCase(),
strategys[2], Integer.parseInt(strategys[3]));
}
}
IndexWriter indexWriter = null;
Connection conn = null;
PreparedStatement ps = null;
ResultSet rs = null;
try {
DataSource dataSource = DataSourceHelper
.getDataSource(dataSourceName);
conn = dataSource.getConnection();
LuceneIndex luceneIndex = LuceneIndexFactory.getLuceneIndex();
indexWriter = luceneIndex.getIndexWriter(indexDir, true);
ps = conn.prepareStatement(sql);
rs = ps.executeQuery();
ResultSetMetaData rsmd = rs.getMetaData();
while (rs.next()) {
Document doc = new Document();
for (int i = 1; i <= rsmd.getColumnCount(); i++) {
String columnName = rsmd.getColumnName(i).toUpperCase();
if (!jobDataMap.containsKey(columnName)) {
continue;
}
String columnValue = SQLDataType.requireValueByColumnType(
rs, rsmd.getColumnType(i), i, false);
if (columnValue == null || columnValue.trim().length() == 0) {
continue;
}
if ("HTML".equalsIgnoreCase(columnIndexStrategy
.getFieldContentType(columnName))
&& columnIndexStrategy.getFieldIndex(columnName) != Index.NO) {
columnValue = filterHtmlLable(columnValue);
}
// ----判断如果是检索不分词时,,值转换为小写
if (columnIndexStrategy.getFieldIndex(columnName) == Index.NOT_ANALYZED) {
columnValue = columnValue.toLowerCase();
}
Field f = new Field(
columnName,// 名称
columnValue,// 值
// 对每一个字段执行不同的索引策略
columnIndexStrategy.getFieldStore(columnName),
columnIndexStrategy.getFieldIndex(columnName));
int boost = columnIndexStrategy.getBoost(columnName);
if (boost > 0)
f.setBoost(boost);
doc.add(f);
}
indexWriter.addDocument(doc);
}
//将索引的信息打印到控制台上。
if(SystemLogsHelper.isDebugger()){
System.out.println("index infos--------------------->");
indexWriter.setInfoStream(System.out);
}
indexWriter.optimize();
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (indexWriter != null)
indexWriter.close();
} catch (Exception e) {
SystemLogsHelper.trace("Could not close Lucene IndexWriter", e);
}
try {
if (rs != null)
rs.close();
} catch (Exception e) {
SystemLogsHelper.trace("Could not close JDBC ResultSet", e);
}
try {
if (ps != null)
ps.close();
} catch (Exception e) {
SystemLogsHelper.trace("Could not close JDBC Statement", e);
}
try {
if (conn != null)
conn.close();
} catch (Exception e) {
SystemLogsHelper.trace("Could not close JDBC Connection", e);
}
}
}
/**
* 对有字段标签先过滤再全文检索
*
* @param field
* 含有html标签的字段
* @return
*/
private static String filterHtmlLable(String field) {
StringBuffer result = new StringBuffer();
try {
String body = field;
Parser nodesParser = Parser.createParser(body, "UTF-8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeList nodeList = nodesParser.parse(textFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node nextNode = (Node) nodes[i];
String content = "";
if (nextNode instanceof TextNode) {
TextNode textnode = (TextNode) nextNode;
content = textnode.getText();
}
result.append(" ");
result.append(content);
}
} catch (Exception e) {
e.printStackTrace();
}
field = result.toString();
if (StringHelper.isEmpty(field))
return field;
// field = field.replaceAll("</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n");
// field = field.replaceAll("<br(?:\\s*)/>", "\n");
// field = field.replaceAll("\"", "''");
field = field.replaceAll("<[^>]+>?", "");
return field;
}
public static void main(String[] args) {
System.out
.println(LuceneIndexJob
.filterHtmlLable("<p class=MsoNormal style=\"MARGIN: 0cm 0cm 0pt; TEXT-INDENT: 28pt; TEXT-ALIGN: left; mso-layout-grid-align: none\" align=left><span style=\"FONT-SIZE: 9pt; FONT-FAMILY: 宋体; mso-ascii-font-family: 'Times New Roman'; mso-hansi-font-family: 'Times New Roman'\">在主仓库内备货,按库存金额的</span><span lang=EN-US style=\"FONT-SIZE: 9pt\"><font face=\"Times New Roman\">0.1</font></span><span style=\"FONT-SIZE: 9pt; FONT-FAMILY: 宋体; mso-ascii-font-family: 'Times New Roman'; mso-hansi-font-family: 'Times New Roman'\">‰每天计算占库费用,在其他仓库则需按仓库库存金额的</span><span lang=EN-US style=\"FONT-SIZE: 9pt\"><font face=\"Times New Roman\">0.12</font></span><span style=\"FONT-SIZE: 9pt; FONT-FAMILY: 宋体; mso-ascii-font-family: 'Times New Roman'; mso-hansi-font-family: 'Times New Roman'\">‰计算占库费。</span><span lang=EN-US style=\"FONT-SIZE: 9pt\"><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></span></p>"));
}
}
数据库类型判断:
public class SQLDataType {
public static String requireValueByColumnType(ResultSet rs, int columnType,
int columnIndex) throws SQLException {
String returnValue = "";
switch (columnType) {
case Types.BOOLEAN:
returnValue = rs.getBoolean(columnIndex) ? "TRUE" : "FALSE";
break;
case Types.TIMESTAMP:
returnValue = rs.getTimestamp(columnIndex).toString();
break;
case Types.BLOB:
Blob b = rs.getBlob(columnIndex);
//maybe affect to return result
byte[] blobs = b.getBytes(0, (int) b.length());
returnValue = new String(Base64.decodeBase64(blobs));
break;
case Types.CLOB:
returnValue = new String(Base64.decodeBase64(rs.getString(
columnIndex).getBytes()));
break;
default:
returnValue = rs.getString(columnIndex);
}
return returnValue;
}
}
Lucene进行查询:
public class DatabaseSearch extends SearchSupport {
private static final String __SEARCHTEXT = "__q";
// "="(默认)为精确匹配,"like"为不精确匹配
private String operator = "like";
// 用于加高亮时的显示
private Query useHighLightQuery;
@Override
public TopDocs execSearch(int pageSize, int pageIndex, Parameters ps,
LuceneStore luceneStore) throws Exception {
// 用来判断唯一字段
String uniqueField = "";
// 默认为or的策略
String searchType = "SEARCH_OR";
FieldSet fieldSet = luceneStore.getFields();
int fieldSize = fieldSet.fieldSize();
if (fieldSet.getField("luceneScore") != null)
fieldSize = fieldSize - 1;
String[] indexFields = new String[fieldSize];
BooleanClause.Occur[] clauses = new BooleanClause.Occur[fieldSize];
for (int i = 0; i < fieldSet.fieldSize(); i++) {
LuceneField luceneField = (LuceneField) fieldSet.getField(i);
if (luceneField.getName().equalsIgnoreCase("LuceneScore"))
continue;
indexFields[i] = luceneField.getName();
searchType = luceneField.getSearchType();
clauses[i] = ColumnSearchStrategy.getClause(searchType
.toUpperCase());
// 取第一个标为唯一的字段
if (luceneField.isUnique() && uniqueField.equals("")) {
uniqueField = indexFields[i];
}
}
ArrayList<MatchRule> matchRules = luceneStore.getMatchRules();
DefaultExprResolver der = new DefaultExprResolver();
for (Iterator<MatchRule> itr = matchRules.iterator(); itr.hasNext();) {
MatchRule matchRule = itr.next();
if (matchRule instanceof SQLMatchRule) {
SQLMatchRule sqlMatchRule = (SQLMatchRule) matchRule;
if (__SEARCHTEXT.equalsIgnoreCase(sqlMatchRule.getName())) {
this.operator = sqlMatchRule.getOperator().trim();
}
if (ps.exists(sqlMatchRule.getName()))
continue;
if (StringHelper.isNotEmpty(ps
.getString(sqlMatchRule.getName())))
continue;
if (luceneStore.getFields().getField(sqlMatchRule.getName()) == null)
continue;
// if (StringHelper.isEmpty(sqlMatchRule.getRightSide()))
// continue;
ps.setDataType(sqlMatchRule.getName(), sqlMatchRule
.getDataType());
ps.setValue(sqlMatchRule.getName(), QLExpressHelper.getRunner()
.execute(
(String) der.evaluate(null, sqlMatchRule
.getRightSide()), null, null, false,
false));
}
}
boolean addQuery = false;
Query query;
query = new BooleanQuery();
for (int i = 0, ilen = ps.count(); i < ilen; ++i) {
String name = ps.indexToName(i);
if (name.equalsIgnoreCase(__SEARCHTEXT))
continue;
if (luceneStore.getFields().getField(name) == null)
continue;
String value = ps.getString(i);
if (StringHelper.isEmpty(value))
continue;
// QueryParser qp = new QueryParser(Version.LUCENE_30, name, this
// .getAnalyzer());
// Query q1 = qp.parse(value);
Query q1 = BWSQueryParser.parseMultiField(new String[] { name },
value, new BooleanClause.Occur[] { Occur.SHOULD }, false);
if (q1 != null) {
((BooleanQuery) query).add(q1, BooleanClause.Occur.MUST);
addQuery = true;
}
}
// 要查询的值
String queryValue = ps.getString(__SEARCHTEXT);
if (!StringHelper.isEmpty(queryValue)) {
// 是否精确查找(是否对传过来的值再进行分词查找)
boolean exactMatch = false;
if (!operator.equalsIgnoreCase("=")) {// 不精确查找"或"的关系
exactMatch = false;
StringBuffer buffers = new StringBuffer();
buffers.append(queryValue);
buffers.append(" ");
buffers.append("\"");
buffers.append(queryValue.replaceAll("[\\s| ]+", "-"));
buffers.append("\"");
queryValue = buffers.toString();// 增加权重
} else {// 精确查找"且"的关系
exactMatch = true;
}
Query q2 = BWSQueryParser.parseMultiField(indexFields, queryValue,
clauses, exactMatch);
((BooleanQuery) query).add(q2, BooleanClause.Occur.MUST);
// Query q2 = IKQueryParser.parseMultiField(indexFields, queryValue,
// clauses);
addQuery = true;
}
if (!addQuery)
throw new NoKeywordsException("查询关键字不能为空!");
int topSize = 100;
if (pageSize > 0) {
if (pageIndex > 0) {
topSize = pageSize * pageIndex;
} else
topSize = pageSize;
}
Sort sort = null;
ArrayList<SortField> sortFields = new ArrayList<SortField>();
for (Iterator<SortRule> itr = luceneStore.getSortRules().iterator(); itr
.hasNext();) {
SortRule sortRule = itr.next();
Field f = luceneStore.getFields().getField(sortRule.getFieldName());
if (f != null) {
sortFields.add(new SortField(f.getName(), SortField.STRING,
sortRule.isDescent()));
}
}
if (sortFields.size() > 0) {
SortField[] sortArray = new SortField[sortFields.size()];
sort = new Sort(sortFields.toArray(sortArray));
}
// 因为下面的过虑重复,扩展的query,用于高度时会有问题,故在此提取出来
setUseHighLightQuery((Query) query.clone());
if (StringHelper.isNotEmpty(uniqueField)) {
// 以下的filter,与query要配合使用才能执行
DuplicateExtendFilter filter = new DuplicateExtendFilter(
uniqueField);
query = new DuplicateQuery(query, filter);
}
if (sort == null)
return getIndexSearcher().search(query, null, topSize);
else
return getIndexSearcher().search(query, null, topSize, sort);
}
public Query getUseHighLightQuery() {
return useHighLightQuery;
}
public void setUseHighLightQuery(Query useHighLightQuery) {
this.useHighLightQuery = useHighLightQuery;
}
}
根据以上两个类,与公司建模端结合,建模端所配置的参数不同,创建不同的工作,执行不同的索引........