java lucence_Lucence使用入门

依赖:

lucene-analyzers.jar

lucene-benchmark.jar

lucene-core.jar

lucene-highlighter.jar

lucene-memory.jar

lucene-parser.jar

lucene-remote.jar

lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

private static final long serialVersionUID = 3701082756628915138L;

private Integer id;

private String type;

private String virtualDoc;

private String summary;

private float score;

public Integer getId() {

return id;

}

public void setId(Integer id) {

this.id = id;

}

public String getType() {

return type;

}

public void setType(String type) {

this.type = type;

}

public String getVirtualDoc() {

if (null == virtualDoc) {

// TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索

// 格式:字段1:属性值1,字段2:属性值2,...

}

return virtualDoc;

}

public void setVirtualDoc(String virtualDoc) {

this.virtualDoc = virtualDoc;

}

public String getSummary() {

StringBuilder sb = new StringBuilder();

String tmpSum = summary;

tmpSum = tmpSum.replace("", "");

tmpSum = tmpSum.replace("", "");

String virtualDoc2 = getVirtualDoc();

int length = tmpSum.length();

int firstIndex = virtualDoc2.indexOf(tmpSum);

if (firstIndex > 0) {

sb.append("...");

}

sb.append(summary);

if (firstIndex + length < virtualDoc2.length()) {

sb.append("...");

}

return sb.toString();

}

public void setSummary(String summary) {

this.summary = summary;

}

public float getScore() {

return score;

}

public void setScore(float score) {

this.score = score;

}

}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;

import java.io.File;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.PrefixQuery;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.WildcardQuery;

import org.apache.lucene.search.highlight.Highlighter;

import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;

import org.apache.lucene.search.highlight.QueryScorer;

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.apache.lucene.search.highlight.SimpleSpanFragmenter;

import org.apache.lucene.search.highlight.TokenSources;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class Demo {

/** lucene索引目录 */

private static Directory ciIndexDir;

private static final String CI_CONTENT_FLAG = "virtualDoc";

/** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */

private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

private static Pattern VALID_IPV4_PATTERN = null;

private static Pattern VALID_IPV6_PATTERN = null;

private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";

private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}";

private static IndexWriter indexWriter;

static {

VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);

VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);

IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);

conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

try {

indexWriter = new IndexWriter(getCiIndexDir(), conf);

} catch (IOException e) {

e.printStackTrace();

}

}

private static Directory getCiIndexDir() {

if (null == ciIndexDir) {

try {

ciIndexDir = FSDirectory.open(new File("D://indexs"));

} catch (IOException e) {

e.printStackTrace();

}

}

return ciIndexDir;

}

private static boolean isIpAddress(String ipAddress) {

Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);

Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);

return m1.matches() || m2.matches();

}

private static boolean isChinese(char c) {

Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);

if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS

|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B

|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS

|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {

return true;

}

return false;

}

private static BooleanQuery parseChineseCharacters(String inputString){

BooleanQuery query = new BooleanQuery();

if(isIpAddress(inputString)){

query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);

return query;

}

BooleanQuery fieldQuery = new BooleanQuery();

boolean isWord = false;

StringBuilder tempWord = new StringBuilder();

inputString = inputString.toLowerCase();

BooleanQuery booleanQuery = new BooleanQuery();

int length = inputString.length();

Query termQuery = null;

for(int i=0; i

char c = inputString.charAt(i);

if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character

isWord = true;

tempWord.append(c);

}

else{//Delimiter or Chinese character

isWord = false;

if(tempWord.length() > 0){

termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));

// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

tempWord = new StringBuilder();

}

}

if(!isWord){

termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));

if(isChinese(c)){//Chinese character

// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

}

else{//Delimiter

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

}

}

}

if(tempWord.length() > 0){

termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

}

// Begin 处理全局字段匹配

termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));

booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

// End 处理全局字段匹配

BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);

fieldQuery.add(clause);

BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);

query.add(fieldClause);

return query;

}

/**

* 全文检索

* @param queryStr

* @throws Exception

*/

private static void contentSearch(String queryStr, boolean highlight) throws Exception {

IndexReader indexReader = null;

IndexSearcher indexSearcher = null;

try {

indexReader = IndexReader.open(getCiIndexDir());

indexSearcher = new IndexSearcher(indexReader);

//组合查询条件,需要根据业务自己定义

Query query = parseChineseCharacters(queryStr);

TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);

if(hits.totalHits > 0) {

if (highlight) {

QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);

SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("", "");

Highlighter highlighter = new Highlighter(formatter, scorer);

highlighter

.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));

for (ScoreDoc scoreDoc : hits.scoreDocs) {

Document doc = indexSearcher.doc(scoreDoc.doc);

System.out.println(doc.get("virtualDoc"));

Entity entity = null;

entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);

entity.setScore(scoreDoc.score);

}

} else {

for (ScoreDoc scoreDoc : hits.scoreDocs) {

Document doc = indexSearcher.doc(scoreDoc.doc);

System.out.println(doc.get("virtualDoc"));

Entity entity = null;

entity = convertToEntity(doc);

entity.setScore(scoreDoc.score);

}

}

}

} catch (IOException ioe) {

ioe.printStackTrace();

} finally {

close(indexSearcher);

close(indexReader);

}

}

/**

* 对实现Closeable接口的统一关闭

* @param object

*/

private static void close(Closeable object) {

if(null != object) {

try {

object.close();

} catch (IOException e) {

}

}

}

/**

* 实体转换为Doc

* @param entity

* @return

*/

public static Document convertToDocument(Entity entity) {

Document doc = new Document();

String virtualDoc = entity.getVirtualDoc();

//Field.Store.Yes存储,Field.Index.ANALYZED分词

doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));

doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));

doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));

return doc;

}

/**

* Doc转换为实体

* @param doc

* @return

*/

public static Entity convertToEntity(Document doc) {

Entity ci = new Entity();

ci.setId(Integer.valueOf(doc.get("id")));

ci.setType(doc.get("type"));

ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));

return ci;

}

/**

* 检索Entity,含高亮信息

* @param doc

* @param indexReader

* @param docId

* @param highlighter

* @return

* @throws IOException

* @throws InvalidTokenOffsetsException

*/

public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)

throws IOException, InvalidTokenOffsetsException {

Entity entity = convertToEntity(doc);

String virtualDoc = entity.getVirtualDoc();

TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);

String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);

if(highlighterSummary == null){

highlighterSummary = virtualDoc;

}

entity.setSummary(highlighterSummary);

return entity;

}

/**

* 给entity信息增加索引

* @param entity

*/

public static void addIndex(Entity entity) {

try {

deleteIndex(entity);

Document doc = convertToDocument(entity);

indexWriter.addDocument(doc);

indexWriter.commit();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 批量增加索引

* @param list

*/

public static void addIndexs(List list) {

try {

List docs = new ArrayList();

deleteIndexs(list);

for (Entity entity : list) {

Document doc = convertToDocument(entity);

docs.add(doc);

}

indexWriter.addDocuments(docs);

indexWriter.commit();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 给实体信息更新索引

* @param entity

*/

public static void updateIndex(Entity entity) {

try {

addIndex(entity);

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 删除entity列表信息对应的索引

* @param entity

*/

public static void deleteIndexs(List list) {

try {

int size = list.size();

Term[] terms = new Term[size];

for(int i=0; i

terms[i] = new Term("id", list.get(i).getId().toString());

}

indexWriter.deleteDocuments(terms);

indexWriter.commit();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 删除实体信息对应的索引

* @param entity

*/

public static void deleteIndex(Entity entity) {

try {

indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));

indexWriter.commit();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 删除实体类型对应的所以索引信息

* @param type

*/

public static void deleteIndexByType(String type) {

try {

indexWriter.deleteDocuments(new Term("type", type));

indexWriter.commit();

} catch (Exception e) {

e.printStackTrace();

}

}

@Override

protected void finalize() throws Throwable {

indexWriter.close();

}

public static void main(String[] args) throws Exception {

String queryStr = "http://mail6c1.shenzhenair.com";

contentSearch(queryStr, true);

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值