java lucene 分词,Lucene分词与查询 | 学步园

0_1319016815zxAr.gif

0_1319016368RPlp.gif

package com.demo.ajax;

public class Building

{

private Integer id;

private String name;

private String Information;

public Integer getId()

{

return id;

}

public void setId(Integer id)

{

this.id = id;

}

public String getName()

{

return name;

}

public void setName(String name)

{

this.name = name;

}

public String getInformation()

{

return Information;

}

public void setInformation(String information)

{

Information = information;

}

}

package com.demo.ajax;

import java.util.ArrayList;

import java.util.List;

public class InitTool

{

public static List initBuilding()

{

List list = new ArrayList();

for (int i=60;i<100;i++)

{

Building building = new Building();

building.setId(i);

building.setName(i+"号楼");

building.setInformation("总统套间");

list.add(building);

}

return list;

}

}

package com.demo.ajax;

import java.io.File;

import java.net.URLDecoder;

import java.util.Iterator;

import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.FSDirectory;

public class AnalyzerTool

{

public void createIndex() throws Exception

{

// 实例化分词器,使用的是中文分词器

Analyzer analyzer = new PaodingAnalyzer();

// 指定要保存的文件路径并保存到FSDirectory中

//System.out.println(URLDecoder.decode(AnalyzerTool.class

//.getResource("/date/index/building/").toString(),"UTF-8").substring(6));

File file = new File("d:\\data\\index");

if(!file.exists())

{

file.mkdirs();

}

FSDirectory directory = FSDirectory.getDirectory("d:\\data\\index");

//true表示覆盖原来已经创建的索引,如果是false表示不覆盖,而是继续添加索引

IndexWriter writer = new IndexWriter(directory, analyzer, true);

List list = InitTool.initBuilding();

for(int i=0;i<30;i++)

{

Building building = list.get(i);

System.out.println(building.getId()+"-------------->"+building.getName()+"---------->"+building.getInformation());

}

for(Building building : list)

{

System.out.println(building.getId()+"-------------->"+building.getName()+"---------->"+building.getInformation());

}

for (Iterator it = list.iterator(); it.hasNext();)

{

Document doc = new Document();

Building building = (Building) it.next();

doc.add(new Field("id", String.valueOf(building.getId()), Field.Store.YES,

Field.Index.UN_TOKENIZED));

doc.add(new Field("building_name", building.getName(), Field.Store.YES,

Field.Index.TOKENIZED));

//String information = FunctionUtil.Html2Text(building.getInformation());

doc.add(new Field("building_information", building.getInformation(), Field.Store.YES,

Field.Index.TOKENIZED));

writer.addDocument(doc);

}

writer.optimize();

writer.close();

}

public static void main(String[] args) throws Exception

{

AnalyzerTool analyzerTool = new AnalyzerTool();

analyzerTool.createIndex();

}

}

package com.demo.ajax;

import java.io.StringReader;

import java.util.ArrayList;

import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.MultiFieldQueryParser;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Searcher;

import org.apache.lucene.search.highlight.Fragmenter;

import org.apache.lucene.search.highlight.Highlighter;

import org.apache.lucene.search.highlight.QueryScorer;

import org.apache.lucene.search.highlight.SimpleFragmenter;

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.apache.lucene.store.FSDirectory;

public class LuceneSearchTool

{

List searcheResult = new ArrayList();

public List getSearcheResult()

{

return searcheResult;

}

public void setSearcheResult(List searcheResult)

{

this.searcheResult = searcheResult;

}

public List search(String keywords)throws Exception

{

String path = "d:\\data\\index";

return searchIndex(path,keywords);

}

public List searchIndex(String path, String keywords) throws Exception

{

FSDirectory directory = FSDirectory.getDirectory(path);

IndexReader reader = IndexReader.open(directory);

Searcher searcher = new IndexSearcher(directory);

// MultiFieldQueryParser.parse中的参数分别为:

// 1.关键词

// 2.要查询的字段,字符串类型的数组

String[] field = { "building_name", "building_information" };

// 3.两个字段的关系(与或非)

BooleanClause.Occur[] flags = new BooleanClause.Occur[] {

BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };

// 4.指明分词的时候所用的分词器

Analyzer analyzer = new PaodingAnalyzer();

Query query = MultiFieldQueryParser.parse(keywords, field, flags,

analyzer);

// 由于我们目前使用的查询是多字段查询,需要匹配度的排序

// QueryScorer内置计分器

//query.rewrite(reader);// 用于重写query对象,目的能够让计分器识别当前的query.

// 获得结果集

Hits hits = searcher.search(query);

for (int i = 0; i < hits.length(); i++)

{

Document doc = hits.doc(i);

Building building = new Building();

building.setId(Integer.valueOf(doc.get("id")));

// title

String name = doc.get("building_name");

building.setName(name);

// content

String information = doc.get("building_information");

building.setInformation(information);

// 以上两项需要加亮

// Highlighter的构造函数中需要添加两个参数

// 1.高亮文字的格式(这个格式是基于html)

SimpleHTMLFormatter simpleHTMLFOrmatter = new SimpleHTMLFormatter(

"", "");

// 2.计分器

Highlighter highlighter = new Highlighter(simpleHTMLFOrmatter,

new QueryScorer(query));

// 关键字附近字符串的截取,截取120个字

Fragmenter fragmenter = new SimpleFragmenter(120);

highlighter.setTextFragmenter(fragmenter);

// 针对某个字段的加亮以及截取

TokenStream tokenStream = analyzer.tokenStream("building_information",

new StringReader(information));

//将加亮并截取的字符串取出来

String highLightText = highlighter.getBestFragment(tokenStream, information);

if(highLightText!=null)

{

building.setInformation(highLightText);

}

// 针对某个字段的加亮以及截取

TokenStream name_tokenStream = analyzer.tokenStream("building_name",

new StringReader(name));

//将加亮并截取的字符串取出来

String name_highLightText = highlighter.getBestFragment(name_tokenStream, name);

if(name_highLightText != null)

{

building.setName(name_highLightText);

}

searcheResult.add(building);

}

reader.close();

return searcheResult;

}

}

package com.demo.ajax;

import java.util.List;

public class Junit

{

public static void main(String[] args) throws Exception

{

LuceneSearchTool luceneSearchTool = new LuceneSearchTool();

List searcheResult = luceneSearchTool.search("号楼");

System.out.println(searcheResult.size());

for(Building building : searcheResult)

{

System.out.println(building.getId()+"------------->"+building.getName()+"-------------->"+building.getInformation());

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值