Lucene之分词器的使用

 

 

 

 

语法分析器

 

具体文件组成:

 

 

ext.dic

gsdgdg

 

 

IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">

<properties>

<comment>IK Analyzer 扩展配置</comment>

 

<entry key="ext_dict">ext.dic;</entry>

 

<!--用户可以在这里配置自己的扩展停止词字典-->

<entry key="ext_stopwords">stopword.dic;</entry>

</properties>

 

 

 

stopword.dic

a

an

and

are

as

at

be

but

by

for

if

in

into

is

it

no

not

of

on

or

such

that

the

their

then

there

these

they

this

to

was

will

with

 

 

 

 

lucene测试类

package lucene;

 

import javafx.beans.binding.When;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.cjk.CJKAnalyzer;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.core.SimpleAnalyzer;

import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.FieldType;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.junit.Test;

import org.wltea.analyzer.lucene.IKAnalyzer;

 

import java.io.File;

import java.io.IOException;

import java.util.HashMap;

import java.util.Map;

 

public class lucene {

String path="D:/workforce/lucene/hello"; //创建文件路径

Version version=Version.LUCENE_4_10_4; //创建Lucene的版本

String cn="我对什么书表示兴趣,父亲就把那部书放在我书桌上,有时他得爬梯到书橱高处去拿; 假如我长期不读,那部书就不见了---这就等于谴责。";

String en="When I expressed interest in any book, my father put it on my desk. Sometimes he had to climb the ladder to get it from the top of the bookcase. If I didn't read it for a long time, it would disappear - that would be condemnation.";

//设置内容

String content1="走好选择的路,别选择好走的路,你才能拥有真正的自己。";

String content2="惟有身处卑微的人,最有机缘看到世态人情的真相。一个人不想攀高就不怕下跌,也不用倾轧排挤,可以保其天真,成其自然,潜心一志完成自己能做的事。";

String content3="我甘心当个,人家不把我当个东西,我正好可以把看不起我的人看个透 ";

//Lucene的录入

@Test

public void testLucene() throws Exception{

//1.定义Lucene存放文件的位置

Directory directory= FSDirectory.open(new File(path));

//2.配置分词对象

Analyzer analyzer=new StandardAnalyzer();

//3.配置对象

IndexWriterConfig config=new IndexWriterConfig(version,analyzer);

IndexWriter writer=new IndexWriter(directory,config);

//4.往库里写入内容

FieldType type=new FieldType();

type.setStored(true);//可存储

type.setIndexed(true);//存储索引

type.setTokenized(true);//设置分词

//创建文档对象

Document doc=new Document();

doc.add(new Field("title","doc1",type));

doc.add(new Field("content",content1,type));

writer.addDocument(doc);

 

 

Document doc2=new Document();

doc2.add(new Field("title","doc2",type));

doc2.add(new Field("content",content2,type));

writer.addDocument(doc2);

 

 

Document doc3=new Document();

doc3.add(new Field("title","doc3",type));

doc3.add(new Field("content",content3,type));

writer.addDocument(doc3);

 

 

//5.提交资源

writer.commit();

 

//6.关闭资源

writer.close();

}

 

 

 

//Lucene的查询

@Test

public void testSearch()throws Exception{

//1.定义Lucene存放文件的位置

Directory directory=FSDirectory.open(new File(path));

//2.创建Reader

IndexReader reader= DirectoryReader.open(directory);

//3.创建读取对象

IndexSearcher searcher=new IndexSearcher(reader);

//第一个参数表示:在哪个字段查询内容

//第二参数:分词对象

Analyzer analyzer=new StandardAnalyzer();

QueryParser parser=new QueryParser("content",analyzer);

Query query=parser.parse("");

//第二个参数表示符合条件的前n条记录

TopDocs tds=searcher.search(query,10000);

System.out.println("总共命中次数: "+tds.totalHits);

ScoreDoc[] scoreDocs=tds.scoreDocs;

ScoreDoc scoredoc=null;

Document doc=null;

for (int i=0;i<scoreDocs.length;i++){

scoredoc=scoreDocs[i];

System.out.println("文档分数:"+scoredoc.score);

System.out.println("文档编号:"+scoredoc.doc);

doc=searcher.doc(scoredoc.doc); //根据编号获取文档

System.out.println("title>>>>>>>>>>>>>"+doc.get("title")); //获取标题

System.out.println("content>>>>>>>>>>>"+doc.get("content"));//获取文档内容

System.out.println("-------------------------");

}

}

 

 

 

 

 

//分词器的使用(中英文)

//英文分词器

@Test

public void testAnalyzer() throws IOException {

//中文

Analyzer analyzer=new SimpleAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

 

 

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

 

//英文

Analyzer analy=new SimpleAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

// 结论:SimpleAnalyzer:对于英文是空格分词

 

 

 

 

}

 

@Test

public void testStandAnalyzer() throws IOException {

//中文

Analyzer analyzer=new StandardAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

 

 

System.out.println("-------------------------------中英文分词对比--------------------------------------");

 

//英文

Analyzer analy=new StandardAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

//结论:StandardAnalyzer对于英文是空格分词,对于中文是单字分词

}

 

 

@Test

public void testPerFieldAnalyzerWrapper() throws IOException {

Map<String,Analyzer>fieldAnalyzer=new HashMap<>();

fieldAnalyzer.put("en",new SimpleAnalyzer());

fieldAnalyzer.put("cn",new StandardAnalyzer());

 

 

PerFieldAnalyzerWrapper wrapper=new PerFieldAnalyzerWrapper(new SimpleAnalyzer(),fieldAnalyzer);

// TokenStream token=wrapper.tokenStream("content",cn); //如果存在map不存在的key来执行 将会按照SimpleAnalyzer()这种分词器

TokenStream token=wrapper.tokenStream("cn",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

 

 

 

System.out.println("-------------------------------中英文分词对比--------------------------------------");

 

 

 

PerFieldAnalyzerWrapper wrapp=new PerFieldAnalyzerWrapper(new SimpleAnalyzer(),fieldAnalyzer);

// TokenStream token=wrapper.tokenStream("content",cn); //如果存在map不存在的key来执行 将会按照SimpleAnalyzer()这种分词器

TokenStream stream=wrapp.tokenStream("en",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){ //指针下走一格

System.out.println(stream);

}

//结论:可以根据解析的字段来分配分析器

}

 

 

 

 

//中文分词器

@Test

public void testCJKAnalyzer() throws IOException {

//中文

Analyzer analyzer=new CJKAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

 

 

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

 

//英文

Analyzer analy=new CJKAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

//结论:CJKAnalyzer两字两字的分词

}

 

 

 

 

@Test

public void testSmartCn() throws IOException {

//中文

Analyzer analyzer=new SmartChineseAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

 

 

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

 

//英文

Analyzer analy= new SmartChineseAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

//结论:使用字典分词,对于新兴的词语没办法分词.

 

 

}

 

 

 

 

@Test

public void testIKAnalyzer() throws IOException {

//中文

Analyzer analyzer=new IKAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

 

 

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

 

//英文

Analyzer analy= new IKAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

}

}

 

 

 

测试结果如下:

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值