Lucene之分词器的使用

最新推荐文章于 2022-05-28 22:27:41 发布

zhouzhou_98

最新推荐文章于 2022-05-28 22:27:41 发布

阅读量574

点赞数

分类专栏： Lucene

本文链接：https://blog.csdn.net/zhouzhou_98/article/details/88261914

版权

Lucene 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

语法分析器

具体文件组成：

ext.dic

gsdgdg

IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">

<properties>

<comment>IK Analyzer 扩展配置</comment>

<entry key="ext_dict">ext.dic;</entry>

<entry key="ext_stopwords">stopword.dic;</entry>

</properties>

stopword.dic

and

are

but

for

into

not

such

that

the

their

then

there

these

they

this

was

will

with

lucene测试类

package lucene;

import javafx.beans.binding.When;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.cjk.CJKAnalyzer;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.core.SimpleAnalyzer;

import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.FieldType;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.junit.Test;

import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;

import java.io.IOException;

import java.util.HashMap;

import java.util.Map;

public class lucene {

String path="D:/workforce/lucene/hello"; //创建文件路径

Version version=Version.LUCENE_4_10_4; //创建Lucene的版本

String cn="我对什么书表示兴趣，父亲就把那部书放在我书桌上，有时他得爬梯到书橱高处去拿; 假如我长期不读，那部书就不见了---这就等于谴责。";

String en="When I expressed interest in any book, my father put it on my desk. Sometimes he had to climb the ladder to get it from the top of the bookcase. If I didn't read it for a long time, it would disappear - that would be condemnation.";

//设置内容

String content1="走好选择的路，别选择好走的路，你才能拥有真正的自己。";

String content2="惟有身处卑微的人，最有机缘看到世态人情的真相。一个人不想攀高就不怕下跌，也不用倾轧排挤，可以保其天真，成其自然，潜心一志完成自己能做的事。";

String content3="我甘心当个“零”，人家不把我当个东西，我正好可以把看不起我的人看个透 ";

//Lucene的录入

@Test

public void testLucene() throws Exception{

//1.定义Lucene存放文件的位置

Directory directory= FSDirectory.open(new File(path));

//2.配置分词对象

Analyzer analyzer=new StandardAnalyzer();

//3.配置对象

IndexWriterConfig config=new IndexWriterConfig(version,analyzer);

IndexWriter writer=new IndexWriter(directory,config);

//4.往库里写入内容

FieldType type=new FieldType();

type.setStored(true);//可存储

type.setIndexed(true);//存储索引

type.setTokenized(true);//设置分词

//创建文档对象

Document doc=new Document();

doc.add(new Field("title","doc1",type));

doc.add(new Field("content",content1,type));

writer.addDocument(doc);

Document doc2=new Document();

doc2.add(new Field("title","doc2",type));

doc2.add(new Field("content",content2,type));

writer.addDocument(doc2);

Document doc3=new Document();

doc3.add(new Field("title","doc3",type));

doc3.add(new Field("content",content3,type));

writer.addDocument(doc3);

//5.提交资源

writer.commit();

//6.关闭资源

writer.close();

}

//Lucene的查询

@Test

public void testSearch()throws Exception{

//1.定义Lucene存放文件的位置

Directory directory=FSDirectory.open(new File(path));

//2.创建Reader

IndexReader reader= DirectoryReader.open(directory);

//3.创建读取对象

IndexSearcher searcher=new IndexSearcher(reader);

//第一个参数表示：在哪个字段查询内容

//第二参数：分词对象

Analyzer analyzer=new StandardAnalyzer();

QueryParser parser=new QueryParser("content",analyzer);

Query query=parser.parse("人");

//第二个参数表示符合条件的前n条记录

TopDocs tds=searcher.search(query,10000);

System.out.println("总共命中次数： "+tds.totalHits);

ScoreDoc[] scoreDocs=tds.scoreDocs;

ScoreDoc scoredoc=null;

Document doc=null;

for (int i=0;i<scoreDocs.length;i++){

scoredoc=scoreDocs[i];

System.out.println("文档分数:"+scoredoc.score);

System.out.println("文档编号:"+scoredoc.doc);

doc=searcher.doc(scoredoc.doc); //根据编号获取文档

System.out.println("title>>>>>>>>>>>>>"+doc.get("title")); //获取标题

System.out.println("content>>>>>>>>>>>"+doc.get("content"));//获取文档内容

System.out.println("-------------------------");

}

//分词器的使用（中英文）

//英文分词器

@Test

public void testAnalyzer() throws IOException {

//中文

Analyzer analyzer=new SimpleAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

//英文

Analyzer analy=new SimpleAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

// 结论：SimpleAnalyzer:对于英文是空格分词

}

@Test

public void testStandAnalyzer() throws IOException {

//中文

Analyzer analyzer=new StandardAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

System.out.println("-------------------------------中英文分词对比--------------------------------------");

//英文

Analyzer analy=new StandardAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

//结论：StandardAnalyzer对于英文是空格分词,对于中文是单字分词

}

@Test

public void testPerFieldAnalyzerWrapper() throws IOException {

Map<String,Analyzer>fieldAnalyzer=new HashMap<>();

fieldAnalyzer.put("en",new SimpleAnalyzer());

fieldAnalyzer.put("cn",new StandardAnalyzer());

PerFieldAnalyzerWrapper wrapper=new PerFieldAnalyzerWrapper(new SimpleAnalyzer(),fieldAnalyzer);

// TokenStream token=wrapper.tokenStream("content",cn); //如果存在map不存在的key来执行将会按照SimpleAnalyzer()这种分词器

TokenStream token=wrapper.tokenStream("cn",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

System.out.println("-------------------------------中英文分词对比--------------------------------------");

PerFieldAnalyzerWrapper wrapp=new PerFieldAnalyzerWrapper(new SimpleAnalyzer(),fieldAnalyzer);

// TokenStream token=wrapper.tokenStream("content",cn); //如果存在map不存在的key来执行将会按照SimpleAnalyzer()这种分词器

TokenStream stream=wrapp.tokenStream("en",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){ //指针下走一格

System.out.println(stream);

}

//结论：可以根据解析的字段来分配分析器

}

//中文分词器

@Test

public void testCJKAnalyzer() throws IOException {

//中文

Analyzer analyzer=new CJKAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

//英文

Analyzer analy=new CJKAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

//结论：CJKAnalyzer两字两字的分词

}

@Test

public void testSmartCn() throws IOException {

//中文

Analyzer analyzer=new SmartChineseAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

//英文

Analyzer analy= new SmartChineseAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

//结论：使用字典分词,对于新兴的词语没办法分词.

}

@Test

public void testIKAnalyzer() throws IOException {

//中文

Analyzer analyzer=new IKAnalyzer();

TokenStream token=analyzer.tokenStream("content",cn);

token.reset();//将指针摆放到最原始的位置

while (token.incrementToken()){ //指针下走一格

System.out.println(token);

}

System.out.println("-------------------------------中英文分词对比“--------------------------------------”");

//英文

Analyzer analy= new IKAnalyzer();

TokenStream stream=analy.tokenStream("content",en);

stream.reset();//将指针摆放到最原始的位置

while (stream.incrementToken()){//指针下走一格

System.out.println(stream);

}

测试结果如下：

zhouzhou_98

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene之分词器的使用

语法分析器具体文件组成：ext.dicgsdgdgIKAnalyzer.cfg.xml&lt;?xml version="1.0" encoding="UTF-8"?&gt;&lt;!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties...
复制链接

扫一扫

专栏目录