Lucene中文分词2.4.0

jar包使用:
lucene-core-2.4.0.jar //Lucene核心包.
lucene-analyzers-2.4.0.jar //分词器包.
paoding-analysis-2.0.4-alpha2 //paoding分词器包.


//首先我们想要分词就得创建一个分词的所索引.
public void createLuceneIndex(List<Bean> beanList) throws Exception {
		System.out.println("--------- 开始创建索引 ------------");
		Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");
		//IndexWriter参数说明: 目录参数, 分词器, 是否重新创建, 字段最大长度.
		IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, true, MaxFieldLength.LIMITED);
		
		//创建随机储存目录.
		Directory ramDir = new RAMDirectory(fsDir);
		IndexWriter ramIndexWriter = new IndexWriter(ramDir, analyzer, MaxFieldLength.LIMITED);
		
		for (Bean bean : beanList) {
			//把文件转为Document类型
			Document doc = DocumentUtil.getDocuement(bean);
			System.out.println("索引号------> : " + ramIndexWriter.numRamDocs());
			System.out.println("文章标题----> : " + bean.getText());
			ramIndexWriter.addDocument(doc);
		}
		//关闭随机索性写入器.
		ramIndexWriter.optimize();
		ramIndexWriter.close();
		
		//把内存里的索引存入硬盘里.
		fsindexWriter.addIndexesNoOptimize(new Directory[]{ ramDir });
		System.out.println("--------- 创建索引成功 ------------");
		fsindexWriter.optimize();
		fsindexWriter.close();
	}


	//我们创建索引的时候会插入的数据是Document类型的.创建一个方法来把一个Bean对像转为Document对象.
	private static int index = 0;	//声明一个全局变量来做为索引的gid, 方便对单一索引进行操作.
	public synchronized static Document getDocuement(Bean bean){
//		File file = new File(path);
		Document doc = new Document();
		//Filed 参数说明: 索引名, 文件名, 是否存储(Store.COMPRESS(压缩之后在存)), 是否建立索引
		//(index.NOT_ANALYZED(不分词索引);index.ANALYZED(分词后索引);index.NO(不索引)).
		
		doc.add(new Field("gid", getNextIndex()+"", Store.COMPRESS, Index.ANALYZED));
		doc.add(new Field("title", bean.getTitle(), Store.COMPRESS, Index.ANALYZED));
		doc.add(new Field("text", bean.getText(), Store.COMPRESS, Index.ANALYZED));
		return doc;
	}


	//插入索引.
	public synchronized boolean insertIndex(Bean bean) throws Exception{
		System.out.println("开始插入数据........");
		//创建读入索引对象.创建读入对像是为了看是否插入数据.也可以不查看.
		IndexReader indexReader = IndexReader.open("E:\\Workspaces\\Lucene\\FileIndex");
		System.out.println("插入前号:--》" + indexReader.numDocs());
		indexReader.close();
		//获取索引对象.
		Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");
		//创建索引对写入对象.
		IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, false, MaxFieldLength.LIMITED);
		
		Document doc = FileToDocument.getDocuement(bean);		//传入BEAN得到文档对象.
		fsindexWriter.addDocument(doc);									//添加文档对象.
		fsindexWriter.optimize();										//
		fsindexWriter.close();
		
		IndexReader indexReaderLast = IndexReader.open("E:\\Workspaces\Lucene\\FileIndex");
		System.out.println("插入后号:--》" + indexReaderLast.numDocs());
		indexReaderLast.close();
		return true;
	}
	
	//删除相对的索引.
	public synchronized boolean deleteIndex(int index) throws Exception {
	
		Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");
		IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, false, MaxFieldLength.LIMITED);
		
		//删除相对的索引.通过对我们自己设置的GID进行查找.就能找以我们想要删除的对象.
		fsindexWriter.deleteDocuments(new Term("gid",index+""));
		fsindexWriter.commit();
		fsindexWriter.close();
		
		IndexReader indexReader2 = IndexReader.open("E:\\Workspaces\\Lucene\\FileIndex");
		System.out.println("删除后---------->" + indexReader2.numDocs());
	}
	
public class LuceneTest {
	public static void main(String[] args) throws Exception {
		String queryStr = "中国";
		//查询用的分词器和创建用的最好使用一个.
		Analyzer analyzer = new PaodingAnalyzer();
		
		//把要解析的文本解析为Query对像.
		String[] fields = {"title"};
		QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
		Query query = queryParser.parse(queryStr);
		
		//过滤器.为空就不过滤.
		Filter filter = null;
		IndexSearcher indexSearcher = new IndexSearcher("E:\\Workspaces\\Lucene\\FileIndex");
		//indexSearcher参数说明: 查询对象, 过滤器, 一次性能查的最大文档. 
		TopDocs topDocs = indexSearcher.search(query, filter, 10000);
		System.out.println("匹配条数:" + topDocs.totalHits);
		
		for (ScoreDoc topDoc : topDocs.scoreDocs) {
			int docid = topDoc.doc;
			System.out.println(docid);		//文档内部编号.
			Document doc = indexSearcher.doc(docid);
			System.out.println(doc.get("text"));
		}
	}
}
	
	
//	分词器的分词例子.
package com.testLucene.analyzer;
import java.io.StringReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;


public class AnalyzerTest {
	public static void main(String[] args) throws Exception {
		Analyzer analyzer = new PaodingAnalyzer();
		String str = "中华人民共和国";
		new AnalyzerTest().analyzer(analyzer, str);
	}
	
	public void analyzer(Analyzer analyzer, String str) throws Exception{
		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(str));
		
		for(Token token = new Token();(token = tokenStream.next(token)) != null;){
			System.out.println(token);
		}
	}
}


paoding分词器的配置方法.
版本: paoding-analysis-2.0.4-alpha2
使用paoding分词要配置环境变量.(没试验过不配能不能用).
1. PAODING_DIC_HOME = E:\MyDocument\paoding-analysis-2.0.4-alpha2\dic
2. 把E:\MyDocument\paoding-analysis-2.0.4-alpha2\src\paoding-dic-home.properties这个文件复制到项目的src目录下.
3. 修改paoding-dic-home.properties的内容:
paoding.dic.home =/MyDocument/paoding-analysis-2.0.4-alpha2/dic

如果报错:Caused by: java.lang.ClassNotFoundException: org.apache.commons.logging.LogFactory
需添加一个包: commons-logging-1.0.4.jar.
这样paoding分词器就搞定.
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值