lucene入门-使用JE中文分词

最新推荐文章于 2014-11-07 20:26:44 发布

deepfuture

最新推荐文章于 2014-11-07 20:26:44 发布

阅读量91

点赞数

分类专栏：搜索引擎文章标签： lucene Apache 算法 F#

搜索引擎专栏收录该内容

147 篇文章 0 订阅

订阅专栏

基于词库的算法分词，是较好的中文分词器

package busetoken;
import java.io.IOException;

import jeasy.analysis.MMAnalyzer;

public class UseJe {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String s="编码规范从根本上解决了程序维护员的难题；规范的编码阅读和理解起来更容易，也可以快速的不费力气的借鉴别人的编码。对将来维护你编码的人来说，你的编码越优化，他们就越喜欢你的编码，理解起来也就越快。";
MMAnalyzer mm=new MMAnalyzer();
System.out.print(mm.segment(s, "|"));

}

效果如下

编码|规范|从根本上|解决|程序|维护|员|难题|规范|编码|阅读|理解|起来|更|容易|也可以|快速|不费力气|借鉴|别人|编码|将来|维护|你|编码|的人|来说|你的|编码|越|优化|他们|就越|喜欢|你的|编码|理解|起来|也就|越快|

建立索引

package bindex;
import java.io.File;
import tool.FileText;
import tool.FileList;
import java.io.*;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.store.LockObtainFailedException;
public class FileIndexer {

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String indexPath ="indexes";
try {
IndexWriter indexWriter = new IndexWriter(indexPath,new MMAnalyzer());
String[] files=FileList.getFiles("htmls");
int num=files.length;
for(int i=0;i<num;i++){
Document doc=new Document();
File f=new File(files[i]);

String name=f.getName();
String content=FileText.getText(f);
String path=f.getPath();
Field field=new Field("name",name,Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
field=new Field("content",content,Field.Store.YES,Field.Index.TOKENIZED);
doc.add(field);
field=new Field("path",path,Field.Store.YES,Field.Index.NO);
doc.add(field);
indexWriter.addDocument(doc);
System.out.println("File:"+path+name+" indexed!");
}
System.out.println("OK!");
indexWriter.close();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (LockObtainFailedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

搜索

package bindex;
import java.io.IOException;
import java.lang.StringBuffer;

import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.document.*;

public class BindexSearcher {

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String indexPath="indexes";
String searchField="content";
String searchPhrase="安全";
StringBuffer sb=new StringBuffer("");

try {
IndexSearcher searcher=new IndexSearcher(indexPath);
Term t=new Term(searchField,searchPhrase);
Query q=new TermQuery(t);
Hits hs=searcher.search(q);
int num=hs.length();
for (int i=0;i<num;i++){
Document doc=hs.doc(i);
Field fname=doc.getField("name");
Field fcontent=doc.getField("content");
sb.append("name:\n");
sb.append(fname.stringValue()+"\n");
sb.append("content:\n");
sb.append(fcontent.stringValue().substring(0, 100)+"\n");
sb.append("------------"+"\n");
}
searcher.close();
System.out.println(sb);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}