下面附上代码
1
using
System;
2 using System.Collections.Generic;
3 using System.Text;
4 using System.IO;
5 using Lucene.Net;
6 using Lucene.Net.Analysis;
7
8 namespace Lucene.Net.Analysis.KTDictSeg
9 {
10 public class KTDictSegAnalyzer:Analyzer
11 {
12 public KTDictSegAnalyzer()
13 {
14 }
15
16 public override TokenStream TokenStream(string fieldName, TextReader reader)
17 {
18 TokenStream result = new KTDictSegTokenizer(reader);
19 result = new LowerCaseFilter(result);
20 return result;
21 }
22 }
23}
2 using System.Collections.Generic;
3 using System.Text;
4 using System.IO;
5 using Lucene.Net;
6 using Lucene.Net.Analysis;
7
8 namespace Lucene.Net.Analysis.KTDictSeg
9 {
10 public class KTDictSegAnalyzer:Analyzer
11 {
12 public KTDictSegAnalyzer()
13 {
14 }
15
16 public override TokenStream TokenStream(string fieldName, TextReader reader)
17 {
18 TokenStream result = new KTDictSegTokenizer(reader);
19 result = new LowerCaseFilter(result);
20 return result;
21 }
22 }
23}
using
System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net;
using Lucene.Net.Analysis;
using KTDictSeg;
namespace Lucene.Net.Analysis.KTDictSeg
{
public class KTDictSegTokenizer:Tokenizer
{
public static CSimpleDictSeg m_SimpleDictSeg;
private ArrayList ioBuffer;
private int offSet = 0 ; //偏移量.
private int position = -1 ; //词汇在缓冲中的位置.
private int length = 0 ; //词汇的长度.
private int start = 0 ; //开始偏移量.
public KTDictSegTokenizer(System.IO.TextReader input)
: base(input)
{
//这里用了一个第三方的中文分词组件.
//ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
if (m_SimpleDictSeg == null)
{
try
{
m_SimpleDictSeg = new CSimpleDictSeg();
m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
m_SimpleDictSeg.LoadDict();
}
catch (Exception e1)
{
m_SimpleDictSeg = null;
throw e1;
}
}
m_SimpleDictSeg.FilterStopWords = true;
m_SimpleDictSeg.MatchName = true;
ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd());
}
//DotLucene的分词器简单来说,就是实现Tokenizer的Next方法,把分解出来的每一个词构造为一个Token,因为Token是DotLucene分词的基本单位。
public override Token Next()
{
position++;
if (position < ioBuffer.Count)
{
length = ioBuffer[position].ToString().Length;
start = offSet ;
offSet += length ;
return new Token(ioBuffer[position].ToString(), start, start + length);
}
return null;
}
}
}
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net;
using Lucene.Net.Analysis;
using KTDictSeg;
namespace Lucene.Net.Analysis.KTDictSeg
{
public class KTDictSegTokenizer:Tokenizer
{
public static CSimpleDictSeg m_SimpleDictSeg;
private ArrayList ioBuffer;
private int offSet = 0 ; //偏移量.
private int position = -1 ; //词汇在缓冲中的位置.
private int length = 0 ; //词汇的长度.
private int start = 0 ; //开始偏移量.
public KTDictSegTokenizer(System.IO.TextReader input)
: base(input)
{
//这里用了一个第三方的中文分词组件.
//ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
if (m_SimpleDictSeg == null)
{
try
{
m_SimpleDictSeg = new CSimpleDictSeg();
m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
m_SimpleDictSeg.LoadDict();
}
catch (Exception e1)
{
m_SimpleDictSeg = null;
throw e1;
}
}
m_SimpleDictSeg.FilterStopWords = true;
m_SimpleDictSeg.MatchName = true;
ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd());
}
//DotLucene的分词器简单来说,就是实现Tokenizer的Next方法,把分解出来的每一个词构造为一个Token,因为Token是DotLucene分词的基本单位。
public override Token Next()
{
position++;
if (position < ioBuffer.Count)
{
length = ioBuffer[position].ToString().Length;
start = offSet ;
offSet += length ;
return new Token(ioBuffer[position].ToString(), start, start + length);
}
return null;
}
}
}
以上代码借鉴了其他朋友的代码,自己组织了下, 使用这个分词,比使用lucene.net自带的分词 StandardAnalyzer 速度上快了6倍
下面是制作索引的函数
private
void
mackIndex()
{
Analyzer analyzer = new KTDictSegAnalyzer();
//lucene.net 默认分词器
//Analyzer analyzer = new StandardAnalyzer();
FSDirectory fsDir = FSDirectory.GetDirectory(Index_Store_Path, true);
IndexWriter fswriter = new IndexWriter(fsDir, analyzer, true);
ProductDao productDao = new ProductDao();
//得到数据源
IList<Product> PList = productDao.GetProduct();
IEnumerator<Product> _p = PList.GetEnumerator();
//根据数据源制定document
while(_p.MoveNext())
{
Document Doc = new Document();
Field prodname = new Field("prodname", _p.Current.Proname,Field.Store.YES,Field.Index.TOKENIZED);
if (_p.Current.Proshuoming == null)
{
_p.Current.Proshuoming = "null";
}
Field profunction = new Field("profunction", _p.Current.Proshuoming, Field.Store.YES, Field.Index.UN_TOKENIZED);
Doc.Add(prodname);
Doc.Add(profunction);
fswriter.AddDocument(Doc);
}
fswriter.Close();
}
{
Analyzer analyzer = new KTDictSegAnalyzer();
//lucene.net 默认分词器
//Analyzer analyzer = new StandardAnalyzer();
FSDirectory fsDir = FSDirectory.GetDirectory(Index_Store_Path, true);
IndexWriter fswriter = new IndexWriter(fsDir, analyzer, true);
ProductDao productDao = new ProductDao();
//得到数据源
IList<Product> PList = productDao.GetProduct();
IEnumerator<Product> _p = PList.GetEnumerator();
//根据数据源制定document
while(_p.MoveNext())
{
Document Doc = new Document();
Field prodname = new Field("prodname", _p.Current.Proname,Field.Store.YES,Field.Index.TOKENIZED);
if (_p.Current.Proshuoming == null)
{
_p.Current.Proshuoming = "null";
}
Field profunction = new Field("profunction", _p.Current.Proshuoming, Field.Store.YES, Field.Index.UN_TOKENIZED);
Doc.Add(prodname);
Doc.Add(profunction);
fswriter.AddDocument(Doc);
}
fswriter.Close();
}
以上就是我的方法了 ,如果大家有什么更好的办法,麻烦介绍下 同时谁有雨痕v3免费版的分词也发给我一份,先谢谢了
OK,继续努力学习中........