1.引用文件下载地址:
http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar
2.引用一个Lucene.Net.dll文件
3.添加新类库文件 WordTree.cs
using System;
using System.Collections;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace A.SplitString
{
public class WordTree
{
//需要添加的对照文件 sdict.txt文件
private static string DictPath = System.Web.HttpContext.Current.Server.MapPath("~/sDict.txt");
public static Hashtable chartable = new Hashtable();
public static bool DictLoaded = false;
public static double DictLoad_Span = 0.0;
public string strChinese = "[一-龥]";
public string strNumber = "[0-9]";
public string strEnglish = "[a-zA-Z]";
public int GetCharType(string Char)
{
int result;
if (new Regex(this.strChinese).IsMatch(Char))
{
result = 0;
}
else if (new Regex(this.strEnglish).IsMatch(Char))
{
result = 1;
}
else if (new Regex(this.strNumber).IsMatch(Char))
{
result = 2;
}
else
{
result = -1;
}
return result;
}
public void LoadDict()
{
if (!WordTree.DictLoaded)
{
this.BuidDictTree();
WordTree.DictLoaded = true;
}
}
private void BuidDictTree()
{
long ticks = DateTime.Now.Ticks;
StreamReader streamReader = new StreamReader(WordTree.DictPath, Encoding.UTF8);
string text = streamReader.ReadLine();
if (!chartable.Contains("word"))
{
WordTree.chartable.Add("word", null);
}
while (!string.IsNullOrEmpty(text))
{
Hashtable hashtable = WordTree.chartable;
for (int i = 0; i < text.Length; i++)
{
string key = text.Substring(i, 1);
if (!hashtable.Contains(key))
{
hashtable.Add(key, new Hashtable());
}
hashtable = (Hashtable)hashtable[key];
}
if (!hashtable.Contains("word"))
{
hashtable.Add("word", null);
}
text = streamReader.ReadLine();
}
streamReader.Close();
}
}
}
4.添加cs文件 ChineseTokenizer.cs
using Lucene.Net.Analysis;
using System;
using System.Collections;
using System.IO;
namespace A.SplitString
{
internal class ChineseTokenizer : Tokenizer
{
private int bufferIndex = 0;
private int dataLen = 0;
private int start;
private string text;
public ChineseTokenizer(TextReader reader)
{
this.input = reader;
this.text = this.input.ReadToEnd();
this.dataLen = this.text.Length;
}
public override Token Next()
{
WordTree wordTree = new WordTree();
wordTree.LoadDict();
Hashtable hashtable = WordTree.chartable;
string text = string.Empty;
this.bufferIndex = this.start;
int num = this.start;
int num2 = this.bufferIndex;
string text2 = string.Empty;
Token result;
while (this.start < this.dataLen)
{
string text3 = this.text.Substring(this.start, 1);
if (!string.IsNullOrEmpty(text3.Trim()))
{
if (!hashtable.Contains(text3))
{
if (text == string.Empty)
{
int i = this.start + 1;
switch (wordTree.GetCharType(text3))
{
case 0:
text += text3;
break;
case 1:
while (i < this.dataLen)
{
if (wordTree.GetCharType(this.text.Substring(i, 1)) != 1)
{
break;
}
i++;
}
text += this.text.Substring(this.start, i - this.start).ToLower();
break;
case 2:
while (i < this.dataLen)
{
if (wordTree.GetCharType(this.text.Substring(i, 1)) != 2)
{
break;
}
i++;
}
text += this.text.Substring(this.start, i - this.start);
break;
default:
this.start++;
this.bufferIndex = this.start;
continue;
}
this.start = i;
}
else if (wordTree.GetCharType(text3) == -1)
{
this.start++;
}
if (hashtable.Contains("word"))
{
result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);
}
else
{
this.start = num + 1;
result = new Token(text2, num2, num2 + text2.Length);
}
}
else
{
text += text3;
hashtable = (Hashtable)hashtable[text3];
if (hashtable.Contains("word") || text.Length == 1)
{
text2 = text;
num = this.start;
num2 = this.bufferIndex;
}
this.start++;
if (this.start != this.dataLen)
{
continue;
}
if (hashtable.Contains("word") || text.Length == 1)
{
result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);
}
else
{
this.start = num + 1;
result = new Token(text2, num2, num2 + text2.Length);
}
}
return result;
}
this.start++;
this.bufferIndex = this.start;
}
result = null;
return result;
}
}
}
5.添加cs 文件 SplitAdapter.cs
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace A.SplitString
{
public class SplitAdapter : Analyzer
{
public static string[] CHINESE_ENGLISH_STOP_WORDS;
public static readonly string[] Filter = new string[321];
public SplitAdapter(string path)
{
StreamReader streamReader = new StreamReader(path, Encoding.UTF8);
string text = streamReader.ReadLine();
int num = 0;
while (!string.IsNullOrEmpty(text))
{
SplitAdapter.Filter[num] = text;
text = streamReader.ReadLine();
num++;
}
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream tokenStream = new ChineseTokenizer(reader);
tokenStream = new StandardFilter(tokenStream);
return new StopFilter(tokenStream, SplitAdapter.Filter);
}
}
}
6.实现类库
using Lucene.Net.Analysis;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace A.Helper
{
public class MatchingHelper
{
public static List<string> GetMatchingList(string inputString)
{
string snoisePath = System.Web.HttpContext.Current.Server.MapPath("~/sNoise.config");
List<string> resultList = new List<string>();
SplitAdapter analyzer = new SplitAdapter(snoisePath);
StringReader reader = new StringReader(inputString);
TokenStream tokenStream = analyzer.TokenStream(null, reader);
Token token = tokenStream.Next();
while (token != null)
{
resultList.Add(token.TermText());
token = tokenStream.Next();
}
return resultList;
//这个 list,就是拆分后的 词汇
}
}
}