using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace WMCSharp
{
//++++++++++++++++++++++++++++++++++++++//
// 保存敏感词前缀及所有前缀相同的词 //
//++++++++++++++++++++++++++++++++++++++//
class Node
{
public ArrayList str;
public string strpre;
public Node() { str = new ArrayList(); strpre = null; }
};
//++++++++++++++++++++++++++++++++++++++//
// 前缀表的建立、查找及完整字符串的匹配 //
//++++++++++++++++++++++++++++++++++++++//
class PreNode : Node
{
const int NODE_NUM = 100;
public Node[] pre = new Node[NODE_NUM];
public PreNode()
{
for (int i = 0; i < NODE_NUM; i++)
{
pre[i] = new Node();
}
}
//**************************************//
// 匹配完整字符串 //
//**************************************//
public ArrayList searchrstr(string str, int n, int len_mi, ArrayList result)
{
int length = 24; //敏感词最大长度
int breakcount = 0; //
if (length > str.Length)
length = str.Length;
for (int i = len_mi; i <= length; i++)
{
if (breakcount > pre[n].str.Count) //如果表以经遍历一遍,则跳出。
break;
string ss = str.Substring(0, i);
if (pre[n].str.Contains(ss))
{
breakcount++;
Console.WriteLine(ss);
result.Add(ss);
}
}
return result;
}
//**************************************//
// 插入前缀 //
//**************************************//
public void insertpre(string str, int len = 2)
{
string ss = str.Substring(0, len);
int i = 0;
while (pre[i].strpre != null)
if (!pre[i].strpre.Equals(ss))
i++;
else
break;
pre[i].strpre = ss.Substring(0, ss.Length);
pre[i].str.Add(str);
}
//**************************************//
// 按前缀查找 //
//**************************************//
public int serchpre(string str)
{
int i = 0;
while (i < NODE_NUM)
{
if (pre[i].strpre == null)
return -1;
if (pre[i].strpre.Equals(str))
return i;
i++;
}
return -1;
}
}
//++++++++++++++++++++++++++++++++++++++//
// WM算法 //
//++++++++++++++++++++++++++++++++++++++//
class WM
{
int len_min = 128;
int COUNT = 0;
private int[] Shift;
PreNode[] ArrPre ;
Hashtable str_Block = new Hashtable();
//**************************************//
// 初始化函数,建立Shift表,前缀表 //
//**************************************//
public int init(string FilePath) //参数为敏感词 文件路径+文件名
{
ArrayList str = new ArrayList();
if (!System.IO.File.Exists(FilePath))
{
Console.WriteLine("词库不存在!");
return 0;
}
StreamReader f2 = new StreamReader(FilePath, System.Text.Encoding.GetEncoding("gb2312"));
while (!f2.EndOfStream)
{
str.Add(f2.ReadLine());
}
f2.Close();
int n = str.Count;
for (int i = 0; i < n; i++) //获取最短敏感词长度
{
if (len_min > str[i].ToString().Length)
len_min = str[i].ToString().Length;
}
if (len_min < 2)
{
Console.WriteLine("词库不允许有单字存在!");
return 0;
}
ArrPre = new PreNode[n]; //初始化前缀表的大小
Shift = new int[n]; //初始化Shift表的大小
for (int i = 0; i < n ; i++) //初始化Shift表
{
Shift[i] = -1;
ArrPre[i] = new PreNode();
}
int block_value = -1;
for (int i = 0; i < n; i++) //建立Shif表,前缀表,和词表
{
string term = str[i].ToString().Substring(0);
int ndx = 0;
while (ndx++ < len_min - 1)
{
string Block = term.Substring(ndx - 1, 2);
if (!str_Block.ContainsKey(Block))
{
str_Block.Add(Block, COUNT);
Shift[COUNT++] = len_min - ndx - 1;
if (Shift[COUNT - 1] == 0)
if (ArrPre[COUNT - 1] != null)
ArrPre[COUNT - 1].insertpre(term, len_min);
continue;
}
else
{
block_value = (int)str_Block[Block];
if (Shift[block_value] > len_min - ndx - 1 || (len_min - ndx - 1) == 0)
{
Shift[block_value] = len_min - ndx - 1;
if (Shift[block_value] == 0)
ArrPre[block_value].insertpre(term, len_min);
continue;
}
continue;
}
}
}
return 1;
}
//**************************************//
// 查找函数,结果保存在ArrayList中 //
//**************************************//
public ArrayList Search(string text, ArrayList result)
{
ArrayList error = new ArrayList();
if (text == null)
{
error.Add("The text is null!");
return error;
}
int len = len_min;
int len_text = text.Length;
char[] p = text.ToArray();
char[] block = new char[2];
int i = 0;
while (len_text > i)
{
block[0] = p[i++];
if (i == len_text)
{
break;
}
block[1] = p[i++];
i--; i--;
string Block = new string(block);
if (!str_Block.ContainsKey(Block))
{
i++;
continue;
}
int step = Shift[(int)str_Block[Block]];
if (step > 0)
{
while (step-- > 0)
i++;
continue;
}
if (step == 0)
{
i++;
string stemp = text.Substring(i + 1 - len, len);
int k = (int)str_Block[Block];
int pos = ArrPre[k].serchpre(stemp);
if (pos != -1)
{
string strtemp = text.Substring(i + 1 - len);
result = ArrPre[k].searchrstr(strtemp, pos, len, result);
}
}
}
return result;
}
}
}
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace WMCSharp
{
//++++++++++++++++++++++++++++++++++++++//
// 保存敏感词前缀及所有前缀相同的词 //
//++++++++++++++++++++++++++++++++++++++//
class Node
{
public ArrayList str;
public string strpre;
public Node() { str = new ArrayList(); strpre = null; }
};
//++++++++++++++++++++++++++++++++++++++//
// 前缀表的建立、查找及完整字符串的匹配 //
//++++++++++++++++++++++++++++++++++++++//
class PreNode : Node
{
const int NODE_NUM = 100;
public Node[] pre = new Node[NODE_NUM];
public PreNode()
{
for (int i = 0; i < NODE_NUM; i++)
{
pre[i] = new Node();
}
}
//**************************************//
// 匹配完整字符串 //
//**************************************//
public ArrayList searchrstr(string str, int n, int len_mi, ArrayList result)
{
int length = 24; //敏感词最大长度
int breakcount = 0; //
if (length > str.Length)
length = str.Length;
for (int i = len_mi; i <= length; i++)
{
if (breakcount > pre[n].str.Count) //如果表以经遍历一遍,则跳出。
break;
string ss = str.Substring(0, i);
if (pre[n].str.Contains(ss))
{
breakcount++;
Console.WriteLine(ss);
result.Add(ss);
}
}
return result;
}
//**************************************//
// 插入前缀 //
//**************************************//
public void insertpre(string str, int len = 2)
{
string ss = str.Substring(0, len);
int i = 0;
while (pre[i].strpre != null)
if (!pre[i].strpre.Equals(ss))
i++;
else
break;
pre[i].strpre = ss.Substring(0, ss.Length);
pre[i].str.Add(str);
}
//**************************************//
// 按前缀查找 //
//**************************************//
public int serchpre(string str)
{
int i = 0;
while (i < NODE_NUM)
{
if (pre[i].strpre == null)
return -1;
if (pre[i].strpre.Equals(str))
return i;
i++;
}
return -1;
}
}
//++++++++++++++++++++++++++++++++++++++//
// WM算法 //
//++++++++++++++++++++++++++++++++++++++//
class WM
{
int len_min = 128;
int COUNT = 0;
private int[] Shift;
PreNode[] ArrPre ;
Hashtable str_Block = new Hashtable();
//**************************************//
// 初始化函数,建立Shift表,前缀表 //
//**************************************//
public int init(string FilePath) //参数为敏感词 文件路径+文件名
{
ArrayList str = new ArrayList();
if (!System.IO.File.Exists(FilePath))
{
Console.WriteLine("词库不存在!");
return 0;
}
StreamReader f2 = new StreamReader(FilePath, System.Text.Encoding.GetEncoding("gb2312"));
while (!f2.EndOfStream)
{
str.Add(f2.ReadLine());
}
f2.Close();
int n = str.Count;
for (int i = 0; i < n; i++) //获取最短敏感词长度
{
if (len_min > str[i].ToString().Length)
len_min = str[i].ToString().Length;
}
if (len_min < 2)
{
Console.WriteLine("词库不允许有单字存在!");
return 0;
}
ArrPre = new PreNode[n]; //初始化前缀表的大小
Shift = new int[n]; //初始化Shift表的大小
for (int i = 0; i < n ; i++) //初始化Shift表
{
Shift[i] = -1;
ArrPre[i] = new PreNode();
}
int block_value = -1;
for (int i = 0; i < n; i++) //建立Shif表,前缀表,和词表
{
string term = str[i].ToString().Substring(0);
int ndx = 0;
while (ndx++ < len_min - 1)
{
string Block = term.Substring(ndx - 1, 2);
if (!str_Block.ContainsKey(Block))
{
str_Block.Add(Block, COUNT);
Shift[COUNT++] = len_min - ndx - 1;
if (Shift[COUNT - 1] == 0)
if (ArrPre[COUNT - 1] != null)
ArrPre[COUNT - 1].insertpre(term, len_min);
continue;
}
else
{
block_value = (int)str_Block[Block];
if (Shift[block_value] > len_min - ndx - 1 || (len_min - ndx - 1) == 0)
{
Shift[block_value] = len_min - ndx - 1;
if (Shift[block_value] == 0)
ArrPre[block_value].insertpre(term, len_min);
continue;
}
continue;
}
}
}
return 1;
}
//**************************************//
// 查找函数,结果保存在ArrayList中 //
//**************************************//
public ArrayList Search(string text, ArrayList result)
{
ArrayList error = new ArrayList();
if (text == null)
{
error.Add("The text is null!");
return error;
}
int len = len_min;
int len_text = text.Length;
char[] p = text.ToArray();
char[] block = new char[2];
int i = 0;
while (len_text > i)
{
block[0] = p[i++];
if (i == len_text)
{
break;
}
block[1] = p[i++];
i--; i--;
string Block = new string(block);
if (!str_Block.ContainsKey(Block))
{
i++;
continue;
}
int step = Shift[(int)str_Block[Block]];
if (step > 0)
{
while (step-- > 0)
i++;
continue;
}
if (step == 0)
{
i++;
string stemp = text.Substring(i + 1 - len, len);
int k = (int)str_Block[Block];
int pos = ArrPre[k].serchpre(stemp);
if (pos != -1)
{
string strtemp = text.Substring(i + 1 - len);
result = ArrPre[k].searchrstr(strtemp, pos, len, result);
}
}
}
return result;
}
}
}