中英文关键字生成器:
中文,会生成最大命中率2+3的格式,英文保留原词,至少2个长。
见我http://www.cnblogs.com/dullwolf/archive/2011/04/14/2015539.html
这个文章:倒排索引,中文维持2+3长度的重要性。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace ConsoleApplication2
{
class Program
{
static Dictionary<string, int> WordIndex = new Dictionary<string, int>();
static void Main(string[] args)
{
WordIndex.Add("上海", 0);
WordIndex.Add("上海市", 0);
WordIndex.Add("质量", 0);
WordIndex.Add("技术", 0);
WordIndex.Add("监督", 0);
WordIndex.Add("监督局", 0);
WordIndex.Add("吊销", 0);
WordIndex.Add("染色", 0);
WordIndex.Add("馒头", 0);
WordIndex.Add("加工", 0);
WordIndex.Add("工厂", 0);
WordIndex.Add("加工厂", 0);
WordIndex.Add("生产", 0);
WordIndex.Add("许可", 0);
WordIndex.Add("许可证", 0);
KeyMaker KM = new KeyMaker();
string input = @"上海市质量技术监督局吊销了Shanghai ABC染色馒头加工厂的生产许可证";
Console.WriteLine(KM.GetMaxHitKey(input, WordIndex));
input = @"上A海市c质量c技术监督局aa吊销了Shanghai ABC染色馒头厂的生产许可证";
Console.WriteLine(KM.GetMaxHitKey(input, WordIndex));
Console.WriteLine("----");
Console.Read();
}
public class KeyMaker
{
private Dictionary<string, int> getChinesMaxHitKey(string text, Dictionary<string, int> dict)
{
Dictionary<string, int> D = new Dictionary<string, int>();
List<string> strList = GetAllKey(text);
//查找最大命中,线性扫描即可,无须排序
if (strList.Count > 0)
{
int maxValue = -1;
int maxIndex = -1;
for (int i = 0; i < strList.Count; i++)
{
string[] arrA = strList[i].Split(" ".ToCharArray());
int x = 0;
foreach (string a in arrA)
{
x += (dict.ContainsKey(a) ? 1 : 0);
}
if (x > maxValue)
{
maxValue = x;
maxIndex = i;
}
}
string[] arrStr = strList[maxIndex].Split(" ".ToCharArray());
foreach (string a in arrStr)
{
AddDict(D, a, maxValue);
}
}
return D;
}
public string GetMaxHitKey(string text, Dictionary<string, int> dict)
{
Dictionary<string, wordInfo> D = getSegMent(text.ToLower());
Dictionary<string, int> finalDict = new Dictionary<string, int>();
foreach (string K in D.Keys)
{
if (D[K].IsChinese)
{
AddDict(finalDict, getChinesMaxHitKey(K, dict));
}
else
{
AddDict(finalDict, K, 1);
}
}
string re = "";
foreach (string K in finalDict.Keys)
{
re += K + " ";
}
return re.Trim();
}
private List<string> GetAllKey(string text)
{
List<string> strList = new List<string>();
if (text.Length > 1 && text.Length < 30)
{
getKeys(text, text, "", strList);
}
return strList;
}
private void getKeys(string text, string tempText, string resultText, List<string> strList)
{
switch (tempText.Length)
{
case 0:
break;
case 1:
break;
case 2:
strList.Add(resultText.Trim() + " " + text.Substring(text.Length - 2));
break;
case 3:
strList.Add(resultText.Trim() + " " + text.Substring(text.Length - 3));
break;
default:
getKeys(text, tempText.Remove(0, 3), resultText + " " + tempText.Substring(0, 3), strList);
getKeys(text, tempText.Remove(0, 2), resultText + " " + tempText.Substring(0, 2), strList);
break;
}
}
private void AddDict(Dictionary<string, wordInfo> D, string theWord, bool isCHN)
{
if (!D.ContainsKey(theWord))
{
wordInfo WI = new wordInfo();
WI.Word = theWord;
WI.IsChinese = isCHN;
D.Add(theWord, WI);
}
}
private void AddDict(Dictionary<string, int> targetD, Dictionary<string, int> SourceD)
{
foreach (string K in SourceD.Keys)
{
if (!targetD.ContainsKey(K))
{
targetD.Add(K, SourceD[K]);
}
else
{
targetD[K] += SourceD[K];
}
}
}
private void AddDict(Dictionary<string, int> D, string W, int F)
{
if (!D.ContainsKey(W))
{
D.Add(W, F);
}
}
private class wordInfo
{
public string Word = "";
public bool IsChinese = true;
}
private Dictionary<string, wordInfo> getSegMent(string text)
{
Dictionary<string, wordInfo> D = new Dictionary<string, wordInfo>();
Regex RegCHN = new Regex(@"[\u4e00-\u9fa5]{2,}");
foreach (Match M in RegCHN.Matches(text))
{
AddDict(D, M.Value, true);
}
Regex RegEN = new Regex(@"[a-z]{2,}");
foreach (Match M in RegEN.Matches(text))
{
AddDict(D, M.Value, false);
}
return D;
}
}
}
}