中英文关键字生成器



 中英文关键字生成器:

中文,会生成最大命中率2+3的格式,英文保留原词,至少2个长。

见我http://www.cnblogs.com/dullwolf/archive/2011/04/14/2015539.html
这个文章:倒排索引,中文维持2+3长度的重要性。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace ConsoleApplication2
{
    class Program
    {
        static Dictionary<string, int> WordIndex = new Dictionary<string, int>();
        static void Main(string[] args)
        {
            WordIndex.Add("上海", 0);
            WordIndex.Add("上海市", 0);
            WordIndex.Add("质量", 0);
            WordIndex.Add("技术", 0);
            WordIndex.Add("监督", 0);
            WordIndex.Add("监督局", 0);
            WordIndex.Add("吊销", 0);
            WordIndex.Add("染色", 0);
            WordIndex.Add("馒头", 0);
            WordIndex.Add("加工", 0);
            WordIndex.Add("工厂", 0);
            WordIndex.Add("加工厂", 0);
            WordIndex.Add("生产", 0);
            WordIndex.Add("许可", 0);
            WordIndex.Add("许可证", 0);

            KeyMaker KM = new KeyMaker();
            string input = @"上海市质量技术监督局吊销了Shanghai ABC染色馒头加工厂的生产许可证";
            Console.WriteLine(KM.GetMaxHitKey(input, WordIndex));
            input = @"上A海市c质量c技术监督局aa吊销了Shanghai ABC染色馒头厂的生产许可证";
            Console.WriteLine(KM.GetMaxHitKey(input, WordIndex));
            Console.WriteLine("----");
            Console.Read();
        }

        public class KeyMaker
        {
            private Dictionary<string, int> getChinesMaxHitKey(string text, Dictionary<string, int> dict)
            {
                Dictionary<string, int> D = new Dictionary<string, int>();
                List<string> strList = GetAllKey(text);
                //查找最大命中,线性扫描即可,无须排序
                if (strList.Count > 0)
                {
                    int maxValue = -1;
                    int maxIndex = -1;
                    for (int i = 0; i < strList.Count; i++)
                    {
                        string[] arrA = strList[i].Split(" ".ToCharArray());
                        int x = 0;
                        foreach (string a in arrA)
                        {
                            x += (dict.ContainsKey(a) ? 1 : 0);
                        }
                        if (x > maxValue)
                        {
                            maxValue = x;
                            maxIndex = i;
                        }
                    }

                    string[] arrStr = strList[maxIndex].Split(" ".ToCharArray());
                    foreach (string a in arrStr)
                    {
                        AddDict(D, a, maxValue);
                    }
                }
                return D;

            }

            public string GetMaxHitKey(string text, Dictionary<string, int> dict)
            {
                Dictionary<string, wordInfo> D = getSegMent(text.ToLower());
                Dictionary<string, int> finalDict = new Dictionary<string, int>();
                foreach (string K in D.Keys)
                {
                    if (D[K].IsChinese)
                    {
                        AddDict(finalDict, getChinesMaxHitKey(K, dict));
                    }
                    else
                    {
                        AddDict(finalDict, K, 1);
                    }
                }
                string re = "";
                foreach (string K in finalDict.Keys)
                {
                    re += K + " ";
                }
                return re.Trim();

            }


            private List<string> GetAllKey(string text)
            {
                List<string> strList = new List<string>();
                if (text.Length > 1 && text.Length < 30)
                {
                    getKeys(text, text, "", strList);
                }
                return strList;
            }
            private void getKeys(string text, string tempText, string resultText, List<string> strList)
            {
                switch (tempText.Length)
                {
                    case 0:
                        break;
                    case 1:
                        break;
                    case 2:
                        strList.Add(resultText.Trim() + " " + text.Substring(text.Length - 2));
                        break;
                    case 3:
                        strList.Add(resultText.Trim() + " " + text.Substring(text.Length - 3));
                        break;
                    default:
                        getKeys(text, tempText.Remove(0, 3), resultText + " " + tempText.Substring(0, 3), strList);
                        getKeys(text, tempText.Remove(0, 2), resultText + " " + tempText.Substring(0, 2), strList);

                        break;
                }
            }

            private void AddDict(Dictionary<string, wordInfo> D, string theWord, bool isCHN)
            {
                if (!D.ContainsKey(theWord))
                {
                    wordInfo WI = new wordInfo();
                    WI.Word = theWord;
                    WI.IsChinese = isCHN;
                    D.Add(theWord, WI);
                }
            }
            private void AddDict(Dictionary<string, int> targetD, Dictionary<string, int> SourceD)
            {
                foreach (string K in SourceD.Keys)
                {
                    if (!targetD.ContainsKey(K))
                    {
                        targetD.Add(K, SourceD[K]);
                    }
                    else
                    {
                        targetD[K] += SourceD[K];
                    }
                }
            }
            private void AddDict(Dictionary<string, int> D, string W, int F)
            {
                if (!D.ContainsKey(W))
                {
                    D.Add(W, F);
                }

            }
            private class wordInfo
            {
                public string Word = "";
                public bool IsChinese = true;
            }

            private Dictionary<string, wordInfo> getSegMent(string text)
            {
                Dictionary<string, wordInfo> D = new Dictionary<string, wordInfo>();
                Regex RegCHN = new Regex(@"[\u4e00-\u9fa5]{2,}");

                foreach (Match M in RegCHN.Matches(text))
                {
                    AddDict(D, M.Value, true);
                }
                Regex RegEN = new Regex(@"[a-z]{2,}");
                foreach (Match M in RegEN.Matches(text))
                {
                    AddDict(D, M.Value, false);
                }

                return D;

            }


        }

    }
}

转载于:https://www.cnblogs.com/dullwolf/archive/2011/04/14/2016107.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值