页面访问统计 提取url中的关键字

表结构:

CREATE TABLE [dbo].[SearchSet](
 [SearchSetID] [int] IDENTITY(1,1) NOT NULL,
 [SiteName] [varchar](50) NULL,
 [SiteFlag] [varchar](1000) NULL,
 [KeyWordFlag] [varchar](255) NULL,
 CONSTRAINT [PK_SEARCHSET] PRIMARY KEY CLUSTERED
(
 [SearchSetID] ASC
)WITH (PAD_INDEX  = OFF, STATISTICS_NORECOMPUTE  = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS  = ON, ALLOW_PAGE_LOCKS  = ON) ON [PRIMARY]
)

数据:

SearchSetID    SiteName    SiteFlag                                                                                                       KeyWordFlag
1                       google         www.google.com;www.google.ca;www.google.co.uk;www.google.de;        q
2                       yahoo          cn.search.yahoo.com;search.yahoo.com                                                     p
3                       bing            www.bing.com;cn.bing.com                                                                           q

 

CREATE TABLE [dbo].[SearchKeyWord](
 [ClickID] [int] NULL,
 [SearchSetID] [int] NULL,
 [KeyWord] [varchar](255) NULL
) ON [PRIMARY]

 

//分析entryurl 获取关键字
        private static string GetKeyWord(string url)
        {
            #region  关键字
            string ss = "";

            string str1 = ""; //+asd+%B0%A212+asdf120+%B0%A2%CB%B9asdf

            if (url != "")
            {
                if (url.Contains("baidu.com"))
                {
                    #region baidu

                    if (url.IndexOf("wd=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("wd=") + 3);

                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }
                    }
                    else if (url.IndexOf("word=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("word=") + 5);

                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }
                    }
                    #endregion
                }
                else if (url.Contains("soso.com"))
                {
                    #region  soso

                    if (url.IndexOf("w=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("w=") + 2);
                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }
                    }
                    #endregion
                }
                else if (url.Contains("google.com"))
                {
                    #region google

                    if (url.IndexOf("q=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("q=") + 2);
                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }
                    }

                    #endregion
                }
                else if (url.Contains("yahoo.com") || url.Contains("yahoo.cn"))
                {
                    #region yahoo

                    if (url.IndexOf("q=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("q=") + 2);
                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }
                    }

                    #endregion
                }
                else if (url.Contains("bing.com"))
                {
                    #region bing

                    if (url.IndexOf("q=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("q=") + 2);
                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }

                        //ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.UTF8);
                    }
                    #endregion
                }
                else if (url.Contains("sogou.com"))
                {
                    #region  sogou
                    if (url.IndexOf("query=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("query=") + 6);
                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }
                    }

                    #endregion
                }
                else if (url.Contains("youdao.com"))
                {
                    #region youdao
                    if (url.IndexOf("q=") > 0)
                    {
                        str1 = url.Substring(url.IndexOf("q=") + 2);

                        if (str1.Contains("&"))
                        {
                            str1 = str1.Substring(0, str1.IndexOf("&"));
                        }
                        else
                        {
                            str1 = str1.Substring(0);
                        }

                        //if (url.Contains("ue=gbk"))
                        //{
                        //    ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.GetEncoding("GB2312"));
                        //}
                        //else
                        //{
                        //    ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.UTF8);
                        //}
                    }

                    #endregion
                }

                #region 解析关键字

                if (str1.Contains("%"))
                {
                    Regex re = new Regex(@"(?<name>((%\w{2}){2,}))");
                    MatchCollection mc = re.Matches(str1);
                    int snum = 0;

                    foreach (Match m in mc)
                    {
                        string stt = m.Groups["name"].Value;
                        string s1 = "";
                        string s2 = "";
                        if (str1.IndexOf(stt) > 0)
                        {                        
                            //IndexOf 在有重复的时候(a%CC%CCa%CC%CC)
                            s2 = System.Web.HttpUtility.UrlDecode(str1.Substring(snum, str1.IndexOf(stt, snum) - snum), Encoding.Default);
                            snum = str1.IndexOf(stt) + stt.Length;
                        }
                        else
                        {
                            snum += stt.Length;
                        }

                        try
                        {

                            s1 = System.Web.HttpUtility.UrlDecode(stt, Encoding.UTF8);

                            int num = GetSubstrNum(stt); //包含几个特殊字符

                            if (stt.Split('%').Length - 1 == (s1.Length * 3) - 2 * num)
                            {
                                s1 = System.Web.HttpUtility.UrlDecode(stt, Encoding.UTF8);
                            }
                            else
                            {
                                s1 = System.Web.HttpUtility.UrlDecode(stt, Encoding.GetEncoding("GB2312"));
                            }

                        }
                        catch
                        {
                            s1 = System.Web.HttpUtility.UrlDecode(str1, Encoding.GetEncoding("GB2312"));
                        }


                        ss += s2;
                        ss += s1;
                    }
                    if (snum < str1.Length)
                    {
                        ss += System.Web.HttpUtility.UrlDecode(str1.Substring(snum), Encoding.Default);
                    }
                }
                else
                {
                    ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.Default);
                }
            }
                #endregion

            #endregion

            return ss;
        }


        //1.    + URL 中+号表示空格 %2B 
        //2.    空格 URL中的空格可以用+号或者编码 %20 
        //3.    / 分隔目录和子目录 %2F 
        //4.     ? 分隔实际的 URL 和参数 %3F 
        //5.     % 指定特殊字符 %25 
        //6.     # 表示书签 %23 
        //7.     & URL 中指定的参数间的分隔符 %26 
        //8.     = URL 中指定参数的值 %3D
        //特殊字符处理
        private static int GetSubstrNum(string url)
        {
            Regex re = new Regex("(%2B)|(%20)|(%2F)|(%3F)|(%25)|(%23)|(%26)|(%3D)", RegexOptions.IgnoreCase);

            MatchCollection mc = re.Matches(url);
            return mc.Count;
        }
    }


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值