表结构:
CREATE TABLE [dbo].[SearchSet](
[SearchSetID] [int] IDENTITY(1,1) NOT NULL,
[SiteName] [varchar](50) NULL,
[SiteFlag] [varchar](1000) NULL,
[KeyWordFlag] [varchar](255) NULL,
CONSTRAINT [PK_SEARCHSET] PRIMARY KEY CLUSTERED
(
[SearchSetID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
)
数据:
SearchSetID SiteName SiteFlag KeyWordFlag
1 google www.google.com;www.google.ca;www.google.co.uk;www.google.de; q
2 yahoo cn.search.yahoo.com;search.yahoo.com p
3 bing www.bing.com;cn.bing.com q
CREATE TABLE [dbo].[SearchKeyWord](
[ClickID] [int] NULL,
[SearchSetID] [int] NULL,
[KeyWord] [varchar](255) NULL
) ON [PRIMARY]
//分析entryurl 获取关键字
private static string GetKeyWord(string url)
{
#region 关键字
string ss = "";
string str1 = ""; //+asd+%B0%A212+asdf120+%B0%A2%CB%B9asdf
if (url != "")
{
if (url.Contains("baidu.com"))
{
#region baidu
if (url.IndexOf("wd=") > 0)
{
str1 = url.Substring(url.IndexOf("wd=") + 3);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
}
else if (url.IndexOf("word=") > 0)
{
str1 = url.Substring(url.IndexOf("word=") + 5);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
}
#endregion
}
else if (url.Contains("soso.com"))
{
#region soso
if (url.IndexOf("w=") > 0)
{
str1 = url.Substring(url.IndexOf("w=") + 2);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
}
#endregion
}
else if (url.Contains("google.com"))
{
#region google
if (url.IndexOf("q=") > 0)
{
str1 = url.Substring(url.IndexOf("q=") + 2);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
}
#endregion
}
else if (url.Contains("yahoo.com") || url.Contains("yahoo.cn"))
{
#region yahoo
if (url.IndexOf("q=") > 0)
{
str1 = url.Substring(url.IndexOf("q=") + 2);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
}
#endregion
}
else if (url.Contains("bing.com"))
{
#region bing
if (url.IndexOf("q=") > 0)
{
str1 = url.Substring(url.IndexOf("q=") + 2);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
//ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.UTF8);
}
#endregion
}
else if (url.Contains("sogou.com"))
{
#region sogou
if (url.IndexOf("query=") > 0)
{
str1 = url.Substring(url.IndexOf("query=") + 6);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
}
#endregion
}
else if (url.Contains("youdao.com"))
{
#region youdao
if (url.IndexOf("q=") > 0)
{
str1 = url.Substring(url.IndexOf("q=") + 2);
if (str1.Contains("&"))
{
str1 = str1.Substring(0, str1.IndexOf("&"));
}
else
{
str1 = str1.Substring(0);
}
//if (url.Contains("ue=gbk"))
//{
// ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.GetEncoding("GB2312"));
//}
//else
//{
// ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.UTF8);
//}
}
#endregion
}
#region 解析关键字
if (str1.Contains("%"))
{
Regex re = new Regex(@"(?<name>((%\w{2}){2,}))");
MatchCollection mc = re.Matches(str1);
int snum = 0;
foreach (Match m in mc)
{
string stt = m.Groups["name"].Value;
string s1 = "";
string s2 = "";
if (str1.IndexOf(stt) > 0)
{
//IndexOf 在有重复的时候(a%CC%CCa%CC%CC)
s2 = System.Web.HttpUtility.UrlDecode(str1.Substring(snum, str1.IndexOf(stt, snum) - snum), Encoding.Default);
snum = str1.IndexOf(stt) + stt.Length;
}
else
{
snum += stt.Length;
}
try
{
s1 = System.Web.HttpUtility.UrlDecode(stt, Encoding.UTF8);
int num = GetSubstrNum(stt); //包含几个特殊字符
if (stt.Split('%').Length - 1 == (s1.Length * 3) - 2 * num)
{
s1 = System.Web.HttpUtility.UrlDecode(stt, Encoding.UTF8);
}
else
{
s1 = System.Web.HttpUtility.UrlDecode(stt, Encoding.GetEncoding("GB2312"));
}
}
catch
{
s1 = System.Web.HttpUtility.UrlDecode(str1, Encoding.GetEncoding("GB2312"));
}
ss += s2;
ss += s1;
}
if (snum < str1.Length)
{
ss += System.Web.HttpUtility.UrlDecode(str1.Substring(snum), Encoding.Default);
}
}
else
{
ss = System.Web.HttpUtility.UrlDecode(str1, Encoding.Default);
}
}
#endregion
#endregion
return ss;
}
//1. + URL 中+号表示空格 %2B
//2. 空格 URL中的空格可以用+号或者编码 %20
//3. / 分隔目录和子目录 %2F
//4. ? 分隔实际的 URL 和参数 %3F
//5. % 指定特殊字符 %25
//6. # 表示书签 %23
//7. & URL 中指定的参数间的分隔符 %26
//8. = URL 中指定参数的值 %3D
//特殊字符处理
private static int GetSubstrNum(string url)
{
Regex re = new Regex("(%2B)|(%20)|(%2F)|(%3F)|(%25)|(%23)|(%26)|(%3D)", RegexOptions.IgnoreCase);
MatchCollection mc = re.Matches(url);
return mc.Count;
}
}