暂时没有事情做,所以就研究一些小东东,以前经常听人家说抓取网站数据呀,感觉好牛呀,所以自己也来研究一下下,只是没有成为牛人一组,写了一段代码,以后再慢慢的改,希望能改成搜索引擎那样子,随意抓取各大网站数据。
//分析HTML标签查找裢接
private string GetUrl(string strWebContent)
{
//string strRef = @"(href|HREF|src|SRC|action|ACTION|Action)[ ]*=[ ]*[""'][^""'#>]+[""']";
string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']";
string strResult = "";
MatchCollection matches = new Regex(strRef).Matches(strWebContent);
for (int i = 0; i < matches.Count; i++)
{
strResult += matches[i].ToString().Replace("href=", "") + "/r/n";
}
//strRef = @"[ ]*[""'][^""'#>]+[""']";
//matches = new Regex(strRef).Matches(strResult);
//for (int i = 0; i < matches.Count; i++)
//{
// strResult += matches[i].ToString() + "/r/n";
//}
return strResult;
}
//提取URL地址
private string GetUrl()
{
string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']";
Regex objRegExp = new Regex(strRef);
return strRef;
}
/// <summary>
/// 将Html标签转化为空格
/// </summary>
/// <param name="strHtml">待转化的字符串</param>
/// <returns>经过转化的字符串</returns>
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex("<(.|/n)+?>");
string strOutput = objRegExp.Replace(strHtml, "");
strOutput = strOutput.Replace("<", "<");
strOutput = strOutput.Replace(">", ">");
return strOutput;
}
//获得标题
private string GetTitle(string strWebContent)
{
//获取标题
Match TitleMatch = Regex.Match(strWebContent, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
return TitleMatch.Groups[1].Value;
}
//获取描述信息
private string GetDescription(string strWebContent)
{
Match Desc = Regex.Match(strWebContent, "<Meta name=/"DESCRIPTION/" content=/"([^<]*)/">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
return Desc.Groups[1].Value;
}
//根据Url地址得到网页的html源码
private string GetWebContent(string Url)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult += streamReader.ReadToEnd();
}
catch
{
}
return strResult;
}