/// <summary>
/// 获取抓取链接Html的源代码
/// </summary>
/// <param name="url">url地址</param>
/// <param name="charSet">编码方式、如果传入""则自动获取编码</param>
/// <returns></returns>
public string GetHttpSource(string url, string charSet = "")
{
try
{
string strWebData;
StreamReader sr;
var myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
myHttpWebRequest.Proxy = null;
myHttpWebRequest.Timeout = 15 * 1000; //连接超时
myHttpWebRequest.Accept = "*/*";
myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/536.1";
myHttpWebRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; //自动解压gzip
var myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
var stream = myHttpWebResponse.GetResponseStream();
//stream.ReadTimeout = 15 * 1000; //读取超时 设置.AutomaticDecompression后不支持超时
//先分析header中编码
var hchart = myHttpWebResponse.Headers["Content-Type"];
var hchartm = Regex.Match(hchart, "charset=(.*)?", RegexOptions.IgnoreCase);
var hchart1 = hchartm.Groups[1].Value;
if (hchart1 != "")
{
if (stream != null) {
sr = new StreamReader(stream, Encoding.GetEncoding(hchart1));
strWebData = sr.ReadToEnd();
goto endthis;
}
}
//保存到 MemoryStream 供重复读取
var ms = new MemoryStream();
var buffer = new byte[1024];
while (true)
{
if (stream != null)
{
var sz = stream.Read(buffer, 0, 1024);
if (sz == 0) break;
ms.Write(buffer, 0, sz);
}
}
//默认编码读取
ms.Position = 0;//指针置于流开头
if (charSet == "") charSet = "gb2312";
sr = new StreamReader(ms, Encoding.GetEncoding(charSet));
strWebData = sr.ReadToEnd();
//获取网页meta字符编码
var charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
var webCharSet = charSetMatch.Groups[3].Value.ToLower();
if (!Encoding.GetEncoding(webCharSet).Equals(Encoding.GetEncoding(charSet)) && webCharSet != "")
{
ms.Position = 0;//指针置于流开头
sr = new StreamReader(ms, Encoding.GetEncoding(webCharSet));
strWebData = sr.ReadToEnd();
}
ms.Close();
endthis:
sr.Close();
stream.Close();
myHttpWebResponse.Close(); myHttpWebRequest.Abort();
return strWebData;
}
catch (Exception ex) { return "Error:" + ex.Message; }
}
/// <summary>
/// 清除内容中的Html代码
/// </summary>
/// <param name="Content"></param>
/// <returns></returns>
public string ClearHtml(string Content)
{
Content = ReplaceHtml("&#[^>]*;", "", Content);
Content = ReplaceHtml("</?marquee[^>]*>", "", Content);
Content = ReplaceHtml("</?object[^>]*>", "", Content);
Content = ReplaceHtml("</?param[^>]*>", "", Content);
Content = ReplaceHtml("</?embed[^>]*>", "", Content);
Content = ReplaceHtml("</?table[^>]*>", "", Content);
Content = ReplaceHtml(" ", "", Content);
Content = ReplaceHtml("</?tr[^>]*>", "", Content);
Content = ReplaceHtml("</?th[^>]*>", "", Content);
Content = ReplaceHtml("</?p[^>]*>", "", Content);
Content = ReplaceHtml("</?a[^>]*>", "", Content);
Content = ReplaceHtml("</?img[^>]*>", "", Content);
Content = ReplaceHtml("</?tbody[^>]*>", "", Content);
Content = ReplaceHtml("</?li[^>]*>", "", Content);
Content = ReplaceHtml("</?span[^>]*>", "", Content);
Content = ReplaceHtml("</?div[^>]*>", "", Content);
Content = ReplaceHtml("</?th[^>]*>", "", Content);
Content = ReplaceHtml("</?td[^>]*>", "", Content);
Content = ReplaceHtml("</?script[^>]*>", "", Content);
Content = ReplaceHtml("(javascript|jscript|vbscript|vbs):", "", Content);
Content = ReplaceHtml("on(mouse|exit|error|click|key)", "", Content);
Content = ReplaceHtml("<\\?xml[^>]*>", "", Content);
Content = ReplaceHtml("<\\/?[a-z]+:[^>]*>", "", Content);
Content = ReplaceHtml("</?font[^>]*>", "", Content);
Content = ReplaceHtml("</?b[^>]*>", "", Content);
Content = ReplaceHtml("</?u[^>]*>", "", Content);
Content = ReplaceHtml("</?i[^>]*>", "", Content);
Content = ReplaceHtml("</?strong[^>]*>", "", Content);
string clearHtml = Content;
return clearHtml;
}
/// <summary>
/// 获取Html代码中所要抓取的链接集合
/// </summary>
/// <param name="html"></param>
/// <param name="url"></param>
/// <param name="strReg"></param>
/// <returns></returns>
public string[] GetLinks(string html, string url,string strReg)
{
Collection<string> urls = new Collection<string>();
MatchCollection matches = new Regex(strReg, RegexOptions.Singleline).Matches(html);
var regLInk = string.Empty;
//判断:如果Html源代码中没有 </body> 标签则证明源不完整,重新抓取
if (!html.Contains("</body>"))
{
var newContent = GetHttpSource(url, "");
GetLinks(newContent, url, strReg);
}
//循环添加
foreach (Match match in matches)
{
regLInk = match.Groups["key"].Value;
urls.Add(regLInk);
}
return urls.ToArray();
}