获取html页面内容大致分为两类,一类是获取整个页面内容,通常应用场景是:访问量比较大的页面,但又需要不断刷新页面,此时可将此页面生成静态页面,定期更新静态页面即可;
另一类则是只需要获取HTML部分内容,我们可以使用HtmlAgilityPack,将HTML当做XML来处理,根据文档流一层一层往下读取;
下面我们一一介绍下使用方法:
一、使用webrequest:
调用方式如下:
HttpWebResponseUtility.CreateGetHttpResponse(url,null);拿到字符串格式的HTML文本就可以写入新的html中。
方法具体实现为:
public class HttpWebResponseUtility
{
private static readonly string DefaultUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
/// <summary>
/// 创建GET方式的HTTP请求
/// </summary>
/// <param name="url">请求的URL</param>
/// <param name="timeout">请求的超时时间</param>
/// <param name="userAgent">请求的客户端浏览器信息,可以为空</param>
/// <param name="cookies">随同HTTP请求发送的Cookie信息,如果不需要身份验证可以为空</param>
/// <returns></returns>
public static string CreateGetHttpResponse(string url,int? timeout,string userAgent=null,CookieCollection cookies=null)
{
if(string.IsNullOrEmpty(url))
{
throw new ArgumentNullException("url");
}
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "GET";
request.UserAgent = DefaultUserAgent;
if(!string.IsNullOrEmpty(userAgent))
{
request.UserAgent = userAgent;
}
if(timeout.HasValue)
{
request.Timeout = timeout.Value;
}
if(cookies != null)
{
request.CookieContainer = new CookieContainer();
request.CookieContainer.Add(cookies);
}
HttpWebResponse webreponse = request.GetResponse() as HttpWebResponse;
try
{
using(StreamReader reader = new StreamReader(webreponse.GetResponseStream(),System.Text.Encoding.UTF8))
{
return reader.ReadToEnd();
}
}
catch(Exception exp)
{
return exp.ToString();
}
}
}
二、使用HtmlAgilityPack:
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmldoc = web.Load(url + "&r=" + Guid.NewGuid().ToString("N"));
int i = retry;
do
{
if (web.StatusCode != System.Net.HttpStatusCode.OK)
{
Thread.Sleep(1000);
i--;
this._code = "202";
this._msg = "网页获取失败,错误:" + web.StatusCode.ToString();
}
else
{
var basicTable = htmldoc.DocumentNode.SelectSingleNode("//*[@id=\"layout-01_01_01\"]/div/table");
if (basicTable == null)
{
this._code = "201";
this._msg = "未找到基本信息";
break;
}
foreach (var tr in basicTable.ChildNodes)
{
if (tr.Name.ToLower() == "tr")
{
//获取标签内容
}
}
this._code = "100";
this._msg = "读取成功";
}
}
while (web.StatusCode != System.Net.HttpStatusCode.OK && i != 0);
要获取某个节点的路径可使用下图办法: