使用HtmlAgilityPack类库解析html非常方便,网上的资料有很多,可以自行搜索了解
下面上一个非常简单的小例子
要爬取的信息如下:
首先要引用HtmlAgilityPack.dll文件
上代码:
internal void Run()
{
string url = "爬取网站URL";
string res = HttpTool.Excute(url); //发送请求得到页面
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(res); //加载html
HtmlNode documentNode = doc.DocumentNode;
string hname = documentNode.SelectSingleNode("//div[@class='main-title']/h1").InnerText.Trim(); //定位到节点获取文本值
string address = documentNode.SelectSingleNode("//div[@class='location']/span").InnerText.Trim();
}
HttpTool类
using System;
using System.IO;
using System.Text;
using System.Net;
namespace Spider
{
public class HttpTool : IDisposable
{
public static string Excute(string url, string data = null)
{
for (int i = 0; i < 3; i++) //尝试3次连接
{
try
{
using (HttpTool tool = new HttpTool(url, data))
{
return tool.Result;
}
}
catch (Exception ex)
{
//异常处理
}
}
return null;
}
public string Url { get; set; }
public HttpWebRequest Request { get; private set; }
public HttpWebResponse Response { get; private set; }
public string Result { get; private set; }
public HttpTool(string url, string postData = null)
{
Url = url;
InitRequest();
if (string.IsNullOrEmpty(postData))
{
InitResponse();
}
else
{
InitResponse(postData);
}
var stream = Response.GetResponseStream();
var sr = new StreamReader(Response.GetResponseStream());
Result = sr.ReadToEnd();
stream.Dispose();
sr.Close();
sr.Dispose();
}
/// <summary>
/// 初始化请求 请求头信息可以按自己需求增加
/// </summary>
public virtual void InitRequest()
{
Request = (HttpWebRequest)WebRequest.Create(Url);
//Request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; QQWubi 133; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; CIBA; InfoPath.2)";
//Request.KeepAlive = false;
Request.Accept = "Accept text/html, application/xhtml+xml, */*";
Request.UserAgent = "User-Agent Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)";
Request.Timeout = 6000;
Request.ReadWriteTimeout = 6000;
}
/// <summary>
/// 初始化响应报文,GET
/// </summary>
public void InitResponse()
{
Request.Method = "GET";
Response = (HttpWebResponse)Request.GetResponse();
}
/// <summary>
/// 初始化响应报文,POST
/// </summary>
/// <param name="postData">POST参数</param>
/// <param name="encoding">编码方式</param>
public void InitResponse(string postData, Encoding encoding)
{
Request.Method = "POST";
byte[] bs = encoding.GetBytes(postData);
Request.ContentLength = bs.Length;
var reqStream = Request.GetRequestStream();
reqStream.Write(bs, 0, bs.Length);
reqStream.Close();
reqStream.Dispose();
Response = (HttpWebResponse)Request.GetResponse();
}
/// <summary>
/// 初始化响应报文,POST,UTF-8序列化
/// </summary>
/// <param name="postData">POST参数</param>
public void InitResponse(string postData)
{
InitResponse(postData, Encoding.UTF8);
}
}
}