C#使用HtmlAgilityPack爬虫实例

使用HtmlAgilityPack类库解析html非常方便,网上的资料有很多,可以自行搜索了解微笑

下面上一个非常简单的小例子

要爬取的信息如下:



首先要引用HtmlAgilityPack.dll文件

上代码:

 internal void Run()
        {
            string url = "爬取网站URL";
            string res = HttpTool.Excute(url); //发送请求得到页面
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(res); //加载html
            HtmlNode documentNode = doc.DocumentNode;
            string hname = documentNode.SelectSingleNode("//div[@class='main-title']/h1").InnerText.Trim(); //定位到节点获取文本值
            string address = documentNode.SelectSingleNode("//div[@class='location']/span").InnerText.Trim();
        }

HttpTool类

using System;
using System.IO;
using System.Text;
using System.Net;


namespace Spider
{
    public class HttpTool : IDisposable
    {
        public static string Excute(string url, string data = null)
        {

            for (int i = 0; i < 3; i++)  //尝试3次连接
            {
                try
                {
                    using (HttpTool tool = new HttpTool(url, data))
                    {
                        return tool.Result;
                    }
                }
                catch (Exception ex)
                {
                    //异常处理
                }
            }
            return null;
        }

        public string Url { get; set; }

        public HttpWebRequest Request { get; private set; }

        public HttpWebResponse Response { get; private set; }

        public string Result { get; private set; }

        public HttpTool(string url, string postData = null)
        {
            Url = url;
            InitRequest();
            if (string.IsNullOrEmpty(postData))
            {
                InitResponse();
            }
            else
            {
                InitResponse(postData);
            }
            var stream = Response.GetResponseStream();
            var sr = new StreamReader(Response.GetResponseStream());
            Result = sr.ReadToEnd();
            stream.Dispose();
            sr.Close();
            sr.Dispose();
        }

        /// <summary>
        /// 初始化请求 请求头信息可以按自己需求增加
        /// </summary>
        public virtual void InitRequest()
        {

            Request = (HttpWebRequest)WebRequest.Create(Url);

            //Request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; QQWubi 133; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; CIBA; InfoPath.2)";
            //Request.KeepAlive = false;
            Request.Accept = "Accept	text/html, application/xhtml+xml, */*";
            Request.UserAgent = "User-Agent	Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)";
            Request.Timeout = 6000;
            Request.ReadWriteTimeout = 6000;
        }



        /// <summary>
        /// 初始化响应报文,GET
        /// </summary>
        public void InitResponse()
        {
            Request.Method = "GET";
            Response = (HttpWebResponse)Request.GetResponse();
        }

        /// <summary>
        /// 初始化响应报文,POST
        /// </summary>
        /// <param name="postData">POST参数</param>
        /// <param name="encoding">编码方式</param>
        public void InitResponse(string postData, Encoding encoding)
        {
            Request.Method = "POST";
            byte[] bs = encoding.GetBytes(postData);
            Request.ContentLength = bs.Length;
            var reqStream = Request.GetRequestStream();
            reqStream.Write(bs, 0, bs.Length);
            reqStream.Close();
            reqStream.Dispose();
            Response = (HttpWebResponse)Request.GetResponse();
        }

        /// <summary>
        /// 初始化响应报文,POST,UTF-8序列化
        /// </summary>
        /// <param name="postData">POST参数</param>
        public void InitResponse(string postData)
        {
            InitResponse(postData, Encoding.UTF8);
        }

    }

}





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值