用 DOM 实现文章采集 -- 采集到网页源码

先来个采集网页的代码。

using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Text;
namespace TopWinCMS.Common
{
    public class NetHelper
    {

        //private string _HTTP_USER_AGENT = "Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.2;+SV1;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727)";
        private string _UserAgent = "Googlebot/2.1 (+http://www.google.com/bot.html)";
        private Encoding _HttpEncoding = null;
        private string _ProxyHost = string.Empty;
        private int _ProxyInt = 8080;
        private int _TimeOut = 200000;

        #region 属性
        /// <summary>
        /// 设置UserAgent
        /// </summary>
        public string UserAgent
        {
            get
            {
                return this._UserAgent;
            }
            set
            {
                this._UserAgent = value;
            }
        }
        /// <summary>
        /// 设置编码
        /// </summary>
        public Encoding HttpEncoding
        {
            get
            {
                return this._HttpEncoding;
            }
            set
            {
                this._HttpEncoding = value;
            }
        }
        /// <summary>
        /// 设置代理服务器
        /// </summary>
        public string ProxyHost
        {
            get
            {
                return this._ProxyHost;
            }
            set
            {
                this._ProxyHost = value;
            }
        }
        /// <summary>
        /// 设置代理服务器端口
        /// </summary>
        public int ProxyInt
        {
            get
            {
                return this._ProxyInt;
            }
            set
            {
                this._ProxyInt = value;
            }
        }
        /// <summary>
        /// 设置默认超时时间
        /// </summary>
        public int TimeOut
        {
            get
            {
                return this._TimeOut;
            }
            set
            {
                this._TimeOut = value;
            }
        }
        #endregion

        public RemoteRes Get(string uri)
        {
            return Get(new Uri(uri));
        }
        public RemoteRes Get(Uri uri)
        {
            RemoteRes info = new RemoteRes();

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
            request.Timeout = this._TimeOut;
            request.UserAgent = this._UserAgent;
            request.Method = "GET"; 
            request.Referer = string.Concat("http://", uri.Host);

            if (this._ProxyHost.Length > 0)
            {
                request.Proxy = new WebProxy(this._ProxyHost, this._ProxyInt);
            }
            HttpWebResponse response = null;
            Stream responseStream = null;
            try
            {
                Encoding encoding;
                response = (HttpWebResponse)request.GetResponse();
                responseStream = response.GetResponseStream();
              
                if (response.Headers["Accept-Encoding"] != null)
                {
                    if (MyCollections.Contain(response.Headers["Accept-Encoding"], "*", "gzip", "x-gzip"))
                    {
                        responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
                    }
                }
                else if (response.Headers["Content-Encoding"] != null)
                {
                    if (MyCollections.Contain(response.Headers["Content-Encoding"], "*", "gzip", "x-gzip"))
                    {
                        responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
                    }
                }
               
                if (this._HttpEncoding == null)
                {
                    string str = response.CharacterSet.ToLower();
                    if (str.Length > 3)
                    {
                        if (str.Substring(0, 3) == "iso")
                        {
                            encoding = Encoding.Default;
                        }
                        else
                        {
                            encoding = Encoding.GetEncoding(response.CharacterSet);
                        }
                    }
                    else
                    {
                        encoding = Encoding.GetEncoding(response.CharacterSet);
                    }
                    if (str.Length == 0)
                    {
                        encoding = Encoding.UTF8;
                    }
                }
                else
                {
                    encoding = this._HttpEncoding;
                }
                info.HTML = new StreamReader(responseStream, encoding).ReadToEnd();
                info.ContentType = response.ContentType;
                info.StatusCode = response.StatusCode;

            }
            catch (WebException WE)
            {
                if (WE.Response != null)
                {
                    info.StatusCode = (WE.Response as HttpWebResponse).StatusCode;
                }
                else
                {
                    info.StatusCode = HttpStatusCode.ServiceUnavailable;
                }
                info.Code = "错误:" + WE.Message;

            }
            catch (Exception ex)
            {
                info.Code = "错误:" + ex.Message;
                info.StatusCode = HttpStatusCode.InternalServerError;
            }
            finally
            {
                if (responseStream != null)
                    responseStream.Close();
                if (response != null)
                    response.Close();
            }

            return info;
        }

        #region 取得远程资源
        /// <summary>
        /// 取得远程资源
        /// </summary>
        /// <param name="strUrl">要取的URL</param>
        /// <returns>网页源代码</returns>
        public RemoteRes GetRemoteResource(string strUrl)
        {
            HttpWebResponse response = null;
            Stream stream = null;
            RemoteRes info = new RemoteRes();
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
                request.AllowAutoRedirect = true;
                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506)";
                request.Referer = "http://" + new Uri(strUrl).Host;
                response = request.GetResponse() as HttpWebResponse;
                stream = response.GetResponseStream();
                info.ContentType = response.ContentType;
                MemoryStream ms = new MemoryStream();

                byte[] buffer = new byte[256];

                int c = stream.Read(buffer, 0, buffer.Length);

                while (c > 0)
                {
                    ms.Write(buffer, 0, c);
                    c = stream.Read(buffer, 0, buffer.Length);
                }
                stream.Close();

                info.StatusCode = response.StatusCode;

                info.Bytes = ms.ToArray();

            }
            catch (WebException WE)
            {
                if (WE.Response != null)
                {
                    info.StatusCode = (WE.Response as HttpWebResponse).StatusCode;
                }
                else
                {
                    info.StatusCode = HttpStatusCode.ServiceUnavailable;
                }

                return null;
            }
            catch
            {
                info.StatusCode = HttpStatusCode.InternalServerError;

                return null;
            }
            finally
            {
                if (stream != null)
                    stream.Close();

                if (response != null)
                    response.Close();
            }
            return info;
        }
        #endregion


        public RemoteRes Post(string strUrl, string postData)
        {
            RemoteRes info = new RemoteRes();
            Stream responseStream = null;
            HttpWebResponse response = null;
            try
            {
                byte[] bytes = this._HttpEncoding.GetBytes(postData);
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
                request.Method = "POST";
                request.ContentType = "application/x-www-form-urlencoded";
                request.ContentLength = bytes.Length;
                request.Timeout = this._TimeOut;
                request.UserAgent = this._UserAgent; 
                //request.Referer = string.Concat("http://", uri.Host);
                if (this._ProxyHost.Length > 0)
                {
                    request.Proxy = new WebProxy(this._ProxyHost, this._ProxyInt);
                }
                using (Stream requestStream = request.GetRequestStream())
                {
                    requestStream.Write(bytes, 0, bytes.Length);
                    requestStream.Close();
                }
                try
                {
                    Encoding encoding;
                    response = (HttpWebResponse)request.GetResponse();
                    responseStream = response.GetResponseStream();
                    if (this._HttpEncoding == null)
                    {
                        string str = response.CharacterSet.ToLower();
                        if (str.Length > 3)
                        {
                            if (str.Substring(0, 3) == "iso")
                            {
                                encoding = Encoding.Default;
                            }
                            else
                            {
                                encoding = Encoding.GetEncoding(response.CharacterSet);
                            }
                        }
                        else
                        {
                            encoding = Encoding.GetEncoding(response.CharacterSet);
                        }
                        if (str.Length == 0)
                        {
                            encoding = Encoding.Default;
                        }
                    }
                    else
                    {
                        encoding = this._HttpEncoding;
                    }
                    info.HTML = new StreamReader(responseStream, encoding).ReadToEnd();
                    info.StatusCode = HttpStatusCode.OK;

                    responseStream.Close();
                    response.Close();
                    return info;
                }
                catch (Exception ex)
                {
                    info.HTML = "错误:" + ex.Message;
                }

            }
            catch (Exception ex)
            {
                info.HTML = "错误:" + ex.Message;
            }
            finally
            {
                if (responseStream != null)
                    responseStream.Close();
                if (response != null)
                    response.Close();
            }
            return info;
        }

        #region 检查链接
        /// <summary>
        /// 检查链接是否存在
        /// </summary>
        /// <param name="sURL"></param>
        /// <param name="AllowBadNum"></param>
        public bool UrlExist(string strURL)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strURL);
            request.Method = "HEAD";
            request.AllowAutoRedirect = false;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)";
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode != HttpStatusCode.OK)
            {
                response.Close();
                return false;
            }
            else
            {
                return true;
            }


        }
        /// <summary>
        /// 检查死链接是否在能容忍的数量内
        /// </summary>
        /// <param name="URLs"></param>
        /// <param name="AllowBadNum"></param>
        /// <returns></returns>
        public bool UrlExist(List<string> URLs, int AllowBadNum)
        {
            //如果图片的数量小于能容忍的数量就不用检查了。
            if (URLs.Count <= AllowBadNum)
            {
                return true;
            }
            int intTemp = 0;
            foreach (string strUrl in URLs)
            {
                if (UrlExist(strUrl) == false)
                {
                    intTemp++;
                    if (intTemp > AllowBadNum)
                    {
                        return false;
                    }
                }
            }
            return true;
        }
        #endregion
    }

    public class RemoteRes
    {
        private string _code;
        private string _html;
        private byte[] _bytes;
        private string _ContentType;
        private HttpStatusCode _StatusCode;
        /// <summary>
        /// 返回信息的代码
        /// </summary>
        public string Code
        {
            get
            {
                return this._code;
            }
            set
            {
                this._code = value;
            }
        }
        /// <summary>
        /// 信息
        /// </summary>
        public string HTML
        {
            get
            {
                return this._html;
            }
            set
            {
                this._html = value;
            }
        }
        /// <summary>
        /// 远程资源
        /// </summary>
        public byte[] Bytes
        {
            get
            {
                return this._bytes;
            }
            set
            {
                this._bytes = value;
            }
        }
        /// <summary>
        /// 内容类型
        /// </summary>
        public string ContentType
        {
            get
            {
                return this._ContentType;
            }
            set
            {
                this._ContentType = value;
            }
        }
        /// <summary>
        /// 状态代码
        /// </summary>
        public HttpStatusCode StatusCode
        {
            get
            {
                return this._StatusCode;
            }
            set
            {
                this._StatusCode = value;
            }
        }
    }
}



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值