网络爬虫

网络爬虫,主要就是爬取一些其他网站上的数据。原理基本就是先把网页down下来,然后根据正则表达式来对里面的数据进行解析。最后组装成自己需要的数据。【这边使用了第三方组件HtmlAgilityPack(nuget下载)】

我们先以中国民航局官网—法律法规的数据抓取为例子,网站地址:http://www.caac.gov.cn

封装代码:

public class Service
    {
        const string CaacGovUrl = @"http://www.caac.gov.cn/was5/web/search?page={0}&channelid=211383&fl={1}";//民航局抓取Url

        /// <summary>
        /// 从民航爬取手册
        /// </summary>
        /// <param name="cmColumnId">栏目ID</param>
        /// <returns></returns>
        public List<ManualCivilEntity> GetManualListFromCivilList(int cmColumnId)
        {
            var list = new List<ManualCivilEntity>();
            string html = HttpHelper.DownloadUrl(string.Format(CaacGovUrl, 1, cmColumnId));//下载html
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(html);//加载html
            string pageNumberPath = @"//*[@id='outlinebar']";
            HtmlNode pageNumberNode = doc.DocumentNode.SelectSingleNode(pageNumberPath);
            if (pageNumberNode != null)
            {
                var strPageStrIndex = pageNumberNode.InnerText.IndexOf("共");
                var strPageEndIndex = pageNumberNode.InnerText.IndexOf("页");
                int pageNumber = Convert.ToInt32(pageNumberNode.InnerText.Substring(strPageStrIndex, strPageEndIndex - strPageStrIndex).Replace("共", "").Replace("页", ""));
                for (int i = 1; i <= pageNumber; i++)
                {
                    var objCMList = GetCivilManualList(i, cmColumnId);
                    list.AddRange(objCMList);
                }
            }
            return list;
        }

        /// <summary>
        /// 获取每页的手册List
        /// </summary>
        /// <param name="page">当前页码</param>
        /// <param name="cmColumnId">栏目ID</param>
        /// <returns></returns>
        private List<ManualCivilEntity> GetCivilManualList(int page, int cmColumnId)
        {
            var result = new List<ManualCivilEntity>();
            var html = HttpHelper.DownloadUrl(string.Format(CaacGovUrl, page, cmColumnId));//下载html
            html = html.Replace("<tbody>", "").Replace("</tbody>", "").Replace("</td></tr>", "</tr>").Replace("</span>", "").Replace("<span>", "");
            var doc = new HtmlDocument();
            doc.LoadHtml(html);//加载html

            string trPath = "//*[@id='tbRe']/tr/td/table/tr";
            HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(trPath);
            if (noneNodeList == null || noneNodeList.Count == 0)
            {
                return result;
            }
            foreach (var node in noneNodeList)
            {
                result.Add(GetCivilManual(node, cmColumnId));
            }
            return result;
        }

        /// <summary>
        /// 解析手册
        /// </summary>
        /// <param name="node">当前HTMLNode</param>
        /// <param name="cmColumnId">栏目ID</param>
        /// <returns></returns>
        private ManualCivilEntity GetCivilManual(HtmlNode node, int cmColumnId)
        {
            HtmlDocument docChild = new HtmlDocument();
            docChild.LoadHtml(node.OuterHtml);

            var objCM = new ManualCivilEntity();

            #region 赋值

            //标题和链接
            string urlPath = "//*/td[2]/a";
            HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
            if (urlNode != null)
            {
                objCM.CMTitle = urlNode.InnerText;
                objCM.CMUrl = urlNode.Attributes["href"].Value;
            }

            //发布时间
            objCM.CMCreateTime = ChineseTimeToDateTime(GetInnerTextByPath(docChild, "//*/td[2]/div/ul[1]/li[2]").Replace("发文日期:", ""));
            //办文单位
            objCM.CMCreator = GetInnerTextByPath(docChild, "//*/td[2]/div/ul[1]/li[1]").Replace("办文单位:", "");
            //文号
            objCM.CMVersion = GetInnerTextByPath(docChild, "//*/td[2]/div/ul[3]/li[1]").Replace("&nbsp;", "").Replace("文号:", "");
            //有效性
            objCM.RowStatus = GetInnerTextByPath(docChild, "//*/td[2]/div/ul[3]/li[2]").Replace("&nbsp;", "").Replace("有效性:", "") == "有效" ? 1 : -1;

            objCM.CMColumnId = cmColumnId;
            #endregion

            return objCM;
        }

        /// <summary>
        /// 跟进路径获取文字
        /// </summary>
        /// <param name="htmlDoc">doc对象</param>
        /// <param name="path">xpath</param>
        /// <returns></returns>
        private string GetInnerTextByPath(HtmlDocument htmlDoc, string path)
        {
            if (htmlDoc == null || string.IsNullOrWhiteSpace(path)) return string.Empty;

            HtmlNode objNode = htmlDoc.DocumentNode.SelectSingleNode(path);
            return objNode != null ? objNode.InnerText : string.Empty;
        }

        /// <summary>中文日期转化为时间</summary>
        private DateTime ChineseTimeToDateTime(string dateTime)
        {
            dateTime = dateTime.Replace("年", "-").Replace("月", "-").Replace("日", "");
            return Convert.ToDateTime(dateTime);
        }
    }

辅助代码:

public class HttpHelper
    {
        /// <summary>
        /// 根据url下载内容  之前是GB2312
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string DownloadUrl(string url)
        {
            return DownloadHtml(url, Encoding.UTF8);
        }

        /// <summary>
        /// 下载html
        /// http://tool.sufeinet.com/HttpHelper.aspx
        /// HttpWebRequest功能比较丰富,WebClient使用比较简单
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string DownloadHtml(string url, Encoding encode)
        {
            string html = string.Empty;
            try
            {
                HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
                request.Timeout = 30 * 1000;//设置30s的超时
                request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
                request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// 
                //request.Host = "search.yhd.com";

                //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");

                //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
                //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
                //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");

                //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312

                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
                {
                    if (response.StatusCode != HttpStatusCode.OK)
                    {
                        //LogHelper.WriteLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), "民航手册抓取", string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
                    }
                    else
                    {
                        try
                        {
                            StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
                            html = sr.ReadToEnd();//读取数据
                            sr.Close();
                        }
                        catch (Exception ex)
                        {
                            //LogHelper.ExceptionLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), ex, "民航手册抓取", $"DownloadHtml抓取{url}失败");
                            html = null;
                        }
                    }
                }
            }
            catch (WebException ex)
            {
                if (ex.Message.Equals("远程服务器返回错误: (306)。"))
                {
                    //LogHelper.ExceptionLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), ex, "民航手册抓取", ex.Message);
                    html = null;
                }
            }
            catch (Exception ex)
            {
                //LogHelper.ExceptionLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), ex, "民航手册抓取", ex.Message);
                html = null;
            }
            return html;
        }
    }

public class ManualCivilEntity
    {
        /// <summary>
        /// 标题
        /// </summary>
        public string CMTitle { get; set; } = string.Empty;

        /// <summary>
        /// 归属菜单ID
        /// </summary>
        public int CMColumnId { get; set; } = 0;

        /// <summary>
        /// 链接
        /// </summary>
        public string CMUrl { get; set; } = string.Empty;

        /// <summary>
        /// 发布时间
        /// </summary>
        public DateTime CMCreateTime { get; set; } = new DateTime(1900, 1, 1);

        /// <summary>
        /// 办文单位
        /// </summary>
        public string CMCreator { get; set; } = string.Empty;

        /// <summary>
        /// 文号
        /// </summary>
        public string CMVersion { get; set; } = string.Empty;

        /// <summary>
        /// 有效性(0:默认;1:有效;-1:无效)
        /// </summary>
        public int RowStatus { get; set; } = 1;
    }

调用端代码:

class Program
    {
        static void Main(string[] args)
        {
            var service = new Service();

            Console.WriteLine("开始爬取数据....");

            //获取中国民用航空局—法律法规的数据
            //http://www.caac.gov.cn/XXGK/XXGK/index_172.html?fl=12
            var list = service.GetManualListFromCivilList(12);//取fl的参数

            foreach (var item in list)
            {
                var response = $"标题:{item.CMTitle}\n链接:{item.CMUrl}\n发布时间:{item.CMCreateTime}\n办文单位:{item.CMCreator}\n文号:{item.CMVersion}\n有效性:{item.RowStatus}";
                Console.WriteLine(response);
                Console.WriteLine();
                Console.WriteLine("***************************************************************");
                Console.WriteLine();
            }
            Console.WriteLine("爬取数据结束!!!");
            Console.ReadLine();
        }
    }

效果:

源码下载:https://pan.baidu.com/s/1719aeG2h_r2JF96RKhMLSw  baq7

注:关于想要获取页面上指定的内容,需要确认路径时,可以直接使用html工具,如图:

取到该值时可以给doc.DocumentNode.SelectSingleNode进行读取

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值