C#爬取数据_详细篇

第一步

我们先找到拿数据的网站网站,然后打开F12,找到对应的json数据。

 第二步

观察json的格式,然后在程序里设对应的模型(属性名得一模一样的,类型的话自己判断吧)

public class SichuangJsonModel
    {
        public string msg { get; set; }
        public Int32 total { get; set; }
        public string code { get; set; }

        public List<SichuanData> data { get; set; }
    }

    public class SichuanData
    {
        
        /// <summary>
        /// 发布时间
        /// </summary>
        public string noticeTime { get; set; }
        /// <summary>
        /// 标题
        /// </summary>
        public string title { get; set; }
        /// <summary>
        /// 公告URL  
        /// </summary>
       public string pageurl { get; set; }

       /// <summary>
       /// 内容
       /// </summary>
       public string content { get; set; }
       
        /// <summary>
        /// 采购人名称
        /// </summary>
        public string purchaser { get; set; }
        /// <summary>
        /// 预算金额
        /// </summary>
        public string budget { get; set; }
        /// <summary>
        /// 采购人地址
        /// </summary>
        public string purchaserAddr { get; set; }
        /// <summary>
        /// 采购人联系方式
        /// </summary>
        public string purchaserLinkPhone { get; set; }

        /// <summary>
        /// 项目编号
        /// </summary>
        public string openTenderCode { get; set; }

        /// <summary>
        /// 采购代理机构名称
        /// </summary>
        public string agency { get; set; }

        /// <summary>
        /// 采购方式
        /// </summary>
        public string noticeType{ get; set; }

    }

或者你不想自己写这种东西话,直接自动生成吧!

把json放到类中

 

 

 就自动生成了

第三步

开始编写代码了,建议你先建个测试类,这样调试方便,当然你自己不嫌麻烦能一下子调试成功的当我没说!

我这边用测试类做的,测试完了,直接搬过去用就行,话不多说,上代码

3.1  json,get方式

注:这个request里面的参数是在网站上找的

 是不是分块编码的看这里

  json,get方式

using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using LitJson;
using System;
 
[TestMethod]

        public void TestSichuanService()
        {
            var pageNum = 1;//页码
            var noticeType = "001004,001006";//数据的类型
            var url = @"http://www.ccgp-sichuan.gov.cn/freecms/rest/v1/notice/selectInfoMoreChannel.do?=&siteId=94c965cc-c55d-4f92-8469-d5875c68bd04&channel=c5bff13f-21ca-4dac-b158-cb40accd3035&currPage=" + pageNum + "&pageSize=10&noticeType=" + noticeType + "&regionCode=&purchaseManner=&title=&openTenderCode=&purchaser=&agency=&purchaseNature=&operationStartTime=&operationEndTime=&selectTimeName=noticeTime&cityOrArea=";
           
            //为指定的 URI 方案初始化新的 WebRequest 实例,网站上请求的网络协议
            var request = (HttpWebRequest)WebRequest.Create(url);
            request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
            request.KeepAlive = true;
            request.ContentType = "application/json;charset=utf-8";
            request.Host = "www.ccgp-sichuan.gov.cn";
            request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36 Maxthon/5.3.8.2000";

            request.Headers.Add("Accept-Encoding", "gzip, deflate");
            request.Headers.Add("Accept-Language", "zh-CN");
            request.Headers.Add("DNT", "1");
            request.Headers.Add("Cache-Control", "max-age=0");
            request.Headers.Add("Upgrade-Insecure-Requests", "1");

            request.Method = "Get";//因为这个网站是get请求的
            //分块编码的解码方式
            request.AutomaticDecompression = DecompressionMethods.GZip;//因为这个网站json数据是分块编码的,得加上这个,如果要爬取的网站不是分块编码的可以省略

            //来自 Internet 资源的响应
            var response = request.GetResponse();

            byte[] responseBytes = GetContentBytes(response);//数据流转字符串
            var jsonString = Encoding.UTF8.GetString(responseBytes);

            var listJson = JsonMapper.ToObject<SichuangJsonModel>(jsonString);//把数据存模型里面去
            if (listJson.data == null || listJson.data.Count < 1)
                Console.WriteLine("no json");
            else
            {

                foreach (var noteListModel in listJson.data)
                {
                    Console.WriteLine(noteListModel.title);
                }


            }
        }





        /// <summary>
        /// 数据流转字符串
        /// </summary>
        /// <param name="response"></param>
        /// <returns></returns>
        private byte[] GetContentBytes(WebResponse response)
        {
            byte[] data;
            using (var buffer = new MemoryStream())
            {
                using (var responseStream = response.GetResponseStream())
                {
                    if (responseStream != null) responseStream.CopyTo(buffer);
                }
                data = buffer.ToArray();
            }

            return data;
        }

3.2  json post方式

我用另一个post方式的网站做的,跟上面那个网站不一样

不过,还是分析json,建模型,建模型的步骤我就省略了,跟上面差不多

注:datas的参数在这里找

注:request参数在这里找

using System.Net;
using System.Text;
using BaseHelper;
using LitJson;
using System;
using System.Collections.Generic;
using System.IO;


 [TestMethod]
        public void TestGjdlService()
        {
            //获取和设置cookies
            var cookies = new Dictionary<string, string>();
            //主要是用这个链接拿cookie
            var firstUrl = @"https://ecp.sgcc.com.cn/ecp2.0/portal/#/list/list-com/2018032600289606_1_2018060501171111";
            

            var result = HttpHelper.Send(firstUrl, new Dictionary<string, string>(), new Dictionary<string, string>(),
                "Get");
            //从网页上拿到Set-Cookie的内容
            var cid = result.Headers.GetValues("Set-Cookie");
            if (cid == null || cid.Length < 1) return;
            //从网页上拿到Set-Cookie的内容
            var cids = cid[0].Split(new[] { ";", "proxy_server=" }, StringSplitOptions.RemoveEmptyEntries);
            if (cids.Length < 1) return;
            var cookiesStrs = cids[0];
            cookies.Add("proxy_server", cookiesStrs); //这个代码直接用就行,大部分网站都能用



            //主界面的数据,跟get不一样,get直接放网址里就行,post得这样写
            var datas = new Dictionary<string, string>();
            var pageNum = 1;
            datas.Add("firstPageMenuId", "2018060501171111");
            datas.Add("index", pageNum.ToString());
            datas.Add("key", "");
            datas.Add("orgId", "");
            datas.Add("orgName", "");
            datas.Add("purOrgCode", "");
            datas.Add("purOrgStatus", "");
            datas.Add("purType", "");
            datas.Add("size", "20");



            //主要是用这个链接拿列表的数据
            var url = @"https://ecp.sgcc.com.cn/ecp2.0/ecpwcmcore//index/noteList";
            //为指定的 URI 方案初始化新的 WebRequest 实例
            var request = (HttpWebRequest)WebRequest.Create(url);
            

            request.Accept = "application/json, text/plain, */*";
            request.KeepAlive = true;
            request.ContentLength = 138;
            request.ContentType = "application/json";
            request.Host = "ecp.sgcc.com.cn";
            request.Referer = @"https://ecp.sgcc.com.cn/ecp2.0/portal/";
            request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36";

            request.Headers.Add("Accept-Encoding", "gzip, deflate, br");
            request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9");
            request.Headers.Add("DNT", "1");
            request.Headers.Add("sec-ch-ua-mobile", "?0");
            request.Headers.Add("sec-ch-ua-platform", "Windows");
            request.Headers.Add("Sec-Fetch-Dest", "empty");
            request.Headers.Add("Sec-Fetch-Mode", "cors");
            request.Headers.Add("Sec-Fetch-Site", "same-origin");
            request.Headers.Add("Origin", @"https://ecp.sgcc.com.cn");

            request.Method = "Post";
            _InjectCookies(request, cookies);//封装请求cookies
            //纯字符串转字符数据流
            var data = _ConvertKeyValuePayLoadToBytes(datas);
            if (data.Length > 0)
            {
                request.ContentLength = data.Length;
                //GetRequestStream()将数据写入 Internet 资源的 Stream
                using (var requestStream = request.GetRequestStream())
                {
                    requestStream.Write(data, 0, data.Length);
                    requestStream.Flush();
                }
            }
            //来自 Internet 资源的响应
            var response = request.GetResponse();
            byte[] responseBytes = GetContentBytes(response);//数据流转字符串


            var jsonString = Encoding.UTF8.GetString(responseBytes);
            var listJson = JsonMapper.ToObject<GjdlJsonModel>(jsonString);//将数据存模型里
            if (listJson == null || listJson.resultValue == null || listJson.resultValue.noteList == null || listJson.resultValue.noteList.Count < 1)
                Console.WriteLine("no json");
            else
            {
                {
                    foreach (var noteListModel in listJson.resultValue.noteList)
                    {
                        Console.WriteLine(noteListModel.title);
                    }
                }

            }
        }


-----------------------后面这三个方法直接用就行,不用改啥


/// <summary>
        /// 封装请求cookies
        /// </summary>
        /// <param name="request"></param>
        /// <param name="cookies"></param>
        private static void _InjectCookies(HttpWebRequest request, IDictionary<string, string> cookies)
        {
            if (cookies != null && cookies.Count > 0)
            {
                request.CookieContainer = new CookieContainer();
                foreach (var key in cookies.Keys)
                {
                    var value = cookies[key];
                    var hostItems = request.Host.Split(new[] { ':' });
                    var cookie = new Cookie(key, value, "/", hostItems[0]) { Expires = DateTime.Now.AddDays(60) };

                    request.CookieContainer.Add(cookie);
                }
            }
        }

        private byte[] _ConvertKeyValuePayLoadToBytes(IDictionary<string, string> dataDictionary)
        {
            var dataString = "{";
            var wz = 0;
            foreach (var key in dataDictionary.Keys)
            {
                var itemString = wz == 0 ? $"\"{key}\":\"{dataDictionary[key]}\"" : $", \"{key}\":\"{dataDictionary[key]}\"";
                dataString += itemString;
                wz++;

            }
            dataString += "}";
            var data = Encoding.UTF8.GetBytes(dataString);
            return data;
        }

        /// <summary>
        /// 数据流转字符串
        /// </summary>
        /// <param name="response"></param>
        /// <returns></returns>
        private byte[] GetContentBytes(WebResponse response)
        {
            byte[] data;
            using (var buffer = new MemoryStream())
            {
                using (var responseStream = response.GetResponseStream())
                {
                    if (responseStream != null) responseStream.CopyTo(buffer);
                }
                data = buffer.ToArray();
            }

            return data;
        }

3.3 HTML方式

有些网站不是用json方式给数据的,直接纯HtML的方式给的,这个时候就得分析网站HTML的格式了

例子 

 public void TestSichuanghtml()
        {

            //废标
            var url = @"http://www.ccgp-sichuan.gov.cn/freecms/site/sichuan/ggxx/info/2022/8a69ce41836122f801837c83882606b3.html?noticeType=001004,001006";


            var htmlDoc = GetHtmlDocument(url);//get方式获取到页面数据
            var div = htmlDoc.FindFirstOrDefault(@"div[id=content]");//定位到数据所在的div区域
            //采购代理地址
            var tmAdd = div.FindFirstOrDefault(@"span[id=_notice_content_noticeAgency-agentAddress]");
            var CGAdd = GetTitle(tmAdd);//获取元素的Title
            //采购代理联系方式
            var tmTel = div.FindFirstOrDefault(@"span[id=_notice_content_noticeAgency-agentLinkTel]");
            var CGTel = GetTitle(tmTel);
            //项目联系人
            var tmTel_2 = div.FindFirstOrDefault(@"span[id=_notice_content_projectContact-managerName]");
            var CGTel_2 = GetTitle(tmTel_2);
            //项目联系人电话
            var tmTel_3 = div.FindFirstOrDefault(@"span[id=_notice_content_projectContact-managerLinkPhone]");
            var CGTel_3 = GetTitle(tmTel_3);
            //附件
            var tmTel_4 = div.Find(@"a").ToList();
            var aaa = "";
            for (int i = 0; i < tmTel_4.Count; i++)
            {
                var CGTel_4 = tmTel_4[i]?.InnerText().Trim();//附件名称

                foreach (var aa in tmTel_4[i]?.Attributes())
                {

                    if (aa.Name == "href")
                    {

                        if (i == 0)
                            aaa = aa.AttributeValue.Trim();//附件URL
                        else
                            aaa += ";;;" + aa.AttributeValue.Trim();
                        break;
                    }
                }
            }
        }


        /// <summary>
        /// GET方式获取页面
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public IHtmlDocument GetHtmlDocument(string url)
        {
            try
            {
                var cookies = new Dictionary<string, string>();
                var datas = new Dictionary<string, string>();
                var pageContent = HttpHelper.DownloadString(url, cookies, datas, Encoding.UTF8, HttpHelper.GetMethod, BaseAppConfig.DefaultWebRequestTimeout);
                var parser = new JumonyParser();
                return parser.Parse(pageContent);
            }
            catch (Exception e)
            {
                throw new Exception("服务器访问出错:" + e.InnerMessage());
            }
        }


        /// <summary>
        /// 获取元素的Title
        /// </summary>
        /// <param name="aElement"></param>
        /// <returns></returns>
        public string GetTitle(IHtmlElement aElement)
        {
            var attrs = aElement.Attributes();
            string addr;
            foreach (var attribute in attrs)
            {
                if (attribute.Name.ToLower() == "title")
                {
                    addr = attribute.AttributeValue.Trim();
                    return addr;
                }
            }
            addr = aElement.InnerText().Trim();
            return addr;
        }



        /// <summary>
        /// 获取超级链接地址
        /// </summary>
        /// <param name="attrs"></param>
        /// <returns></returns>
        public string GetHref(IEnumerable<IHtmlAttribute> attrs)
        {
            var addr = string.Empty;
            foreach (var attribute in attrs)
            {
                if (attribute.Name == "href")
                {
                    addr = attribute.AttributeValue.Trim();
                    break;
                }
            }
            return addr;
        }



        /// <summary>
        /// 获取点击事件值
        /// </summary>
        /// <param name="attrs"></param>
        /// <returns></returns>
        public string GetOnclick(IEnumerable<IHtmlAttribute> attrs)
        {
            var addr = string.Empty;
            foreach (var attribute in attrs)
            {
                if (attribute.Name.ToLower() == "onclick")
                {
                    addr = attribute.AttributeValue.Trim();
                    break;
                }
            }
            return addr;
        }





        /// <summary>
        /// 获取包含关键字的元素的值
        /// </summary>
        /// <param name="list"></param>
        /// <param name="keyWord"></param>
        /// <returns></returns>
        public string GetElementValue(List<IHtmlElement> list, string keyWord)
        {
            if (list == null || list.Count < 1 || string.IsNullOrWhiteSpace(keyWord)) return null;
            foreach (var htmlElement in list)
                if (htmlElement.InnerText().Trim().Contains(keyWord.Trim()))
                    return htmlElement.InnerText().Trim().Replace(keyWord, "");
            return null;
        }




        /// <summary>
        /// 获取包含文本内容的元素的序列
        /// </summary>
        /// <param name="list"></param>
        /// <param name="texts"></param>
        /// <returns></returns>
        public int GetElementIndex(List<IHtmlElement> list, List<string> texts)
        {
            if (list == null || list.Count < 1 || texts == null || texts.Count < 1) return -1;
            for (var index = 0; index < list.Count; index++)
            {
                var htmlElement = list[index];
                foreach (var text in texts)
                {
                    if (htmlElement.InnerText().Trim().Contains(text.Trim()))
                        return index;
                }
            }
            return -1;
        }




        

C#爬取数据的方法这三个能应付大部分了

  • 3
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值