爬虫Crawler

using Crawler.Helper;
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;

namespace Crawler
{
    class Program
    {
        static void Main(string[] args)
        {
            // XPath学习地址: https://www.w3school.com.cn/xpath/index.asp
            //网络爬虫
            //Application--WebRequest--Filter--Thread--Data

            //有什么用
            //1.内容站  小说/漫画/电影
            //2.数据收集、竞品分析

            //robot协议
            //用户和爬虫识别      user-agent/urlrefer   
            //用户登录识别        带上cookie
            //IP黑名单/白名单     代理请求
            //怀疑爬虫后返回验证码要求验证
            //js动态加载/动态修改/数据图片化

            //HttpWebRequest   
            //WebClient

            //数据筛选:正则/第三方工具包HtmlAgilityPack(支持Xpath)
            //抓
            //GetCPUListFromJD();

            //百度爬虫规则:
            //优先title  keyword  description  一些固定标签
            
            //跨域  jsonp
            GetGoodsFromJD();
        }

        #region MyRegion
        public static void GetCPUListFromJD()
        {
            string timestamp = GetTimestamp().ToString();
            List<string> idList = new List<string>();
            //第一页
            //string url = "https://list.jd.com/list.html?cat=670%2C677%2C678&psort=3&cid3=678&cid2=677";
            string url = "https://list.jd.com/listNew.php?cat=670%2C677%2C678&psort=3&psort=3&page=1&click=0";

            string html = HttpRequestHelper.GetWithCookie(url, timestamp); //HttpRequestHelper.Get(url);
            HtmlDocument htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);
            ShowProduct(htmlDoc, idList);

            Console.WriteLine("*****************************懒加载*************************************");
            //懒加载剩余30条//以后每次page+1就行,每一页有两个page,最后一页有可能只有一个page,需要比对之前的ID是否已经存在

            url = $"https://list.jd.com/listNew.php?cat=670%2C677%2C678&psort=3&psort=3&page=2&scrolling=y&log_id={timestamp}.7563&tpl=1_M&isList=1";
            url += $"&show_items={string.Join(",", idList)}";
            string lazyHtml = HttpRequestHelper.GetWithCookie(url, timestamp);
            HtmlDocument lazyHtmlDoc = new HtmlDocument();
            lazyHtmlDoc.LoadHtml(lazyHtml);
            ShowProductLazy(lazyHtmlDoc);

        }

        private static void ShowProduct(HtmlDocument htmlDoc, List<string> idList)
        {
            string lisXpath = "//*[@id=\"J_goodsList\"]/ul/li";
            HtmlNodeCollection lisNode = htmlDoc.DocumentNode.SelectNodes(lisXpath);
            for (int i = 0; i < lisNode.Count; i++)
            {
                HtmlDocument htmlnode = new HtmlDocument();
                htmlnode.LoadHtml(lisNode[i].OuterHtml);

                string idStr = lisNode[i].Attributes["data-sku"].Value;
                idList.Add(idStr);
                string name = "";
                string price = "";

                string nameXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-name p-name-type-3\"]/a/em|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-name p-name-type-3\"]/a/em";
                HtmlNode nodeName = htmlnode.DocumentNode.SelectSingleNode(nameXpath);
                name = nodeName.LastChild.InnerText.Trim();
                Console.WriteLine(i + 1 + "***" + name);

                string priceXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-price\"]/strong/i|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-price\"]/strong/i";
                HtmlNode priceNode = htmlnode.DocumentNode.SelectSingleNode(priceXpath);
                price = priceNode.LastChild.InnerText.Trim();
                Console.WriteLine("\t" + price);
                Console.WriteLine("\t" + idStr);
            }
        }

        private static void ShowProductLazy(HtmlDocument htmlDoc)
        {
            string lisXpath = "//li";
            HtmlNodeCollection lisNode = htmlDoc.DocumentNode.SelectNodes(lisXpath);
            for (int i = 0; i < lisNode.Count; i++)
            {
                HtmlDocument htmlnode = new HtmlDocument();
                htmlnode.LoadHtml(lisNode[i].OuterHtml);

                string idStr = lisNode[i].Attributes["data-sku"].Value;
                string name = "";
                string price = "";

                string nameXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-name p-name-type-3\"]/a/em|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-name p-name-type-3\"]/a/em";
                HtmlNode nodeName = htmlnode.DocumentNode.SelectSingleNode(nameXpath);
                name = nodeName.LastChild.InnerText.Trim();
                Console.WriteLine(i + 1 + "***" + name);

                string priceXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-price\"]/strong/i|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-price\"]/strong/i";
                HtmlNode priceNode = htmlnode.DocumentNode.SelectSingleNode(priceXpath);
                price = priceNode.LastChild.InnerText.Trim();
                Console.WriteLine("\t" + price);
                Console.WriteLine("\t" + idStr);
            }
        }

        public static long GetTimestamp()
        {
            TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
            return Convert.ToInt64(ts.TotalMilliseconds);
        }
        #endregion

        public static void GetGoodsFromJD()
        {
            string url = "https://search.jd.com/Search?keyword=%E7%83%9F%E7%81%B6&enc=utf-8&wq=&pvid=b16267386d2746149da973289c0eb547";
            string html = HttpRequestHelper.Get(url); //HttpRequestHelper.Get(url);
            HtmlDocument htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);
            ShowGoods(htmlDoc);
        }

        private static void ShowGoods(HtmlDocument htmlDoc)
        { 
            
        
        }
    }
}
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using System.Text;

namespace Crawler.Helper
{
    public class HttpRequestHelper
    {
        public static string Get(string strUrl)
        {
            //证书
            //ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(
            //    (object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) =>
            //{
            //    return true;//总是接收
            //});
            //ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls;


            HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(strUrl);
            //webRequest.Timeout = 30;
            webRequest.Method = "GET";
            webRequest.ContentType = "text/html;charset=utf-8";
            //webRequest.UserAgent = "";
            //webRequest.Headers.Add(HttpRequestHeader.Cookie, "__jdv=76161171|direct|-|none|-|1600765774351; __jdu=16007657743431160837809; areaId=27; ipLoc-djd=27-2376-4343-0; PCSYCityID=CN_610000_610100_610113; shshshfpa=44240f70-4882-bdfe-8f4d-f151f25e6528-1600765778; 3AB9D23F7A4B3C9B=NYT6ANOUVL54GXWA6X62NQBHXA5IP2IO6HQ62JZLUMM3X2X3DUHLU2OZVZPP3GHHSCY3KONPWPRMP4B6QMFTXFGKQA; __jda=122270672.16007657743431160837809.1600765774.1600765774.1600842497.2; __jdc=122270672; __jdb=122270672.2.16007657743431160837809|2.1600842497; shshshfp=da950713a5d998500611bbdd0cf7fbf6; shshshsID=7c72cd2a6787e9e57c75e23eb2c5350d_1_1600842499152; shshshfpb=uEKYPWO%20GAUdAXA0E6dsM8w%3D%3D");

            //自动读取cookie
            webRequest.CookieContainer = new CookieContainer();//准备cookie容器

            using (HttpWebResponse webResponse = webRequest.GetResponse() as HttpWebResponse)
            {
                if (webResponse.StatusCode == HttpStatusCode.OK)
                {
                    //string cookie = webResponse.Cookies["ASP.NET_SessionId"].Value;//读取cookie
                    using (StreamReader sr = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8))
                    {
                        return sr.ReadToEnd();
                    }
                }
                else
                {
                    Console.WriteLine($"请求出错");
                    return string.Empty;
                }
            }
        }


        public static string GetWithCookie(string strUrl, string timestamp)
        {
            //证书
            //ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(
            //    (object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) =>
            //{
            //    return true;//总是接收
            //});
            //ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls;


            HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(strUrl);
            //webRequest.Timeout = 30;
            webRequest.Accept = "*/*";
            webRequest.Method = "GET";
            webRequest.ContentType = "text/html;charset=utf-8";
            webRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36";
            webRequest.Headers.Add("cookie", "__jdv=76161171|direct|-|none|-|1600765774351; __jdu=16007657743431160837809; areaId=27; ipLoc-djd=27-2376-4343-0; PCSYCityID=CN_610000_610100_610113; shshshfpa=44240f70-4882-bdfe-8f4d-f151f25e6528-1600765778; 3AB9D23F7A4B3C9B=NYT6ANOUVL54GXWA6X62NQBHXA5IP2IO6HQ62JZLUMM3X2X3DUHLU2OZVZPP3GHHSCY3KONPWPRMP4B6QMFTXFGKQA; shshshfp=da950713a5d998500611bbdd0cf7fbf6; shshshfpb=uEKYPWO%20GAUdAXA0E6dsM8w%3D%3D; __jda=122270672.16007657743431160837809.1600765774.1601343776.1601362283.8; __jdc=122270672; __jdb=122270672.2.16007657743431160837809|8.1601362283; shshshsID=9839a741b5e632add50573bd32d82a5a_1_1601362284090");
            webRequest.Referer = "https://list.jd.com/list.html?cat=670%2C677%2C678&psort=3&cid3=678&cid2=677";
            //webRequest.Headers.Add(":path", "/listNew.php?cat=670%2C677%2C678&psort=3&psort=3&page=2&s=29&scrolling=y&log_id=1601366287336.1687&tpl=1_M&isList=1&show_items=100006897507,100011978522,100002657042,100006391078,100006445340,100011978542,100012590222,100008149501,100006391096,100004478305,100007764578,100003815415,27441033033,100000634417,100003372319,100006532168,100004995955,100003026162,71978584950,100013163656,100013163660,100007389497,100005187625,100005786798,100000634429,100007308631,100007178173,100014638682,100007183917,100007168357");
           //自动读取cookie
           webRequest.CookieContainer = new CookieContainer();//准备cookie容器

            using (HttpWebResponse webResponse = webRequest.GetResponse() as HttpWebResponse)
            {
                if (webResponse.StatusCode == HttpStatusCode.OK)
                {
                    //string cookie = webResponse.Cookies["ASP.NET_SessionId"].Value;//读取cookie
                    using (StreamReader sr = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8))
                    {
                        return sr.ReadToEnd();
                    }
                }
                else
                {
                    Console.WriteLine($"请求出错");
                    return string.Empty;
                }
            }
        }
    }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值