using Crawler.Helper;
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
namespace Crawler
{
class Program
{
static void Main(string[] args)
{
// XPath学习地址: https://www.w3school.com.cn/xpath/index.asp
//网络爬虫
//Application--WebRequest--Filter--Thread--Data
//有什么用
//1.内容站 小说/漫画/电影
//2.数据收集、竞品分析
//robot协议
//用户和爬虫识别 user-agent/urlrefer
//用户登录识别 带上cookie
//IP黑名单/白名单 代理请求
//怀疑爬虫后返回验证码要求验证
//js动态加载/动态修改/数据图片化
//HttpWebRequest
//WebClient
//数据筛选:正则/第三方工具包HtmlAgilityPack(支持Xpath)
//抓
//GetCPUListFromJD();
//百度爬虫规则:
//优先title keyword description 一些固定标签
//跨域 jsonp
GetGoodsFromJD();
}
#region MyRegion
public static void GetCPUListFromJD()
{
string timestamp = GetTimestamp().ToString();
List<string> idList = new List<string>();
//第一页
//string url = "https://list.jd.com/list.html?cat=670%2C677%2C678&psort=3&cid3=678&cid2=677";
string url = "https://list.jd.com/listNew.php?cat=670%2C677%2C678&psort=3&psort=3&page=1&click=0";
string html = HttpRequestHelper.GetWithCookie(url, timestamp); //HttpRequestHelper.Get(url);
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
ShowProduct(htmlDoc, idList);
Console.WriteLine("*****************************懒加载*************************************");
//懒加载剩余30条//以后每次page+1就行,每一页有两个page,最后一页有可能只有一个page,需要比对之前的ID是否已经存在
url = $"https://list.jd.com/listNew.php?cat=670%2C677%2C678&psort=3&psort=3&page=2&scrolling=y&log_id={timestamp}.7563&tpl=1_M&isList=1";
url += $"&show_items={string.Join(",", idList)}";
string lazyHtml = HttpRequestHelper.GetWithCookie(url, timestamp);
HtmlDocument lazyHtmlDoc = new HtmlDocument();
lazyHtmlDoc.LoadHtml(lazyHtml);
ShowProductLazy(lazyHtmlDoc);
}
private static void ShowProduct(HtmlDocument htmlDoc, List<string> idList)
{
string lisXpath = "//*[@id=\"J_goodsList\"]/ul/li";
HtmlNodeCollection lisNode = htmlDoc.DocumentNode.SelectNodes(lisXpath);
for (int i = 0; i < lisNode.Count; i++)
{
HtmlDocument htmlnode = new HtmlDocument();
htmlnode.LoadHtml(lisNode[i].OuterHtml);
string idStr = lisNode[i].Attributes["data-sku"].Value;
idList.Add(idStr);
string name = "";
string price = "";
string nameXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-name p-name-type-3\"]/a/em|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-name p-name-type-3\"]/a/em";
HtmlNode nodeName = htmlnode.DocumentNode.SelectSingleNode(nameXpath);
name = nodeName.LastChild.InnerText.Trim();
Console.WriteLine(i + 1 + "***" + name);
string priceXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-price\"]/strong/i|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-price\"]/strong/i";
HtmlNode priceNode = htmlnode.DocumentNode.SelectSingleNode(priceXpath);
price = priceNode.LastChild.InnerText.Trim();
Console.WriteLine("\t" + price);
Console.WriteLine("\t" + idStr);
}
}
private static void ShowProductLazy(HtmlDocument htmlDoc)
{
string lisXpath = "//li";
HtmlNodeCollection lisNode = htmlDoc.DocumentNode.SelectNodes(lisXpath);
for (int i = 0; i < lisNode.Count; i++)
{
HtmlDocument htmlnode = new HtmlDocument();
htmlnode.LoadHtml(lisNode[i].OuterHtml);
string idStr = lisNode[i].Attributes["data-sku"].Value;
string name = "";
string price = "";
string nameXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-name p-name-type-3\"]/a/em|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-name p-name-type-3\"]/a/em";
HtmlNode nodeName = htmlnode.DocumentNode.SelectSingleNode(nameXpath);
name = nodeName.LastChild.InnerText.Trim();
Console.WriteLine(i + 1 + "***" + name);
string priceXpath = "//div[@class=\"tab-content-item tab-cnt-i-selected\"]/div[@class=\"p-price\"]/strong/i|//div[@class=\"gl-i-wrap\"]/div[@class=\"p-price\"]/strong/i";
HtmlNode priceNode = htmlnode.DocumentNode.SelectSingleNode(priceXpath);
price = priceNode.LastChild.InnerText.Trim();
Console.WriteLine("\t" + price);
Console.WriteLine("\t" + idStr);
}
}
public static long GetTimestamp()
{
TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
return Convert.ToInt64(ts.TotalMilliseconds);
}
#endregion
public static void GetGoodsFromJD()
{
string url = "https://search.jd.com/Search?keyword=%E7%83%9F%E7%81%B6&enc=utf-8&wq=&pvid=b16267386d2746149da973289c0eb547";
string html = HttpRequestHelper.Get(url); //HttpRequestHelper.Get(url);
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
ShowGoods(htmlDoc);
}
private static void ShowGoods(HtmlDocument htmlDoc)
{
}
}
}
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using System.Text;
namespace Crawler.Helper
{
public class HttpRequestHelper
{
public static string Get(string strUrl)
{
//证书
//ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(
// (object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) =>
//{
// return true;//总是接收
//});
//ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls;
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(strUrl);
//webRequest.Timeout = 30;
webRequest.Method = "GET";
webRequest.ContentType = "text/html;charset=utf-8";
//webRequest.UserAgent = "";
//webRequest.Headers.Add(HttpRequestHeader.Cookie, "__jdv=76161171|direct|-|none|-|1600765774351; __jdu=16007657743431160837809; areaId=27; ipLoc-djd=27-2376-4343-0; PCSYCityID=CN_610000_610100_610113; shshshfpa=44240f70-4882-bdfe-8f4d-f151f25e6528-1600765778; 3AB9D23F7A4B3C9B=NYT6ANOUVL54GXWA6X62NQBHXA5IP2IO6HQ62JZLUMM3X2X3DUHLU2OZVZPP3GHHSCY3KONPWPRMP4B6QMFTXFGKQA; __jda=122270672.16007657743431160837809.1600765774.1600765774.1600842497.2; __jdc=122270672; __jdb=122270672.2.16007657743431160837809|2.1600842497; shshshfp=da950713a5d998500611bbdd0cf7fbf6; shshshsID=7c72cd2a6787e9e57c75e23eb2c5350d_1_1600842499152; shshshfpb=uEKYPWO%20GAUdAXA0E6dsM8w%3D%3D");
//自动读取cookie
webRequest.CookieContainer = new CookieContainer();//准备cookie容器
using (HttpWebResponse webResponse = webRequest.GetResponse() as HttpWebResponse)
{
if (webResponse.StatusCode == HttpStatusCode.OK)
{
//string cookie = webResponse.Cookies["ASP.NET_SessionId"].Value;//读取cookie
using (StreamReader sr = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8))
{
return sr.ReadToEnd();
}
}
else
{
Console.WriteLine($"请求出错");
return string.Empty;
}
}
}
public static string GetWithCookie(string strUrl, string timestamp)
{
//证书
//ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(
// (object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) =>
//{
// return true;//总是接收
//});
//ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls;
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(strUrl);
//webRequest.Timeout = 30;
webRequest.Accept = "*/*";
webRequest.Method = "GET";
webRequest.ContentType = "text/html;charset=utf-8";
webRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36";
webRequest.Headers.Add("cookie", "__jdv=76161171|direct|-|none|-|1600765774351; __jdu=16007657743431160837809; areaId=27; ipLoc-djd=27-2376-4343-0; PCSYCityID=CN_610000_610100_610113; shshshfpa=44240f70-4882-bdfe-8f4d-f151f25e6528-1600765778; 3AB9D23F7A4B3C9B=NYT6ANOUVL54GXWA6X62NQBHXA5IP2IO6HQ62JZLUMM3X2X3DUHLU2OZVZPP3GHHSCY3KONPWPRMP4B6QMFTXFGKQA; shshshfp=da950713a5d998500611bbdd0cf7fbf6; shshshfpb=uEKYPWO%20GAUdAXA0E6dsM8w%3D%3D; __jda=122270672.16007657743431160837809.1600765774.1601343776.1601362283.8; __jdc=122270672; __jdb=122270672.2.16007657743431160837809|8.1601362283; shshshsID=9839a741b5e632add50573bd32d82a5a_1_1601362284090");
webRequest.Referer = "https://list.jd.com/list.html?cat=670%2C677%2C678&psort=3&cid3=678&cid2=677";
//webRequest.Headers.Add(":path", "/listNew.php?cat=670%2C677%2C678&psort=3&psort=3&page=2&s=29&scrolling=y&log_id=1601366287336.1687&tpl=1_M&isList=1&show_items=100006897507,100011978522,100002657042,100006391078,100006445340,100011978542,100012590222,100008149501,100006391096,100004478305,100007764578,100003815415,27441033033,100000634417,100003372319,100006532168,100004995955,100003026162,71978584950,100013163656,100013163660,100007389497,100005187625,100005786798,100000634429,100007308631,100007178173,100014638682,100007183917,100007168357");
//自动读取cookie
webRequest.CookieContainer = new CookieContainer();//准备cookie容器
using (HttpWebResponse webResponse = webRequest.GetResponse() as HttpWebResponse)
{
if (webResponse.StatusCode == HttpStatusCode.OK)
{
//string cookie = webResponse.Cookies["ASP.NET_SessionId"].Value;//读取cookie
using (StreamReader sr = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8))
{
return sr.ReadToEnd();
}
}
else
{
Console.WriteLine($"请求出错");
return string.Empty;
}
}
}
}
}