网络爬虫,主要就是爬取一些其他网站上的数据。原理基本就是先把网页down下来,然后根据正则表达式来对里面的数据进行解析。最后组装成自己需要的数据。【这边使用了第三方组件HtmlAgilityPack(nuget下载)】
我们先以中国民航局官网—法律法规的数据抓取为例子,网站地址:http://www.caac.gov.cn
封装代码:
public class Service
{
const string CaacGovUrl = @"http://www.caac.gov.cn/was5/web/search?page={0}&channelid=211383&fl={1}";//民航局抓取Url
/// <summary>
/// 从民航爬取手册
/// </summary>
/// <param name="cmColumnId">栏目ID</param>
/// <returns></returns>
public List<ManualCivilEntity> GetManualListFromCivilList(int cmColumnId)
{
var list = new List<ManualCivilEntity>();
string html = HttpHelper.DownloadUrl(string.Format(CaacGovUrl, 1, cmColumnId));//下载html
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);//加载html
string pageNumberPath = @"//*[@id='outlinebar']";
HtmlNode pageNumberNode = doc.DocumentNode.SelectSingleNode(pageNumberPath);
if (pageNumberNode != null)
{
var strPageStrIndex = pageNumberNode.InnerText.IndexOf("共");
var strPageEndIndex = pageNumberNode.InnerText.IndexOf("页");
int pageNumber = Convert.ToInt32(pageNumberNode.InnerText.Substring(strPageStrIndex, strPageEndIndex - strPageStrIndex).Replace("共", "").Replace("页", ""));
for (int i = 1; i <= pageNumber; i++)
{
var objCMList = GetCivilManualList(i, cmColumnId);
list.AddRange(objCMList);
}
}
return list;
}
/// <summary>
/// 获取每页的手册List
/// </summary>
/// <param name="page">当前页码</param>
/// <param name="cmColumnId">栏目ID</param>
/// <returns></returns>
private List<ManualCivilEntity> GetCivilManualList(int page, int cmColumnId)
{
var result = new List<ManualCivilEntity>();
var html = HttpHelper.DownloadUrl(string.Format(CaacGovUrl, page, cmColumnId));//下载html
html = html.Replace("<tbody>", "").Replace("</tbody>", "").Replace("</td></tr>", "</tr>").Replace("</span>", "").Replace("<span>", "");
var doc = new HtmlDocument();
doc.LoadHtml(html);//加载html
string trPath = "//*[@id='tbRe']/tr/td/table/tr";
HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(trPath);
if (noneNodeList == null || noneNodeList.Count == 0)
{
return result;
}
foreach (var node in noneNodeList)
{
result.Add(GetCivilManual(node, cmColumnId));
}
return result;
}
/// <summary>
/// 解析手册
/// </summary>
/// <param name="node">当前HTMLNode</param>
/// <param name="cmColumnId">栏目ID</param>
/// <returns></returns>
private ManualCivilEntity GetCivilManual(HtmlNode node, int cmColumnId)
{
HtmlDocument docChild = new HtmlDocument();
docChild.LoadHtml(node.OuterHtml);
var objCM = new ManualCivilEntity();
#region 赋值
//标题和链接
string urlPath = "//*/td[2]/a";
HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
if (urlNode != null)
{
objCM.CMTitle = urlNode.InnerText;
objCM.CMUrl = urlNode.Attributes["href"].Value;
}
//发布时间
objCM.CMCreateTime = ChineseTimeToDateTime(GetInnerTextByPath(docChild, "//*/td[2]/div/ul[1]/li[2]").Replace("发文日期:", ""));
//办文单位
objCM.CMCreator = GetInnerTextByPath(docChild, "//*/td[2]/div/ul[1]/li[1]").Replace("办文单位:", "");
//文号
objCM.CMVersion = GetInnerTextByPath(docChild, "//*/td[2]/div/ul[3]/li[1]").Replace(" ", "").Replace("文号:", "");
//有效性
objCM.RowStatus = GetInnerTextByPath(docChild, "//*/td[2]/div/ul[3]/li[2]").Replace(" ", "").Replace("有效性:", "") == "有效" ? 1 : -1;
objCM.CMColumnId = cmColumnId;
#endregion
return objCM;
}
/// <summary>
/// 跟进路径获取文字
/// </summary>
/// <param name="htmlDoc">doc对象</param>
/// <param name="path">xpath</param>
/// <returns></returns>
private string GetInnerTextByPath(HtmlDocument htmlDoc, string path)
{
if (htmlDoc == null || string.IsNullOrWhiteSpace(path)) return string.Empty;
HtmlNode objNode = htmlDoc.DocumentNode.SelectSingleNode(path);
return objNode != null ? objNode.InnerText : string.Empty;
}
/// <summary>中文日期转化为时间</summary>
private DateTime ChineseTimeToDateTime(string dateTime)
{
dateTime = dateTime.Replace("年", "-").Replace("月", "-").Replace("日", "");
return Convert.ToDateTime(dateTime);
}
}
辅助代码:
public class HttpHelper
{
/// <summary>
/// 根据url下载内容 之前是GB2312
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string DownloadUrl(string url)
{
return DownloadHtml(url, Encoding.UTF8);
}
/// <summary>
/// 下载html
/// http://tool.sufeinet.com/HttpHelper.aspx
/// HttpWebRequest功能比较丰富,WebClient使用比较简单
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string DownloadHtml(string url, Encoding encode)
{
string html = string.Empty;
try
{
HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
request.Timeout = 30 * 1000;//设置30s的超时
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";//
//request.Host = "search.yhd.com";
//request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");
//request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
//request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
//request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");
//Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312
using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
{
if (response.StatusCode != HttpStatusCode.OK)
{
//LogHelper.WriteLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), "民航手册抓取", string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
}
else
{
try
{
StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
html = sr.ReadToEnd();//读取数据
sr.Close();
}
catch (Exception ex)
{
//LogHelper.ExceptionLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), ex, "民航手册抓取", $"DownloadHtml抓取{url}失败");
html = null;
}
}
}
}
catch (WebException ex)
{
if (ex.Message.Equals("远程服务器返回错误: (306)。"))
{
//LogHelper.ExceptionLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), ex, "民航手册抓取", ex.Message);
html = null;
}
}
catch (Exception ex)
{
//LogHelper.ExceptionLog(LogType.CivilManualAPI, MethodBase.GetCurrentMethod().Name, JsonHelper.SerializeObject(url), ex, "民航手册抓取", ex.Message);
html = null;
}
return html;
}
}
public class ManualCivilEntity
{
/// <summary>
/// 标题
/// </summary>
public string CMTitle { get; set; } = string.Empty;
/// <summary>
/// 归属菜单ID
/// </summary>
public int CMColumnId { get; set; } = 0;
/// <summary>
/// 链接
/// </summary>
public string CMUrl { get; set; } = string.Empty;
/// <summary>
/// 发布时间
/// </summary>
public DateTime CMCreateTime { get; set; } = new DateTime(1900, 1, 1);
/// <summary>
/// 办文单位
/// </summary>
public string CMCreator { get; set; } = string.Empty;
/// <summary>
/// 文号
/// </summary>
public string CMVersion { get; set; } = string.Empty;
/// <summary>
/// 有效性(0:默认;1:有效;-1:无效)
/// </summary>
public int RowStatus { get; set; } = 1;
}
调用端代码:
class Program
{
static void Main(string[] args)
{
var service = new Service();
Console.WriteLine("开始爬取数据....");
//获取中国民用航空局—法律法规的数据
//http://www.caac.gov.cn/XXGK/XXGK/index_172.html?fl=12
var list = service.GetManualListFromCivilList(12);//取fl的参数
foreach (var item in list)
{
var response = $"标题:{item.CMTitle}\n链接:{item.CMUrl}\n发布时间:{item.CMCreateTime}\n办文单位:{item.CMCreator}\n文号:{item.CMVersion}\n有效性:{item.RowStatus}";
Console.WriteLine(response);
Console.WriteLine();
Console.WriteLine("***************************************************************");
Console.WriteLine();
}
Console.WriteLine("爬取数据结束!!!");
Console.ReadLine();
}
}
效果:
源码下载:https://pan.baidu.com/s/1719aeG2h_r2JF96RKhMLSw baq7
注:关于想要获取页面上指定的内容,需要确认路径时,可以直接使用html工具,如图:
取到该值时可以给doc.DocumentNode.SelectSingleNode进行读取