爬虫过程记录
特别说明
- 单纯利用HtmlAgilityPack爬虫,需要掌握xpath语法,这里利用Fizzler可以使用节点选择。
- 还是用python做爬虫快些,功能也全。
第一步,建立控制台程序
第二步,引入 Fizzler.Systems.HtmlAgilityPack; HtmlAgilityPack;
第三步,上代码
using System.IO;
using System.Net;
using System.Text;
namespace HtmlAgilityPackSpider
{
class HtmlHelper
{
public static string GetWebClient(string url)
{
string strHTML = "";
WebClient myWebClient = new WebClient();
Stream myStream = myWebClient.OpenRead(url);
StreamReader sr = new StreamReader(myStream, Encoding.Default);
strHTML = sr.ReadToEnd();
myStream.Close();
return strHTML;
}
}
}
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using System;
namespace HtmlAgilityPackSpider
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("要输出了\n");
string url = "http://xxgk.chd.edu.cn/info/1066/2419.htm";
string htmlNode = "td.titlestyle46105";
string res = zySpider(url, htmlNode);
Console.WriteLine("爬虫链接是:"+url+"\n 新闻标题是:\n"+res);
Console.ReadKey();
}
public static string zySpider(String url,string htmlNode)
{
string doctext = HtmlHelper.GetWebClient(url);
HtmlDocument docHTML = new HtmlDocument();
docHTML.LoadHtml(doctext);
var node = docHTML.DocumentNode.QuerySelector(htmlNode);
string res = node.InnerText;
return res;
}
}
}
结果
小技巧,用谷歌浏览器开发者工具查询节点。