前言
最近在项目上使用了HtmlAgilityPack抓取网页内容,由于需要根据页面节点获取信息,所以需要使用xpath,通过谷歌的xpath helper插件可以很方便的找到对应的节点,方法如下
使用方法
1、在谷歌应用中搜索xpath,在结果中找到并安装Xpath Helper
2、安装完成后会在谷歌浏览器右上角出现X图标
3、在浏览器中打开需要抓取内容的网址,通过如下方法获取node
4、代码实例如下
static readonly HttpClient client = new HttpClient();
public async Task GetNews()
{
var articles = new List<Article>();
var urlNewsList = "https://news.163.com/domestic/";
//以byte[]获取html
byte[] responseNewsList = await client.GetByteArrayAsync(urlNewsList);
//将byte[]重新编码成GB2312;
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
string tempNewsList = Encoding.GetEncoding("GB2312").GetString(responseNewsList);
//解析html,并输入
HtmlDocument htmlNewsList = new HtmlDocument();
htmlNewsList.LoadHtml(tempNewsList);
var post_listnode = htmlNewsList.DocumentNode.SelectSingleNode("/html[@class=' ua-win']/body[@class='ns9']/div[@class='second2016_wrap guonei_second_wrap']/div[@class='second2016_content']/div[@class='ns_area top_news clearfix']");
//后续为获取站点内容后的处理
}