使用C#爬虫去爬取百度搜索,由自己组合成关键词,搜索各区县的工作报告,获取到百度搜索第一条的网页,爬取上面的标题,摘要,正文。
下面只举了三个例子,如果你想多尝试一下,也可以多输入几个。
使用了两个包,HtmlAgilityPack和ScrapySharp可以在NuGet里面下载
废话不多说,开始上代码。
using System;
using System.Collections.Generic;
using ScrapySharp.Network;
using ScrapySharp.Extensions;
using HtmlAgilityPack;
using System.Text;
namespace ExtractBaiduFirstLink
{
class Program
{
static void Main(string[] args)
{
string[] Conties = new string[3]{"高唐县", "阳谷县", "东平县"};
for (int i = 0; i < 3; i++)
{
try
{
string t = DownloadFile(Conties[i]);
Console.WriteLine(t);
Console.ReadKey();
}
catch
{
Console.WriteLine("失败");
Console.ReadKey();
continue;
}
}
}
/// <summary>
/// 构成uri
/// </summary>
/// <param name="keyWord"></param>搜索的关键字
/// <returns></returns>
private static string GetUri(string keyWord)
{
string wd = UriEncode(keyWord+"人民政府工作报告");
string returnStr = String.Format("https://www.baidu.com/s?wd={0}&rsv_spt=1&rsv_iqid=0x88b43a5c00074f90&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=10&rsv_sug1=2&rsv_sug7=100",wd);
return returnStr;
}
private static string UriEncode(string originStr)//百度关键字需要的转码
{
StringBuilder stringBuilder = new StringBuilder();
byte[] bs = System.Text.Encoding.UTF8.GetBytes(originStr);
for (int i = 0; i <bs.Length; i++)
{
stringBuilder.Append(@"%"+Convert.ToString(bs[i],16));
}
return stringBuilder.ToString();
}
/// <summary>
/// 文件下载
/// </summary>
/// <param name="conties"></param>//区县名称
/// <returns></returns>
private static string DownloadFile(string conties)
{
string str2 = GetUri(conties);
Uri uri = new Uri(str2);
ScrapingBrowser browser1 = new ScrapingBrowser { Encoding = Encoding.UTF8 };
string html1 = browser1.DownloadString(uri);
HtmlDocument dc = new HtmlDocument();
string returnStr = "";
try
{
dc.LoadHtml(html1);
HtmlNode htmlNode = dc.DocumentNode;
var href = htmlNode.CssSelect("h3.t").CssSelect("a");
List<string> links = new List<string>();//存储链接
foreach (var node in href)//读取百度中的链接
{
string tmpStr = (string)node.Attributes["href"].Value;
links.Add(tmpStr);
}
string firstLink = links[0];
ScrapingBrowser browser2 = new ScrapingBrowser { Encoding = Encoding.GetEncoding("gb2312") };
browser1.UserAgent = new FakeUserAgent("12", GetUA());
string html2 = browser2.DownloadString(new Uri(firstLink));
dc.LoadHtml(html2);
htmlNode = dc.DocumentNode;
IEnumerable<HtmlNode> title = htmlNode.CssSelect("div.title");//html中的div class=title标题
IEnumerable<HtmlNode> des = htmlNode.CssSelect("div.des");//html中的div class=des摘要
var test = htmlNode.CssSelect("p");//选择所有带有<p></p>中间数据,使用var只是说明还有这种用法
foreach (var node in title)//IEumerable需要这样遍历
{
returnStr += node.InnerText;
}
returnStr += "\n";
foreach (var node in des)
{
returnStr += node.InnerText;
}
returnStr += "\n";
foreach (var node in test)
{
returnStr += node.FirstChild.InnerText;//所有对HtmlNode的操作都需要在这里进行
}
}
catch
{
Console.WriteLine("读取失败");
return null;
}
Console.WriteLine("结束");
return returnStr;
}
}
}