最近想要通过C#调用Python脚本,奈何在第三方库的调用上一直出现问题,于是我退而求其次,首先尝试用C#实现爬虫,期望达到触类旁通的效果。
一、效果(以某小说网站为例)
1、实现小说的搜索功能(post请求)
注:获取到书籍名与其目录的链接。
//存储书名以及目录
List<String> SaveString = new List<string>();
#region 实现搜索功能,并获取搜索到的目录名与链接
string getValue = PostInf(@"https://www.xbiquge.la/modules/article/waps.php", "searchkey=" + textBox1.Text);
richTextBox1.AppendText(getValue);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(getValue);//将字符串转换成 HtmlDocument
Console.WriteLine(getValue);
var node = doc.DocumentNode.SelectSingleNode("//table[@class='grid']");
HtmlNodeCollection aCollection = node.ChildNodes;
foreach (var item in aCollection)
{
try
{
Regex reg = new Regex(@"href\s*=\s*((""[^""]*"")|(\'[^\']*\')|[^>^\s]+)");
Match m = reg.Match(item.ChildNodes[1].InnerHtml);
if (m.Success)
{
SaveString.Add(item.ChildNodes[1].InnerText + "," + m.Value.Split('=')[1].Substring(1, m.Value.Split('=')[1].Length - 2));
}
listBox1.Items.Add(item.ChildNodes[1].InnerText);
}
catch { }
}
#endregion
/// <summary>
/// 发送Post请求
/// </summary>
/// <param name="url">连接</param>
/// <param name="str">请求字符串</param>
/// <returns>返回结果</returns>
public static string PostInf(string url, string str)
{
ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3
| SecurityProtocolType.Tls
| (SecurityProtocolType)0x300 //Tls11
| (SecurityProtocolType)0xC00; //Tls12
//创建HTTP请求
var re = WebRequest.Create(url) as HttpWebRequest;
//设置请求头
//re.Headers = headers;
re.ContentType = "application/x-www-form-urlencoded"; //跨平台调WebApi接口方式
re.Timeout = 20000;
re.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0";
re.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
//设置访问类型为POST
re.Method = "POST";
//写入请求信息
using (StreamWriter sw = new StreamWriter(re.GetRequestStream()))
{
sw.WriteLine(str);
}
//获取相应内容
var ans = re.GetResponse();
using (var st = new StreamReader(ans.GetResponseStream()))
{
return st.ReadToEnd();
}
}
2、根据小说名展现对应目录,并可选择目录查看文章内容(get请求)
注:我的正则学的不好,用的多少C#的那一套,读者可自行修改。
#region 目录链接获取
var html = i.Split(',')[1];
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDoc = web.Load(html);
var node = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='list']").ChildNodes[1];
HtmlNodeCollection aCollection = node.ChildNodes;
foreach (var item in aCollection)
{
try
{
string str = item.InnerHtml;
Regex reg = new Regex(@"href\s*=\s*((""[^""]*"")|(\'[^\']*\')|[^>^\s]+)");
Match m = reg.Match(str);
if (m.Success)
{
//拼接字符串
string value = item.InnerText + "," + i.Split(',')[1] + "/" + m.Value.Split('=')[1].Substring(1, m.Value.Split('=')[1].Length - 2).Split('/')[3];
SaveString1.Add(value);
listBox2.Items.Add(item.InnerText);
}
}
catch { }
}
#endregion
/// <summary>
/// 根据URL获取文字内容
/// </summary>
/// <param name="url"></param>
public void getUrlText(string url)
{
var html = url;
HtmlWeb web = new HtmlWeb();
var htmlDoc = web.Load(html);
var node = htmlDoc.GetElementbyId("content");
//
string saveText = node.InnerText.Replace(" ", "\"\n\"");
richTextBox1.AppendText(saveText);
}
3、实现文字的修饰(随便写写,加了修改字号与字体)
/字号设置
private void comboBox1_TextChanged(object sender, EventArgs e)
{
richTextBox1.Font = new Font(richTextBox1.Font.FontFamily, Convert.ToInt32(comboBox1.Text), richTextBox1.Font.Style);
}
//字体样式设置
private void comboBox2_TextChanged(object sender, EventArgs e)
{
richTextBox1.Font = new Font(comboBox2.Text, Convert.ToInt32(comboBox1.Text), richTextBox1.Font.Style);
}
4、最终效果图
图片涉及违规,没办法我给加上马赛克。
资源已经上传,自取地址如下:
小说爬取窗体https://download.csdn.net/download/beichuanshangren/86406320