花了几个小时写了一个小程序,没什么技术含量,第一次写博客。本人是个菜鸟,想记录一下自己的学习。
运行效果如图:
代码如下:
1 using System; 2 using System.IO; 3 using System.Net; 4 using System.Text; 5 using System.Text.RegularExpressions; 6 7 namespace ConsoleApplication2 8 { 9 class Program 10 { 11 static void Main(string[] args) 12 { 13 string url = "https://www.cnblogs.com/"; 14 int num = 0; //统计当前为第几个文章的标题 15 int pagSize = 100; //爬取的最大页数 10表示爬首页前10页的标题 16 17 //标题标签样例:<a class="titlelnk" href="https://www.xxxxx.html" target="_blank">【设计模式】简单工厂模式 Simple Factory Pattern</a> 18 string pater = "<a class=\"titlelnk\" href=\"(.*?)\" target=\"_blank\">(.*?)</a>"; //()为C#要捕捉的内容,括号里面的".*?"表示匹配任意内容(因为url的地址是不确定的) 19 Regex regex = new Regex(pater); 20 21 for (int i = 1; i < pagSize; i++) 22 { 23 //首页完整链接为https://www.cnblogs.com/#p2 #p后面的数字代表当前页 24 url = url + "#p" + i; 25 var html = GetHtmlString(url); 26 if (!string.IsNullOrEmpty(html)) 27 { 28 //标题标签<a class="titlelnk" href="https://www.xxxxx.html" target="_blank">【设计模式】简单工厂模式 Simple Factory Pattern</a> 29 //正则匹配标题的标签,再提取其中的名称和url 30 foreach (Match ma in regex.Matches(html)) 31 { 32 Match match = Regex.Match(ma.Value, pater); 33 string title = match.Groups[2].Value; 34 string titlelnk = match.Groups[1].Value; 35 num++; 36 Console.WriteLine("第" + num + "个标题:\b" + title + "Url:" + titlelnk); 37 File.AppendAllText(@"d:\cnblog.txt", title + titlelnk + "\r\n"); 38 } 39 } 40 } 41 Console.WriteLine("结束一共爬了" + num + "个标题"); 42 Console.ReadKey(); 43 } 44 45 /// <summary> 46 /// 请求url 47 /// </summary> 48 /// <param name="url"></param> 49 /// <returns></returns> 50 public static string GetHtmlString(string url) 51 { 52 try 53 { 54 WebRequest request = WebRequest.Create(url); 55 Stream stream = request.GetResponse().GetResponseStream(); 56 request.Timeout = 3000; 57 using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) 58 { 59 return reader.ReadToEnd(); 60 } 61 } 62 catch (Exception ex) 63 { 64 Console.WriteLine(ex.ToString()); 65 return null; 66 } 67 } 68 69 } 70 }