C#超简单爬虫demo

最新推荐文章于 2024-08-16 18:14:18 发布

xgq_Star

最新推荐文章于 2024-08-16 18:14:18 发布

阅读量948

点赞数 5

分类专栏：爬虫文章标签：正则表达式 c# regex

本文链接：https://blog.csdn.net/xgq_Star/article/details/106024418

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

运用正则表达式匹配链接，实现爬取煎蛋网的图片。代码很短，新手值得一试。

不说废话了，直接上图。

在这里插入图片描述

using System;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace Crawler
{
    class Program
    {
        static void Main(string[] args)
        {
            string url = "http://jandan.net/top-zoo";
            string path = @"D:\Picture\";
            HttpWebRequest webRequest = WebRequest.CreateHttp(url);
            webRequest.Method = "GET";
            webRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ";
            var webResponse = webRequest.GetResponse();
            StreamReader streamReader = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8);
            string str = streamReader.ReadToEnd();
            streamReader.Close();
            if (string.IsNullOrEmpty(str))
            {
                Console.WriteLine("————————-错误—————————");
                Console.ReadKey();
            }
            Regex regex = new Regex("<img.*?src=['|\"](?<Collect>(.*?(?:\\.(?:png|jpg|gif))))['|\"]");
            MatchCollection match = regex.Matches(str);
            WebClient client = new WebClient();
            int name = 0;
            try
            {
                foreach (Match match1 in match)
                {
                    string src = match1.Groups["Collect"].Value;
                    src = "http:"+src;
                    name++;
                    client.DownloadFile(src,path+name+".jpg");
                    Console.WriteLine("\n正在爬取———————" + "|" +src);                  
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("-------------" + ex);
            }
            Console.ReadKey();
        }

    }
}