C#简单爬虫（爬取图片）

最新推荐文章于 2024-07-25 14:16:13 发布

xgq_Star

最新推荐文章于 2024-07-25 14:16:13 发布

阅读量2.9k

点赞数 6

分类专栏：爬虫文章标签： c# http

本文链接：https://blog.csdn.net/xgq_Star/article/details/105727892

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

爬虫是比较有趣的一个东西，绝大多数爬虫都是用Python来写的，的确python在爬虫，人工智能这些领域有其独特的优势，但是这并不代表不代表其他语言就不可以了。

今天分享一个C#写的简单的爬虫小程序，最基本的只能爬一些图片，真真是最基本。我觉得吧，什么东西只要与吃饭的家伙连在一起也就显得不那么纯粹有趣了（这只是我的个人兴趣而已，记录一下）。

简单爬虫基本就分为几步

（一）HTTP请求与响应

C#有好几个类可以实现，比如

WebClient,WebRequest,WebRespond,HttpWebRequest

（二）网页流的获取

这个主要是获取网页流并读取，有时还会涉及到解压之类的操作

（三）根据分析得到相关资源

可以用正则表达式，也可以用其他抓包工具分析处理得到你想要的链接或文本之类的

（四）下载并进行处理整合

WebClien类的DownloadFile等方法可以下载

然后根据需要进行处理整合，比如是图片就需要保存路径，名称等，比如是小说就需要文本的拼接，排版等等。

以上就是个人的一点看法，其实还有许多细节没有说到，也不是三言两语就能说清楚的。
————————————————————————————
首先建一个控制台应用，添加一个类，进行功能的实现。

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace Test_01
{
    class HttpCrawelHelper
    {
        #region    爬取图片
        public static void HttpGetHandle(string url, string path, int name)
        {
            Stopwatch stopwatch = new Stopwatch();
            stopwatch.Start();
            HttpWebRequest webRequest = (HttpWebRequest)WebRequest.CreateHttp(url);
            webRequest.Method = "GET";
            webRequest.UserAgent = " Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0";
            var webResponse = webRequest.GetResponse();
            StreamReader streamReader = new StreamReader(webResponse.GetResponseStream(),Encoding .UTF8 );
            string str = streamReader.ReadToEnd();
            streamReader.Close();
            if (string.IsNullOrEmpty(str)) 
            {
                Console.WriteLine("————————-错误—————————");
                Console.ReadKey();
            }
            Regex regex = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<Group>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
            MatchCollection match = regex.Matches(str);
            WebClient client = new WebClient();
            int temp = 0;
            try
            {
                foreach (Match match1 in match)
                {
                    string src = match1.Groups[1].Value;
                    if (src.Contains("http") && !src.Contains(".svg"))
                    {
                        temp++;
                        client.DownloadFile(src, path + name + ".jpg");
                        name++;
                        Console.WriteLine("\n正在爬取———————" + "|" + temp);
                    }
                }
            }
            catch (Exception ex) 
            {
                Console.WriteLine("-------------"+ex);
            }
            stopwatch.Stop ();          
            Console.WriteLine("————-———爬取成功！—————");
            Console.WriteLine("\n_______总共爬取了" + temp + "张图片!_______________");
            Console.WriteLine("\n一共耗时"+stopwatch .ElapsedMilliseconds/1000+"秒");
        }
        #endregion
        #region 创建一个文件夹
        public static void CreatFile() 
        {
            if (Directory.Exists(@"D:\Picture\"))
            {
                Console.WriteLine("\n————————开始——————————");
            }
            else
            {
                DirectoryInfo directory = new DirectoryInfo(@"D:\Picture\");
                directory.Create();
            }
        }
        #endregion 
    }
}

接下来就可以在主函数里直接调用了。

 static void Main(string[] args)
        {
            string url = "https://cn.bing.com/images/search?q=%e6%84%8f%e5%a2%83%e5%9b%be%e7%89%87&qpvt=%e6%84%8f%e5%a2%83%e5%9b%be%e7%89%87&FORM=IGRE";
            HttpCrawelHelper.CreatFile();
            string path = Path.Combine(@"D:\Picture\");
            HttpCrawelHelper.HttpGetHandle (url, path, 1);
            Console.ReadKey();
        }