C#爬虫入门

C#语言是由微软(Microsoft)团队研发出的基于.NET平台的一款高级程序语言,C#的功能强大在Windows平台上可以完美兼容;

本篇就由作者介绍一下如何使用C#来获取网页的内容,也就是大家熟悉的爬虫;

说到爬虫,大家可能第一时间想到的就是使用Python语言来编写的爬虫了,当然使用python进行爬虫是方便的这是毋庸置疑的,但是在基于.NET平台上c#的作用也是相当强大的!

在使用C#的爬取网页的内容时要先在自己的VS(Visual Studio)中为自己的项目(Project)安装一个插件:HtmlAgilityPack
在这里插入图片描述

先给大家介绍一个类(Class)【crawlerHelper】:

 static class crawlerHelper
    {
        //文件保存目录
        public static string path = "E:\\Download";
        //创建GET请求
        public static string CreateGetHttpResponse(string url)
        {
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.Method = "GET";
            request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0";
            try
            {
                HttpWebResponse webresponse = request.GetResponse() as HttpWebResponse;
                using (Stream s = webresponse.GetResponseStream())
                {
                    StreamReader reader = new StreamReader(s, Encoding.UTF8);
                    return reader.ReadToEnd();
                }
            }
            catch (Exception)
            {
                return "requestFalse";
            }

        }
        //创建POST请求
        static string CreatePostHttpResponse(string url, IDictionary<string, string> parameters, string ContentType = "application/x-www-form-urlencoded")
        {
            HttpWebRequest request = null;
            //如果是发送HTTPS请求  
            if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
            {
                request = WebRequest.Create(url) as HttpWebRequest;
            }
            else
            {
                request = WebRequest.Create(url) as HttpWebRequest;
            }
            request.Method = "POST";
            request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0";
            request.ContentType = ContentType;
            //发送POST数据  
            if (!(parameters == null || parameters.Count == 0))
            {
                StringBuilder buffer = new StringBuilder();
                int i = 0;
                foreach (string key in parameters.Keys)
                {
                    if (i > 0)
                    {
                        buffer.AppendFormat("&{0}={1}", key, parameters[key]);
                    }
                    else
                    {
                        buffer.AppendFormat("{0}={1}", key, parameters[key]);
                        i++;
                    }
                }
                byte[] data = Encoding.ASCII.GetBytes(buffer.ToString());
                using (Stream stream = request.GetRequestStream())
                {
                    stream.Write(data, 0, data.Length);
                }
            }
            try
            {
                HttpWebResponse webresponse = request.GetResponse() as HttpWebResponse;
                using (Stream s = webresponse.GetResponseStream())
                {
                    StreamReader reader = new StreamReader(s, Encoding.UTF8);
                    return reader.ReadToEnd();
                }
            }
            catch (Exception)
            {
                return "requestFalse";
            }
        }
        //下载
        public static bool Download(string url, string filename)
        {
            string tempPath = Path.Combine(Path.GetDirectoryName(path), "temp");
            string filepath = Path.Combine(path, filename);
            Directory.CreateDirectory(tempPath);  //创建临时文件目录
            string tempFile = tempPath + "\\" + filename + ".temp"; //临时文件
            if (File.Exists(tempFile))
            {
                File.Delete(tempFile);    //存在则删除
            }
            FileStream fs = new FileStream(tempFile, FileMode.Append, FileAccess.Write, FileShare.ReadWrite);
            try
            {
                // 设置参数
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                //发送请求并获取相应回应数据
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                //直到request.GetResponse()程序才开始向目标网页发送Post请求
                Stream responseStream = response.GetResponseStream();
                byte[] bArr = new byte[1024];
                int size = responseStream.Read(bArr, 0, (int)bArr.Length);
                while (size > 0)
                {
                    fs.Write(bArr, 0, size);
                    size = responseStream.Read(bArr, 0, (int)bArr.Length);
                }
                responseStream.Close();
                responseStream.Dispose();
            }
            catch (Exception ex)
            {
                Console.WriteLine("错误:{0}", ex.Message);
                return false;
            }
            finally
            {
                fs.Close();
                fs.Dispose();
            }
            File.Move(tempFile, filepath);
            return true;
        }
        ///<summary>
        ///生成随机字符串 
        ///</summary>
        ///<param name="length">目标字符串的长度</param>
        ///<param name="useNum">是否包含数字,1=包含,默认为包含</param>
        ///<param name="useLow">是否包含小写字母,1=包含,默认为包含</param>
        ///<param name="useUpp">是否包含大写字母,1=包含,默认为包含</param>
        ///<param name="useSpe">是否包含特殊字符,1=包含,默认为不包含</param>
        ///<param name="custom">要包含的自定义字符,直接输入要包含的字符列表</param>
        ///<returns>指定长度的随机字符串</returns>
        public static string GetRandomString(int length, bool useNum = true, bool useLow = true, bool useUpp = true, bool useSpe = false, string custom = null)
        {
            byte[] b = new byte[4];
            new System.Security.Cryptography.RNGCryptoServiceProvider().GetBytes(b);
            Random r = new Random(BitConverter.ToInt32(b, 0));
            string s = null, str = custom;
            if (useNum == true) { str += "0123456789"; }
            if (useLow == true) { str += "abcdefghijklmnopqrstuvwxyz"; }
            if (useUpp == true) { str += "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; }
            if (useSpe == true) { str += "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
            for (int i = 0; i < length; i++)
            {
                s += str.Substring(r.Next(0, str.Length - 1), 1);
            }
            return s;
        }

给大家简单的示范一下在腾讯漫画上爬取一张封面!

static void Main(string[] args)
        {
            string url = "https://ac.qq.com/Comic/ComicInfo/id/542330";
            //网络网址的网页源代码
            string htmlContext = crawlerHelper.CreateGetHttpResponse(url);
            //新建一个htmlAgilityPack对象
            HtmlDocument doc = new HtmlDocument();
            //装载网页内容
            doc.LoadHtml(htmlContext);
            HtmlNode img = doc.DocumentNode.SelectSingleNode("/html/body/div[3]/div[3]/div[1]/div[1]/div[1]/a/img");
            string imgUal = img.Attributes["src"].Value;
            crawlerHelper.Download(imgUal,"123.jpg");
            //Console.ReadKey();
        }

实现的前提是将crawlerHelper这个类放到Main函数前或者是再新建一个类,并且把【crawlerHelper】这个放到新建类中1

这里我把图片下载到E:\Download这个路径下,大家可以在自己的代码上更改下载的路径来下载图片!

但是在爬取网站上的图片(资源)时需要大家去查找一下该图片在网址上的位置:

在这里插入图片描述
然后输入图片的地址:
/html/body/div[3]/div[3]/div[1]/div[1]/div[1]/a/img

这是一个入门级的爬虫知识!还请各位大佬指教!!!

  • 2
    点赞
  • 21
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值