C#实现网页爬虫

最新推荐文章于 2023-05-06 14:21:23 发布

weixin_34376562

最新推荐文章于 2023-05-06 14:21:23 发布

阅读量369

点赞数

文章标签：爬虫 javascript ViewUI

原文链接：http://www.cnblogs.com/s0611163/p/5170263.html

版权

HTTP请求工具类(功能：1、获取网页html；2、下载网络图片；)：

using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// HTTP请求工具类
    /// </summary>
    public class HttpRequestUtil
    {
        /// <summary>
        /// 获取页面html
        /// </summary>
        public static string GetPageHtml(string url)
        {
            // 设置参数
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            //发送请求并获取相应回应数据
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            //直到request.GetResponse()程序才开始向目标网页发送Post请求
            Stream responseStream = response.GetResponseStream();
            StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
            //返回结果网页（html）代码
            string content = sr.ReadToEnd();
            return content;
        }

        /// <summary>
        /// Http下载文件
        /// </summary>
        public static void HttpDownloadFile(string url, int minWidth, int minHeight)
        {
            int pos = url.LastIndexOf("/") + 1;
            string fileName = url.Substring(pos);
            string path = Application.StartupPath + "\\download";
            if (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
            }
            string filePathName = path + "\\" + fileName;
            if (File.Exists(filePathName)) return;

            // 设置参数
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            request.Proxy = null;
            //发送请求并获取相应回应数据
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            //直到request.GetResponse()程序才开始向目标网页发送Post请求
            Stream responseStream = response.GetResponseStream();

            MemoryStream memoryStream = new MemoryStream();
            byte[] bArr = new byte[1024];
            int size = responseStream.Read(bArr, 0, (int)bArr.Length);
            while (size > 0)
            {
                memoryStream.Write(bArr, 0, size);
                size = responseStream.Read(bArr, 0, (int)bArr.Length);
            }
            Image tempImage = System.Drawing.Image.FromStream(memoryStream, true);
            int imageHeight = tempImage.Height;
            int imageWidth = tempImage.Width;
            if (imageHeight >= minHeight && imageWidth >= minWidth)
            {
                memoryStream.Seek(0, SeekOrigin.Begin);
                size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                FileStream fs = new FileStream(filePathName, FileMode.Create);
                while (size > 0)
                {
                    fs.Write(bArr, 0, size);
                    size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                }
                fs.Close();
            }
            memoryStream.Close();
            responseStream.Close();
        }
    }
}

View Code

VisitedHelper类：

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// 已访问的网址列表
    /// </summary>
    public class VisitedHelper
    {
        private static List<string> m_VisitedList = new List<string>();

        #region 判断是否已访问
        /// <summary>
        /// 判断是否已访问
        /// </summary>
        public static bool IsVisited(string url)
        {
            if (m_VisitedList.Exists(a => a == url))
            {
                return true;
            }
            return false;
        }
        #endregion

        #region 添加已访问
        /// <summary>
        /// 添加已访问
        /// </summary>
        public static void Add(string url)
        {
            m_VisitedList.Add(url);
        }
        #endregion

    }
}

View Code

多线程爬取网页代码：

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;

namespace 爬虫
{
    public partial class Form1 : Form
    {
        private static int m_MinWidth = 300;
        private static int m_MinHeight = 300;
        private static int m_CompletedCount = 0;

        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            ThreadPool.SetMaxThreads(100, 100);
            int.TryParse(txtMinWidth.Text, out m_MinWidth);
            int.TryParse(txtMinHeight.Text, out m_MinHeight);
            button1.Enabled = false;
            lblMsg.Text = "正在爬取图片…";
            timer1.Start();
            new Thread(new ThreadStart(delegate()
            {
                Crawling(txtUrl.Text, null);
            })).Start();
        }

        /// <summary>
        /// 爬取
        /// </summary>
        private void Crawling(string url, string host)
        {
            if (!VisitedHelper.IsVisited(url))
            {
                VisitedHelper.Add(url);

                if (host == null)
                {
                    host = GetHost(url);
                }

                string pageHtml = HttpRequestUtil.GetPageHtml(url);
                Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>", RegexOptions.IgnoreCase);
                Regex regImg = new Regex(@"<img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>", RegexOptions.IgnoreCase);

                MatchCollection mcImg = regImg.Matches(pageHtml);
                foreach (Match mImg in mcImg)
                {
                    string imageUrl = mImg.Groups[1].Value;
                    try
                    {
                        int imageWidth = GetImageWidthOrHeight(mImg.Value, true);
                        int imageHeight = GetImageWidthOrHeight(imageUrl, false);
                        if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight)
                        {
                            if (imageUrl.IndexOf("javascript") == -1)
                            {
                                if (imageUrl.IndexOf("http") == 0)
                                {
                                    HttpRequestUtil.HttpDownloadFile(imageUrl, m_MinWidth, m_MinHeight);
                                }
                                else
                                {
                                    HttpRequestUtil.HttpDownloadFile(host + imageUrl, m_MinWidth, m_MinHeight);
                                }
                            }
                        }
                    }
                    catch { }
                }

                //递归遍历
                MatchCollection mcA = regA.Matches(pageHtml);
                foreach (Match mA in mcA)
                {
                    try
                    {
                        string nextUrl = mA.Groups[1].Value;
                        if (nextUrl.IndexOf("javascript") == -1)
                        {
                            if (nextUrl.IndexOf("http") == 0)
                            {
                                if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                    {
                                        try
                                        {
                                            Crawling(nextUrl, host);
                                            m_CompletedCount++;
                                        }
                                        catch { }
                                    }));
                                }
                            }
                            else
                            {
                                if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                    {
                                        try
                                        {
                                            Crawling(host + nextUrl, host);
                                            m_CompletedCount++;
                                        }
                                        catch { }
                                    }));
                                }
                            }
                        }
                    }
                    catch { }
                }
            }
        } //end Crawling方法

        /// <summary>
        /// 获取主机
        /// </summary>
        private string GetHost(string url)
        {
            Regex regHost = new Regex(@"(?:http|https)://[a-z0-9\-\.:]+", RegexOptions.IgnoreCase);
            Match mHost = regHost.Match(url);
            return mHost.Value + "/";
        }

        //计时器事件
        private void timer1_Tick(object sender, EventArgs e)
        {
            int workerThreads;
            int completionPortThreads;
            ThreadPool.GetAvailableThreads(out workerThreads, out completionPortThreads);
            if (workerThreads == 100 && m_CompletedCount > 0)
            {
                lblMsg.Text = "已结束";
            }
            else
            {
                lblMsg.Text = "正在爬取图片…";
            }
        }

        /// <summary>
        /// 获取图片宽度或高度
        /// </summary>
        private int GetImageWidthOrHeight(string imageTagString, bool isWidth)
        {
            string tag = isWidth ? "width" : "height";
            Regex reg = new Regex(string.Format(@"{0}=""([\d\.]+)""", tag), RegexOptions.IgnoreCase);
            Match match = reg.Match(imageTagString);
            if (match.Success)
            {
                return (int)Convert.ToDouble(match.Groups[1].Value);
            }
            else
            {
                reg = new Regex(string.Format(@"{0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;", tag), RegexOptions.IgnoreCase);
                match = reg.Match(imageTagString);
                if (match.Success)
                {
                    return (int)Convert.ToDouble(match.Groups[1].Value);
                }
            }
            return int.MaxValue;
        }

    } //end Form1类

    /// <summary>
    /// 跨线程访问控件的委托
    /// </summary>
    public delegate void InvokeDelegate();
}

View Code

截图：

转载于:https://www.cnblogs.com/s0611163/p/5170263.html

weixin_34376562

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
C#实现网页爬虫

HTTP请求工具类(功能：1、获取网页html；2、下载网络图片；)：using System;using System.Collections.Generic;using System.Drawing;using System.IO;using System.Linq;using System.Net;using System.Text;using Sys...
复制链接

扫一扫