C#实现网页爬虫

HTTP请求工具类(功能:1、获取网页html;2、下载网络图片;):

using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// HTTP请求工具类
    /// </summary>
    public class HttpRequestUtil
    {
        /// <summary>
        /// 获取页面html
        /// </summary>
        public static string GetPageHtml(string url)
        {
            // 设置参数
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            //发送请求并获取相应回应数据
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            //直到request.GetResponse()程序才开始向目标网页发送Post请求
            Stream responseStream = response.GetResponseStream();
            StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
            //返回结果网页(html)代码
            string content = sr.ReadToEnd();
            return content;
        }

        /// <summary>
        /// Http下载文件
        /// </summary>
        public static void HttpDownloadFile(string url, int minWidth, int minHeight)
        {
            int pos = url.LastIndexOf("/") + 1;
            string fileName = url.Substring(pos);
            string path = Application.StartupPath + "\\download";
            if (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
            }
            string filePathName = path + "\\" + fileName;
            if (File.Exists(filePathName)) return;

            // 设置参数
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            request.Proxy = null;
            //发送请求并获取相应回应数据
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            //直到request.GetResponse()程序才开始向目标网页发送Post请求
            Stream responseStream = response.GetResponseStream();

            MemoryStream memoryStream = new MemoryStream();
            byte[] bArr = new byte[1024];
            int size = responseStream.Read(bArr, 0, (int)bArr.Length);
            while (size > 0)
            {
                memoryStream.Write(bArr, 0, size);
                size = responseStream.Read(bArr, 0, (int)bArr.Length);
            }
            Image tempImage = System.Drawing.Image.FromStream(memoryStream, true);
            int imageHeight = tempImage.Height;
            int imageWidth = tempImage.Width;
            if (imageHeight >= minHeight && imageWidth >= minWidth)
            {
                memoryStream.Seek(0, SeekOrigin.Begin);
                size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                FileStream fs = new FileStream(filePathName, FileMode.Create);
                while (size > 0)
                {
                    fs.Write(bArr, 0, size);
                    size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                }
                fs.Close();
            }
            memoryStream.Close();
            responseStream.Close();
        }
    }
}
View Code

VisitedHelper类:

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// 已访问的网址列表
    /// </summary>
    public class VisitedHelper
    {
        private static List<string> m_VisitedList = new List<string>();

        #region 判断是否已访问
        /// <summary>
        /// 判断是否已访问
        /// </summary>
        public static bool IsVisited(string url)
        {
            if (m_VisitedList.Exists(a => a == url))
            {
                return true;
            }
            return false;
        }
        #endregion

        #region 添加已访问
        /// <summary>
        /// 添加已访问
        /// </summary>
        public static void Add(string url)
        {
            m_VisitedList.Add(url);
        }
        #endregion

    }
}
View Code

多线程爬取网页代码:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;

namespace 爬虫
{
    public partial class Form1 : Form
    {
        private static int m_MinWidth = 300;
        private static int m_MinHeight = 300;
        private static int m_CompletedCount = 0;

        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            ThreadPool.SetMaxThreads(100, 100);
            int.TryParse(txtMinWidth.Text, out m_MinWidth);
            int.TryParse(txtMinHeight.Text, out m_MinHeight);
            button1.Enabled = false;
            lblMsg.Text = "正在爬取图片…";
            timer1.Start();
            new Thread(new ThreadStart(delegate()
            {
                Crawling(txtUrl.Text, null);
            })).Start();
        }

        /// <summary>
        /// 爬取
        /// </summary>
        private void Crawling(string url, string host)
        {
            if (!VisitedHelper.IsVisited(url))
            {
                VisitedHelper.Add(url);

                if (host == null)
                {
                    host = GetHost(url);
                }

                string pageHtml = HttpRequestUtil.GetPageHtml(url);
                Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>", RegexOptions.IgnoreCase);
                Regex regImg = new Regex(@"<img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>", RegexOptions.IgnoreCase);

                MatchCollection mcImg = regImg.Matches(pageHtml);
                foreach (Match mImg in mcImg)
                {
                    string imageUrl = mImg.Groups[1].Value;
                    try
                    {
                        int imageWidth = GetImageWidthOrHeight(mImg.Value, true);
                        int imageHeight = GetImageWidthOrHeight(imageUrl, false);
                        if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight)
                        {
                            if (imageUrl.IndexOf("javascript") == -1)
                            {
                                if (imageUrl.IndexOf("http") == 0)
                                {
                                    HttpRequestUtil.HttpDownloadFile(imageUrl, m_MinWidth, m_MinHeight);
                                }
                                else
                                {
                                    HttpRequestUtil.HttpDownloadFile(host + imageUrl, m_MinWidth, m_MinHeight);
                                }
                            }
                        }
                    }
                    catch { }
                }

                //递归遍历
                MatchCollection mcA = regA.Matches(pageHtml);
                foreach (Match mA in mcA)
                {
                    try
                    {
                        string nextUrl = mA.Groups[1].Value;
                        if (nextUrl.IndexOf("javascript") == -1)
                        {
                            if (nextUrl.IndexOf("http") == 0)
                            {
                                if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                    {
                                        try
                                        {
                                            Crawling(nextUrl, host);
                                            m_CompletedCount++;
                                        }
                                        catch { }
                                    }));
                                }
                            }
                            else
                            {
                                if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                    {
                                        try
                                        {
                                            Crawling(host + nextUrl, host);
                                            m_CompletedCount++;
                                        }
                                        catch { }
                                    }));
                                }
                            }
                        }
                    }
                    catch { }
                }
            }
        } //end Crawling方法

        /// <summary>
        /// 获取主机
        /// </summary>
        private string GetHost(string url)
        {
            Regex regHost = new Regex(@"(?:http|https)://[a-z0-9\-\.:]+", RegexOptions.IgnoreCase);
            Match mHost = regHost.Match(url);
            return mHost.Value + "/";
        }

        //计时器事件
        private void timer1_Tick(object sender, EventArgs e)
        {
            int workerThreads;
            int completionPortThreads;
            ThreadPool.GetAvailableThreads(out workerThreads, out completionPortThreads);
            if (workerThreads == 100 && m_CompletedCount > 0)
            {
                lblMsg.Text = "已结束";
            }
            else
            {
                lblMsg.Text = "正在爬取图片…";
            }
        }

        /// <summary>
        /// 获取图片宽度或高度
        /// </summary>
        private int GetImageWidthOrHeight(string imageTagString, bool isWidth)
        {
            string tag = isWidth ? "width" : "height";
            Regex reg = new Regex(string.Format(@"{0}=""([\d\.]+)""", tag), RegexOptions.IgnoreCase);
            Match match = reg.Match(imageTagString);
            if (match.Success)
            {
                return (int)Convert.ToDouble(match.Groups[1].Value);
            }
            else
            {
                reg = new Regex(string.Format(@"{0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;", tag), RegexOptions.IgnoreCase);
                match = reg.Match(imageTagString);
                if (match.Success)
                {
                    return (int)Convert.ToDouble(match.Groups[1].Value);
                }
            }
            return int.MaxValue;
        }

    } //end Form1类

    /// <summary>
    /// 跨线程访问控件的委托
    /// </summary>
    public delegate void InvokeDelegate();
}
View Code

截图:

 

转载于:https://www.cnblogs.com/s0611163/p/5170263.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值