豆瓣相册图片爬虫

FrmMain.cs

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Threading.Tasks;

namespace DouBanDownLoad
{
    public partial class FrmMain : Form
    {
        public FrmMain()
        {
            InitializeComponent();

        }



        Task t1;
        private void btnstart_Click(object sender, EventArgs e)
        {
            if (t1 == null || t1.Status == TaskStatus.RanToCompletion)
            {
                t1 = new Task(DoDownLoad);
            }

            if (t1.Status == TaskStatus.Created)
            {
                t1.Start();
            }
        }

        private void DoDownLoad()
        {

            btnstart.Invoke((EventHandler)delegate { btnstart.Enabled = false; });

            DownLoadHelper dhelper = new DownLoadHelper();

            if (string.IsNullOrWhiteSpace(txturl.Text))
            {
                MessageBox.Show("错误,请输入网址信息!");
                return;
            }

            int pagecount = dhelper.GetThePageCount(txturl.Text);
            int tempstart = 0;
            this.Invoke((EventHandler)delegate
                         {
                             lvstate.Items.Insert(0, string.Format("======我们有{0}页要下载,开心~======", pagecount));
                         }
                       );

            for (int i = 0; i < pagecount; i++)
            {
                this.Invoke((EventHandler)delegate
                   {
                       lvstate.Items.Insert(0, string.Format("======我们正在下载页面: {0}======", i + 1));
                   }
                );
                //这是因为这样的doulist页面列表这样 http://www.douban.com/photos/album/62113919/?start=18
                tempstart = i * 18;
                List<ImgInfo> list = GetList(tempstart);
                for (int j = 0; j < list.Count; j++)
                {
                    System.Threading.Thread.Sleep(1 * 1000);
                    dhelper.DownLoadFile(list[j].Downloadurl, list[j].ImgName);
                }
            }
            this.Invoke(
                (EventHandler)delegate
                     {
                         lvstate.Items.Insert(0, string.Format("====== {0} 页面成功下载,开心^_^======", pagecount));
                         btnstart.Enabled = true;
                     }
                  );
        }




        public List<ImgInfo> GetList(int pagestart)
        {
            ImgInfo img = new ImgInfo();
            img.Localpath = label1.Text;
            string url = txturl.Text;

            if (url.Contains("?"))
            {
                url = url.Split('?')[0];
            }
            url = url + "?start=" + pagestart.ToString();
            //string urlinfo = string.Format("http://www.douban.com/photos/album/62113919/?start={0}", pagestart.ToString());
            return img.GetListByXpath(url, ImgType.Medium);
        }

        private void label1_TextChanged(object sender, EventArgs e)
        {
            //验证是否选择路径
            if (!label1.Text.Contains("未选择"))
            {
                btnstart.Enabled = true;
            }
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            btnstart.Enabled = false;
        }

        //选中路径
        private void btnpath_Click(object sender, EventArgs e)
        {
            fbpath.ShowDialog();
            label1.Text = fbpath.SelectedPath;
        }

        private void txturl_TextChanged(object sender, EventArgs e)
        {
            //改变状态,那么验证这个url信息,如果它是正确的douban ablums
            btnstart.Enabled = ToolKit.CheckUrl(txturl.Text) ? true : false;
        }
    }
}

DownLoadHelper.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using HtmlAgilityPack;

namespace DouBanDownLoad
{
    public class DownLoadHelper
    {

        private WebClient wc = new WebClient();
        public void DownLoadFile(string url, string localfilename)
        {
            try
            {
                wc.DownloadFile(url, localfilename);
            }
            catch (Exception ex)
            {

            }

        }

        /// <summary>
        /// 使用htmlweb获取总页数
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public int GetThePageCount(string url)
        {
            string mycount = "0";
            HtmlWeb web = new HtmlWeb();
            var doc = web.Load(url);
            var count = doc.DocumentNode.SelectSingleNode("//span[@class='thispage']");

            if (count != null)
            {
                mycount = count.Attributes["data-total-page"].Value;
            }
            //检查这是否是实数
            if (string.IsNullOrWhiteSpace(mycount) || !ToolKit.CheckIsNumber(mycount))
            {
                mycount = "0";
            }
            return Convert.ToInt32(mycount);
        }

    }
}

ImgInfo.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using HtmlAgilityPack;

namespace DouBanDownLoad
{
    public class ImgInfo
    {
        public string Downloadurl { get; set; }
        /// <summary>
        /// 绝对路径
        /// </summary>
        public string ImgName { get; set; }


        public string Localpath { get; set; }

        /// <summary>
        /// 获取列表页面信息
        /// </summary>
        /// <param name="htmlinfo"></param>
        /// <returns></returns>
        public List<ImgInfo> GetListByContent(string urlinfo)
        {
            List<ImgInfo> list = new List<ImgInfo>();

            if (string.IsNullOrEmpty(urlinfo))
            {
                throw new Exception("请输入网址信息!!!");
            }
            HtmlWeb web = new HtmlWeb();
            list = Regex.Matches(web.Load(urlinfo).DocumentNode.InnerHtml, @"(?is)<div class=""photo_wrap"">\s*.*?<img src=""(?<imgurl>.*?)"">")
                  .OfType<Match>()
                  .Select(p => new ImgInfo { ImgName = DoPath(p.Groups["imgurl"].Value.ToString()), Downloadurl = p.Groups["imgurl"].Value.ToString() })
                  .ToList<ImgInfo>();
            //将缩小图转成大图
            //list.ForEach(x =>
            //{
            //    if (x.Downloadurl.Contains("thumb"))
            //    {
            //        x.Downloadurl = x.Downloadurl.Replace("thumb", "public");
            //    }
            //});
            return list;

        }


        /// <summary>
        /// 使用xpath获取图像列表
        /// </summary>
        /// <param name="url"></param>
        /// <param name="imgtype"></param>
        /// <returns></returns>
        public List<ImgInfo> GetListByXpath(string url, ImgType imgtype)
        {
            HtmlWeb web = new HtmlWeb();
            List<ImgInfo> list = new List<ImgInfo>();
            list = web.Load(url)
                .DocumentNode
                .SelectNodes("//a[@class='photolst_photo']/img")
                .Select(p => new ImgInfo { ImgName = DoPath(p.Attributes["src"].Value), Downloadurl = p.Attributes["src"].Value })
                .ToList<ImgInfo>();
            if (imgtype == ImgType.Medium)
            {
                list.ForEach(x =>
                {
                    if (x.Downloadurl.Contains("thumb"))
                    {
                        x.Downloadurl = x.Downloadurl.Replace("thumb", "photo");
                    }
                });
            }


            return list;
        }

        private string DoPath(string nameinfo)
        {
            return Localpath + "\\" + Regex.Match(nameinfo, @"public/(.*?jpg)").Groups[1].Value.ToString();
        }
    }
}

ToolKit.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace DouBanDownLoad
{
    public static class ToolKit
    {

        /// <summary>
        /// 检查是否是Douban URL
        /// </summary>
        /// <param name="txt">要输入的字符串 </param>
        /// <returns></returns>
        public static bool CheckUrl(string txt)
        {
            return Regex.IsMatch(txt, "^http://www[.]douban[.]com/photos/album/[0-9]+/$");
        }

        public static bool CheckIsNumber(string txt)
        {
            return Regex.IsMatch(txt, "^\\d+$");
        }
    }

    public enum ImgType
    {
        Big = 1,
        Medium = 2,
        Small = 3
    }
}

运行结果如图:

这里写图片描述


这里写图片描述


这里写图片描述


这里写图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值