利用HtmlAgilityPack抓取网站图片并下载

利用HtmlAgilityPack抓取网站图片并下载~~~~~~邪恶完善版

 

今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。

原始版本:

利用HtmlAgilityPack抓取XX网站图片并下载~~~~~~邪恶版。。。。

 

老版本代码:  记住哦!,在E盘下新建一个DownLoadImg文件夹

主要代码如下:
WebClient wc = new WebClient();
    private static int i = 0;
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    protected void Button1_Click(object sender, EventArgs e)
    {
        HtmlWeb web = new HtmlWeb();

        string imgurl = "";
        //目前一般XX(你懂的...)网站,都是以.../版块/yyyymmdd/一堆数字.html结尾
        //由于XX网站一般分为好几个版块,所以每个.html文件在同一版块下并不是连续的
        //我用了外层两层循环,最外层循环yyyymmdd(日期如2012-02-15)
        //内层是循环每个.html文件,当然你可以自己修改两个循环


        for (int k = 20120215; k <= 20120215; k++)
                                                  
      {
          for (int j = 124289; j <= 124306; j++)
        {
           
       
            string cnblogs = "";//看这里,需要填写某一网站的格式,在源码下载里有
            HtmlDocument doc = web.Load(cnblogs);
            HtmlNode node = doc.GetElementbyId("ks_xp");
            if (node == null)
            {
                continue;

            }
            else
            {

                foreach (HtmlNode child in node.SelectNodes("//img"))
                {
                    if (child.Attributes["src"] == null)
                        continue;

                    imgurl = child.Attributes["src"].Value.ToString();
                    DownLoadImg(imgurl);
                }
            }
         
        }
      }

    }

 

新版本代码:

 

复制代码
#region Using namespace

using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;

#endregion

namespace DownloadImages
{
     internal  class Program
    {
         private  static  readonly WebClient Wc =  new WebClient();
         private  static  readonly  char[] InvalidFileNameChars =  new[]
                                                                  {
                                                                       ' " ',
                                                                       ' < ',
                                                                       ' > ',
                                                                       ' | ',
                                                                       ' \0 ',
                                                                       ' \u0001 ',
                                                                       ' \u0002 ',
                                                                       ' \u0003 ',
                                                                       ' \u0004 ',
                                                                       ' \u0005 ',
                                                                       ' \u0006 ',
                                                                       ' \a ',
                                                                       ' \b ',
                                                                       ' \t ',
                                                                       ' \n ',
                                                                       ' \v ',
                                                                       ' \f ',
                                                                       ' \r ',
                                                                       ' \u000e ',
                                                                       ' \u000f ',
                                                                       ' \u0010 ',
                                                                       ' \u0011 ',
                                                                       ' \u0012 ',
                                                                       ' \u0013 ',
                                                                       ' \u0014 ',
                                                                       ' \u0015 ',
                                                                       ' \u0016 ',
                                                                       ' \u0017 ',
                                                                       ' \u0018 ',
                                                                       ' \u0019 ',
                                                                       ' \u001a ',
                                                                       ' \u001b ',
                                                                       ' \u001c ',
                                                                       ' \u001d ',
                                                                       ' \u001e ',
                                                                       ' \u001f ',
                                                                       ' : ',
                                                                       ' * ',
                                                                       ' ? ',
                                                                       ' \\ ',
                                                                       ' / '
                                                                  };
         public  static  string CleanInvalidFileName( string fileName)
        {
            fileName = fileName +  "";
            fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c +  """"));

             if (fileName.Length >  1)
                 if (fileName[ 0] ==  ' . ')
                    fileName =  " dot " + fileName.TrimStart( ' . ');

             return fileName;
        }
         private  static  void Main( string[] args)
        {
            Start();
        }

         private  static  void Start()
        {
             var web =  new HtmlWeb();
             var startDate =  int.Parse(DateTime.Parse( " 2010-08-18 ").ToString( " yyyyMMdd "));
             var endDate =  int.Parse(DateTime.Now.ToString( " yyyyMMdd "));
             const  int startPageId =  49430;
             const  int endPageId =  124621;
             for ( int k = startDate; k <= endDate; k++)
            {
                 for ( int j = startPageId; j <= endPageId; j++)
                {
                     string cnblogs =  http://xxxxxxxx/ + k +  " / " + j +  " .html ";  //此处省略……源码内详
                    HtmlDocument doc = web.Load(cnblogs);
                     var titles = doc.DocumentNode.SelectNodes( " //title ");
                     var titleName = j.ToString();
                     if( titles!= null && titles.Count> 0)
                        titleName = titles[ 0].InnerText;
                    HtmlNode node = doc.GetElementbyId( " ks_xp ");
                     if (node ==  null)
                    {
                         continue;
                    }
                     foreach (HtmlNode child  in node.SelectNodes( " //img "))
                    {
                         if (child.Attributes[ " src "] ==  null)
                             continue;

                         string imgurl = child.Attributes[ " src "].Value;
                        DownLoadImg(imgurl, k +  "", CleanInvalidFileName(titleName));
                        Console.WriteLine( " 正在下载: " + titleName +  "   " + imgurl);
                    }
                }
            }
             // 善后
            CleanEmptyFolders();
        }

         private  static  void CleanEmptyFolders()
        {
             var rootFolders = Environment.CurrentDirectory +  " \\Images\\ ";
             var folders = Directory.GetDirectories(rootFolders,  " *.* ", SearchOption.AllDirectories);
             foreachvar f  in folders)
            {
                 if (Directory.GetFiles(f,  " *.* ", SearchOption.AllDirectories).Length ==  0)
                    Directory.Delete(f);
            }
        }

         private  static  void DownLoadImg( string url,  string folderName,  string subFolderName)
        {
             var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf( " / ") +  1));
             var fileFolder = Environment.CurrentDirectory +  " \\Images\\ " + folderName +  " \\ " + subFolderName +  " \\ " ;
             if (!Directory.Exists(fileFolder))
                Directory.CreateDirectory(fileFolder);
            fileName = fileFolder + fileName;
             try
            {
                Wc.DownloadFile(url, fileName);
            }
             catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
    }
}
 

测试程序和源码下载:

/Files/Chinasf/DownloadImages.rar

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值