利用HtmlAgilityPack抓取网站图片并下载~~~~~~邪恶完善版

  今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。

原始版本:

利用HtmlAgilityPack抓取XX网站图片并下载~~~~~~邪恶版。。。。

新版本代码:

复制代码
#region Using namespace

using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;

#endregion

namespace DownloadImages
{
internal class Program
{
private static readonly WebClient Wc = new WebClient();
private static readonly char[] InvalidFileNameChars = new[]
{
' " ',
' < ',
' > ',
' | ',
' \0 ',
' \u0001 ',
' \u0002 ',
' \u0003 ',
' \u0004 ',
' \u0005 ',
' \u0006 ',
' \a ',
' \b ',
' \t ',
' \n ',
' \v ',
' \f ',
' \r ',
' \u000e ',
' \u000f ',
' \u0010 ',
' \u0011 ',
' \u0012 ',
' \u0013 ',
' \u0014 ',
' \u0015 ',
' \u0016 ',
' \u0017 ',
' \u0018 ',
' \u0019 ',
' \u001a ',
' \u001b ',
' \u001c ',
' \u001d ',
' \u001e ',
' \u001f ',
' : ',
' * ',
' ? ',
' \\ ',
' / '
};
public static string CleanInvalidFileName( string fileName)
{
fileName = fileName + "";
fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + "", ""));

if (fileName.Length > 1)
if (fileName[ 0] == ' . ')
fileName = " dot " + fileName.TrimStart( ' . ');

return fileName;
}
private static void Main( string[] args)
{
Start();
}

private static void Start()
{
var web = new HtmlWeb();
var startDate = int.Parse(DateTime.Parse( " 2010-08-18 ").ToString( " yyyyMMdd "));
var endDate = int.Parse(DateTime.Now.ToString( " yyyyMMdd "));
const int startPageId = 49430;
const int endPageId = 124621;
for ( int k = startDate; k <= endDate; k++)
{
for ( int j = startPageId; j <= endPageId; j++)
{
string cnblogs = http://xxxxxxxx/ + k + " / " + j + " .html "; //此处省略……源码内详
HtmlDocument doc = web.Load(cnblogs);
var titles = doc.DocumentNode.SelectNodes( " //title ");
var titleName = j.ToString();
if( titles!= null && titles.Count> 0)
titleName = titles[ 0].InnerText;
HtmlNode node = doc.GetElementbyId( " ks_xp ");
if (node == null)
{
continue;
}
foreach (HtmlNode child in node.SelectNodes( " //img "))
{
if (child.Attributes[ " src "] == null)
continue;

string imgurl = child.Attributes[ " src "].Value;
DownLoadImg(imgurl, k + "", CleanInvalidFileName(titleName));
Console.WriteLine( " 正在下载: " + titleName + " " + imgurl);
}
}
}
// 善后
CleanEmptyFolders();
}

private static void CleanEmptyFolders()
{
var rootFolders = Environment.CurrentDirectory + " \\Images\\ ";
var folders = Directory.GetDirectories(rootFolders, " *.* ", SearchOption.AllDirectories);
foreach( var f in folders)
{
if (Directory.GetFiles(f, " *.* ", SearchOption.AllDirectories).Length == 0)
Directory.Delete(f);
}
}

private static void DownLoadImg( string url, string folderName, string subFolderName)
{
var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf( " / ") + 1));
var fileFolder = Environment.CurrentDirectory + " \\Images\\ " + folderName + " \\ " + subFolderName + " \\ " ;
if (!Directory.Exists(fileFolder))
Directory.CreateDirectory(fileFolder);
fileName = fileFolder + fileName;
try
{
Wc.DownloadFile(url, fileName);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
}
复制代码

测试程序和源码下载:

/Files/Chinasf/DownloadImages.rar




本文转自suifei博客园博客,原文链接:http://www.cnblogs.com/Chinasf/archive/2012/02/16/2354971.html,如需转载请自行联系原作者

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值