一、工具
NCrawler
http://ncrawler.codeplex.com/
Html Agility Pack
http://htmlagilitypack.codeplex.com/
ScrapySharp
https://bitbucket.org/rflechner/scrapysharp
Quartz.NET
http://www.quartz-scheduler.net/
二、先看几个基本的
//demo:基础案例
using NCrawler;
using NCrawler.Interfaces;
using NCrawler.HtmlProcessor;
public class DumperStep : IPipelineStep
{
public void Process(Crawler crawler, PropertyBag propertyBag)
{
Console.WriteLine(propertyBag.Step.Uri);
}
}
class Program
{
static void Main(string[] args)
{
Uri uri = new Uri("http://www.csdn.net/");
Crawler c = new Crawler(uri,new HtmlDocumentProcessor(),new DumperStep());
c.MaximumThreadCount = 3;//线程数量
c.MaximumCrawlDepth = 2;//爬行深度
c.BlackListedUriRegexMatchers = new [] { new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase) };
c.Crawl();//开始爬行
}
}
//demo:死链检查完整代码
public class InvalidLinkDetector
{
private XmlDocument _writerDoc = null;
private Crawler _crawler = null;
public InvalidLinkDetector(string DetectUrl, int MaxThreadCount, int MaxCrawlDepth)
{
this._writerDoc = new XmlDocument();
this._writerDoc.LoadXml("<invalidUrls></invalidUrls>");
this._crawler = new Crawler(DetectUrl, new HtmlDocumentProcessor(), new ScanResultWriter(this._writerDoc))
{
MaximumThreadCount = MaxThreadCount,
MaximumCrawlDepth = MaxCrawlDepth
};
this._crawler.CrawlFinished += new EventHandler<NCrawler.Events.CrawlFinishedEventArgs>(CrawlFinished);
this._crawler.PipelineException += new EventHandler<NCrawler.Events.PipelineExceptionEventArgs>(PipelineException);
}
public void Run()
{
this._crawler.Crawl();
}
private void PipelineException(object sender, NCrawler.Events.PipelineExceptionEventArgs e)
{
Console.WriteLine("Exception occurred in pipeline: {0}, message: {1}", e.PropertyBag.Step.GetType().Name, e.Exception.Message);
}
private void CrawlFinished(object sender, NCrawler.Events.CrawlFinishedEventArgs e)
{
this._writerDoc.Save(Environment.CurrentDirectory + @"\" + DateTime.Now.ToString("yyyyMMdd HHmmss") + ".xml");
this._writerDoc = null;
}
}
public class ScanResultWriter : IPipelineStep
{
private XmlDocument _writerDoc = null;
public ScanResultWriter(XmlDocument doc)
{
this._writerDoc = doc;
}
public void Process(Crawler crawler, PropertyBag propertyBag)
{
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
Console.WriteLine("Find a invalid link...");
XmlNode node = this._writerDoc.CreateNode(XmlNodeType.Element, "invalidUrl", null);
XmlNode nodeUrl = this._writerDoc.CreateNode(XmlNodeType.Attribute, "url", null);
XmlNode nodeReferUrl = this._writerDoc.CreateNode(XmlNodeType.Attribute, "refer", null);
XmlNode nodeReason = this._writerDoc.CreateNode(XmlNodeType.Attribute, "reason", null);
nodeUrl.Value = propertyBag.OriginalUrl;
nodeReferUrl.Value = propertyBag.OriginalReferrerUrl;
nodeReason.Value = ((int)propertyBag.StatusCode).ToString();
node.Attributes.SetNamedItem(nodeUrl);
node.Attributes.SetNamedItem(nodeReferUrl);
node.Attributes.SetNamedItem(nodeReason);
this._writerDoc.DocumentElement.AppendChild(node);
}
}
}
static void Main(string[] args)
{
Console.WriteLine("Scanning...");
InvalidLinkDetector detector = new InvalidLinkDetector("http://msdn.microsoft.com", 10, 4);
detector.Run();
}
三、html解析
demo:基本使用方法
static void Main(string[] args)
{
var uri = new Uri("http://www.cnblogs.com/shanyou/archive/2012/05/20/2509435.html");
var browser1 = new ScrapingBrowser();
var html1 = browser1.DownloadString(uri);
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(html1);
var html = htmlDocument.DocumentNode;
var title = html.CssSelect("title");
foreach (var htmlNode in title)
{
Console.WriteLine(htmlNode.InnerHtml);
}
var divs = html.CssSelect("div.postBody");
foreach (var htmlNode in divs)
{
Console.WriteLine(htmlNode.InnerHtml);
}
divs = html.CssSelect("#cnblogs_post_body");
foreach (var htmlNode in divs)
{
Console.WriteLine(htmlNode.InnerHtml);
}
}
示例:像一个浏览器一样
ScrapingBrowser browser = new ScrapingBrowser();
//set UseDefaultCookiesParser as false if a website returns invalid cookies format
//browser.UseDefaultCookiesParser = false;
WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/"));
PageWebForm form = homePage.FindFormById("sb_form");
form["q"] = "scrapysharp";
form.Method = HttpVerb.Get;
WebPage resultsPage = form.Submit();
HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray();
WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();
参考:
https://msdn.microsoft.com/zh-tw/ee939355
http://ncrawler.codeplex.com
dotnet下的采集
最新推荐文章于 2024-06-11 09:36:25 发布