dotnet下的采集

最新推荐文章于 2024-06-11 09:36:25 发布

马立弘

最新推荐文章于 2024-06-11 09:36:25 发布

阅读量444

点赞数

分类专栏： C#

本文链接：https://blog.csdn.net/manimanihome/article/details/53328174

版权

C# 专栏收录该内容

80 篇文章 1 订阅

订阅专栏

一、工具
NCrawler
http://ncrawler.codeplex.com/

Html Agility Pack
http://htmlagilitypack.codeplex.com/

ScrapySharp
https://bitbucket.org/rflechner/scrapysharp

Quartz.NET
http://www.quartz-scheduler.net/

二、先看几个基本的
//demo:基础案例
using NCrawler;
using NCrawler.Interfaces;
using NCrawler.HtmlProcessor;

public class DumperStep : IPipelineStep  
{  
    public void Process(Crawler crawler, PropertyBag propertyBag)  
    {  
        Console.WriteLine(propertyBag.Step.Uri);  
    }  
}

class Program
{  
    static void Main(string[] args)  
    {  
        Uri uri = new Uri("http://www.csdn.net/");  
        Crawler c = new Crawler(uri,new HtmlDocumentProcessor(),new DumperStep());  
        c.MaximumThreadCount = 3;//线程数量  
        c.MaximumCrawlDepth = 2;//爬行深度  
    c.BlackListedUriRegexMatchers = new [] { new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase) }; 
        c.Crawl();//开始爬行  
    }  
}  



//demo:死链检查完整代码
public class InvalidLinkDetector 
{ 
    private XmlDocument _writerDoc = null; 
    private Crawler _crawler = null; 

    public InvalidLinkDetector(string DetectUrl, int MaxThreadCount, int MaxCrawlDepth) 
    { 
        this._writerDoc = new XmlDocument(); 
        this._writerDoc.LoadXml("<invalidUrls></invalidUrls>"); 

        this._crawler = new Crawler(DetectUrl, new HtmlDocumentProcessor(), new ScanResultWriter(this._writerDoc)) 
        { 
            MaximumThreadCount = MaxThreadCount, 
            MaximumCrawlDepth = MaxCrawlDepth 
        }; 

        this._crawler.CrawlFinished += new EventHandler<NCrawler.Events.CrawlFinishedEventArgs>(CrawlFinished); 
        this._crawler.PipelineException += new EventHandler<NCrawler.Events.PipelineExceptionEventArgs>(PipelineException); 
    } 

    public void Run() 
    { 
        this._crawler.Crawl(); 
    } 

    private void PipelineException(object sender, NCrawler.Events.PipelineExceptionEventArgs e) 
    { 
        Console.WriteLine("Exception occurred in pipeline: {0}, message: {1}", e.PropertyBag.Step.GetType().Name, e.Exception.Message); 
    } 

    private void CrawlFinished(object sender, NCrawler.Events.CrawlFinishedEventArgs e) 
    { 
        this._writerDoc.Save(Environment.CurrentDirectory + @"\" + DateTime.Now.ToString("yyyyMMdd HHmmss") + ".xml"); 
        this._writerDoc = null; 
    } 
} 

public class ScanResultWriter : IPipelineStep 
{ 
    private XmlDocument _writerDoc = null; 

    public ScanResultWriter(XmlDocument doc) 
    { 
        this._writerDoc = doc; 
    } 

    public void Process(Crawler crawler, PropertyBag propertyBag) 
    { 
        if (propertyBag.StatusCode != HttpStatusCode.OK) 
        { 
            Console.WriteLine("Find a invalid link..."); 

            XmlNode node = this._writerDoc.CreateNode(XmlNodeType.Element, "invalidUrl", null); 
            XmlNode nodeUrl = this._writerDoc.CreateNode(XmlNodeType.Attribute, "url", null); 
            XmlNode nodeReferUrl = this._writerDoc.CreateNode(XmlNodeType.Attribute, "refer", null); 
            XmlNode nodeReason = this._writerDoc.CreateNode(XmlNodeType.Attribute, "reason", null); 

            nodeUrl.Value = propertyBag.OriginalUrl; 
            nodeReferUrl.Value = propertyBag.OriginalReferrerUrl; 
            nodeReason.Value = ((int)propertyBag.StatusCode).ToString(); 

            node.Attributes.SetNamedItem(nodeUrl); 
            node.Attributes.SetNamedItem(nodeReferUrl); 
            node.Attributes.SetNamedItem(nodeReason); 

            this._writerDoc.DocumentElement.AppendChild(node); 
        } 
    } 
}


static void Main(string[] args) 
{ 
    Console.WriteLine("Scanning..."); 
    InvalidLinkDetector detector = new InvalidLinkDetector("http://msdn.microsoft.com", 10, 4); 
    detector.Run(); 
}

三、html解析

demo：基本使用方法
static void Main(string[] args) 
        { 
            var uri = new Uri("http://www.cnblogs.com/shanyou/archive/2012/05/20/2509435.html"); 
            var browser1 = new ScrapingBrowser(); 
            var html1 = browser1.DownloadString(uri); 
            var htmlDocument = new HtmlDocument(); 
            htmlDocument.LoadHtml(html1); 
            var html = htmlDocument.DocumentNode;

            var title = html.CssSelect("title"); 
            foreach (var htmlNode in title) 
            { 
                Console.WriteLine(htmlNode.InnerHtml); 
            } 
            var divs = html.CssSelect("div.postBody");

            foreach (var htmlNode in divs) 
            { 
                Console.WriteLine(htmlNode.InnerHtml); 
            }

            divs = html.CssSelect("#cnblogs_post_body"); 
            foreach (var htmlNode in divs) 
            { 
                Console.WriteLine(htmlNode.InnerHtml); 
            } 
        } 



示例：像一个浏览器一样
ScrapingBrowser browser = new ScrapingBrowser();

//set UseDefaultCookiesParser as false if a website returns invalid cookies format
//browser.UseDefaultCookiesParser = false;

WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/"));

PageWebForm form = homePage.FindFormById("sb_form");
form["q"] = "scrapysharp";
form.Method = HttpVerb.Get;
WebPage resultsPage = form.Submit();

HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray();

WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();


参考：
https://msdn.microsoft.com/zh-tw/ee939355
http://ncrawler.codeplex.com

马立弘

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
dotnet下的采集

一、工具NCrawlerhttp://ncrawler.codeplex.com/Html Agility Packhttp://htmlagilitypack.codeplex.com/ScrapySharphttps://bitbucket.org/rflechner/scrapysharpQuartz.NEThttp://www.quartz-scheduler.net/二、先看几个
复制链接

扫一扫