dotnet下的采集

一、工具
NCrawler
http://ncrawler.codeplex.com/

Html Agility Pack
http://htmlagilitypack.codeplex.com/

ScrapySharp
https://bitbucket.org/rflechner/scrapysharp

Quartz.NET
http://www.quartz-scheduler.net/

二、先看几个基本的
//demo:基础案例
using NCrawler;
using NCrawler.Interfaces;
using NCrawler.HtmlProcessor;

public class DumperStep : IPipelineStep  
{  
    public void Process(Crawler crawler, PropertyBag propertyBag)  
    {  
        Console.WriteLine(propertyBag.Step.Uri);  
    }  
}

class Program
{  
    static void Main(string[] args)  
    {  
        Uri uri = new Uri("http://www.csdn.net/");  
        Crawler c = new Crawler(uri,new HtmlDocumentProcessor(),new DumperStep());  
        c.MaximumThreadCount = 3;//线程数量  
        c.MaximumCrawlDepth = 2;//爬行深度  
    c.BlackListedUriRegexMatchers = new [] { new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase) }; 
        c.Crawl();//开始爬行  
    }  
}  



//demo:死链检查完整代码
public class InvalidLinkDetector 
{ 
    private XmlDocument _writerDoc = null; 
    private Crawler _crawler = null; 

    public InvalidLinkDetector(string DetectUrl, int MaxThreadCount, int MaxCrawlDepth) 
    { 
        this._writerDoc = new XmlDocument(); 
        this._writerDoc.LoadXml("<invalidUrls></invalidUrls>"); 

        this._crawler = new Crawler(DetectUrl, new HtmlDocumentProcessor(), new ScanResultWriter(this._writerDoc)) 
        { 
            MaximumThreadCount = MaxThreadCount, 
            MaximumCrawlDepth = MaxCrawlDepth 
        }; 

        this._crawler.CrawlFinished += new EventHandler<NCrawler.Events.CrawlFinishedEventArgs>(CrawlFinished); 
        this._crawler.PipelineException += new EventHandler<NCrawler.Events.PipelineExceptionEventArgs>(PipelineException); 
    } 

    public void Run() 
    { 
        this._crawler.Crawl(); 
    } 

    private void PipelineException(object sender, NCrawler.Events.PipelineExceptionEventArgs e) 
    { 
        Console.WriteLine("Exception occurred in pipeline: {0}, message: {1}", e.PropertyBag.Step.GetType().Name, e.Exception.Message); 
    } 

    private void CrawlFinished(object sender, NCrawler.Events.CrawlFinishedEventArgs e) 
    { 
        this._writerDoc.Save(Environment.CurrentDirectory + @"\" + DateTime.Now.ToString("yyyyMMdd HHmmss") + ".xml"); 
        this._writerDoc = null; 
    } 
} 

public class ScanResultWriter : IPipelineStep 
{ 
    private XmlDocument _writerDoc = null; 

    public ScanResultWriter(XmlDocument doc) 
    { 
        this._writerDoc = doc; 
    } 

    public void Process(Crawler crawler, PropertyBag propertyBag) 
    { 
        if (propertyBag.StatusCode != HttpStatusCode.OK) 
        { 
            Console.WriteLine("Find a invalid link..."); 

            XmlNode node = this._writerDoc.CreateNode(XmlNodeType.Element, "invalidUrl", null); 
            XmlNode nodeUrl = this._writerDoc.CreateNode(XmlNodeType.Attribute, "url", null); 
            XmlNode nodeReferUrl = this._writerDoc.CreateNode(XmlNodeType.Attribute, "refer", null); 
            XmlNode nodeReason = this._writerDoc.CreateNode(XmlNodeType.Attribute, "reason", null); 

            nodeUrl.Value = propertyBag.OriginalUrl; 
            nodeReferUrl.Value = propertyBag.OriginalReferrerUrl; 
            nodeReason.Value = ((int)propertyBag.StatusCode).ToString(); 

            node.Attributes.SetNamedItem(nodeUrl); 
            node.Attributes.SetNamedItem(nodeReferUrl); 
            node.Attributes.SetNamedItem(nodeReason); 

            this._writerDoc.DocumentElement.AppendChild(node); 
        } 
    } 
}


static void Main(string[] args) 
{ 
    Console.WriteLine("Scanning..."); 
    InvalidLinkDetector detector = new InvalidLinkDetector("http://msdn.microsoft.com", 10, 4); 
    detector.Run(); 
}

三、html解析

demo:基本使用方法
static void Main(string[] args) 
        { 
            var uri = new Uri("http://www.cnblogs.com/shanyou/archive/2012/05/20/2509435.html"); 
            var browser1 = new ScrapingBrowser(); 
            var html1 = browser1.DownloadString(uri); 
            var htmlDocument = new HtmlDocument(); 
            htmlDocument.LoadHtml(html1); 
            var html = htmlDocument.DocumentNode;

            var title = html.CssSelect("title"); 
            foreach (var htmlNode in title) 
            { 
                Console.WriteLine(htmlNode.InnerHtml); 
            } 
            var divs = html.CssSelect("div.postBody");

            foreach (var htmlNode in divs) 
            { 
                Console.WriteLine(htmlNode.InnerHtml); 
            }

            divs = html.CssSelect("#cnblogs_post_body"); 
            foreach (var htmlNode in divs) 
            { 
                Console.WriteLine(htmlNode.InnerHtml); 
            } 
        } 



示例:像一个浏览器一样
ScrapingBrowser browser = new ScrapingBrowser();

//set UseDefaultCookiesParser as false if a website returns invalid cookies format
//browser.UseDefaultCookiesParser = false;

WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/"));

PageWebForm form = homePage.FindFormById("sb_form");
form["q"] = "scrapysharp";
form.Method = HttpVerb.Get;
WebPage resultsPage = form.Submit();

HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray();

WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();


参考:
https://msdn.microsoft.com/zh-tw/ee939355
http://ncrawler.codeplex.com
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值