using System.Net;
namespace Crawler
{
public class CookiesAwareWebClient : WebClient
{
private CookieContainer outboundCookies = new CookieContainer();
private CookieCollection inboundCookies = new CookieCollection();
public CookieContainer OutboundCookies
{
get { return outboundCookies; }
}
public CookieCollection InboundCookies
{
get { return inboundCookies; }
}
public bool IgnoreRedirects { get; set; }
protected override WebRequest GetWebRequest(System.Uri address)
{
var request = base.GetWebRequest(address);
if (request is HttpWebRequest)
{
(request as HttpWebRequest).CookieContainer = outboundCookies;
(request as HttpWebRequest).AllowAutoRedirect = !IgnoreRedirects;
(request as HttpWebRequest).UserAgent =
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705;)";
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
return request;
}
protected override WebResponse GetWebResponse(WebRequest request)
{
var response = base.GetWebResponse(request);
if (response is HttpWebResponse)
{
inboundCookies = (response as HttpWebResponse).Cookies ?? inboundCookies;
}
return response;
}
}
}
CustomDownloaderModule.cs Raw
using System.Net;
using NCrawler;
using Autofac;
using NCrawler.Interfaces;
namespace Crawler
{
public class CustomDownloaderModule : NCrawlerModule
{
private readonly CookieContainer _cookieContainer;
public CustomDownloaderModule(CookieContainer cookieContainer)
{
_cookieContainer = cookieContainer;
}
protected override void Load(ContainerBuilder builder)
{
base.Load(builder);
builder.Register(c => new CustomWebDownloader(_cookieContainer))
.As<IWebDownloader>()
.SingleInstance()
.ExternallyOwned();
}
public static void Setup(CookieContainer cookieContainer)
{
Setup(cookieContainer);
}
}
}
CustomWebDownloader.cs Raw
using System.Net;
using NCrawler.Services;
namespace Crawler
{
public class CustomWebDownloader : WebDownloaderV2
{
private readonly CookieContainer _cookieContainer;
public CustomWebDownloader(CookieContainer cookieContainer)
{
_cookieContainer = cookieContainer;
}
protected override void SetDefaultRequestProperties(HttpWebRequest request)
{
base.SetDefaultRequestProperties(request);
request.CookieContainer = _cookieContainer;
}
}
}
Program.cs Raw
using System;
using NCrawler;
using NCrawler.Services;
using Module = Autofac.Module;
using NCrawler.HtmlProcessor;
using NCrawler.Interfaces;
public static Main(string[] args)
{
var authorizedCookies = GetAuthorizationCookie(new Uri("http://mysecuresite.com/login.html"));
var modules = new Module[] { new CustomDownloaderModule(authorizedCookies)};
NCrawlerModule.Setup(modules);
using(Crawler c = new Crawler("http://mysecuresite.com/", new HtmlDocumentProcessor()))
{
c.Crawl();
}
}
private static CookieContainer GetAuthorizationCookie(Uri loginPage)
{
CookieContainer cookies;
//Put all required form post data here.
var postData = new NameValueCollection
{
{"userid", "user1"},
{"pwd", "password"},
};
using (var client = new CookiesAwareWebClient())
{
client.IgnoreRedirects = false;
//Load Page via get request to initialize cookies...
client.DownloadData(loginPage);
//Add cookies to the outbound request.
client.OutboundCookies.Add(client.InboundCookies);
client.UploadValues(loginPage, "POST", postData);
//Add latest cookies (includes the authorization to the cookie collection)
client.OutboundCookies.Add(client.InboundCookies);
cookies = client.OutboundCookies;
}
if (cookies == null || cookies.Count == 0)
{
Console.Writeline("Authorization Cookies are null or empty.");
}
else
{
Console.Writeline("Authorization Cookies obtained.");
}
return cookies;
}
Readme.txt Raw
How to add custom authentication using cookies based authorization from POST based login page.
- Create cookies aware web client so that we can obtain the required cookies.
- Create CustomWebDownloader that inherits from WebDownloaderV2 and overrides the cookie behavior.
- Create custom NCrawlerModule that will implement the CustomWebDownloader.
- Get the required login cookis for the session.
- Register CustomDownloadModule passing in the authorized cookies. Note that the last item registered with Autofac will be the one used, so our CustomWebDownloader will now replace the default WebDownloaderV2.
- Crawl the site, it will not use the CustomWebDownloader for all links crawled.