转:利用WebRequest登录网站,抓取信息

利用WebRequest登录网站,抓取信息
以前也做过类似于新闻小偷之类的东西,就是利用WebRequest从请求的URL获取信息后进行分析,然后显示。记得当时老板让做一个搜索时抓取程序,想把这边几个BBS的信息都能纳入搜索范围。因为有些BBS的搜索功能必须登录后才能使用,所以当时我想破了脑子也没办法,最后认为这个任务根本没法做,就没往下思考。今天在CSDN上看到一个帖子,才知道原来这并非不可实现的,其实早己有人这样做过。要做到这些,其要点有两个:

1、通过附加一个cookiecontainer到httprequest对象中,可以得到登录后返回的代表SESSION ID的COOKIE。 

2 、将此COOKIE包含在一个cookiecontainer中并附加到另一个HTTPREQUEST请求中,则可以实现SESSION的还原。

using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
//using System.Data.OleDb;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Data.Odbc;

namespace PdfTest
{
/// <summary>
/// Summary description for WebForm1.
/// </summary>
public class getHttpInfo : System.Web.UI.Page
{
protected static string cookieheader;


private void Page_Load(object sender, System.EventArgs e)
{
// Put user code to initialize the page here

string strResult;

if (HttpContext.Current.Application["cookieheader"] != null)
{
cookieheader = (string)HttpContext.Current.Application["cookieheader"];
}
else
{
//Login into the website and keep the cookie for the session in the application variable
string strLogin = Login("http://www.thesiteyouwanttovisit/theloginpage.asp", "Action=&USERID=&Password=") ;
}

strResult = getPage("http://www.thesiteyouwanttovisit/theloginpage.asp", "Action=&data=") ;


//Write the result to htm file
FileStream htmFile = new FileStream("c:/save.htm", FileMode.OpenOrCreate);
StreamWriter sw = new StreamWriter(htmFile);
sw.Write(strResult);
sw.Close();
htmFile.Close();

// output the result
Response.Write(strResult);
}


public static string Login(String url, String paramList)
{
HttpWebResponse res = null;
string strResult="";

try
{

HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
req.Method = "POST";
req.ContentType = "application/x-www-form-urlencoded";
req.AllowAutoRedirect = false;
CookieContainer cookieCon = new CookieContainer();
req.CookieContainer = cookieCon;

StringBuilder UrlEncoded = new StringBuilder();
Char[] reserved = {'?', '=', '&'};
byte[] SomeBytes = null;

if (paramList != null)
{
int i=0, j;
while(i<paramList.Length)
{
j=paramList.IndexOfAny(reserved, i);
if (j==-1)
{
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, paramList.Length-i)));
break;
}
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, j-i)));
UrlEncoded.Append(paramList.Substring(j,1));
i = j+1;
}
SomeBytes = Encoding.UTF8.GetBytes(UrlEncoded.ToString());
req.ContentLength = SomeBytes.Length;
Stream newStream = req.GetRequestStream();
newStream.Write(SomeBytes, 0, SomeBytes.Length);
newStream.Close();
}
else
{
req.ContentLength = 0;
}


res = (HttpWebResponse)req.GetResponse();
cookieheader = req.CookieContainer.GetCookieHeader(new Uri(url));
HttpContext.Current.Application.Lock();
HttpContext.Current.Application["cookieheader"] = cookieheader;
HttpContext.Current.Application.UnLock();

Stream ReceiveStream = res.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
StreamReader sr = new StreamReader( ReceiveStream, encode );
Char[] read = new Char[256];
int count = sr.Read( read, 0, 256 );
while (count > 0)
{
String str = new String(read, 0, count);
strResult += str;
count = sr.Read(read, 0, 256);
}
}
catch(Exception e)
{
strResult = e.ToString();
}
finally
{
if ( res != null )
{
res.Close();
}
}

return strResult;
}


public static string getPage(String url, String paramList)
{
HttpWebResponse res = null;
string strResult = "";

try
{

HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
req.Method = "POST";
req.KeepAlive = true;
req.ContentType = "application/x-www-form-urlencoded";
CookieContainer cookieCon = new CookieContainer();
req.CookieContainer = cookieCon;
req.CookieContainer.SetCookies(new Uri(url),cookieheader);
StringBuilder UrlEncoded = new StringBuilder();
Char[] reserved = {'?', '=', '&'};
byte[] SomeBytes = null;

if (paramList != null)
{
int i=0, j;
while(i<paramList.Length)
{
j=paramList.IndexOfAny(reserved, i);
if (j==-1)
{
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, paramList.Length-i)));
break;
}
UrlEncoded.Append(HttpUtility.UrlEncode(paramList.Substring(i, j-i)));
UrlEncoded.Append(paramList.Substring(j,1));
i = j+1;
}
SomeBytes = Encoding.UTF8.GetBytes(UrlEncoded.ToString());
req.ContentLength = SomeBytes.Length;
Stream newStream = req.GetRequestStream();
newStream.Write(SomeBytes, 0, SomeBytes.Length);
newStream.Close();
}
else
{
req.ContentLength = 0;
}


res = (HttpWebResponse)req.GetResponse();
Stream ReceiveStream = res.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
StreamReader sr = new StreamReader( ReceiveStream, encode );
Char[] read = new Char[256];
int count = sr.Read( read, 0, 256 );
while (count > 0)
{
String str = new String(read, 0, count);
strResult += str;
count = sr.Read(read, 0, 256);
}
}
catch(Exception e)
{
strResult = e.ToString();
}
finally
{
if ( res != null )
{
res.Close();
}
}

return strResult;
}


#region Web Form Designer generated code
override protected void OnInit(EventArgs e)
{
//
// CODEGEN: This call is required by the ASP.NET Web Form Designer.
//
InitializeComponent();
base.OnInit(e);
}

/// <summary>
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
/// </summary>
private void InitializeComponent()
{
this.Load += new System.EventHandler(this.Page_Load);

}
#endregion


}
}


http://blog.csdn.net/waterboy/archive/2004/08/31/90427.aspx

http://msdn.microsoft.com/library/chs/default.asp?url=/library/CHS/cpref/html/frlrfsystemnet.asp

http://news.dvbbs.net/infoview/Article_2776.html

http://blog.joycode.com/yaodong/archive/2004/10/10/35129.aspx



阅读更多
个人分类: Asp.Net
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭