领导要求公司旗下所有网站的51拉统计数据能自动抓取到我们自己的数据库中,心想这种抓取统计数据的程序以前肯定有人写过,到百度搜了一下没有找到抓取51la的,于是乎自己动手。
1、利用httpwatch找到网站入口及参数:(详情略,请百度httpwatch的使用方法)
2、利用httpwebRequest将51拉查看密码Post到网站入口,登陆成功后再进入你想要进入的页面抓取页面
///
<summary>
/// 某日或者叫时间段,51la里组员ID里——搜索引擎内——的数据。
/// </summary>
/// <param name="strStaticId"></param>
/// <param name="LookPass"></param>
/// <param name="strBeginDate"></param>
/// <param name="strEndDate"></param>
/// <returns></returns>
public string FiveLaSeo( string strStaticId, string LookPass, string strBeginDate, string strEndDate)
{
CookieContainer Cc = new CookieContainer();
ASCIIEncoding encoding = new ASCIIEncoding();
string postData = " id= " + strStaticId;
postData += ( " &lookpass= " + LookPass);
postData += " &t=chalogin " ;
// 将提交的字符串数据转换成字节数组
byte [] data = encoding.GetBytes(postData);
// 设置提交的相关参数
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create( " http://www.51.la/report/0_help.asp " );
myRequest.Method = " POST " ;
myRequest.ContentType = " application/x-www-form-urlencoded " ;
myRequest.ContentLength = data.Length;
// cookie的容器一定要加
myRequest.CookieContainer = Cc;
// 提交请求数据
Stream newStream = myRequest.GetRequestStream();
newStream.Write(data, 0 , data.Length);
newStream.Close();
// 接收返回的页面
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
string content = reader.ReadToEnd();
// 进去后打开特定页面的参数设置
myRequest = (HttpWebRequest)WebRequest.Create( " http://www.51.la/report/3_SE.asp?id= " + strStaticId + " &d1= " + strBeginDate + " &d2= " + strEndDate);
myRequest.Method = " GET " ;
myRequest.KeepAlive = false ;
myRequest.CookieContainer = Cc;
// 接收返回的特定页面
myResponse = (HttpWebResponse)myRequest.GetResponse();
newStream = myResponse.GetResponseStream();
reader = new StreamReader(myResponse.GetResponseStream(),Encoding.Default);
content = reader.ReadToEnd();
return content;
}
/// 某日或者叫时间段,51la里组员ID里——搜索引擎内——的数据。
/// </summary>
/// <param name="strStaticId"></param>
/// <param name="LookPass"></param>
/// <param name="strBeginDate"></param>
/// <param name="strEndDate"></param>
/// <returns></returns>
public string FiveLaSeo( string strStaticId, string LookPass, string strBeginDate, string strEndDate)
{
CookieContainer Cc = new CookieContainer();
ASCIIEncoding encoding = new ASCIIEncoding();
string postData = " id= " + strStaticId;
postData += ( " &lookpass= " + LookPass);
postData += " &t=chalogin " ;
// 将提交的字符串数据转换成字节数组
byte [] data = encoding.GetBytes(postData);
// 设置提交的相关参数
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create( " http://www.51.la/report/0_help.asp " );
myRequest.Method = " POST " ;
myRequest.ContentType = " application/x-www-form-urlencoded " ;
myRequest.ContentLength = data.Length;
// cookie的容器一定要加
myRequest.CookieContainer = Cc;
// 提交请求数据
Stream newStream = myRequest.GetRequestStream();
newStream.Write(data, 0 , data.Length);
newStream.Close();
// 接收返回的页面
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
string content = reader.ReadToEnd();
// 进去后打开特定页面的参数设置
myRequest = (HttpWebRequest)WebRequest.Create( " http://www.51.la/report/3_SE.asp?id= " + strStaticId + " &d1= " + strBeginDate + " &d2= " + strEndDate);
myRequest.Method = " GET " ;
myRequest.KeepAlive = false ;
myRequest.CookieContainer = Cc;
// 接收返回的特定页面
myResponse = (HttpWebResponse)myRequest.GetResponse();
newStream = myResponse.GetResponseStream();
reader = new StreamReader(myResponse.GetResponseStream(),Encoding.Default);
content = reader.ReadToEnd();
return content;
}
3、将抓取的页面进行正则表达式匹配,取出自己所需要的数据(这里我需要搜索引擎流量)
///
<summary>
/// 返回51拉中:搜索引擎的访问量IP-*?表示匹配最少的重复项目
/// </summary>
/// <param name="strStaticId"></param>
/// <param name="LookPass"></param>
/// <param name="strBeginDate"></param>
/// <param name="strEndDate"></param>
/// <returns></returns>
public string FivelaSeoPv( string strStaticId, string LookPass, string strBeginDate, string strEndDate)
{
Tool.FiveLa fl = new FiveLa();
string html = fl.FiveLaSeo(strStaticId, LookPass, strBeginDate, strEndDate);
string pattern = @" 来自搜索引擎的访问量 \( [\s\S]*? IP \) 占总访问量 " ;
// string pattern = @"占总访问量";
string number = Regex.Match(html, pattern, RegexOptions.IgnoreCase).Value;
number = number.Replace( " 来自搜索引擎的访问量 ( " , "" ).Replace( " IP ) " , "" ).Replace( " 占总访问量 " , "" ).Trim(); ;
if (number == "" )
{ number = " 0 " ; }
return number;
}
/// 返回51拉中:搜索引擎的访问量IP-*?表示匹配最少的重复项目
/// </summary>
/// <param name="strStaticId"></param>
/// <param name="LookPass"></param>
/// <param name="strBeginDate"></param>
/// <param name="strEndDate"></param>
/// <returns></returns>
public string FivelaSeoPv( string strStaticId, string LookPass, string strBeginDate, string strEndDate)
{
Tool.FiveLa fl = new FiveLa();
string html = fl.FiveLaSeo(strStaticId, LookPass, strBeginDate, strEndDate);
string pattern = @" 来自搜索引擎的访问量 \( [\s\S]*? IP \) 占总访问量 " ;
// string pattern = @"占总访问量";
string number = Regex.Match(html, pattern, RegexOptions.IgnoreCase).Value;
number = number.Replace( " 来自搜索引擎的访问量 ( " , "" ).Replace( " IP ) " , "" ).Replace( " 占总访问量 " , "" ).Trim(); ;
if (number == "" )
{ number = " 0 " ; }
return number;
}
4、最后来看看利用ActiveReport显示取得数据的效果。