using System;
using System.Collections.Generic;
using System.Text;
using System.Threading;
using System.Text.RegularExpressions;
using IWOMWebCrawlerDbLayer.Model;
using IWOMWebCrawlerDbLayer.Common;
using System.Net;
using System.IO;
namespace IWOMWebCrawlerApp.Crawler
{
public class viayunBBS: AbstractLoginSearchEngine
{
//帐号 xxxx 密码:xxxx
public viayunBBS()
{
this.SearchID = 1039;
this.SearchName = "行云社";
this.Methord = downWebFileMethod.ByCookieCollection;
}
/// <summary>
/// 制造post数据去模拟登录,并取回登录后的CookieCollection
/// </summary>
protected override void getCookieCollection()
{
DownWebFile dw = new DownWebFile();
string url = "http://bbs.viayun.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1";
string postcontext = "username=ababs&password=19881014&quickforward=yes&handlekey=ls";
this.Collection = dw.GetCookie(url, postcontext, Encoding.GetEncoding("utf-8"), "http://bbs.viayun.com/");
this.CookieDomain = "http://bbs.viayun.com";
this.dtCookieExpires = DateTime.Now.AddHours(4);
}
/// <summary>
/// 根据任务生成抓取要素
/// </summary>
protected override void initCrawlerModel(IwomTask taskItem)
{
crawlerModel = new CrawlerModel();
crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, true);//关键词
crawlerModel.GetDays = taskItem.GetDays;
crawlerModel.PageSize = taskItem.GetItems > 30 ? 30 : taskItem.GetItems; //每页大小
crawlerModel.Postion = taskItem.Task_Postion;
}
/// <summary>
/// 根据任务要素构造抓取的url
/// </summary>
protected override string createUrl(int pageIndex)
{
string strUrl = "http://bbs.viayun.com/search.php?mod=forum&searchid=5&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=aa";
return strUrl;
}
/// <summary>
/// 每抓取一页都间隔的时间
/// </summary>
protected override void PageSleep()
{
//新浪微博的被封的情况很严重
Random ran = new Random();
int sleepTime = ran.Next(6000, 18000);
Thread.Sleep(sleepTime);
}
/// <summary>
/// 页面的编码
/// </summary>
protected override Encoding getPageEncoding()
{
return Encoding.UTF8;
}
/// <summary>
/// 根据内容判断是否被封禁了
/// </summary>
protected override bool checkContentIsForbat(string HTMLContent)
{
if (HTMLContent.IndexOf("要继续访问,请输入下图中的四位验证码") > -1)
return false;
return true;
}
/// <summary>
/// 根据内容判断是否是最后一页了
/// </summary>
protected override bool checkContentIsLastPage(string HTMLContent)
{
if (HTMLContent.IndexOf("抱歉,没有找到与") > 0)
return true;
return false;
}
/// <summary>
/// 根据网页信息得到文章集合
/// </summary>
protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)
{
HTMLContent = getPageContent("");
List<CrawlerResult> arrayList = new List<CrawlerResult>();
MatchCollection matchList;
MatchCollection tempMatch;
Regex regex = new Regex(@"<li class=""pbw"" id=""[\d]*"">[\s\S]+?</li>");
Regex regexTitle = new Regex(@"<h3[\s\S]+?</h3>");//标题
Regex regexRandV = new Regex(@"<p class=""xg1"">(?<R>[\d]+)[\s]+个回复[\s]+-[\s]+(?<V>[\d]+)[\s]+次查看</p>");//回复与查看
Regex regexHref = new Regex(@"<h3[\s\S]+?</h3>");//链接
Regex regexContent = new Regex(@"次查看</p>[\s]*<p>(?<T>[\s\S]+?)</p>");//摘要
Regex regexTandA = new Regex(@"<p>[\s]+<span>(?<T>[\s\S]+?)</span>[\s]+-[\s]+<span>(?<A>[\s\S]+)</span>");//时间与作者
matchList = regex.Matches(HTMLContent);
for (int i = 0; i < matchList.Count; i++)
{
CrawlerResult item = new CrawlerResult();
item.Task_ID = task_ID;
if (matchList[i].Value.ToString() != "")
{
//URL
tempMatch = regexHref.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
item.Url = GetURL(tempMatch[0].Value);
if (item.Url.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
{
//主题
tempMatch = regexTitle.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].ToString());
}
//媒体
item.SiteName = "xxx";
tempMatch = regexTandA.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
DateTime tempTime;
DateTime.TryParse(tempMatch[0].Groups["T"].ToString(), out tempTime);
if (tempTime.ToString() != "0001/1/1 0:00:00")
item.CreateTime = tempTime;
item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["A"].ToString());
}
tempMatch = regexRandV.Matches(matchList[i].Value);
if (tempMatch.Count > 0)
{
int tempRandV;
int.TryParse(tempMatch[0].Groups["R"].ToString(), out tempRandV);
item.ReplyCount = tempRandV;
int.TryParse(tempMatch[0].Groups["V"].ToString(), out tempRandV);
item.ViewCount = tempRandV;
}
tempMatch = regexContent.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
item.Summary = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["T"].ToString());
}
item.FilterType = FilterType.FilterNo;
arrayList.Add(item);
}
}
}
}
return arrayList;
}
/// <summary>
/// 生成测试任务的方法
/// </summary>
protected override string initTestUrl()
{
this.HaseCreateTime = true;
this.HasePageSize =30;
this.HaseAuthor = true;
this.HaseSummary = true;
return "http://bbs.viayun.com/search.php?mod=forum&searchid=5&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=aa";
}
/// <summary>
/// 获得解析的URL
/// </summary>
private string GetURL(string urlHtml)
{
MatchCollection matchList;
Regex regex = new Regex(@"f="".*?""");
matchList = regex.Matches(urlHtml);
if (matchList.Count > 0)
{
return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4).Trim();
}
return "";
}
#region 获得页面
private string getPageContent(string url)
{
string htmlcontent = "";
HttpWebRequest request = null;
HttpWebResponse response = null;
string gethost = string.Empty;
CookieContainer cc = new CookieContainer();
string Cookiesstr = string.Empty;
try
{
//第一次POST请求
string postdata = @"username=xxxxx&password=xxxxx&quickforward=yes&handlekey=ls";//模拟请求数据
string LoginUrl = "http://bbs.viayun.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1";
request = (HttpWebRequest)WebRequest.Create(LoginUrl);//实例化web访问类
request.Method = "POST";//数据提交方式为POST
//模拟头
request.ContentType = "application/x-www-form-urlencoded";
byte[] postdatabytes = Encoding.UTF8.GetBytes(postdata);
request.ContentLength = postdatabytes.Length;
request.Referer = "bbs.viayun.com";
request.AllowAutoRedirect = false;
request.CookieContainer = cc;
request.KeepAlive = true;
//提交请求
Stream stream;
stream = request.GetRequestStream();
stream.Write(postdatabytes, 0, postdatabytes.Length);
stream.Close();
//接收响应
response = (HttpWebResponse)request.GetResponse();
//保存返回cookie
response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
CookieCollection cook = response.Cookies;
string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);
Cookiesstr = strcrook;
//取第一次GET跳转地址
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
string content = sr.ReadToEnd();
response.Close();
}
catch (Exception)
{
//第一次POST出错;
}
try
{
gethost = "http://bbs.viayun.com/search.php?mod=forum&searchid=5&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=aa"; //第一次GET地址
request = (HttpWebRequest)WebRequest.Create(gethost);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
htmlcontent = sr.ReadToEnd();
request.Abort();
sr.Close();
response.Close();
}
catch (Exception)
{
//第一次GET出错
}
return htmlcontent;
}
#endregion
}
}
post抓取小例
最新推荐文章于 2022-11-23 18:50:10 发布