post抓取小例

最新推荐文章于 2022-11-23 18:50:10 发布
ArvinStudy
最新推荐文章于 2022-11-23 18:50:10 发布
阅读量1.1k
点赞数
分类专栏：抓取文章标签： regex string exception stream random url
本文链接：https://blog.csdn.net/ArvinStudy/article/details/7890132
版权
抓取专栏收录该内容
70 篇文章 0 订阅
订阅专栏
using System;
using System.Collections.Generic;
using System.Text;
using System.Threading;
using System.Text.RegularExpressions;
using IWOMWebCrawlerDbLayer.Model;
using IWOMWebCrawlerDbLayer.Common;
using System.Net;
using System.IO;
namespace IWOMWebCrawlerApp.Crawler
{
   public class viayunBBS: AbstractLoginSearchEngine
    {
        //帐号 xxxx  密码：xxxx

       public viayunBBS()
        {
            this.SearchID = 1039;
            this.SearchName = "行云社";
            this.Methord = downWebFileMethod.ByCookieCollection;
        }
        /// <summary>
        /// 制造post数据去模拟登录，并取回登录后的CookieCollection
        /// </summary>
        protected override void getCookieCollection()
        {
            DownWebFile dw = new DownWebFile();
            string url = "http://bbs.viayun.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1";
            string postcontext = "username=ababs&password=19881014&quickforward=yes&handlekey=ls";
            this.Collection = dw.GetCookie(url, postcontext, Encoding.GetEncoding("utf-8"), "http://bbs.viayun.com/");
            this.CookieDomain = "http://bbs.viayun.com";
            this.dtCookieExpires = DateTime.Now.AddHours(4);
        }
        /// <summary>
        /// 根据任务生成抓取要素
        /// </summary>
        protected override void initCrawlerModel(IwomTask taskItem)
        {
            crawlerModel = new CrawlerModel();
            crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, true);//关键词
            crawlerModel.GetDays = taskItem.GetDays;
            crawlerModel.PageSize = taskItem.GetItems > 30 ? 30 : taskItem.GetItems;       //每页大小
            crawlerModel.Postion = taskItem.Task_Postion;
        }
        /// <summary>
        /// 根据任务要素构造抓取的url
        /// </summary>
        protected override string createUrl(int pageIndex)
        {
            string strUrl = "http://bbs.viayun.com/search.php?mod=forum&searchid=5&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=aa";
            return strUrl;
        }
        /// <summary>
        /// 每抓取一页都间隔的时间
        /// </summary>
        protected override void PageSleep()
        {
            //新浪微博的被封的情况很严重
            Random ran = new Random();
            int sleepTime = ran.Next(6000, 18000);
            Thread.Sleep(sleepTime);
        }
        /// <summary>
        /// 页面的编码
        /// </summary>
        protected override Encoding getPageEncoding()
        {
            return Encoding.UTF8;
        }
        /// <summary>
        /// 根据内容判断是否被封禁了
        /// </summary>
        protected override bool checkContentIsForbat(string HTMLContent)
        {
            if (HTMLContent.IndexOf("要继续访问，请输入下图中的四位验证码") > -1)
                return false;
            return true;
        }
        /// <summary>
        /// 根据内容判断是否是最后一页了
        /// </summary>
        protected override bool checkContentIsLastPage(string HTMLContent)
        {
            if (HTMLContent.IndexOf("抱歉，没有找到与") > 0)
                return true;
            return false;
        }

        /// <summary>
        /// 根据网页信息得到文章集合
        /// </summary>
        protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)
        {
            HTMLContent = getPageContent("");
            List<CrawlerResult> arrayList = new List<CrawlerResult>();
            MatchCollection matchList;
            MatchCollection tempMatch;
            Regex regex = new Regex(@"<li class=""pbw"" id=""[\d]*"">[\s\S]+?</li>");
            Regex regexTitle = new Regex(@"<h3[\s\S]+?</h3>");//标题
            Regex regexRandV = new Regex(@"<p class=""xg1"">(?<R>[\d]+)[\s]+个回复[\s]+-[\s]+(?<V>[\d]+)[\s]+次查看</p>");//回复与查看
            Regex regexHref = new Regex(@"<h3[\s\S]+?</h3>");//链接       
            Regex regexContent = new Regex(@"次查看</p>[\s]*<p>(?<T>[\s\S]+?)</p>");//摘要
            Regex regexTandA = new Regex(@"<p>[\s]+<span>(?<T>[\s\S]+?)</span>[\s]+-[\s]+<span>(?<A>[\s\S]+)</span>");//时间与作者
            matchList = regex.Matches(HTMLContent);
            for (int i = 0; i < matchList.Count; i++)
            {
                CrawlerResult item = new CrawlerResult();
                item.Task_ID = task_ID;
                if (matchList[i].Value.ToString() != "")
                {
                    //URL
                    tempMatch = regexHref.Matches(matchList[i].Value.ToString());
                    if (tempMatch.Count > 0)
                    {

                        item.Url = GetURL(tempMatch[0].Value);

                        if (item.Url.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
                        {

                            //主题
                            tempMatch = regexTitle.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].ToString());
                            }
                            //媒体
                            item.SiteName = "xxx";

                            tempMatch = regexTandA.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                DateTime tempTime;
                                DateTime.TryParse(tempMatch[0].Groups["T"].ToString(), out tempTime);                             
                                if (tempTime.ToString() != "0001/1/1 0:00:00")
                                item.CreateTime = tempTime;
                                item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["A"].ToString());
                            }
                            tempMatch = regexRandV.Matches(matchList[i].Value);
                            if (tempMatch.Count > 0)
                            {
                                int tempRandV;
                                int.TryParse(tempMatch[0].Groups["R"].ToString(), out tempRandV);
                                item.ReplyCount = tempRandV;
                                int.TryParse(tempMatch[0].Groups["V"].ToString(), out tempRandV);
                                item.ViewCount = tempRandV;
                            }
                            tempMatch = regexContent.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                item.Summary = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["T"].ToString());
                            }
                            item.FilterType = FilterType.FilterNo;
                            arrayList.Add(item);
                        }
                    }
                }
            }
            return arrayList;
        }


        /// <summary>
        /// 生成测试任务的方法
        /// </summary>
        protected override string initTestUrl()
        {
            this.HaseCreateTime = true;
            this.HasePageSize =30;
            this.HaseAuthor = true;
            this.HaseSummary = true;
            return "http://bbs.viayun.com/search.php?mod=forum&searchid=5&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=aa";

        }
        /// <summary>
        /// 获得解析的URL
        /// </summary>
        private string GetURL(string urlHtml)
        {
            MatchCollection matchList;
            Regex regex = new Regex(@"f="".*?""");
            matchList = regex.Matches(urlHtml);
            if (matchList.Count > 0)
            {
                return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4).Trim();
            }
            return "";
        }
        #region 获得页面
        private  string getPageContent(string url)
        {
            string htmlcontent = "";
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            string gethost = string.Empty;
            CookieContainer cc = new CookieContainer();
            string Cookiesstr = string.Empty;
            try
            {

                //第一次POST请求
                string postdata = @"username=xxxxx&password=xxxxx&quickforward=yes&handlekey=ls";//模拟请求数据
                string LoginUrl = "http://bbs.viayun.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1";
                request = (HttpWebRequest)WebRequest.Create(LoginUrl);//实例化web访问类
                request.Method = "POST";//数据提交方式为POST
                //模拟头
                request.ContentType = "application/x-www-form-urlencoded";
                byte[] postdatabytes = Encoding.UTF8.GetBytes(postdata);
                request.ContentLength = postdatabytes.Length;
                request.Referer = "bbs.viayun.com";
                request.AllowAutoRedirect = false;
                request.CookieContainer = cc;
                request.KeepAlive = true;
                //提交请求
                Stream stream;
                stream = request.GetRequestStream();
                stream.Write(postdatabytes, 0, postdatabytes.Length);
                stream.Close();
                //接收响应
                response = (HttpWebResponse)request.GetResponse();
                //保存返回cookie
                response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
                CookieCollection cook = response.Cookies;
                string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);
                Cookiesstr = strcrook;
                //取第一次GET跳转地址
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
                string content = sr.ReadToEnd();
                response.Close();
            }
            catch (Exception)
            {
                //第一次POST出错；
            }
            try
            {
                gethost = "http://bbs.viayun.com/search.php?mod=forum&searchid=5&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=aa"; //第一次GET地址
                request = (HttpWebRequest)WebRequest.Create(gethost);
                request.Method = "GET";
                request.KeepAlive = true;
                request.Headers.Add("Cookie:" + Cookiesstr);
                request.CookieContainer = cc;
                request.AllowAutoRedirect = false;
                response = (HttpWebResponse)request.GetResponse();
                //设置cookie   
                Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
                //取再次跳转链接   
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
                htmlcontent = sr.ReadToEnd();
                request.Abort();
                sr.Close();
                response.Close();
            }
            catch (Exception)
            {
                //第一次GET出错   
            }
            return htmlcontent;
        }
        #endregion
    }
}