获取搜狗引擎 添加任意关键词 后的html源码(c#)

一直在不断测试:希望有大神指示如何获取 搜狗微信文章获取)

class Program
    {
        private static ManageKeywordBll mkBll = new ManageKeywordBll();
        private static WeixinquanMessageBll wqBll = new WeixinquanMessageBll();
        private static CookieHelp ch = new CookieHelp();
        private static string userAgentLiShi = "";
        private static string result = "";//返回html结果
        private static string[] kw;//关键词
        private static string id = "";//获取 验证码返回的 id用作SUNID的cookie值
        
        static void Main(string[] args)
        {
            #region 开始
            ManageKeyword model = new ManageKeyword();
            List<ManageKeyword> list = new List<ManageKeyword>();
            string cookieAll = "";//防止cookie信息丢失,URL链接失效问题
            model.Module = "6,";
            list = mkBll.GetList(model);
            foreach (ManageKeyword data in list)
            {
                List<WeixinquanMessage> listWeixin = new List<WeixinquanMessage>();
                string url = "";
                string keyword = data.KeyWord;
                if (keyword.IndexOf("*") != -1)
                {
                    keyword = keyword.Replace("*", " ");
                }
                //long datetime = ConvertDateTimeToInt(true, DateTime.Now);//13位时间戳
                //long datetime16 = ConvertDateTimeToInt(false, DateTime.Now);//16位时间戳
                url = "http://weixin.sogou.com/weixin?"
                    + "query=" + keyword + "&"
                    + "_sug_type_=1&"
                    + "sut=0&"
                    + "sourceid=inttime_all&"//当天的内容inttime_day 全部时间的inttime_all
                    + "ri=0&"
                    + "_sug_=n&"
                    + "type=2&"//type=2 微信号 type=1 公众号
                    + "ie=utf-8&"
                    //+ "sst0=" + datetime + "&"
                    + "interation=&"
                    //+ "interV=kKIOkrELjboJmLkElbYTkKIKmbELjbkRmLkElbk%3D_1893302304&"
                    + "tsn=0&"//0:全部时间,1:一天内,2:一星期内
                    + "page=(*)";
                    //+ "dp=1";
                //url = HttpUtility.UrlEncode(url);

                cookieAll = GetHtmlByYzm(url, keyword);
                //由于读取一次URL之后有10条新闻需要处理 在这处理10新闻的间隙中 
                //cookiecontianer或许保存了过期的SUID,与之前读出URL是产生的SUID不一致 导致 访问内页新闻详情时链接失效
                listWeixin = GetTitleContent(result, keyword, cookieAll);
                foreach (WeixinquanMessage wm in listWeixin)
                {
                    wm.KeyWord = data.KeyWord;
                    wqBll.Add(wm);
                }

                cookieAll = "";//清空一下
                url = "";//清空一下
            }
            #endregion
            Console.WriteLine("OK");
            Console.ReadKey();
        }

        /// <summary>
        /// 将日期转化为Unix时间戳
        /// </summary>
        /// <param name="time">时间日期格式</param>
        /// <returns>长整型数据</returns>
        private static long ConvertDateTimeToInt(bool flag, System.DateTime time)
        {
            long t = 0;
            System.DateTime startTime = TimeZone.CurrentTimeZone.ToLocalTime(new System.DateTime(1970, 1, 1, 0, 0, 0, 0));
            if (flag)
            {
                t = (time.Ticks - startTime.Ticks) / 10000;   //除10000调整为13位    
            }
            else
            {
                t = time.Ticks - startTime.Ticks;//16位
            }
            return t;
        }

        /// <summary>
        /// 将时间戳转化为时间日期格式
        /// </summary>
        /// <param name="timeStamp"></param>
        /// <returns></returns>
        private static DateTime ConvertStringToDateTime(string timeStamp)
        {
            DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1));
            long lTime = long.Parse(timeStamp + "0000000");
            TimeSpan toNow = new TimeSpan(lTime);

            return dtStart.Add(toNow);
        }

        /// <summary>
        /// 随机生成一个useragent
        /// </summary>
        /// <returns></returns>
        private static void RandomUserAgent()
        {
            Random r = new Random();
            int random = r.Next(0, UserAgentList.userAgentShuZu.Length);
            userAgentLiShi = UserAgentList.userAgentShuZu[random];
        }

        /// <summary>
        /// 获取验证码返回的id值 
        /// </summary>
        /// <param name="keywords">关键词</param>
        /// <returns>id值</returns>
        private static string GetSNUID(string keywords)
        {
            string id = "";
            string referer = "";
            for (int j = 0; j < 10; j++)
            {
                ch.MyCookieContainer = new CookieContainer();//清空掉原有的cookie
                ch.Cookie = "";//清空掉原有的cookie
                long tc = ConvertDateTimeToInt(true, DateTime.Now) / 1000;
                string imageUrl = "http://weixin.sogou.com/antispider/util/seccode.php?tc=" + tc.ToString() + "";//图片的src地址
                Image image = ch.getImage(imageUrl, "UTF-8");
                CodeFactory cf = new CodeFactory(CodeType.YDM);
                CodeModel cm = cf.GetCode(image);
                string url = "http://weixin.sogou.com/antispider/thank.php";
                string postdata = "c=" + cm.Code + "&r=/weixin?type=2&query=" + keywords + "&ie=utf8&_sug_=n&_sug_type_=&v=5";
                referer = HttpUtility.UrlEncode("http://weixin.sogou.com/antispider/?from=/weixin?type=2&query=" + keywords + "&ie=utf8&_sug_=n&_sug_type_=");
                id = ch.PostAndGetResult(url, "UTF-8", postdata, "", "", referer);
                
                if (id.IndexOf("解封成功,正在为您跳转来源地址...") > -1)
                {
                    break;
                }
            }

            id = id.Substring(id.IndexOf("id")).Replace("\"", "").Replace(":", "").Replace(" ", "");
            id = id.Substring(2, id.Length - 3);

            return id;
        }

        /// <summary>
        /// 通过破解验证码的方式获取网页源码
        /// </summary>
        /// <param name="urls">要访问的URL</param>
        /// <param name="keywords">要访问的关键词</param>
        /// <returns>cookie值</returns>
        private static string GetHtmlByYzm(string urls,string keywords)
        {
            bool isOk = false;
            string cookies = "";
            result = ch.getHtml(urls, "UTF-8", "", "", "", "	Mozilla / 5.0(Windows NT 10.0; WOW64; rv: 45.0) Gecko / 20100101 Firefox / 45.0");
            cookies = ch.Cookie;
            

            for ( int i = 0; i < 10; i++)
            {
                isOk = true;
                if (result.IndexOf("请输入验证码") > -1)
                {
                    isOk = false;
                    if (cookies.IndexOf("SNUID") > -1)
                    {
                        cookies = cookies.Substring(0, cookies.IndexOf("SNUID") - 1);
                    }
                    //经测试如果出现验证码 seccodeRight=success;refresh=1,必须得用 successCount =1|Thu, 31 Mar 2016 07:58:56 GMT; 可以不用
                    cookies += ";SUV=00677DCD6F11A86256FCD05375656513;seccodeRight=success;refresh=1;SNUID=" + id;
                    result = ch.getHtml(urls, "UTF-8", cookies, "", "", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");//Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0
                }

                //解决出现验证码 获取一次id 可执行4-5次
                if (result.IndexOf("请输入验证码") > -1)
                {
                    id = GetSNUID(keywords);
                }

                if (isOk)
                {
                    break;
                }
            }

            

            return cookies;
        }

        /// <summary>
        /// 把中文的标点符号转换回来
        /// </summary>
        /// <param name="parm"></param>
        /// <returns></returns>
        private static string ReplaceZhongWenBiaoDian(string parm)
        {
            string result = "";
            string[] a = { " ", "&", """, "'", "“", "”", "—", "<", ">", "·", "…" };
            string[] b = { " ", "&", "\"", "'", "“", "”", "—", "<", ">", "·", "…" };
            for (int i = 0; i < a.Length; i++)
            {
                if (parm.Contains(a[i]))
                {
                    parm = parm.Replace(a[i], b[i]);
                }
            }

            result = parm;
            return result;
        }

        /// <summary>
        /// 把html 标签里的class style 全部去掉
        /// </summary>
        /// <param name="htmltag"></param>
        /// <returns></returns>
        private static string RegexReplaceHtmlTag(string htmltag)
        {
            string result = "";
            List<string> list = new List<string>();
            Regex regex = new Regex(@"<([a-z|A-Z|0-9]+?) [^>]*?>", RegexOptions.IgnoreCase);
            MatchCollection mtc = regex.Matches(htmltag);
            foreach (Match m in mtc)
            {
                if (!list.Contains(m.Groups[1].Value))
                {
                    list.Add(m.Groups[1].Value);
                }
                
            }
            string[] h = { "h1", "h2", "h3", "h4", "h5", "h6" };//把h标签换成p标签
            for (int i = 0; i < list.Count; i++)
            {
                htmltag = Regex.Replace(htmltag, @"<" + list[i] + " [^>]*?>", "<" + list[i] + ">", RegexOptions.IgnoreCase);
                if (h.Contains(list[i]))
                {
                    htmltag = htmltag.Replace("<" + list[i] + ">", "<p>").Replace("</" + list[i] + ">", "</p>");
                }
            }
            result = htmltag.Replace("<img>", "").Replace("<iframe>", "").Replace("</iframe>", "");
            return result;
        }

        /// <summary>
        /// 正则匹配
        /// </summary>
        /// <param name="source">html源码</param>
        /// <param name="format">匹配表达式</param>
        /// <param name="isFrist">是否取值group[0]与group[1]</param>
        /// <returns>匹配的值</returns>
        private static string RegexPP(string source, string format, bool isFrist)
        {
            Regex regexRepostsCount = new Regex(format, RegexOptions.IgnoreCase);
            Match m = regexRepostsCount.Match(source);
            string rc = "";
            rc = m.Groups[0].Value;
            if (!isFrist)
            {
                rc = m.Groups[1].Value;
            }

            return rc;
        }

        /// <summary>
        /// 获取文章的相关信息
        /// </summary>
        /// <param name="str">html源码</param>
        /// <param name="keywords">关键词</param>
        /// <param name="cookies">用到的cookie值</param>
        /// <returns>List<WeixinquanMessage>的集合</returns>
        private static List<WeixinquanMessage> GetTitleContent(string str,string keywords,string cookies)
        {
            str = str.Replace("\\r", "").Replace("\\t", "").Replace("\\n", "");
            List<WeixinquanMessage> list = new List<WeixinquanMessage>();

            kw = new string[1];
            kw[0] = keywords;
            if (keywords.Contains(" "))
            {
                kw = new string[2];
                kw = keywords.Split(' ');//如何关键词中包含* 之拆分为 两个词
            }

            //获取整个文章
            Regex regexSource = new Regex(@"<div class=""wx-rb wx-rb3"" [^>]*?>(\s|\S)+?(</div>\s*?){3}", RegexOptions.IgnoreCase);//(\s|\S)+?
            MatchCollection mtcSource = regexSource.Matches(str);

            string[] source = new string[mtcSource.Count];
            int i = 0;
            foreach (Match m in mtcSource)
            {
                source[i++] = m.Groups[0].Value;
            }

            for (int j = 0; j< source.Length; j++)
            { 
                source[j] = source[j].Replace("\n", "");
                WeixinquanMessage wm = new WeixinquanMessage();
                string a = "";
                string divTxtBox = "";
                bool isKeyWord = false;//用来判断关键词 是否 存在于标题与内容之间

                //匹配标题 与 URL  
                a = RegexPP(source[j], @"<div class=""txt-box""><h4>([\s|\S]*?)</h4>", false);
                //url
                wm.Url = "http://weixin.sogou.com" + RegexPP(a, @"<a [^>]*? href=""([\s\S]*?)""[^>]*?>", false).Replace("amp;", "");
                //标题
                wm.WeixinTitle = ReplaceZhongWenBiaoDian(Regex.Replace(a, @"<[\s\S]*?>", "", RegexOptions.IgnoreCase));
                
                //判断该关键字是否存在于标题当中
                if ((wm.WeixinTitle.IndexOf(kw[0]) != -1))
                {
                    isKeyWord = true;
                }
                if (kw.Length > 1)
                {
                    isKeyWord = false;
                    if ((wm.WeixinTitle.IndexOf(kw[1]) != -1))
                    {
                        isKeyWord = true;
                    }
                }

                //时间与作者
                //时间
                divTxtBox = RegexPP(source[j], @"<div class=""s-p"" [^>]*?>([\s|\S]*?)</a>", true);
                string publishTime = RegexPP(divTxtBox, @"<div class=""s-p"" t=""([\d]*?)""[^>]*?>", false);
                wm.PublishTime = ConvertStringToDateTime(publishTime);
                //微信的用户名
                string UserName = RegexPP(source[j], @"<div class=""s-p"" [^>]*?>([\s|\S]*?)</a>", false);
                wm.UserName = RegexPP(UserName, @"<a [^>]*? title=""([\s|\S]*?)""[^>]*?>", false);

                //通过URL获取文章内容、作者的信息等
                string allContent = ch.getHtml(wm.Url, "utf-8", cookies, "", "", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");//ch.Cookie,http://weixin.sogou.com
                
                //获取有效的URL地址
                string biz = RegexPP(allContent, @"var biz = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
                string sn = RegexPP(allContent, @"var sn = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
                string mid = RegexPP(allContent, @"var mid = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
                string idx = RegexPP(allContent, @"var idx = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
                wm.Url = "http://mp.weixin.qq.com/s?__" + biz + "&" + mid + "&" + idx + "&" + sn + "&3rd=MzA3MDU4NTYzMw==&scene=6#rd";
                
                string headTitle = RegexPP(allContent, @"<div class=""rich_media_meta_list"">(\s|\S)+?(</div>\s*?){2}", true);
                //作者
                wm.Author = RegexPP(headTitle, @"<em class=""rich_media_meta rich_media_meta_text"">([\s|\S]*?)<em>", false);
                if (String.IsNullOrEmpty(wm.Author))
                {
                    wm.Author = "空";
                }
                //微信号
                wm.UserId = RegexPP(headTitle, @"<span class=""profile_meta_value"">([\s|\S]*?)</span>", false);
                Random r = new Random();
                if (String.IsNullOrEmpty(wm.UserId))//没有微信号 给一个默认的
                {
                    int random = r.Next(0, 1000);
                    wm.UserId = wm.UserName + (Convert.ToInt64(DateTime.Now.ToString("yyyyMMddHHmmssms")) + random).ToString();
                }
                //文章内容不包含<p> <br/>标签
                wm.WeixinViewContent = "";
                string txt = "";
                txt = RegexPP(allContent, @"<div class=""rich_media_content ""[^>]*?>([\s|\S]*?)</div>", false);
                wm.WeixinViewContent = Regex.Replace(txt, @"<[\s\S]*?>", "", RegexOptions.IgnoreCase);
                wm.WeixinViewContent = ReplaceZhongWenBiaoDian(wm.WeixinViewContent.Replace("\\r", "").Replace("\\n", "").Replace("\\t", "").Replace(" ", ""));

                //文章内容包含 <p> <br/>标签
                string section = "";
                section= RegexPP(allContent, @"<div class=""rich_media_content ""[^>]*?>([\s|\S]*?)</div>", true);
                wm.WeixinAllContent = RegexReplaceHtmlTag(section);
                wm.WeixinAllContent = ReplaceZhongWenBiaoDian(wm.WeixinAllContent.Replace("\\r", "").Replace("\\n", "").Replace("\\t", "").Replace(" ", ""));

                if (String.IsNullOrEmpty(wm.WeixinViewContent))
                {
                    wm.WeixinViewContent = "文章内容为空或为图片格式。";
                }

                //如果该关键字不存在于标题当中,再进行匹配文章内容
                if (!isKeyWord)
                {
                    if ((wm.WeixinViewContent.IndexOf(kw[0]) != -1))
                    {
                        isKeyWord = true;
                    }
                    if (kw.Length > 1)
                    {
                        isKeyWord = false;
                        if ((wm.WeixinViewContent.IndexOf(kw[1]) != -1))
                        {
                            isKeyWord = true;
                        }
                    }
                }

                //如果该关键词既不存在于标题中,又不存在与内容中则直接结束本次循环
                if (!isKeyWord)
                {
                    continue;
                }

                list.Add(wm);
            }
          
            return list;
        }
    }





  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值