C# Google 网络获取

最新推荐文章于 2024-05-06 22:23:44 发布

xingtianzhang2008

最新推荐文章于 2024-05-06 22:23:44 发布

阅读量647

点赞数

分类专栏： C# 文章标签： google c# 网络 regex string html

本文链接：https://blog.csdn.net/xingtianzhang2008/article/details/2087195

版权

C# 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

using System;
using System.Web;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using Boolue.Real;

namespace GoogleRefer
{
    public class Google
    {
        static string URLFormat = "http://news.google.cn/news?tab=vn&hl=zh-CN&ned=cn&scoring=n&q=|&ie=UTF-8&sa=N&start=0";
        static string StrImage;//图片
        static string StrURL;//网址
        static string StrCaption;//标题
        static string StrFrom;//新闻源头
        static string StrMark;//简介
        static string StrTime;
        public Google(string StrKey)
        {
            StrImage = @"src=(.*?)/s";
            StrURL = @"href=""(.*?)""";
            StrCaption = @"target=_blank>(.*?)</a>";
            StrFrom = @"size=-1.*?>(.*?)-";
            StrMark = @"size=-1.*?size=-1>(.*?)</div>";
            StrTime = @"<nobr>(.*?)</nobr>";
            GetKey(StrKey);
        }
        //得到关键字，然后获取到源代码以及对HTML做一些相关的判断
        private void GetKey(string GetKey)
        {

            URLFormat= URLFormat.Replace("|", HttpUtility.UrlEncode(GetKey, Encoding.GetEncoding("UTF-8")));
            string Html = RealPage.Get(URLFormat);
            if (Html.Length == 0)//URL有错误
            {
                GetError();
            }
            if (Html.IndexOf("找不到和您的查询") != -1)//没有搜索到数据
            {
                GetError();
            }
            GetPageCount(Html);
        }

        private void GetPageCount(string HTML)
        {

             GetHtml(HTML, 0);//第一页
            Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
            Match MPage = RePage.Match(HTML);
            int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
            if (PageCount > 9)
            {
                string URL = URLFormat.Replace("start=0", "start=" + (PageCount * 10 - 10));
                HTML = RealPage.Get(URL);
                PageCount=JudgePageCount(HTML,10);
                if (PageCount > 18)
                {
                    URL = URL.Replace("start=90", "start=" + (PageCount * 10 - 10));
                    HTML = RealPage.Get(URL);
                    PageCount = JudgePageCount(HTML,19);
                    if (PageCount > 27)
                    {
                        URL = URL.Replace("start=180", "start=" + (PageCount * 10 - 10));
                        HTML = RealPage.Get(URL);
                        PageCount = JudgePageCount(HTML,28);
                        GetHtml("", PageCount);
                    }
                    else
                    {
                        GetHtml("", PageCount);
                    }
                }
                else
                {
                    GetHtml("", PageCount);
                }
            }
            else
            {
                GetHtml("", PageCount);
            }
        }
        private int JudgePageCount(string HTML,int CurrentPageCount)
        {
            int Page = 0;
            if (HTML.IndexOf("<b>下一页</b>") != -1)
            {
                Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
                Match MPage = RePage.Match(HTML);
                int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
                return PageCount;
            }
            else
            {
            Page=CurrentPageCount;
            }
            return Page;
        }

        private void GetHtml(string HTML, int PageCount)
        {
            if (PageCount == 0)
            {
                GetList(HTML);
            }
            else
            {
                for (int i = 1; i <= PageCount; i++)
                {
                    HTML = RealPage.Get(URLFormat.Replace("start=0", "start=" + (i * 10)));
                    GetList(HTML);
                }
            }
        }

        private void GetList(string HTML)
        {
            Regex ReList = new Regex(@"<table/s{1,10}border=0/s{1,10}valign=top.*?>(.*?)</table>");
            MatchCollection McList = ReList.Matches(HTML);
            if (McList.Count == 0)
            {
                GetError();//没有数据
            }
            foreach (Match MList in McList)
            {
                GetItem(MList.Groups[1].Value);
            }
        }
        private void GetItem(string Html)
        {
            Regex ReImage = new Regex(StrImage);
            Match MImage = ReImage.Match(Html);
            string Str_Image = MImage.Groups[1].Value;

            Regex ReURL = new Regex(StrURL);
            Match MURL = ReURL.Match(Html);
            string Str_URL = MURL.Groups[1].Value;

            Regex ReCaption = new Regex(StrCaption);
            Match MCaption = ReCaption.Match(Html);
            string Str_Caption = Regex.Replace(MCaption.Groups[1].Value,"<.*?>","");

            Regex ReFrom = new Regex(StrFrom);
            Match MFrom = ReFrom.Match(Html);
            string Str_From = Regex.Replace(MFrom.Groups[1].Value.Replace(" ", ""),"<.*?>","");

            Regex ReMark = new Regex(StrMark);
            Match MMark = ReMark.Match(Html);
            string Str_Mark = Regex.Replace(MMark.Groups[1].Value,"<.*?>","").Replace(" ","");

            Regex ReTime = new Regex(StrTime);
            Match Mtime = ReTime.Match(Html);
            string Str_Time = Mtime.Groups[1].Value;
            if (Str_Time.IndexOf("分钟前") != -1)
            {
                Str_Time = "-" + Regex.Replace(Str_Time, @"[^/d]", "");
                Str_Time = DateTime.Now.AddMinutes(double.Parse(Str_Time)).ToString();
            }

            else if (Str_Time.IndexOf("小时前") != -1)
            {
                Str_Time = "-" + Regex.Replace(Str_Time, @"[^/d]", "");
                Str_Time = DateTime.Now.AddHours(double.Parse(Str_Time)).ToString();
            }


            System.IO.StreamWriter sw = new System.IO.StreamWriter(@"C:/aaa.txt",true,Encoding.Default);
            sw.WriteLine("图片地址:"+Str_Image);
            sw.WriteLine("新闻网址:"+Str_URL);
            sw.WriteLine("新闻日期:"+Str_Time);
            sw.WriteLine("新闻来源:"+Str_From);
            sw.WriteLine("新闻标题:" + Str_Caption);
            sw.WriteLine("新闻简介:"+Str_Mark);
            sw.WriteLine("/n/n");
            sw.Flush();
            sw.Close();
        }
        private void GetError()
        {
            throw new Exception();
        }
    }
}

xingtianzhang2008

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
C# Google 网络获取

using System;using System.Web;using System.Collections.Generic;using System.Text;using System.Text.RegularExpressions;using Boolue.Real;namespace GoogleRefer{ public class Google { static
复制链接

扫一扫