C#winform抓取百度,Google搜索关键词结果

基于网站seo,做了一采集百度和Google搜索关键字结果的采集.在这里与大家分享一下

先看先效果图 

代码附加:

 View Code

 
 1    private  void baidu_Click( object sender, EventArgs e)
 2         {
 3              int num =  100; // 搜索条数
 4              string url =  " http://www.baidu.com/s?wd= " + txtSearch.Text.Trim() +  " &rn= " + num +  "";
 5              string html = search(url,  " gb2312 ");
 6             BaiduSearch baidu =  new BaiduSearch();
 7              if (! string.IsNullOrEmpty(html))
 8             {
 9                  int count = baidu.GetSearchCount(html); // 搜索条数
10                  if (count >  0)
11                 {
12                     List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
13                     dataGridView1.DataSource = keywords;
14                 }
15 
16             }
17         }
18 
19          private  void google_Click( object sender, EventArgs e)
20         {
21              int num =  100;
22              string url =  " http://www.google.com.hk/search?hl=zh-CN&source=hp&q= " + txtSearch.Text.Trim() +  " &aq=f&aqi=&aql=&oq=&num= " + num +  "";
23              string html = search(url,  " utf-8 ");
24              if (! string.IsNullOrEmpty(html))
25             {
26 
27                 googleSearch google =  new googleSearch();
28                 List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
29                 dataGridView1.DataSource = keywords;
30 
31             }
32         }
33          ///   <summary>
34           ///  搜索处理
35           ///   </summary>
36           ///   <param name="url"> 搜索网址 </param>
37           ///   <param name="Chareset"> 编码 </param>
38          public  string search( string url,  string Chareset)
39         {
40             HttpState result =  new HttpState();
41             Uri uri =  new Uri(url);
42             HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
43             myHttpWebRequest.UseDefaultCredentials =  true;
44             myHttpWebRequest.ContentType =  " text/html ";
45             myHttpWebRequest.UserAgent =  " Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;) ";
46             myHttpWebRequest.Method =  " GET ";
47             myHttpWebRequest.CookieContainer =  new CookieContainer();
48 
49              try
50             {
51                 HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
52                  //  从 ResponseStream 中读取HTML源码并格式化 add by cqp
53                 result.Html = readResponseStream(response, Chareset);
54                 result.CookieContainer = myHttpWebRequest.CookieContainer;
55                  return result.Html;
56             }
57              catch (Exception ex)
58             {
59                  return ex.ToString();
60             }
61 
62         }
63          public  string readResponseStream(HttpWebResponse response,  string Chareset)
64         {
65              string result =  "";
66              using (StreamReader responseReader =  new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
67             {
68                 result = formatHTML(responseReader.ReadToEnd());
69             }
70 
71              return result;
72         }
73          ///   <summary>
74           ///  描述:格式化网页源码
75           ///  
76           ///   </summary>
77           ///   <param name="htmlContent"></param>
78           ///   <returns></returns>
79          public  string formatHTML( string htmlContent)
80         {
81              string result =  "";
82 
83             result = htmlContent.Replace( " &raquo; """).Replace( " &nbsp; """)
84                     .Replace( " &copy; """).Replace( " /r """).Replace( " /t """)
85                     .Replace( " /n """).Replace( " &amp; "" & ");
86              return result;
87

 把百度和Google两个类抽取了出来

1.百度Search类

 View Code

 
 1  class BaiduSearch
 2     {
 3          protected  string uri =  " http://www.baidu.com/s?wd= ";
 4          protected Encoding queryEncoding = Encoding.GetEncoding( " gb2312 ");
 5          protected Encoding pageEncoding = Encoding.GetEncoding( " gb2312 ");
 6          protected  string resultPattern =  @" (?<=找到相关结果[约]?)[0-9,]*?(?=个) ";
 7          public  int GetSearchCount( string html)
 8         {
 9              int result =  0;
10              string searchcount =  string.Empty;
11 
12             Regex regex =  new Regex(resultPattern);
13             Match match = regex.Match(html);
14 
15              if (match.Success)
16             {
17                 searchcount = match.Value;
18             }
19              else
20             {
21                 searchcount =  " 0 ";
22             }
23 
24              if (searchcount.IndexOf( " , ") >  0)
25             {
26                 searchcount = searchcount.Replace( " , "string.Empty);
27             }
28 
29              int.TryParse(searchcount,  out result);
30 
31              return result;
32         }
33 
34          public List<Keyword> GetKeywords( string html,  string word)
35         {
36              int i =  1;
37             List<Keyword> keywords =  new List<Keyword>();
38              string ss= " <h3 class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a> ";
39             MatchCollection mcTable = Regex.Matches(html,ss);
40              foreach (Match mTable  in mcTable)
41             {
42                  if (mTable.Success)
43                 {
44                     Keyword keyword =  new Keyword();
45                     keyword.ID = i++;
46                     keyword.Title = Regex.Replace(mTable.Groups[ " content "].Value,  " <[^>]*> "string.Empty);
47                     keyword.Link = mTable.Groups[ " url "].Value;
48                     keywords.Add(keyword);
49 
50                 }
51             }
52 
53              return keywords;
54         }
55

2 .GoogleSearch类

 View Code

 
 1    class googleSearch
 2     {
 3 
 4          public List<Keyword> GetKeywords( string html,  string word)
 5         {
 6              int i =  1;
 7             List<Keyword> keywords =  new List<Keyword>();
 8 
 9             Regex regTable =  new Regex( " <h3 class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a> ", RegexOptions.IgnoreCase);
10             Regex regA =  new Regex( @" (?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a> ", RegexOptions.IgnoreCase);
11 
12             MatchCollection mcTable = regTable.Matches(html);
13              foreach (Match mTable  in mcTable)
14             {
15                  if (mTable.Success)
16                 {
17                     Keyword keyword =  new Keyword();
18                     keyword.ID = i++;
19                     keyword.Title = Regex.Replace(mTable.Groups[ " content "].Value,  " <[^>]*> "string.Empty);
20                     keyword.Link = mTable.Groups[ " url "].Value;
21                     keywords.Add(keyword);
22                 }
23             }
24 
25              return keywords;
26         }
27

 忘了.还有个导出Excel,这个友友们应该都有自己的方法,我这里就简单写了一个excel导出.也贴出来吧.

 1    public  void ExportDataGridViewToExcel(DataGridView dataGridview1)
 2         {
 3             SaveFileDialog saveFileDialog =  new SaveFileDialog();
 4             saveFileDialog.Filter =  " Execl  files  (*.xls)|*.xls ";
 5             saveFileDialog.FilterIndex =  0;
 6             saveFileDialog.RestoreDirectory =  true;
 7             saveFileDialog.CreatePrompt =  true;
 8             saveFileDialog.Title =  " 导出Excel文件 ";
 9 
10             DateTime now = DateTime.Now;
11             saveFileDialog.FileName = now.Year.ToString().PadLeft( 2) + now.Month.ToString().PadLeft( 2' 0 ') + now.Day.ToString().PadLeft( 2' 0 ') +  " - " + now.Hour.ToString().PadLeft( 2' 0 ') + now.Minute.ToString().PadLeft( 2' 0 ') + now.Second.ToString().PadLeft( 2' 0 ');
12             saveFileDialog.ShowDialog();
13 
14             Stream myStream;
15             myStream = saveFileDialog.OpenFile();
16             StreamWriter sw =  new StreamWriter(myStream, System.Text.Encoding.GetEncoding( " gb2312 "));
17              string str =  "";
18              try
19             {
20                  // 写标题      
21                  for ( int i =  0; i < dataGridview1.ColumnCount; i++)
22                 {
23                      if (i >  0)
24                     {
25                         str +=  " \t ";
26                     }
27                     str += dataGridview1.Columns[i].HeaderText;
28                 }
29                 sw.WriteLine(str);
30                  // 写内容   
31                  for ( int j =  0; j < dataGridview1.Rows.Count; j++)
32                 {
33                      string tempStr =  "";
34                      for ( int k =  0; k < dataGridview1.Columns.Count; k++)
35                     {
36                          if (k >  0)
37                         {
38                             tempStr +=  " \t ";
39                         }
40                         tempStr += dataGridview1.Rows[j].Cells[k].Value.ToString();
41                     }
42                     sw.WriteLine(tempStr);
43                 }
44                 sw.Close();
45                 myStream.Close();
46                 MessageBox.Show( " 导出成功 ");
47             }
48              catch (Exception e)
49             {
50                 MessageBox.Show(e.ToString());
51             }
52              finally
53             {
54                 sw.Close();
55                 myStream.Close();
56             }

57         } 

我把HTTpStatus类给贴出来..有需要demo的可以发邮件给我.或者留下邮箱

Httpstatus.cs 

 class HttpState
    {
        private string _statusDescription;

        public string StatusDescription
        {
            get { return _statusDescription; }
            set { _statusDescription = value; }
        }

        /// <summary>
        /// 回调 址址, 登陆测试中使用
        /// </summary>
        private string _callBackUrl;

        public string CallBackUrl
        {
            get { return _callBackUrl; }
            set { _callBackUrl = value; }
        }


        /// <summary>
        /// 网页网址 绝对路径格式
        /// </summary>
        private string _url;

        public string Url
        {
            get { return _url; }
            set { _url = value; }
        }

        /// <summary>
        /// 字符串的形式的Cookie信息
        /// </summary>
        private string _cookies;

        public string Cookies
        {
            get { return _cookies; }
            set { _cookies = value; }
        }

        /// <summary>
        /// Cookie信息
        /// </summary>
        private CookieContainer _cookieContainer = new CookieContainer();

        public CookieContainer CookieContainer
        {
            get { return _cookieContainer; }
            set { _cookieContainer = value; }
        }

        /// <summary>
        /// 网页源码
        /// </summary>
        private string _html;

        public string Html
        {
            get { return _html; }
            set { _html = value; }
        }

        /// <summary>
        /// 验证码临时文件(绝对路径)
        /// </summary>
        private string _tmpValCodePic;

        public string TmpValCodePic
        {
            get { return _tmpValCodePic; }
            set { _tmpValCodePic = value; }
        }

        /// <summary>
        /// 验证码临时文件名(相对路径)
        /// </summary>
        private string _tmpValCodeFileName = "emptyPic.gif";

        public string TmpValCodeFileName
        {
            get { return _tmpValCodeFileName; }
            set { _tmpValCodeFileName = value; }
        }

        /// <summary>
        /// 有验证码
        /// </summary>
        private bool _isValCode;

        public bool IsValCode
        {
            get { return _isValCode; }
            set { _isValCode = value; }
        }

        /// <summary>
        /// 验证码URL
        /// </summary>
        private string _valCodeURL;

        public string ValCodeURL
        {
            get { return _valCodeURL; }
            set { _valCodeURL = value; }
        }

        /// <summary>
        /// 验证码识别后的值
        /// </summary>
        private string _valCodeValue;

        public string ValCodeValue
        {
            get { return _valCodeValue; }
            set { _valCodeValue = value; }
        }

        /// <summary>
        /// 其它参数
        /// </summary>
        private Hashtable _otherParams = new Hashtable();

        public Hashtable OtherParams
        {
            get { return _otherParams; }
            set { _otherParams = value; }
        }

        // 重复添加处理 add by fengcj  09/11/19 PM
        public void addOtherParam(object key, object value)
        {
            if (!this.OtherParams.ContainsKey(key))
                this.OtherParams.Add(key, value);
            else
            {
                this.OtherParams[key] = value;
            }
        }

        public void removeOtherParam(object key)
        {
            this.OtherParams.Remove(key);
        }

        public object getOtherParam(object key)
        {
            return this.OtherParams[key];
        }
    }

  

 KeyWord.cs

 

 

 class Keyword
    {
        public int ID { get; set; }
        public string Title { get; set; }
        public string Link { get; set; }
    }

  鉴于大家都需要demo,今天就整理一下发上来.添加了导出word,导出excel功能.晕...木找到怎么放文件路径进来....有需要的可以email我.

 

 

 

 

 

 

 

 

 

转载于:https://www.cnblogs.com/liguanghui/archive/2011/11/07/2239161.html

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值