C#winform抓取百度,Google搜索关键词结果

最新推荐文章于 2025-09-12 10:15:49 发布

转载最新推荐文章于 2025-09-12 10:15:49 发布 · 2.7k 阅读

文章标签：

#c# #爬虫

本文介绍了一种用于抓取百度和Google搜索结果的方法，并实现了关键词及其链接的导出功能。

基于网站seo,做了一采集百度和Google搜索关键字结果的采集.在这里与大家分享一下

先看先效果图

代码附加:

View Code

1   private void baidu_Click(object sender, EventArgs e)
2         {
3             int num = 100;//搜索条数
4             string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
5             string html = search(url, "gb2312");
6             BaiduSearch baidu = new BaiduSearch();
7             if (!string.IsNullOrEmpty(html))
8             {
9                 int count = baidu.GetSearchCount(html);//搜索条数
10                 if (count > 0)
11                 {
12                     List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
13                     dataGridView1.DataSource = keywords;
14                 }
15
16             }
17         }
18
19         private void google_Click(object sender, EventArgs e)
20         {
21             int num = 100;
22             string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num=" + num + "";
23             string html = search(url, "utf-8");
24             if (!string.IsNullOrEmpty(html))
25             {
26
27                 googleSearch google = new googleSearch();
28                 List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
29                 dataGridView1.DataSource = keywords;
30
31             }
32         }
33         /// <summary>
34         /// 搜索处理
35         /// </summary>
36         /// <param name="url">搜索网址</param>
37         /// <param name="Chareset">编码</param>
38         public string search(string url, string Chareset)
39         {
40             HttpState result = new HttpState();
41             Uri uri = new Uri(url);
42             HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
43             myHttpWebRequest.UseDefaultCredentials = true;
44             myHttpWebRequest.ContentType = "text/html";
45             myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
46             myHttpWebRequest.Method = "GET";
47             myHttpWebRequest.CookieContainer = new CookieContainer();
48
49             try
50             {
51                 HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
52                 // 从 ResponseStream 中读取HTML源码并格式化 add by cqp
53                 result.Html = readResponseStream(response, Chareset);
54                 result.CookieContainer = myHttpWebRequest.CookieContainer;
55                 return result.Html;
56             }
57             catch (Exception ex)
58             {
59                 return ex.ToString();
60             }
61
62         }
63         public string readResponseStream(HttpWebResponse response, string Chareset)
64         {
65             string result = "";
66             using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
67             {
68                 result = formatHTML(responseReader.ReadToEnd());
69             }
70
71             return result;
72         }
73         /// <summary>
74         /// 描述:格式化网页源码
75         ///
76         /// </summary>77         /// <param name="htmlContent"></param>78         /// <returns></returns>79         public string formatHTML(string htmlContent)80         {81             string result = "";82 83             result = htmlContent.Replace("»", "").Replace(" ", "")84                     .Replace("©", "").Replace("/r", "").Replace("/t", "")85                     .Replace("/n", "").Replace("&", "&");86             return result;87

把百度和Google两个类抽取了出来

1.百度Search类

View Code

1 class BaiduSearch
2     {
3         protected string uri = "http://www.baidu.com/s?wd=";
4         protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
5         protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
6         protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
7         public int GetSearchCount(string html)
8         {
9             int result = 0;
10             string searchcount = string.Empty;
11
12             Regex regex = new Regex(resultPattern);
13             Match match = regex.Match(html);
14
15             if (match.Success)
16             {
17                 searchcount = match.Value;
18             }
19             else
20             {
21                 searchcount = "0";
22             }
23
24             if (searchcount.IndexOf(",") > 0)
25             {
26                 searchcount = searchcount.Replace(",", string.Empty);
27             }
28
29             int.TryParse(searchcount, out result);
30
31             return result;
32         }
33
34         public List<Keyword> GetKeywords(string html, string word)
35         {
36             int i = 1;
37             List<Keyword> keywords = new List<Keyword>();
38             string ss="<h3 class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>";
39             MatchCollection mcTable = Regex.Matches(html,ss);
40             foreach (Match mTable in mcTable)
41             {
42                 if (mTable.Success)
43                 {
44                     Keyword keyword = new Keyword();
45                     keyword.ID = i++;
46                     keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
47                     keyword.Link = mTable.Groups["url"].Value;
48                     keywords.Add(keyword);
49
50                 }
51             }
52
53             return keywords;
54         }
55

2 .GoogleSearch类

View Code

1   class googleSearch
2     {
3
4         public List<Keyword> GetKeywords(string html, string word)
5         {
6             int i = 1;
7             List<Keyword> keywords = new List<Keyword>();
8
9             Regex regTable = new Regex("<h3 class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase);
10             Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
11
12             MatchCollection mcTable = regTable.Matches(html);
13             foreach (Match mTable in mcTable)
14             {
15                 if (mTable.Success)
16                 {
17                     Keyword keyword = new Keyword();
18                     keyword.ID = i++;
19                     keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
20                     keyword.Link = mTable.Groups["url"].Value;
21                     keywords.Add(keyword);
22                 }
23             }
24
25             return keywords;
26         }
27

忘了.还有个导出Excel,这个友友们应该都有自己的方法,我这里就简单写了一个excel导出.也贴出来吧.

1   public void ExportDataGridViewToExcel(DataGridView dataGridview1)
2         {
3             SaveFileDialog saveFileDialog = new SaveFileDialog();
4             saveFileDialog.Filter = "Execl  files  (*.xls)|*.xls";
5             saveFileDialog.FilterIndex = 0;
6             saveFileDialog.RestoreDirectory = true;
7             saveFileDialog.CreatePrompt = true;
8             saveFileDialog.Title = "导出Excel文件";
9
10             DateTime now = DateTime.Now;
11             saveFileDialog.FileName = now.Year.ToString().PadLeft(2) + now.Month.ToString().PadLeft(2, '0') + now.Day.ToString().PadLeft(2, '0') + "-" + now.Hour.ToString().PadLeft(2, '0') + now.Minute.ToString().PadLeft(2, '0') + now.Second.ToString().PadLeft(2, '0');
12             saveFileDialog.ShowDialog();
13
14             Stream myStream;
15             myStream = saveFileDialog.OpenFile();
16             StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding("gb2312"));
17             string str = "";
18             try
19             {
20                 //写标题
21                 for (int i = 0; i < dataGridview1.ColumnCount; i++)
22                 {
23                     if (i > 0)
24                     {
25                         str += "\t";
26                     }
27                     str += dataGridview1.Columns[i].HeaderText;
28                 }
29                 sw.WriteLine(str);
30                 //写内容
31                 for (int j = 0; j < dataGridview1.Rows.Count; j++)
32                 {
33                     string tempStr = "";
34                     for (int k = 0; k < dataGridview1.Columns.Count; k++)
35                     {
36                         if (k > 0)
37                         {
38                             tempStr += "\t";
39                         }
40                         tempStr += dataGridview1.Rows[j].Cells[k].Value.ToString();
41                     }
42                     sw.WriteLine(tempStr);
43                 }
44                 sw.Close();
45                 myStream.Close();
46                 MessageBox.Show("导出成功");
47             }
48             catch (Exception e)
49             {
50                 MessageBox.Show(e.ToString());
51             }
52             finally
53             {
54                 sw.Close();
55                 myStream.Close();
56             }

57 }

我把HTTpStatus类给贴出来..有需要demo的可以发邮件给我.或者留下邮箱

Httpstatus.cs

 
          class 
          HttpState
         
          {
         
          private 
          string 
          _statusDescription;
         
          public 
          string 
          StatusDescription
         
          {
         
          get 
          { 
          return 
          _statusDescription; }
         
          set 
          { _statusDescription = value; }
         
          }
         
          /// <summary>
         
          /// 回调 址址, 登陆测试中使用
         
          /// </summary>
         
          private 
          string 
          _callBackUrl;
         
          public 
          string 
          CallBackUrl
         
          {
         
          get 
          { 
          return 
          _callBackUrl; }
         
          set 
          { _callBackUrl = value; }
         
          }
         
          /// <summary>
         
          /// 网页网址 绝对路径格式
         
          /// </summary>
         
          private 
          string 
          _url;
         
          public 
          string 
          Url
         
          {
         
          get 
          { 
          return 
          _url; }
         
          set 
          { _url = value; }
         
          }
         
          /// <summary>
         
          /// 字符串的形式的Cookie信息
         
          /// </summary>
         
          private 
          string 
          _cookies;
         
          public 
          string 
          Cookies
         
          {
         
          get 
          { 
          return 
          _cookies; }
         
          set 
          { _cookies = value; }
         
          }
         
          /// <summary>
         
          /// Cookie信息
         
          /// </summary>
         
          private 
          CookieContainer _cookieContainer = 
          new 
          CookieContainer();
         
          public 
          CookieContainer CookieContainer
         
          {
         
          get 
          { 
          return 
          _cookieContainer; }
         
          set 
          { _cookieContainer = value; }
         
          }
         
          /// <summary>
         
          /// 网页源码
         
          /// </summary>
         
          private 
          string 
          _html;
         
          public 
          string 
          Html
         
          {
         
          get 
          { 
          return 
          _html; }
         
          set 
          { _html = value; }
         
          }
         
          /// <summary>
         
          /// 验证码临时文件(绝对路径)
         
          /// </summary>
         
          private 
          string 
          _tmpValCodePic;
         
          public 
          string 
          TmpValCodePic
         
          {
         
          get 
          { 
          return 
          _tmpValCodePic; }
         
          set 
          { _tmpValCodePic = value; }
         
          }
         
          /// <summary>
         
          /// 验证码临时文件名(相对路径)
         
          /// </summary>
         
          private 
          string 
          _tmpValCodeFileName = 
          "emptyPic.gif"
          ;
         
          public 
          string 
          TmpValCodeFileName
         
          {
         
          get 
          { 
          return 
          _tmpValCodeFileName; }
         
          set 
          { _tmpValCodeFileName = value; }
         
          }
         
          /// <summary>
         
          /// 有验证码
         
          /// </summary>
         
          private 
          bool 
          _isValCode;
         
          public 
          bool 
          IsValCode
         
          {
         
          get 
          { 
          return 
          _isValCode; }
         
          set 
          { _isValCode = value; }
         
          }
         
          /// <summary>
         
          /// 验证码URL
         
          /// </summary>
         
          private 
          string 
          _valCodeURL;
         
          public 
          string 
          ValCodeURL
         
          {
         
          get 
          { 
          return 
          _valCodeURL; }
         
          set 
          { _valCodeURL = value; }
         
          }
         
          /// <summary>
         
          /// 验证码识别后的值
         
          /// </summary>
         
          private 
          string 
          _valCodeValue;
         
          public 
          string 
          ValCodeValue
         
          {
         
          get 
          { 
          return 
          _valCodeValue; }
         
          set 
          { _valCodeValue = value; }
         
          }
         
          /// <summary>
         
          /// 其它参数
         
          /// </summary>
         
          private 
          Hashtable _otherParams = 
          new 
          Hashtable();
         
          public 
          Hashtable OtherParams
         
          {
         
          get 
          { 
          return 
          _otherParams; }
         
          set 
          { _otherParams = value; }
         
          }
         
          // 重复添加处理 add by fengcj  09/11/19 PM
         
          public 
          void 
          addOtherParam(
          object 
          key, 
          object 
          value)
         
          {
         
          if 
          (!
          this
          .OtherParams.ContainsKey(key))
         
          this
          .OtherParams.Add(key, value);
         
          else
         
          {
         
          this
          .OtherParams[key] = value;
         
          }
         
          }
         
          public 
          void 
          removeOtherParam(
          object 
          key)
         
          {
         
          this
          .OtherParams.Remove(key);
         
          }
         
          public 
          object 
          getOtherParam(
          object 
          key)
         
          {
         
          return 
          this
          .OtherParams[key];
         
          }
         
          }

KeyWord.cs

 
          class 
          Keyword
         
          {
         
          public 
          int 
          ID { 
          get
          ; 
          set
          ; }
         
          public 
          string 
          Title { 
          get
          ; 
          set
          ; }
         
          public 
          string 
          Link { 
          get
          ; 
          set
          ; }
         
          }