C#winform抓取百度,Google搜索关键词结果

基于网站seo,做了一采集百度和Google搜索关键字结果的采集.在这里与大家分享一下

先看先效果图 

代码附加:

 View Code

复制代码
 1   private void baidu_Click(object sender, EventArgs e)
 2         {
 3             int num = 100;//搜索条数
 4             string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
 5             string html = search(url, "gb2312");
 6             BaiduSearch baidu = new BaiduSearch();
 7             if (!string.IsNullOrEmpty(html))
 8             {
 9                 int count = baidu.GetSearchCount(html);//搜索条数
10                 if (count > 0)
11                 {
12                     List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
13                     dataGridView1.DataSource = keywords;
14                 }
15 
16             }
17         }
18 
19         private void google_Click(object sender, EventArgs e)
20         {
21             int num = 100;
22             string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num=" + num + "";
23             string html = search(url, "utf-8");
24             if (!string.IsNullOrEmpty(html))
25             {
26 
27                 googleSearch google = new googleSearch();
28                 List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
29                 dataGridView1.DataSource = keywords;
30 
31             }
32         }
33         /// <summary>
34         /// 搜索处理
35         /// </summary>
36         /// <param name="url">搜索网址</param>
37         /// <param name="Chareset">编码</param>
38         public string search(string url, string Chareset)
39         {
40             HttpState result = new HttpState();
41             Uri uri = new Uri(url);
42             HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
43             myHttpWebRequest.UseDefaultCredentials = true;
44             myHttpWebRequest.ContentType = "text/html";
45             myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
46             myHttpWebRequest.Method = "GET";
47             myHttpWebRequest.CookieContainer = new CookieContainer();
48 
49             try
50             {
51                 HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
52                 // 从 ResponseStream 中读取HTML源码并格式化 add by cqp
53                 result.Html = readResponseStream(response, Chareset);
54                 result.CookieContainer = myHttpWebRequest.CookieContainer;
55                 return result.Html;
56             }
57             catch (Exception ex)
58             {
59                 return ex.ToString();
60             }
61 
62         }
63         public string readResponseStream(HttpWebResponse response, string Chareset)
64         {
65             string result = "";
66             using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
67             {
68                 result = formatHTML(responseReader.ReadToEnd());
69             }
70 
71             return result;
72         }
73         /// <summary>
74         /// 描述:格式化网页源码
75         /// 
76         /// </summary>77         /// <param name="htmlContent"></param>78         /// <returns></returns>79         public string formatHTML(string htmlContent)80         {81             string result = "";82 83             result = htmlContent.Replace("&raquo;", "").Replace("&nbsp;", "")84                     .Replace("&copy;", "").Replace("/r", "").Replace("/t", "")85                     .Replace("/n", "").Replace("&amp;", "&");86             return result;87
复制代码

 把百度和Google两个类抽取了出来

1.百度Search类

 View Code

复制代码
 1 class BaiduSearch
 2     {
 3         protected string uri = "http://www.baidu.com/s?wd=";
 4         protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
 5         protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
 6         protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
 7         public int GetSearchCount(string html)
 8         {
 9             int result = 0;
10             string searchcount = string.Empty;
11 
12             Regex regex = new Regex(resultPattern);
13             Match match = regex.Match(html);
14 
15             if (match.Success)
16             {
17                 searchcount = match.Value;
18             }
19             else
20             {
21                 searchcount = "0";
22             }
23 
24             if (searchcount.IndexOf(",") > 0)
25             {
26                 searchcount = searchcount.Replace(",", string.Empty);
27             }
28 
29             int.TryParse(searchcount, out result);
30 
31             return result;
32         }
33 
34         public List<Keyword> GetKeywords(string html, string word)
35         {
36             int i = 1;
37             List<Keyword> keywords = new List<Keyword>();
38             string ss="<h3 class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>";
39             MatchCollection mcTable = Regex.Matches(html,ss);
40             foreach (Match mTable in mcTable)
41             {
42                 if (mTable.Success)
43                 {
44                     Keyword keyword = new Keyword();
45                     keyword.ID = i++;
46                     keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
47                     keyword.Link = mTable.Groups["url"].Value;
48                     keywords.Add(keyword);
49 
50                 }
51             }
52 
53             return keywords;
54         }
55
复制代码

2 .GoogleSearch类

 View Code

复制代码
 1   class googleSearch
 2     {
 3 
 4         public List<Keyword> GetKeywords(string html, string word)
 5         {
 6             int i = 1;
 7             List<Keyword> keywords = new List<Keyword>();
 8 
 9             Regex regTable = new Regex("<h3 class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase);
10             Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
11 
12             MatchCollection mcTable = regTable.Matches(html);
13             foreach (Match mTable in mcTable)
14             {
15                 if (mTable.Success)
16                 {
17                     Keyword keyword = new Keyword();
18                     keyword.ID = i++;
19                     keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
20                     keyword.Link = mTable.Groups["url"].Value;
21                     keywords.Add(keyword);
22                 }
23             }
24 
25             return keywords;
26         }
27
复制代码

 忘了.还有个导出Excel,这个友友们应该都有自己的方法,我这里就简单写了一个excel导出.也贴出来吧.

复制代码
 1   public void ExportDataGridViewToExcel(DataGridView dataGridview1)
 2         {
 3             SaveFileDialog saveFileDialog = new SaveFileDialog();
 4             saveFileDialog.Filter = "Execl  files  (*.xls)|*.xls";
 5             saveFileDialog.FilterIndex = 0;
 6             saveFileDialog.RestoreDirectory = true;
 7             saveFileDialog.CreatePrompt = true;
 8             saveFileDialog.Title = "导出Excel文件";
 9 
10             DateTime now = DateTime.Now;
11             saveFileDialog.FileName = now.Year.ToString().PadLeft(2) + now.Month.ToString().PadLeft(2, '0') + now.Day.ToString().PadLeft(2, '0') + "-" + now.Hour.ToString().PadLeft(2, '0') + now.Minute.ToString().PadLeft(2, '0') + now.Second.ToString().PadLeft(2, '0');
12             saveFileDialog.ShowDialog();
13 
14             Stream myStream;
15             myStream = saveFileDialog.OpenFile();
16             StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding("gb2312"));
17             string str = "";
18             try
19             {
20                 //写标题      
21                 for (int i = 0; i < dataGridview1.ColumnCount; i++)
22                 {
23                     if (i > 0)
24                     {
25                         str += "\t";
26                     }
27                     str += dataGridview1.Columns[i].HeaderText;
28                 }
29                 sw.WriteLine(str);
30                 //写内容   
31                 for (int j = 0; j < dataGridview1.Rows.Count; j++)
32                 {
33                     string tempStr = "";
34                     for (int k = 0; k < dataGridview1.Columns.Count; k++)
35                     {
36                         if (k > 0)
37                         {
38                             tempStr += "\t";
39                         }
40                         tempStr += dataGridview1.Rows[j].Cells[k].Value.ToString();
41                     }
42                     sw.WriteLine(tempStr);
43                 }
44                 sw.Close();
45                 myStream.Close();
46                 MessageBox.Show("导出成功");
47             }
48             catch (Exception e)
49             {
50                 MessageBox.Show(e.ToString());
51             }
52             finally
53             {
54                 sw.Close();
55                 myStream.Close();
56             }
复制代码

57         } 

我把HTTpStatus类给贴出来..有需要demo的可以发邮件给我.或者留下邮箱

Httpstatus.cs 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class HttpState
   {
       private string _statusDescription;
 
       public string StatusDescription
       {
           get { return _statusDescription; }
           set { _statusDescription = value; }
       }
 
       /// <summary>
       /// 回调 址址, 登陆测试中使用
       /// </summary>
       private string _callBackUrl;
 
       public string CallBackUrl
       {
           get { return _callBackUrl; }
           set { _callBackUrl = value; }
       }
 
 
       /// <summary>
       /// 网页网址 绝对路径格式
       /// </summary>
       private string _url;
 
       public string Url
       {
           get { return _url; }
           set { _url = value; }
       }
 
       /// <summary>
       /// 字符串的形式的Cookie信息
       /// </summary>
       private string _cookies;
 
       public string Cookies
       {
           get { return _cookies; }
           set { _cookies = value; }
       }
 
       /// <summary>
       /// Cookie信息
       /// </summary>
       private CookieContainer _cookieContainer = new CookieContainer();
 
       public CookieContainer CookieContainer
       {
           get { return _cookieContainer; }
           set { _cookieContainer = value; }
       }
 
       /// <summary>
       /// 网页源码
       /// </summary>
       private string _html;
 
       public string Html
       {
           get { return _html; }
           set { _html = value; }
       }
 
       /// <summary>
       /// 验证码临时文件(绝对路径)
       /// </summary>
       private string _tmpValCodePic;
 
       public string TmpValCodePic
       {
           get { return _tmpValCodePic; }
           set { _tmpValCodePic = value; }
       }
 
       /// <summary>
       /// 验证码临时文件名(相对路径)
       /// </summary>
       private string _tmpValCodeFileName = "emptyPic.gif";
 
       public string TmpValCodeFileName
       {
           get { return _tmpValCodeFileName; }
           set { _tmpValCodeFileName = value; }
       }
 
       /// <summary>
       /// 有验证码
       /// </summary>
       private bool _isValCode;
 
       public bool IsValCode
       {
           get { return _isValCode; }
           set { _isValCode = value; }
       }
 
       /// <summary>
       /// 验证码URL
       /// </summary>
       private string _valCodeURL;
 
       public string ValCodeURL
       {
           get { return _valCodeURL; }
           set { _valCodeURL = value; }
       }
 
       /// <summary>
       /// 验证码识别后的值
       /// </summary>
       private string _valCodeValue;
 
       public string ValCodeValue
       {
           get { return _valCodeValue; }
           set { _valCodeValue = value; }
       }
 
       /// <summary>
       /// 其它参数
       /// </summary>
       private Hashtable _otherParams = new Hashtable();
 
       public Hashtable OtherParams
       {
           get { return _otherParams; }
           set { _otherParams = value; }
       }
 
       // 重复添加处理 add by fengcj  09/11/19 PM
       public void addOtherParam(object key, object value)
       {
           if (!this.OtherParams.ContainsKey(key))
               this.OtherParams.Add(key, value);
           else
           {
               this.OtherParams[key] = value;
           }
       }
 
       public void removeOtherParam(object key)
       {
           this.OtherParams.Remove(key);
       }
 
       public object getOtherParam(object key)
       {
           return this.OtherParams[key];
       }
   }

  

 KeyWord.cs

 

 

class Keyword
   {
       public int ID { get; set; }
       public string Title { get; set; }
       public string Link { get; set; }
   }

  鉴于大家都需要demo,今天就整理一下发上来.添加了导出word,导出excel功能.晕...木找到怎么放文件路径进来....有需要的可以email我.

 

展开阅读全文

怎样抓取Google搜索结果

05-28

为什么通过正则表达式分析google的搜索结果页面代码来抓取结果的标题和链接不成功,而百度的却能成功?rnrnpublic class SaveURLrn rn public static String savepath_SaveURL="d:\\";//默认路径为D盘根目录rn static String title=null;rn static String link=null;rn static String localFile=null;//保存为本地文件后的路径rn static String res=null;rn rn public static void go() rn //String url= "http://www.baidu.com/s?wd=accenture&rn=10";rn String url="http://www.google.com/search?hl=en&newwindow=1&q=hello&start=0&sa=N&num=30";rn String content = getPage(url);//得到url所对应的网页的内容rn rn rn// 对应百度 等的正则表达式rn// 为什么快照的链接没有被下载下来?rn rn //String reg = "(.*?).*? .*?"+rn //"";rnrn //Google对应的正则式rn String reg = " "+"(.*?)";rn Pattern p = Pattern.compile(reg,Pattern.CASE_INSENSITIVE | Pattern.DOTALL);rn Matcher m = p.matcher(content);rn rn int i = 1;rn rn while(m.find())rn rn title=m.group(2).replaceAll("<.*?>", "");//正则表达式rn link=m.group(1);rn rn System.out.println("----------------------------------------------");rn System.out.println("第"+i+"个标题:"+title);rn System.out.println("第"+i+"个链接:"+link); rn rn i++;rn //end of whilernrn rn rn public static String getPage(String page) rn try rn rn URL url = new URL(page);rn HttpURLConnection con = (HttpURLConnection) url.openConnection();rn rn// 以下是修正Server returned HTTP response code: 403 for URL的代码rn// 通常是因为服务器的安全设置不接受Java程序作为客户端访问,解决方案是设置客户端的User Agentrn con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0;Windows NT; DigExt)");rn rn BufferedReader reader = new BufferedReader(new InputStreamReader(rn con.getInputStream()));rn StringBuilder b = new StringBuilder();rn String line;rn while ((line = reader.readLine()) != null) rn b.append(line);rn b.append("\r\n");rn rn return b.toString();rn catch (FileNotFoundException ex) rn System.out.println("NOT FOUND:" + page);rn return null;rn catch (ConnectException ex) rn System.out.println("Timeout:" + page);rn return null;rn catch (Exception ex) rn ex.printStackTrace();rn return null;rn rn rn rn public static void main(String[] args)rn go();rn rnrnrnrnrnrn如果打开google的结果目录页,查看源代码的结构,是和百度的差别不大啊,就是google的每个结果都是放在一个 中 里边,而百度里边的是放在一个 标签里,为什么Google的会得不到呢?rnrn请高手帮忙解释下。rn谢谢。 论坛

没有更多推荐了,返回首页