这两天公司让做一个小的功能就是抓取百度和谷歌的搜索结果,把搜索到的标题和链接一一提取出来。其实页面是很好提取的,主要的问题就是正则表达式处理下载下来的页面。于是在论坛上请教了大家,在大家的帮助下,这个功能的核心代码已经完成,现在整理出来,以提供需要的人参考。
C# 代码:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using httpState;
using System.Text.RegularExpressions;
using System.Collections;
namespace test
{
public partial class DownLoadTest : Form
{
public DownLoadTest()
{
InitializeComponent();
}
/// <summary>
/// 百度搜索
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void btnBaidu_Click(object sender, EventArgs e)
{
int num = 20;//搜索条数
string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
string html=search(url,"gb2312");
BaiduSearch baidu = new BaiduSearch();
if (!string.IsNullOrEmpty(html))
{
int count = baidu.GetSearchCount(html);//搜索条数
if (count > 0)
{
List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
dataGridView1.DataSource = keywords;
}
}
}
/// <summary>
/// 谷歌搜索
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button2_Click(object sender, EventArgs e)
{
int num=100;
string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num="+num+"";
string html=search(url,"utf-8");
if (!string.IsNullOrEmpty(html))
{
googleSearch google = new googleSearch();
List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
dataGridView1.DataSource = keywords;
}
}
/// <summary>
/// 搜索处理
/// </summary>
/// <param name="url">搜索网址</param>
/// <param name="Chareset">编码</param>
public string search(string url,string Chareset)
{
HttpState result = new HttpState();
Uri uri = new Uri(url);
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
myHttpWebRequest.UseDefaultCredentials = true;
myHttpWebRequest.ContentType = "text/html";
myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
myHttpWebRequest.Method = "GET";
myHttpWebRequest.CookieContainer = new CookieContainer();
try
{
HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
// 从 ResponseStream 中读取HTML源码并格式化 add by cqp
result.Html = readResponseStream(response, Chareset);
result.CookieContainer = myHttpWebRequest.CookieContainer;
return result.Html;
}
catch (Exception ex)
{
return ex.ToString();
}
}
public string readResponseStream(HttpWebResponse response, string Chareset)
{
string result = "";
using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
{
result = formatHTML(responseReader.ReadToEnd());
}
return result;
}
/// <summary>
/// 描述:格式化网页源码
///
/// </summary>
/// <param name="htmlContent"></param>
/// <returns></returns>
public string formatHTML(string htmlContent)
{
string result = "";
result = htmlContent.Replace("»", "").Replace(" ", "")
.Replace("©", "").Replace("/r", "").Replace("/t", "")
.Replace("/n", "").Replace("&", "&");
return result;
}
class BaiduSearch
{
protected string uri = "http://www.baidu.com/s?wd=";
//protected string uri = "http://www.baidu.com/s?wd=software&pn=10&usm=2"; // 第二页
protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
public int GetSearchCount(string html)
{
int result = 0;
string searchcount = string.Empty;
Regex regex = new Regex(resultPattern);
Match match = regex.Match(html);
if (match.Success)
{
searchcount = match.Value;
}
else
{
searchcount = "0";
}
if (searchcount.IndexOf(",") > 0)
{
searchcount = searchcount.Replace(",", string.Empty);
}
int.TryParse(searchcount, out result);
return result;
}
public List<Keyword> GetKeywords(string html, string word)
{
int i=1;
List<Keyword> keywords = new List<Keyword>();
Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{1,2}|100)/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
//Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
MatchCollection mcTable = regTable.Matches(html);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Match mA = regA.Match(mTable.Value);
if (mA.Success)
{
Keyword keyword = new Keyword();
keyword.ID=i++;
keyword.Link = mA.Groups["link"].Value;
keyword.Title = mA.Groups["title"].Value;
keywords.Add(keyword);
}
}
}
return keywords;
}
}
class googleSearch
{
public List<Keyword> GetKeywords(string html, string word)
{
int i = 1;
List<Keyword> keywords = new List<Keyword>();
Regex regTable = new Regex(@"(?is)<h3[^>]*?>(?><h3[^>]*>(?<o>)|</h3>(?<-o>)|(?:(?!</?h3/b).)*)*(?(o)(?!))</h3>", RegexOptions.IgnoreCase);
//Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
MatchCollection mcTable = regTable.Matches(html);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Match mA = regA.Match(mTable.Value);
if (mA.Success)
{
Keyword keyword = new Keyword();
keyword.ID = i++;
keyword.Link = mA.Groups["link"].Value;
keyword.Title = mA.Groups["title"].Value;
keywords.Add(keyword);
}
}
}
return keywords;
}
}
class Keyword
{
public int ID { get; set; }
public string Title { get; set; }
public string Link { get; set; }
//private string title;
//public string Title { get { return title; } set { title = value; } }
//private string link;
//public string Link { get { return link; } set { link = value; } }
}
}
}
HttpState:
using System.Net;
using System.Collections;
namespace httpState
{
public class HttpState
{
// 获取与响应一起返回的状态说明。
private string _statusDescription;
public string StatusDescription
{
get { return _statusDescription; }
set { _statusDescription = value; }
}
/// <summary>
/// 回调 址址, 登陆测试中使用
/// </summary>
private string _callBackUrl;
public string CallBackUrl
{
get { return _callBackUrl; }
set { _callBackUrl = value; }
}
/// <summary>
/// 网页网址 绝对路径格式
/// </summary>
private string _url;
public string Url
{
get { return _url; }
set { _url = value; }
}
/// <summary>
/// 字符串的形式的Cookie信息
/// </summary>
private string _cookies;
public string Cookies
{
get { return _cookies; }
set { _cookies = value; }
}
/// <summary>
/// Cookie信息
/// </summary>
private CookieContainer _cookieContainer = new CookieContainer();
public CookieContainer CookieContainer
{
get { return _cookieContainer; }
set { _cookieContainer = value; }
}
/// <summary>
/// 网页源码
/// </summary>
private string _html;
public string Html
{
get { return _html; }
set { _html = value; }
}
/// <summary>
/// 验证码临时文件(绝对路径)
/// </summary>
private string _tmpValCodePic;
public string TmpValCodePic
{
get { return _tmpValCodePic; }
set { _tmpValCodePic = value; }
}
/// <summary>
/// 验证码临时文件名(相对路径)
/// </summary>
private string _tmpValCodeFileName = "emptyPic.gif";
public string TmpValCodeFileName
{
get { return _tmpValCodeFileName; }
set { _tmpValCodeFileName = value; }
}
/// <summary>
/// 有验证码
/// </summary>
private bool _isValCode;
public bool IsValCode
{
get { return _isValCode; }
set { _isValCode = value; }
}
/// <summary>
/// 验证码URL
/// </summary>
private string _valCodeURL;
public string ValCodeURL
{
get { return _valCodeURL; }
set { _valCodeURL = value; }
}
/// <summary>
/// 验证码识别后的值
/// </summary>
private string _valCodeValue;
public string ValCodeValue
{
get { return _valCodeValue; }
set { _valCodeValue = value; }
}
/// <summary>
/// 其它参数
/// </summary>
private Hashtable _otherParams = new Hashtable();
public Hashtable OtherParams
{
get { return _otherParams; }
set { _otherParams = value; }
}
// 重复添加处理 add by fengcj 09/11/19 PM
public void addOtherParam(object key, object value)
{
if (!this.OtherParams.ContainsKey(key))
this.OtherParams.Add(key, value);
else
{
this.OtherParams[key] = value;
}
}
public void removeOtherParam(object key)
{
this.OtherParams.Remove(key);
}
public object getOtherParam(object key)
{
return this.OtherParams[key];
}
}
}
界面很简单一个输入框,两个搜索按钮和一个datagridview