1.远程抓取:
using System.Text.RegularExpressions;
using System.IO;
WebClient w = new WebClient();
byte[] bytes = w.DownloadData("http://news.sina.com.cn/");
string strHtml = System.Text.Encoding.Default.GetString(bytes);
string p =@"/<a.*href/s*=/s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^/>^/s]+)).*/>(?<title>[^/<^/>]*)/<[^/</a/>]*/a/>";
Regex reg = new Regex(p, RegexOptions.IgnoreCase);
MatchCollection ms = reg.Matches(strHtml);
Console.WriteLine("总共抓取了:{0}条链接。",ms.Count);
foreach(Match m in ms)
{
Console.WriteLine("{0}/n{1}/n", m.Groups["title"].Value, m.Groups["url"].Value);
}
2.本地测试:
using System.Text.RegularExpressions;
using System.IO;
string mystr=@"<A HREF=""http://www.csdn.net"">aaaa</A>";
string p = @"/<a.*href/s*=/s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^/>^/s]+)).*/>(?<title>[^/<^/>]*)/<[^/</a/>]*/a/>";
Regex reg = new Regex(p, RegexOptions.IgnoreCase);
MatchCollection ms = reg.Matches(mystr);
foreach(Match m in ms)
{
MessageBox.Show (m.Groups["title"].Value);
MessageBox.Show(m.Groups["url"].Value);
}