由于在最近的一个项目中,涉及到对页面的链接的抓取。以下是我总结的一些代码,现贴出来与大家分享 :
List
<
String
>
titles
=
new
List
<
string
>
();
List < String > urls = new List < string > ();
String html = null ;
String p = @" <a[sS]*?href=(""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^>s]*))[^>]*?>(?<title>[sS]*?)</a> " ;
Regex reg = new Regex(p, RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection ms = reg.Matches(html);
foreach (Match m in ms)
... {
titles.Add(m.Groups["title"].Value);
urls.Add(m.Groups["url"].Value);
}
List < String > urls = new List < string > ();
String html = null ;
String p = @" <a[sS]*?href=(""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^>s]*))[^>]*?>(?<title>[sS]*?)</a> " ;
Regex reg = new Regex(p, RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection ms = reg.Matches(html);
foreach (Match m in ms)
... {
titles.Add(m.Groups["title"].Value);
urls.Add(m.Groups["url"].Value);
}