1.远程抓取: using System.Text.RegularExpressions;using System.IO; WebClient w = new WebClient(); byte[] bytes = w.DownloadData("http://news.sina.com.cn/"); string strHtml = System.Text.Encoding.Default.GetString(bytes); string p =@"/<a.*href/s*=/s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^/>^/s]+)).*/>(?<title>[^/<^/>]*)/<[^/</a/>]*/a/>"; Regex reg = new Regex(p, RegexOptions.IgnoreCase); MatchCollection ms = reg.Matches(strHtml); Console.WriteLine("总共抓取了:{0}条链接。",ms.Count); foreach(Match m in ms) { Console.WriteLine("{0}/n{1}/n", m.Groups["title"].Value, m.Groups["url"].Value); } 2.本地测试: using System.Text.RegularExpressions;using System.IO; string mystr=@"<A HREF=""aaaahttp://www.csdn.net"">aaaa</A>"; string p = @"/<a.*href/s*=/s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^/>^/s]+)).*/>(?<title>[^/<^/>]*)/<[^/</a/>]*/a/>"; Regex reg = new Regex(p, RegexOptions.IgnoreCase); MatchCollection ms = reg.Matches(mystr); foreach(Match m in ms) { MessageBox.Show (m.Groups["title"].Value); MessageBox.Show(m.Groups["url"].Value); }
提取网页中链接和标题的正则表达式
最新推荐文章于 2021-05-30 19:27:32 发布