尝试这个 :
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
var res = Find(html);
}
public static List Find(string file)
{
List list = new List();
// 1.
// Find all matches in file.
MatchCollection m1 = Regex.Matches(file, @"(.*?)",
RegexOptions.Singleline);
// 2.
// Loop over each match.
foreach (Match m in m1)
{
string value = m.Groups[1].Value;
LinkItem i = new LinkItem();
// 3.
// Get href attribute.
Match m2 = Regex.Match(value, @"href=\""(.*?)\""",
RegexOptions.Singleline);
if (m2.Success)
{
i.Href = m2.Groups[1].Value;
}
// 4.
// Remove inner tags from text.
string t = Regex.Replace(value, @"\s*<.>\s*", "",
RegexOptions.Singleline);
i.Text = t;
list.Add(i);
}
return list;
}
public struct LinkItem
{
public string Href;
public string Text;
public override string ToString()
{
return Href + "\n\t" + Text;
}
}
}
输入:
结果:
[0] = {www.aaa.xx/xx.zz?id=xxxx&name=xxxx}
[1] = {http://www.aaa.xx/xx.zz?id=xxxx&name=xxxx}
C#抓取HTML链接
刮HTML提取重要的页面元素。它对网站管理员和ASP.NET开发人员有许多法律用途。使用Regex类型和WebClient,我们实现了HTML的屏幕抓取。
已编辑
另一种简单的方法:您可以使用web browser控件href从tag 进行获取a,例如:(请参阅我的示例)
public Form1()
{
InitializeComponent();
webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);
}
private void Form1_Load(object sender, EventArgs e)
{
webBrowser1.DocumentText = "";
}
void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
List href = new List();
foreach (HtmlElement el in webBrowser1.Document.GetElementsByTagName("a"))
{
href.Add(el.GetAttribute("href"));
}
}