效果:
主类:
/// <summary>
/// 获取网页中的全部url以及名称
/// </summary>
/// <param name="url">要获取网页的网址</param>
/// <param name="beginContent">开始位置</param>
/// <param name="endContent">结束位置</param>
/// <param name="code">网页编码</param>
/// <returns>网址和标题集合</returns>
public NameValueCollection linkUrl_Group(string url, string beginContent, string endContent, string code)
{
WebClient client = new WebClient();
client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
Stream data = null;
StreamReader reader = null;
try
{
if (url != "")
{
if (url.IndexOf("www") == 0)
url = "http://" + url;
Uri u = new Uri(url);
string content;
int m = -1, n = -1;
if (code == "")
code = "utf-8";
data = client.OpenRead(url);
reader = new StreamReader(data, Encoding.GetEncoding(code));
content = reader.ReadToEnd();
if (beginContent != "")
{
m = content.ToLower().IndexOf(beginContent.ToLower());
if (m > 0)
content = content.Substring(m + beginContent.Length);
}
if (endContent != "")
{
n = content.ToLower().IndexOf(endContent.ToLower());
if (n > 0)
content = content.Substring(0, n);
}
//正则匹配url和标题
Regex regUrl = new Regex(@"<a(/s.*?)href=[""'](?<url>.*?)[""'](.*?)>(?<title>.*?)</a>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection matches = regUrl.Matches(content);
NameValueCollection myCol = new NameValueCollection();
string link = "";
string link1 = "";
bool mark = false;
if (matches.Count > 0)
{
//消除重复的url
for (int i = 0; i < matches.Count - 1; i++)
{
link = matches[i].Groups["url"].Value;
if (link.IndexOf("#") > -1)
link = link.Substring(0, link.IndexOf("#"));
mark = false;
for (int j = i + 1; j < matches.Count; j++)
{
link1 = matches[j].Groups["url"].Value;
if (link1.IndexOf("#") > -1)
link1 = link1.Substring(0, link1.IndexOf("#"));
if (link == link1)
{
mark = true;
break;
}
}
if (!mark)
{
if (link.IndexOf("/") == 0)
{
link = "http://" + u.Host + link;
}
myCol.Add(link, matches[i].Groups["title"].Value);
}
}
link = matches[matches.Count - 1].Groups["url"].Value;
if (link.IndexOf("#") > -1)
link = link.Substring(0, link.IndexOf("#"));
if (link.IndexOf("/") == 0)
{
link = "http://" + u.Host + link;
}
myCol.Add(link, matches[matches.Count - 1].Groups["title"].Value);
}
return myCol;
}
return null;
}
catch (Exception ex)
{
return null;
}
finally
{
if (data != null)
{
data.Close();
reader.Close();
}
}
}