下列的函数分别用于获取HTML页面和提取页面中的超链接。
using
System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
string [] GetLink( string strHtml)
{
Regex reg = new Regex( " href[//s//r]*=[//s//r]*[/"/']{0,1}([^/"/'//s//r>]*)[/"/'//s//r]{0,1} " , RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(strHtml);
if (mc.Count > 0 )
{
string [] strHref = new string [mc.Count];
int i = 0 ;
foreach (Match m in mc)
{
strHref[i] = m.Groups[ 1 ].Value;
++ i;
}
return strHref;
}
return null ;
}
string GetHttp( string strUrl)
{
string strHtml = "" ;
WebResponse wrp = null ;
try
{
WebRequest wrq = WebRequest.Create(strUrl);
wrq.Timeout = 60000 ;
wrp = wrq.GetResponse();
}
catch (WebException e)
{
}
catch (Exception e)
{
}
finally
{
if (wrp != null )
{
StreamReader sr = new StreamReader(wrp.GetResponseStream(), Encoding.GetEncoding( " GB2312 " ));
strHtml = sr.ReadToEnd();
sr.Close();
wrp.Close();
}
}
return strHtml;
}
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
string [] GetLink( string strHtml)
{
Regex reg = new Regex( " href[//s//r]*=[//s//r]*[/"/']{0,1}([^/"/'//s//r>]*)[/"/'//s//r]{0,1} " , RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(strHtml);
if (mc.Count > 0 )
{
string [] strHref = new string [mc.Count];
int i = 0 ;
foreach (Match m in mc)
{
strHref[i] = m.Groups[ 1 ].Value;
++ i;
}
return strHref;
}
return null ;
}
string GetHttp( string strUrl)
{
string strHtml = "" ;
WebResponse wrp = null ;
try
{
WebRequest wrq = WebRequest.Create(strUrl);
wrq.Timeout = 60000 ;
wrp = wrq.GetResponse();
}
catch (WebException e)
{
}
catch (Exception e)
{
}
finally
{
if (wrp != null )
{
StreamReader sr = new StreamReader(wrp.GetResponseStream(), Encoding.GetEncoding( " GB2312 " ));
strHtml = sr.ReadToEnd();
sr.Close();
wrp.Close();
}
}
return strHtml;
}
可首先使用GetHttp获取指定URL的页面内容,然后将此内容作为参数传给GetLink,GetLink返回的String数组中的每个元素代表一个超链接。