//得到网页源码
private string[] GetHtml(string http)
{
string[] html = new string[2048];
HttpWebRequest cnblogs = (HttpWebRequest)WebRequest.Create(http.Trim());
cnblogs.Accept = "image/gif, image/x-xbitmap, image/jpeg,image/pjpeg, application/x-shockwave-flash, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/vnd.ms-excel, application/vn";
cnblogs.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; Infopath.2)";
cnblogs.Method = "GET";
HttpWebResponse cnblogsRespone = (HttpWebResponse)cnblogs.GetResponse();
if (cnblogsRespone != null && cnblogsRespone.StatusCode == HttpStatusCode.OK)
{
using (StreamReader sr = new StreamReader(cnblogsRespone.GetResponseStream()))
{
//html = sr.ReadToEnd();
int i = 0;
while (sr.ReadLine() != null)//将每一行存入数组
{
html[i] = sr.ReadLine();
i++;
}
}
}
return html;
}
//得到源码中有URL的行
private string[] GetHref()
{
string[] Href = new string[2048];
int i = 0;
foreach (string s in GetHtml(textBox1.Text.Trim()))
{
if (s != null)
{
string s_line = s;
Match m = Regex.Match(s_line, "(?m)<a[^><]+href=(/"|')+http(?<url>([^>/"'//s)])+)(/"|')?[^>]*>(?<text>(//w|//W)*?)"); //以http开头的
//Match m = Regex.Match(s_line, "(?m)<a[^><]+href=(/"|')?(?<url>([^>/"'//s)])+)(/"|')?[^>]*>(?<text>(//w|//W)*?)"); //任意开头的
if (m.Success == true) Href[i] = m.Value;
i++;
}
}
return Href;
}
//得到URL
private string GetUrl(string Href)
{
int first_href = Href.IndexOf("href");
string s1 = Href.Substring(first_href);
int n1 = s1.IndexOf("=");
int end_href = s1.IndexOf("/a");
string s2 = s1.Substring(n1 + 1, end_href - n1);
int begin = 0, end = 0;
if (s2.Contains('/"'))
{
begin = s2.IndexOf("/"");
end = s2.LastIndexOf("/"");
return s2.Substring(begin + 1, end - 1);
}
else if (s2.Contains('/''))
{
begin = s2.IndexOf("/'");
end = s2.LastIndexOf("/'");
return s2.Substring(begin + 1, end - 1);
}
else
{
end = s2.IndexOf(">");
return s2.Substring(begin, end);
}
}