c#中如何得到网页中的URL

最新推荐文章于 2022-05-04 13:50:27 发布

super_duck

最新推荐文章于 2022-05-04 13:50:27 发布

阅读量588

点赞数

文章标签： url c# string textbox html null

本文链接：https://blog.csdn.net/super_duck/article/details/5994129

版权

//得到网页源码

private string[] GetHtml(string http)
        {
            string[] html = new string[2048];
            HttpWebRequest cnblogs = (HttpWebRequest)WebRequest.Create(http.Trim());
            cnblogs.Accept = "image/gif, image/x-xbitmap, image/jpeg,image/pjpeg, application/x-shockwave-flash, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/vnd.ms-excel, application/vn";
            cnblogs.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; Infopath.2)";
            cnblogs.Method = "GET";
            HttpWebResponse cnblogsRespone = (HttpWebResponse)cnblogs.GetResponse();
            if (cnblogsRespone != null && cnblogsRespone.StatusCode == HttpStatusCode.OK)
            {
                using (StreamReader sr = new StreamReader(cnblogsRespone.GetResponseStream()))
                {
                    //html = sr.ReadToEnd();
                    int i = 0;
                    while (sr.ReadLine() != null)//将每一行存入数组
                    {
                        html[i] = sr.ReadLine();
                        i++;
                    }
                }
            }
            return html;
        }
//得到源码中有URL的行

private string[] GetHref()
        {
            string[] Href = new string[2048];
            int i = 0;
            foreach (string s in GetHtml(textBox1.Text.Trim()))
            {
                if (s != null)
                {
                    string s_line = s;
                    Match m = Regex.Match(s_line, "(?m)<a[^><]+href=(/"|')+http(?<url>([^>/"'//s)])+)(/"|')?[^>]*>(?<text>(//w|//W)*?)"); //以http开头的
                    //Match m = Regex.Match(s_line, "(?m)<a[^><]+href=(/"|')?(?<url>([^>/"'//s)])+)(/"|')?[^>]*>(?<text>(//w|//W)*?)");  //任意开头的
                    if (m.Success == true) Href[i] = m.Value;
                    i++;
                }
            }
            return Href;
        }

//得到URL

private string GetUrl(string Href)

{

int first_href = Href.IndexOf("href");

string s1 = Href.Substring(first_href);

int n1 = s1.IndexOf("=");

int end_href = s1.IndexOf("/a");

string s2 = s1.Substring(n1 + 1, end_href - n1);

int begin = 0, end = 0;

if (s2.Contains('/"'))

{

begin = s2.IndexOf("/"");

end = s2.LastIndexOf("/"");

return s2.Substring(begin + 1, end - 1);

}

else if (s2.Contains('/''))

{

begin = s2.IndexOf("/'");

end = s2.LastIndexOf("/'");

return s2.Substring(begin + 1, end - 1);

}

else

{

end = s2.IndexOf(">");

return s2.Substring(begin, end);

}

super_duck

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
c#中如何得到网页中的URL

//得到网页源码 private string[] GetHtml(string http) { string[] html = new string[2048]; HttpWebRequest cnblogs = (HttpWebRequest)WebRequest.Create(http.Trim()); cnblogs.Accept = "image/gif, image/x-xbitmap, image/j
复制链接

扫一扫