c#中如何得到网页中的URL

//得到网页源码       

private string[] GetHtml(string http)
        {
            string[] html = new string[2048];
            HttpWebRequest cnblogs = (HttpWebRequest)WebRequest.Create(http.Trim());
            cnblogs.Accept = "image/gif, image/x-xbitmap, image/jpeg,image/pjpeg, application/x-shockwave-flash, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/vnd.ms-excel, application/vn";
            cnblogs.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; Infopath.2)";
            cnblogs.Method = "GET";
            HttpWebResponse cnblogsRespone = (HttpWebResponse)cnblogs.GetResponse();
            if (cnblogsRespone != null && cnblogsRespone.StatusCode == HttpStatusCode.OK)
            {
                using (StreamReader sr = new StreamReader(cnblogsRespone.GetResponseStream()))
                {
                    //html = sr.ReadToEnd();
                    int i = 0;
                    while (sr.ReadLine() != null)//将每一行存入数组
                    {
                        html[i] = sr.ReadLine();
                        i++;
                    }
                }
            }
            return html;
        }
//得到源码中有URL的行       

private string[] GetHref()
        {
            string[] Href = new string[2048];
            int i = 0;
            foreach (string s in GetHtml(textBox1.Text.Trim()))
            {
                if (s != null)
                {
                    string s_line = s;
                    Match m = Regex.Match(s_line, "(?m)<a[^><]+href=(/"|')+http(?<url>([^>/"'//s)])+)(/"|')?[^>]*>(?<text>(//w|//W)*?)");  //以http开头的
                    //Match m = Regex.Match(s_line, "(?m)<a[^><]+href=(/"|')?(?<url>([^>/"'//s)])+)(/"|')?[^>]*>(?<text>(//w|//W)*?)");  //任意开头的
                    if (m.Success == true) Href[i] = m.Value;
                    i++;
                }
            }
            return Href;
        }

 

//得到URL

        private string GetUrl(string Href)

        {

            int first_href = Href.IndexOf("href");

            string s1 = Href.Substring(first_href);

            int n1 = s1.IndexOf("=");

            int end_href = s1.IndexOf("/a");

            string s2 = s1.Substring(n1 + 1, end_href - n1);

            int begin = 0, end = 0;

            if (s2.Contains('/"'))

            {

                begin = s2.IndexOf("/"");

                end = s2.LastIndexOf("/"");

                return s2.Substring(begin + 1, end - 1);

            }

            else if (s2.Contains('/''))

            {

                begin = s2.IndexOf("/'");

                end = s2.LastIndexOf("/'");

                return s2.Substring(begin + 1, end - 1);

            }

            else

            {

                end = s2.IndexOf(">");

                return s2.Substring(begin, end);

            }

        }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值