获取某个网页中的全部url及对应的名称(使用c#实现的类)

        效果:

 

 

主类:

/// <summary>
        /// 获取网页中的全部url以及名称
        /// </summary>
        /// <param name="url">要获取网页的网址</param>
        /// <param name="beginContent">开始位置</param>
        /// <param name="endContent">结束位置</param>
        /// <param name="code">网页编码</param>
        /// <returns>网址和标题集合</returns>
        public NameValueCollection linkUrl_Group(string url, string beginContent, string endContent, string code)
        {
            WebClient client = new WebClient();
            client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
            Stream data = null;
            StreamReader reader = null;
            try
            {
                if (url != "")
                {
                    if (url.IndexOf("www") == 0)
                        url = "http://" + url;
                    Uri u = new Uri(url);
                    string content;
                    int m = -1, n = -1;
                    if (code == "")
                        code = "utf-8";
                    data = client.OpenRead(url);
                    reader = new StreamReader(data, Encoding.GetEncoding(code));
                    content = reader.ReadToEnd();
                    if (beginContent != "")
                    {
                        m = content.ToLower().IndexOf(beginContent.ToLower());
                        if (m > 0)
                            content = content.Substring(m + beginContent.Length);
                    }
                    if (endContent != "")
                    {
                        n = content.ToLower().IndexOf(endContent.ToLower());
                        if (n > 0)
                            content = content.Substring(0, n);
                    }
                    //正则匹配url和标题
                    Regex regUrl = new Regex(@"<a(/s.*?)href=[""'](?<url>.*?)[""'](.*?)>(?<title>.*?)</a>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
                    MatchCollection matches = regUrl.Matches(content);
                    NameValueCollection myCol = new NameValueCollection();
                    string link = "";
                    string link1 = "";
                    bool mark = false;
                    if (matches.Count > 0)
                    {
                        //消除重复的url
                        for (int i = 0; i < matches.Count - 1; i++)
                        {
                            link = matches[i].Groups["url"].Value;
                            if (link.IndexOf("#") > -1)
                                link = link.Substring(0, link.IndexOf("#"));
                            mark = false;
                            for (int j = i + 1; j < matches.Count; j++)
                            {
                                link1 = matches[j].Groups["url"].Value;
                                if (link1.IndexOf("#") > -1)
                                    link1 = link1.Substring(0, link1.IndexOf("#"));
                                if (link == link1)
                                {
                                    mark = true;
                                    break;
                                }
                            }
                            if (!mark)
                            {
                                if (link.IndexOf("/") == 0)
                                {
                                    link = "http://" + u.Host + link;
                                }
                                myCol.Add(link, matches[i].Groups["title"].Value);
                            }
                        }
                        link = matches[matches.Count - 1].Groups["url"].Value;
                        if (link.IndexOf("#") > -1)
                            link = link.Substring(0, link.IndexOf("#"));
                        if (link.IndexOf("/") == 0)
                        {
                            link = "http://" + u.Host + link;
                        }
                        myCol.Add(link, matches[matches.Count - 1].Groups["title"].Value);

                    }
                    return myCol;
                }
                return null;
            }
            catch (Exception ex)
            {
                return null;
            }
            finally
            {
                if (data != null)
                {
                    data.Close();
                    reader.Close();
                }
            }

        }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值