asp.net C#抓取网页链接

采用了写正则,具体可以看这里 不过我用的不是这个。呵呵

代码还有点粗糙,比如还没有实现,统一写到xml中,然后显示出来。

还有些东西还要过滤,一点一点来吧。先记录一下,免得以后忘记。

default.aspx

<%@ Page Language="C#" AutoEventWireup="true"  CodeFile="Default.aspx.cs" Inherits="_Default" ValidateRequest="false" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
    <title>Untitled Page</title>
</head>
<body>
    <form id="aspBuffer" method=post runat="server">
    <div align="center" style="FONT-WEIGHT: bold">得到任意网页源代码</div>
    <br />
    <div>
        <asp:TextBox ID="UrlText" runat="server" Style="z-index: 100; left: 9px; position: absolute;
            top: 47px" Width="400px"></asp:TextBox>
        &nbsp;&nbsp;
      <asp:Button id="WebRequestButton" runat="server" Text="用WebRequest得到" style="z-index: 101; left: 444px; position: absolute; top: 45px" OnClick="WebRequestButton_Click"></asp:Button>
        &nbsp; &nbsp;
      <asp:TextBox id="ContentHtml" runat="server" Width="100%" Height="360px" TextMode="MultiLine" style="z-index: 102; left: 3px; position: absolute; top: 92px">
       </asp:TextBox>
        <asp:Button ID="getUrl" runat="server" OnClick="getUrl_Click" Style="z-index: 104;
            left: 675px; position: absolute; top: 45px" Text="得到网页链接" />

   
   
   
    </div>
    </form>
</body>
</html>

default.aspx.cs

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;


public partial class _Default : System.Web.UI.Page
{

    public string urlPage = "";
   
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    protected void WebRequestButton_Click(object sender, EventArgs e)
    {
   
        urlPage = UrlText.Text;
        WebRequest request = WebRequest.Create(urlPage);
        WebResponse response = request.GetResponse();
        Stream resStream = response.GetResponseStream();
        StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
        ContentHtml.Text = Server.HtmlEncode(sr.ReadToEnd());
        resStream.Close();
        sr.Close();
        response.Close();

    }

    protected void getUrl_Click(object sender, EventArgs e)
    {
        ArrayList allLinks;
        allLinks = GetHyperLinks(ContentHtml.Text.ToString());
        ContentHtml.Text = "";
        string strTemp = "";

        for (int j = 0; j< allLinks.Count - 1; j++)
        {
            strTemp += allLinks[j].ToString();
        }

        ContentHtml.Text = strTemp;


    }
    static ArrayList GetHyperLinks(string htmlCode)
    {
        ArrayList myal = new ArrayList();
        string strRegex = @"http:///S+/./S+";
        Regex rg = new Regex(strRegex, RegexOptions.IgnoreCase);
        MatchCollection m = rg.Matches(htmlCode);
             for( int i=0; i<=m.Count-1; i++)
            {
                 bool rep = false;
           
                string strNew = m[i].ToString( );
                // 过滤重复的URL
                foreach( string str in myal )
                    {
                        if( strNew==str )
                            {
                                 rep =true;
                                  break;
                             }
                     }
            if( !rep ) myal.Add( strNew );
                 }
        myal.Sort( );
        return myal;
    }


}

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值