C#抓取网站内容

ContractedBlock.gif ExpandedBlockStart.gif Code
<form id="form1" runat="server">
    
<div id="div_html" runat="server">
    
    
</div>
    
<input type="button" id="btnok" runat="server" value="获取温度" onserverclick="btnok_ServerClick"/>
        
<asp:Button ID="btnNews" runat="server" OnClick="btnNews_Click" Text="新闻" />
        
<table>
        
<asp:Repeater ID="rptNews" runat="server">
            
<ItemTemplate>      
                
<tr>
                    
<td>ID</td>
                    
<td>Name</td>
                    
<td>Pic</td>
                
</tr>
                
<tr>
                    
<td><%#Eval("ID"%></td>
                    
<td><%#Eval("Name"%></td>
                    
<td><%#Eval("Pic")%></td>
                
</tr>       
                
</ItemTemplate>
        
</asp:Repeater>
        
</table>
    
</form>


 

ContractedBlock.gif ExpandedBlockStart.gif Code
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Text;

namespace sqlhelper
{
    
public partial class WebForm2 : System.Web.UI.Page
    {
        
protected void Page_Load(object sender, EventArgs e)
        {
            
        }

        
#region  抓天气

        
protected void btnok_ServerClick(object sender, EventArgs e)
        {
            
string Htm = _GetHtml(@"http://php.weather.sina.com.cn/search.php?city=北京");

            
string pat_city = "<div class=\"City_Data\">" + @"(?:.|\n)*?<h3>(?:[\u4e00-\u9fa5]+)(?:\s+)([\u4e00-\u9fa5]+)</h3>";
            
string pat_date = "<div class=\"City_Data\">" + @"(?:.|\n)*?<p>(?:\d+[\u4e00-\u9fa5]+)+?-(?:\d+[\u4e00-\u9fa5]+)&nbsp;([\u4e00-\u9fa5]+)</p>";
            
string pat_wea = "<div class=\"Weather_TP\">" + @"[\u4e00-\u9fa5]+\s+(-?\d+℃~-?\d+℃)</div>";

            
string str_city = Regex.Match(Htm, pat_city).Groups[1].Value;
            
string str_date = Regex.Match(Htm, pat_date).Groups[1].Value;
            
string str_wea = Regex.Match(Htm, pat_wea).Groups[1].Value;

            div_html.InnerHtml 
= str_city + str_date + str_wea;
        } 

        
public static string _GetHtml(string Url)
        {

            Stream MyInStream 
= null;
            
string Html = "";
            
try
            {
                HttpWebRequest MyRequest 
= (HttpWebRequest)WebRequest.Create(Url);
                HttpWebResponse MyResponse 
= (HttpWebResponse)MyRequest.GetResponse();

                MyInStream 
= MyResponse.GetResponseStream();

                Encoding encode 
= System.Text.Encoding.Default; //这里编码为:utf-8//System.Text.Encoding.Default;
                StreamReader sr = new StreamReader(MyInStream, encode);

                Char[] read 
= new Char[256];
                
int count = sr.Read(read, 0256);
                
while (count > 0)
                {
                    String str 
= new String(read, 0, count);
                    Html 
+= str;
                    count 
= sr.Read(read, 0256);
                }
            }
            
catch (Exception)
            {
                Html 
= "错误";
            }
            
finally
            {
                
if (MyInStream != null)
                {
                    MyInStream.Close();
                }
            }
            
return Html;
        }

        
#endregion

        
#region  抓新闻

        
protected void btnNews_Click(object sender, EventArgs e)
        {
            rptNews.DataSource 
=GetHtmlCode(@"http://bbs.le8le.com/forumdisplay.php?fid=62"7);

            
//rptNews.DataSource = GetHtmlCode(@"http://news.sina.com.cn/china/", 3);
            rptNews.DataBind();
        }

        DataTable dt 
= new DataTable();

        
private void Setdt()
        {

            dt.Columns.Add(
"ID");
            dt.Columns.Add(
"Name");
            dt.Columns.Add(
"URL");
            dt.Columns.Add(
"Pic");
        } 

        
/// <summary> 
        
/// 采集数据 
        
/// </summary> 
        
/// <param name="Url">需要采集的URL地址</param> 
        
/// <param name="i">采集多少条</param> 
        
/// <returns>DataTable</returns> 
        public DataTable GetHtmlCode(string Url, int i)
        {
            
string strHtml = "";;
            strHtml 
= GetURlHTML(Url);
            
try
            {
                Setdt();
                
//关键所在正则表达式,[\\S\\s]*? 匹配所有字符 
                
//<span id="thread_2224"><a href="viewthread.php?tid=2224&extra=page%3D1">2008-09赛季前瞻</a></span>所有满足这种条件的数据 
                string TempRegex = "<span id=\"([\\S\\s]*?)\"><a href=\"([\\S\\s]*?)\">([\\S\\s]*?)</a></span>";
                Regex regex 
= new Regex(TempRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);




                
//string TempRegex = @"<table width=320 cellspacing=0 style='margin:7px 0 7px 0'>(?:.|\n)*?<a\s+href=(.*?)\s+target=_blank>(.*?)</a>(.|\n)*?</table>";
                
//string pat_htm = Regex.Match(strHtml, TempRegex).Groups[0].Value; //先把要抓取的一段提出来
                ////一个一个匹配 <a\s+href=(.*?)(?=\s+target=_blank)(.|\n)*?</a>  (?=\s+target=_blank)后面没有>不然不能匹配
                //string pat_href_text = @"<a\s+href=(.*?)\s+target=_blank>(.*?)</a>";
                
//Regex regex = new Regex(pat_href_text, RegexOptions.Compiled | RegexOptions.IgnoreCase);


                
int x = 1;
                i
++;

                
for (Match match = regex.Match(strHtml); match.Success; match = match.NextMatch())
                {
                    
if (x < i)
                    {
                        DataRow datarow 
= dt.NewRow();
                        datarow[
"ID"= x;
                        
//datarow["URL"] = match.Groups[1].Value;
                        
//datarow["Name"] = match.Groups[2].Value;
                        datarow["Name"= match.Groups[3].ToString();
                        datarow[
"Url"= GetBBSUrl(Url, match.Groups[2].ToString());
                        datarow[
"Pic"= datarow["Url"].ToString();

                        dt.Rows.Add(datarow);
                        dt.AcceptChanges();
                        x
++;
                    }
                    
else
                    {
                        
break;
                    }
                }
            }
            
catch (Exception ex)
            {
                
throw new Exception(ex.Message);
            }
            
return dt;

        }

        
/// <summary> 
        
/// 获取一条BBS连接的真正外网地址 
        
/// </summary> 
        
/// <param name="Url">要采集的网址</param> 
        
/// <param name="BBSUrl">一条BBS连接</param> 
        
/// <returns>一条BBS连接真正地址</returns> 
        private string GetBBSUrl(string Url, string BBSUrl)
        {
            
string TempUrl = Url.Substring(0, Url.LastIndexOf("/"+ 1);
            
return TempUrl + BBSUrl;
        }


        
/// <summary> 
        
/// 获取URL的HTML 
        
/// </summary> 
        
/// <param name="Url">URL地址</param> 
        
/// <returns>HTML代码</returns> 
        private string GetURlHTML(string Url)
        {
            
string strHtml = "";
            
try
            {
                StreamReader sr 
= null;         //用来读取流 
                System.Text.Encoding code = System.Text.Encoding.Default;  //定义编码 

                
//构造web请求,发送请求,获取响应 
                WebRequest HttpWebRequest = null;
                WebResponse HttpWebResponse 
= null;
                HttpWebRequest 
= WebRequest.Create(Url);
                HttpWebResponse 
= HttpWebRequest.GetResponse();

                
//获得流 
                sr = new StreamReader(HttpWebResponse.GetResponseStream(), code);
                strHtml 
= sr.ReadToEnd();
                sr.Close();
                
//sr.Dispose(); 
                HttpWebResponse.Close();
            }
            
catch (Exception ex)
            {
                
throw new Exception(ex.Message);
            }
            
return strHtml;

        }


        
/// <summary> 
        
/// 获取一条BBS连接内力的图片和图片地址 
        
/// </summary> 
        
/// <param name="HtmlCode">HtmlCode代码</param> 
        
/// <param name="regex1"></param> 
        
/// <returns></returns> 
        private string SaveImgToLocal(string URl)
        {
            DataTable dtimg 
= new DataTable();
            dtimg.Columns.Add(
"imgpath");

            
//string HTML = GetURlHTML(URl);
            string HTML = GetURlHTML(@"http://bbs.le8le.com/viewthread.php?tid=2223&extra=page%3D1");

            
string Temp = "<img.*?src=\"(.*?)\".*?/>";//HTML-htm-pat
            
//HTML-htm 用(?=exp的方法来确定末尾在那里)
            string Contm = "<div id=\"postmessage_8769\" class=\"t_msgfont\">(.|\\n)*</div>(?=(.|\\n)*?<div id=\"post_rate_div_8769\">)"
            Regex regex 
= new Regex(Contm, RegexOptions.Compiled | RegexOptions.IgnoreCase);
            
string htm = regex.Match(HTML).Groups[0].Value;

            
string PicUrl = "";
            regex 
= new Regex(Temp, RegexOptions.Compiled | RegexOptions.IgnoreCase);
            
//PicUrl = regex.Match(htm).Groups[1].Value;

            
for (Match match = regex.Match(htm); match.Success; match = match.NextMatch())
            {
                DataRow dtimgrow 
= dtimg.NewRow();
                dtimgrow[
"imgpath"= match.Groups[1].Value;

                dtimg.Rows.Add(dtimgrow);
                dtimg.AcceptChanges();
            }

            
return PicUrl;
        }

        
#endregion




    }
}

转载于:https://www.cnblogs.com/reommmm/articles/1343011.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值