通过Url抓取网页内容

最新推荐文章于 2024-07-02 09:04:22 发布

weixin_30311605

最新推荐文章于 2024-07-02 09:04:22 发布

阅读量720

点赞数

原文链接：http://www.cnblogs.com/kevin-wu/archive/2007/09/20/899786.html

版权

近来想学习一下网页抓取技术,监于之前没有这方面的基础，都只是在socke方面的编程，对http方面了解很少，现在到个较好的入门例子，共享学习一下，如果大家以前看过的话，就当是复习吧。还希望高手可以指导一下如何学习这方面的内容，给点指引。

using System;
using System.Text;
using System.Web;
using System.IO;
using System.Net;

public string ReadUrlContent( string rUrl)
{

// used to build entire input
StringBuilder sb   = new StringBuilder();

// 用于作为读取内容操作的缓冲区
byte []        buf = new byte [ 8192 ];

// 请求该页面
HttpWebRequest  request   = (HttpWebRequest)
WebRequest.Create(rUrl);

// 获取返回的数据（通过相应）
HttpWebResponse response = (HttpWebResponse)
request.GetResponse();

// 将读取到的数据放入到流里面
Stream resStream = response.GetResponseStream();

string tempString = null ;
int totalcount = 0 ;
int     count       = 0 ;
FileStream fs = File.Create(Server.MapPath( " urltext.html " ));

do
{
// 读取部分的数据
count = resStream.Read(buf, 0 , buf.Length);

// 确定读取的数据不为空
if (count != 0 )
{
   // 转换内容格式byte 到 ascii
  tempString = Encoding.ASCII.GetString(buf, 0 , count);
  fs.Write(buf, 0 ,count); // 写入文件
   // 加入到字符串
  sb.Append(tempString);
}

totalcount += count;
}
while (count > 0 );

resStream.Close();
fs.Close();

return sb.ToString();

}