在.net中可以使用XmlHttp,WebClient,HttpWebRequest等方式下载网页html源码。
使用XmlHttp需要引用Microsoft.Xml,在使用HttpWebRequest时,如果网站使用了反爬虫技术时,则需要为此模拟一个浏览器的环境访问,才能返回相应的html源码,否则将会是空,如下所示:
例如:某电子商务网站中有站内搜索
查看源码或浏览器上的URL
这样就可以使用GET直接请求。
{
// 部分网站做了反爬虫技术时,需要模拟浏览器进行返回才能获取到相应的数据,否则获取不了
private static CookieContainer cookie = new CookieContainer();
private static string contentType = " application/x-www-form-urlencoded; " ;
private static string accept = " image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */* " ;
private static string userAgent = " Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022) " ;
/// <summary>
/// 返回请求的URL地址Tuple<bool,string,string> = 是否成功,网页源码,异常信息
/// </summary>
/// <param name="url"></param>
/// <param name="keyword"></param>
/// <param name="encoding"></param>
/// <param name="newUrl"></param>
/// <returns></returns>
public static Tuple < bool , string , string > GetHtmlSourceCode( string url, string keyword, Encoding encoding, out string newUrl)
{
bool methodStatus = false ;
string pageHtml = "" , exceptionInfo = "" ;
Tuple < bool , string , string > executeResult = new Tuple < bool , string , string > (methodStatus, pageHtml, exceptionInfo);
// 编码
newUrl = url + System.Uri.EscapeUriString(keyword);
// 不编码:网站如果支持就可以不编码
// newUrl = url + keyword;
WebResponse response = null ;
HttpWebRequest request = null ;
Stream responseStream = null ;
StreamReader reader = null ;
try
{
request = (HttpWebRequest)WebRequest.Create(newUrl);
request.UserAgent = userAgent;
request.ContentType = contentType;
request.CookieContainer = cookie;
request.Accept = accept;
request.Method = " GET " ;
request.Timeout = 30 * 1000 ;
// request.Host = "www.suning.com";
// request.UserAgent = "User-Agent Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1";
response = request.GetResponse();
responseStream = response.GetResponseStream();
reader = new StreamReader(responseStream, encoding);
pageHtml = reader.ReadToEnd();
methodStatus = true ;
}
catch (System.Net.WebException err)
{
exceptionInfo = err.Message;
}
catch (Exception err)
{
exceptionInfo = err.Message;
}
finally
{
if (reader != null ) reader.Close();
if (responseStream != null ) responseStream.Close();
if (response != null ) response.Close();
if (request != null ) request = null ;
}
return Tuple.Create < bool , string , string > (methodStatus, pageHtml, exceptionInfo);
}
调用时,直接将url和关键词组合后以GET方式就可以获取。
下载方法使用指定url地址通过的URI从远程服务器下载数据到本地应用程序.
1. 获得远程服务器url地址;
2. 获得目标文件路径;
3. 使用WebRequest对象检查文件是否存在于服务器端 (导入命名空间System.Net的引用);
4. HTTP:创建WebClient(System.Net,类似于上面提到的UploadFile方法)实例, 访问你DownloadData() 方法通过指定URI下载文件缓冲资源到本地路径。实际上,对于HTTP资源,使用"GET"方法.
FTP:创建FtpWebRequest实例,通过使用WebRequestMethods.Ftp.DownloadFile方法,我们可以接受来自服务器的资源流,此方法使用"RETR"命令下载FTP资源;
5. DownloadData方法会返回下载资源的字节数组,我们只需要从这个下载文件缓冲使用FileStream(using System.IO)写一个字节段到本地服务器路径;
6. 最后关闭并释放FileStream资源。
可参阅:WebClient 以及WebClient.DownloadData。
RemoteDownload
{
public string UrlString{ get ; set ;}
public string DestDir{ get ; set ;}
public RemoteDownload( string urlString, string destDir)
{
this .UrlString = urlString;
this .DestDir = destDir;
}
/// <summary>
/// 从远程服务器下载文件
/// </summary>
public virtual bool DownloadFile()
{
return true ;
}
}
/// <summary>
/// HttpRemoteDownload 类
/// </summary>
public class HttpRemoteDownload : RemoteDownload
{
public HttpRemoteDownload( string urlString, string descFilePath)
: base (urlString, descFilePath)
{
}
public override bool DownloadFile()
{
string fileName = System.IO.Path.GetFileName( this .UrlString);
string descFilePath =
System.IO.Path.Combine( this .DestDir, fileName);
try
{
WebRequest myre = WebRequest.Create( this .UrlString);
}
catch (Exception ex)
{
throw new Exception( " 服务器上不存在对应文件 " , ex.InnerException);
}
try
{
byte [] fileData;
using (WebClient client = new WebClient())
{
fileData = client.DownloadData( this .UrlString);
}
using (FileStream fs =
new FileStream(descFilePath, FileMode.OpenOrCreate))
{
fs.Write(fileData, 0 , fileData.Length);
}
return true ;
}
catch (Exception ex)
{
throw new Exception( " 下载失败 " , ex.InnerException);
}
}
}
FTPDownload
/// FtpDownload 类
/// </summary>
public class FtpRemoteDownload : RemoteDownload
{
public FtpRemoteDownload( string urlString, string descFilePath)
: base (urlString, descFilePath)
{
}
public override bool DownloadFile()
{
FtpWebRequest reqFTP;
string fileName = System.IO.Path.GetFileName( this .UrlString);
string descFilePath =
System.IO.Path.Combine( this .DestDir, fileName);
try
{
reqFTP = (FtpWebRequest)FtpWebRequest.Create( this .UrlString);
reqFTP.Method = WebRequestMethods.Ftp.DownloadFile;
reqFTP.UseBinary = true ;
using (FileStream outputStream = new FileStream(descFilePath, FileMode.OpenOrCreate))
{
using (FtpWebResponse response = (FtpWebResponse)reqFTP.GetResponse())
{
using (Stream ftpStream = response.GetResponseStream())
{
int bufferSize = 2048 ;
int readCount;
byte [] buffer = new byte [bufferSize];
readCount = ftpStream.Read(buffer, 0 , bufferSize);
while (readCount > 0 )
{
outputStream.Write(buffer, 0 , readCount);
readCount = ftpStream.Read(buffer, 0 , bufferSize);
}
}
}
}
return true ;
}
catch (Exception ex)
{
throw new Exception( " 下载失败 " , ex.InnerException);
}
}