c#完整地获取网页内容

最新推荐文章于 2021-09-10 20:34:24 发布

焦三仙本仙

最新推荐文章于 2021-09-10 20:34:24 发布

阅读量1.5k

点赞数

分类专栏： C#专区

本文链接：https://blog.csdn.net/ghevinn/article/details/8737980

版权

C#专区专栏收录该内容

114 篇文章 3 订阅

订阅专栏

之前看了很多获取网页源码的写法，要么有乱码，要么没考虑到gzip等压缩，比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况

我们根据它的charset读取。

还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性

以下是全部源码

 
         /// <summary>  
        
         /// 将 Stream 转成 byte[]  
        
         /// </summary>  
        
         private 
         static 
         byte 
         [] StreamToBytes(Stream stream) 
        
         { 
        
         MemoryStream stmMemory =  
         new 
         MemoryStream(); 
        
         byte 
         [] buffer =  
         new 
         byte 
         [4096]; 
        
         int 
         i; 
        
         while 
         ((i = stream.Read(buffer, 0, buffer.Length)) > 0) 
        
         { 
        
         stmMemory.Write(buffer, 0, i); 
        
         } 
        
         byte 
         [] arraryByte = stmMemory.ToArray(); 
        
         stmMemory.Close(); 
        
         // 设置当前流的位置为流的开始  
        
         if 
         (stream.CanSeek) 
        
         { 
        
         stream.Seek(0, SeekOrigin.Begin); 
        
         } 
        
         return 
         arraryByte; 
        
         } 
        
         /**  
        
         * 用getBytes(encoding)：返回字符串的一个byte数组  
        
         * 当b[0]为 63时，应该是转码错误  
        
         * A、不乱码的汉字字符串：  
        
         * 1、encoding用GB2312时，每byte是负数；  
        
         * 2、encoding用ISO8859_1时，b[i]全是63。  
        
         * B、乱码的汉字字符串：  
        
         * 1、encoding用ISO8859_1时，每byte也是负数；  
        
         * 2、encoding用GB2312时，b[i]大部分是63。  
        
         * C、英文字符串  
        
         * 1、encoding用ISO8859_1和GB2312时，每byte都大于0；  
        
         * <p/>  
        
         * 总结：给定一个字符串，用getBytes("iso8859_1")  
        
         * 1、如果b[i]有63，不用转码； A-2  
        
         * 2、如果b[i]全大于0，那么为英文字符串，不用转码； B-1  
        
         * 3、如果b[i]有小于0的，那么已经乱码，要转码。 C-1  
        
         */ 
        
         /// <summary> 
        
         /// //url是要访问的网站地址，charSet是目标网页的编码，如果传入的是null或者""，那就自动分析网页的编码  
        
         /// </summary> 
        
         /// <param name="url"></param> 
        
         /// <returns></returns> 
        
         public 
         static 
         string 
         DoGet( 
         string 
         url,  
         string 
         charSet =  
         null 
         ,  
         string 
         aspnetSessionID =  
         null 
         ) 
        
         { 
        
         HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); 
        
         if 
         (!String.IsNullOrEmpty(aspnetSessionID)) 
        
         { 
        
         CookieContainer cookies =  
         new 
         CookieContainer(); 
        
         req.CookieContainer = cookies; 
        
         Cookie appCookie =  
         new 
         Cookie( 
         "ASP.NET_SessionId" 
         , aspnetSessionID); 
         //注意ASP.NET验证sessionID的名:ASP.NET_SessionId  
        
         req.CookieContainer.Add( 
         new 
         Uri(url), appCookie); 
        
         } 
        
         // 需要注意的：  
        
         //有的网页可能下不下来，有种种原因比如需要cookie,编码问题等等  
        
         //这是就要具体问题具体分析比如在头部加入cookie  
        
         // req.Headers.Add("Cookie", cookie);  
        
         //这样可能需要一些重载方法。根据需要写就可以了  
        
         //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。  
        
         req.Credentials = CredentialCache.DefaultCredentials; 
        
         //如果服务器要验证用户名,密码  
        
         //NetworkCredential mycred = new NetworkCredential(struser, strpassword);  
        
         //myWebClient.Credentials = mycred;  
        
         //从资源下载数据并返回字节数组。（加@是因为网址中间有"/"符号）  
        
         req.Method =  
         "GET" 
         ; 
        
         req.ContentType =  
         "application/x-www-form-urlencoded" 
         ; 
        
         HttpWebResponse res = (HttpWebResponse)req.GetResponse(); 
        
         Stream receiveStream = res.GetResponseStream(); 
        
         if 
         (res.ContentEncoding.ToLower().Contains( 
         "gzip" 
         )) 
        
         { 
        
         receiveStream =  
         new 
         GZipStream(receiveStream, CompressionMode.Decompress); 
        
         } 
        
         else 
        
         if 
         (res.ContentEncoding.ToLower().Contains( 
         "deflate" 
         )) 
        
         { 
        
         receiveStream =  
         new 
         DeflateStream(receiveStream, CompressionMode.Decompress); 
        
         } 
        
         //获取网页字符编码描述信息  
        
         //Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline); 
        
         //string webCharSet = charSetMatch.Groups[2].Value.Trim('\"'); 
        
         byte 
         [] buffer = StreamToBytes(receiveStream); 
        
         Encoding encode = Encoding.UTF8; 
        
         if 
         (String.IsNullOrEmpty(charSet)) 
        
         { 
        
         if 
         (res.CharacterSet !=  
         null 
         && res.CharacterSet !=  
         "" 
         && (res.CharacterSet.ToLower() ==  
         "gbk" 
         || res.CharacterSet.ToLower() ==  
         "gb2312" 
         )) 
        
         { 
        
         encode = Encoding.GetEncoding( 
         "gb2312" 
         ); 
        
         } 
        
         } 
        
         else 
        
         { 
        
         encode = Encoding.GetEncoding(charSet); 
        
         } 
        
         string 
         result = encode.GetString(buffer); 
        
         if 
         (res.CharacterSet.ToLower() ==  
         "iso-8859-1" 
         ) 
        
         { 
        
         Match charSetMatch = Regex.Match(result,  
         "<meta([^<]*)charset=([^<]*)\"" 
         , RegexOptions.IgnoreCase | RegexOptions.Multiline); 
        
         if 
         (charSetMatch.Success) 
        
         { 
        
         string 
         webCharSet = charSetMatch.Groups[2].Value.Trim( 
         '\"' 
         ); 
        
         if 
         (Encoding.GetEncoding(webCharSet) != encode) 
        
         { 
        
         result = Encoding.GetEncoding(webCharSet).GetString(buffer); 
        
         } 
        
         } 
        
         } 
        
         receiveStream.Close(); 
        
         buffer =  
         null 
         ; 
        
         return 
         result; 
        
         }

欢迎转载，请注明出处:http://kecq.com/?p=121

焦三仙本仙

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
c#完整地获取网页内容

之前看了很多获取网页源码的写法，要么有乱码，要么没考虑到gzip等压缩，比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况我们根据它的charset读取。还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性以下是全部源码?/
复制链接

扫一扫