之前看了很多获取网页源码的写法,要么有乱码,要么没考虑到gzip等压缩,比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况
我们根据它的charset读取。
还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性
以下是全部源码
/// <summary>
/// 将 Stream 转成 byte[]
/// </summary>
private
static
byte
[] StreamToBytes(Stream stream)
{
MemoryStream stmMemory =
new
MemoryStream();
byte
[] buffer =
new
byte
[4096];
int
i;
while
((i = stream.Read(buffer, 0, buffer.Length)) > 0)
{
stmMemory.Write(buffer, 0, i);
}
byte
[] arraryByte = stmMemory.ToArray();
stmMemory.Close();
// 设置当前流的位置为流的开始
if
(stream.CanSeek)
{
stream.Seek(0, SeekOrigin.Begin);
}
return
arraryByte;
}
/**
* 用getBytes(encoding):返回字符串的一个byte数组
* 当b[0]为 63时,应该是转码错误
* A、不乱码的汉字字符串:
* 1、encoding用GB2312时,每byte是负数;
* 2、encoding用ISO8859_1时,b[i]全是63。
* B、乱码的汉字字符串:
* 1、encoding用ISO8859_1时,每byte也是负数;
* 2、encoding用GB2312时,b[i]大部分是63。
* C、英文字符串
* 1、encoding用ISO8859_1和GB2312时,每byte都大于0;
* <p/>
* 总结:给定一个字符串,用getBytes("iso8859_1")
* 1、如果b[i]有63,不用转码; A-2
* 2、如果b[i]全大于0,那么为英文字符串,不用转码; B-1
* 3、如果b[i]有小于0的,那么已经乱码,要转码。 C-1
*/
/// <summary>
/// //url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public
static
string
DoGet(
string
url,
string
charSet =
null
,
string
aspnetSessionID =
null
)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
if
(!String.IsNullOrEmpty(aspnetSessionID))
{
CookieContainer cookies =
new
CookieContainer();
req.CookieContainer = cookies;
Cookie appCookie =
new
Cookie(
"ASP.NET_SessionId"
, aspnetSessionID);
//注意ASP.NET验证sessionID的名:ASP.NET_SessionId
req.CookieContainer.Add(
new
Uri(url), appCookie);
}
// 需要注意的:
//有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
//这是就要具体问题具体分析比如在头部加入cookie
// req.Headers.Add("Cookie", cookie);
//这样可能需要一些重载方法。根据需要写就可以了
//获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
req.Credentials = CredentialCache.DefaultCredentials;
//如果服务器要验证用户名,密码
//NetworkCredential mycred = new NetworkCredential(struser, strpassword);
//myWebClient.Credentials = mycred;
//从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
req.Method =
"GET"
;
req.ContentType =
"application/x-www-form-urlencoded"
;
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
Stream receiveStream = res.GetResponseStream();
if
(res.ContentEncoding.ToLower().Contains(
"gzip"
))
{
receiveStream =
new
GZipStream(receiveStream, CompressionMode.Decompress);
}
else
if
(res.ContentEncoding.ToLower().Contains(
"deflate"
))
{
receiveStream =
new
DeflateStream(receiveStream, CompressionMode.Decompress);
}
//获取网页字符编码描述信息
//Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//string webCharSet = charSetMatch.Groups[2].Value.Trim('\"');
byte
[] buffer = StreamToBytes(receiveStream);
Encoding encode = Encoding.UTF8;
if
(String.IsNullOrEmpty(charSet))
{
if
(res.CharacterSet !=
null
&& res.CharacterSet !=
""
&& (res.CharacterSet.ToLower() ==
"gbk"
|| res.CharacterSet.ToLower() ==
"gb2312"
))
{
encode = Encoding.GetEncoding(
"gb2312"
);
}
}
else
{
encode = Encoding.GetEncoding(charSet);
}
string
result = encode.GetString(buffer);
if
(res.CharacterSet.ToLower() ==
"iso-8859-1"
)
{
Match charSetMatch = Regex.Match(result,
"<meta([^<]*)charset=([^<]*)\""
, RegexOptions.IgnoreCase | RegexOptions.Multiline);
if
(charSetMatch.Success)
{
string
webCharSet = charSetMatch.Groups[2].Value.Trim(
'\"'
);
if
(Encoding.GetEncoding(webCharSet) != encode)
{
result = Encoding.GetEncoding(webCharSet).GetString(buffer);
}
}
}
receiveStream.Close();
buffer =
null
;
return
result;
}
|
欢迎转载,请注明出处:http://kecq.com/?p=121