ASP.NET 中抓取网页内容是非常方便的,而其中更是解决了 ASP 中困扰我们的编码问题。
需要三个类:WebRequest、WebResponse、StreamReader。
WebRequest、WebResponse 的名称空间 是:
StreamReader 的名称空间是:
核心代码
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
- WebRequest 类的 Create 为静态方法,参数为要抓取的网页的网址;
- Encoding 指定编码,Encoding 中有属性 ASCII、UTF32、UTF8 等全球通用的编码,但没有 gb2312 这个编码属性,所以我们使用 GetEncoding 获得 gb2312 编码。
示例
private static string getContent(string Url)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
streamReader.Close();
}
catch
{
throw;
}
return strResult;
}
private string GetUrl(string url)
{
string str = string.Empty;
try
{
WebRequest request = WebRequest.Create(url);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
str = reader.ReadToEnd();
reader.Close();
reader.Dispose();
response.Close();
return str;
}
catch (Exception ex)
{
str = ex.Message;
return str;
}
}
private string GetPostContent(string strUrl)
{
string strMsg = string.Empty;
try
{
string data = "";
byte[] requestBuffer = System.Text.Encoding.GetEncoding("gb2312").GetBytes(data);
WebRequest request = WebRequest.Create(strUrl);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = requestBuffer.Length;
using (Stream requestStream = request.GetRequestStream())
{
requestStream.Write(requestBuffer, 0, requestBuffer.Length);
requestStream.Close();
}
WebResponse response = request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312")))
{
strMsg = reader.ReadToEnd();
reader.Close();
}
}
catch
{ }
return strMsg;
}
The server committed a protocol violation. Section=ResponseHeader Detail=CR must be followed by LF
主体意思是微软没有容忍不符合RFC 822中的httpHeader必须以CRLF结束的规定的服务器响应。
一个解决方案是在application.config或web.config文件里加入
<system.net>
<settings>
<httpWebRequest useUnsafeHeaderParsing= "true " />
</settings>
</system.net>
允许系统容忍(tolerant)只以CR或LF结尾的hearder信息