private string GetHtmlCode(string url) { string htmlCode; HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url); webRequest.Timeout = 30000; webRequest.Method = "GET"; webRequest.UserAgent = "Mozilla/4.0"; webRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse(); if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压 { using (System.IO.Stream streamReceive = webResponse.GetResponseStream()) { using (var zipStream = new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress)) { Encoding enc = GetEncoding(url); using (StreamReader sr = new System.IO.StreamReader(zipStream, enc)) { htmlCode = sr.ReadToEnd(); } } } } else { using (System.IO.Stream streamReceive = webResponse.GetResponseStream()) { Encoding enc = GetEncoding(url); using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, enc)) { htmlCode = sr.ReadToEnd(); } } } return htmlCode; } public Encoding GetEncoding(string strurl) { string urlToCrawl = strurl; //generate http request if (urlToCrawl != null && urlToCrawl != "") { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl); //use GET method to get url's html req.Method = "GET"; req.Accept = "*/*"; req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); req.ContentType = "text/xml"; //use request to get response HttpWebResponse resp = (HttpWebResponse)req.GetResponse(); Encoding enc; try { if (resp.CharacterSet != "ISO-8859-1") enc = Encoding.GetEncoding(resp.CharacterSet); else enc = Encoding.UTF8; } catch { // *** Invalid encoding passed enc = Encoding.UTF8; } string sHTML = string.Empty; using (StreamReader read = new StreamReader(resp.GetResponseStream(), enc)) { sHTML = read.ReadToEnd(); Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase); string sChartSet = charSetMatch.Groups["code"].Value; //if it's not utf-8,we should redecode the html. if (!string.IsNullOrEmpty(sChartSet) && !sChartSet.Equals("utf-8", StringComparison.OrdinalIgnoreCase)) { enc = Encoding.GetEncoding(sChartSet); } } return enc; } return Encoding.Default; }
使用C#抓取网页时遇到乱码问题,找了各种办法都没有妥善解决的,发现存在gzip压缩的问题;于是乎,在参考CSDN上两位达人的帖子以后,我把代码进行了修正,基本妥善解决页面代码错误问题;欢迎大家使用上面的代码尝试;
以下为参考贴:
http://blog.csdn.net/wsc449/article/details/7280646
http://bbs.csdn.net/topics/320213776