由于远程网页可能采用多种编码,并且有可能采用Gzip格式来压缩数据,让我们采集页面时头疼不已,这里借鉴了一些网络上的资料,一段小程序,用来自动分析编码,自动解压缩gzip数据。 private string GetChartset( string url) ... { string html = getHTML(url); Regex reg_charset = new Regex(@"charsets*=s*(?<charset>[^""]*)"); string enconding = null; if (reg_charset.IsMatch(html)) ...{ enconding = reg_charset.Match(html).Groups["charset"].Value; } else ...{ enconding = Encoding.Default.EncodingName; } if (enconding.ToLower().Contains("gb2312")) enconding = "gb2312"; if (enconding.ToLower().Contains("utf-8")) enconding = "utf-8"; return enconding; } private string getHTML( string url) ... { try ...{ WebRequest webRequest = WebRequest.Create(url); WebResponse webResponse = webRequest.GetResponse(); Stream stream = webResponse.GetResponseStream(); StreamReader sr = new StreamReader(stream, Encoding.GetEncoding(Encoding.ASCII.EncodingName)); string html = sr.ReadToEnd(); return html; } catch (UriFormatException ex) ...{ Console.WriteLine(ex.Message); return null; } catch (WebException ex) ...{ Console.WriteLine(ex.Message); return null; } } private string Html( string Url) ... { string strResult = ""; try ...{ HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); string ce = response.ContentEncoding; Stream streamReceive = response.GetResponseStream(); Encoding encoding = Encoding.GetEncoding(GetChartset(Url)); if (ce.ToLower() == "gzip")//压缩的内容 ...{ GZipStream gzip = new GZipStream(streamReceive, CompressionMode.Decompress); using (StreamReader reader = new StreamReader(gzip,encoding)) ...{ strResult = reader.ReadToEnd(); } } } catch (Exception ex) ...{ HttpContext.Current.Response.Write(ex.ToString()); } return strResult; } protected void Button1_Click( object sender, EventArgs e) ... { Response.Write(Html(TextBox1.Text)); } /**/ /* 使用起来很简单,下面的程序将字符串压缩入文件: using (DeflateStream gzip = new DeflateStream(fs, CompressionMode.Compress)) { byte[] buf = Encoding.UTF8.GetBytes(this.txbSource.Text); gzip.Write(buf, 0, buf.Length); gzip.Flush(); } 解压只需要这样: gzip = new GZipStream(new MemoryStream(buf), CompressionMode.Decompress); using (StreamReader reader = new StreamReader(gzip)) { this.txbTarget.Text = reader.ReadToEnd(); } 如果从文件解压,只需要把MemoryStream换成一个FileStream就行了。 */