#region 获取页面源码(自动获取页面的编码格式)
/// <summary>
/// 获取页面源码(自动获取页面的编码格式) /// <param name="url">url</param>
/// <returns>页面源码</returns>
/// </summary>
private string GetHtmlAutoEncoding(string url)
{
HttpWebRequest req = null;
HttpWebResponse resp = null;
Stream stream = null;
StreamReader read = null;
try
{
string sUrl = url;
req = (HttpWebRequest)HttpWebRequest.Create(sUrl);
req.UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:6.0) Gecko/20100101 Firefox/6.0";
req.Accept = "*/*";
req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
req.ContentType = "text/xml";
req.Timeout = 20000;
resp = (HttpWebResponse)req.GetResponse();
Encoding enc = Encoding.GetEncoding(resp.CharacterSet);
string sHTML = string.Empty;
stream = resp.GetResponseStream();
if (resp.ContentEncoding.ToLower().Contains("gzip"))
{
stream = new GZipStream(stream, CompressionMode.Decompress);
read = new StreamReader(stream, enc);
sHTML = read.ReadToEnd();
}
else
{
string sChartSet = "";
read = new StreamReader(stream, enc);
sHTML = read.ReadToEnd();
Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);
sChartSet = charSetMatch.Groups["code"].Value;
//if it's not utf-8,we should redecode the html.
Regex rx = new Regex("([\u4e00-\u9fa5]{2,4})");
if (!rx.IsMatch(sHTML))
{
if (!string.IsNullOrEmpty(sChartSet.Trim()))
sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML));
}
}
return sHTML;
}
catch
{
return "";
}
finally
{
if (resp != null)
{
resp.Close();
}
if (stream != null)
{
stream.Close();
}
if (read != null)
{
read.Close();
}
}
}
#endregion
获取页面源码(自动获取页面的编码格式)
最新推荐文章于 2022-08-16 14:01:42 发布