一段我自己常用的c#读取页面源码的方法
/// <summary>
/// 根据URL获取页面内容
/// </summary>
/// <param name="sUrl">页面的URL</param>
/// <returns>页面内容</returns>
public static string getHtmlContents(string sUrl)
{
string returnHtml = string.Empty;
try
{
//创建HttpWebRequest对象
HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(sUrl);
rqst.AllowAutoRedirect = false;
rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
rqst.KeepAlive = true;
rqst.Timeout = 30000;
//响应头
HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
//获取主体
Stream sm = rsps.GetResponseStream();
//设置默认
Encoding cding = System.Text.Encoding.Default;
//响应头中的文档格式
string contenttype = rsps.ContentType.ToLower();
//判断格式中是否包含编码
int ix = contenttype.IndexOf("charset=");
if (ix != -1)
{
try
{
cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
}
catch
{
cding = Encoding.Default;
}
returnHtml = new StreamReader(sm, cding).ReadToEnd();
}
else
{
returnHtml = new StreamReader(sm, cding).ReadToEnd();
Regex regex = new Regex("charset=(?<cding>[^=]+)?\"", RegexOptions.IgnoreCase);
string strcding = regex.Match(returnHtml).Groups["cding"].Value;
try
{
cding = Encoding.GetEncoding(strcding);
}
catch
{
cding = Encoding.Default;
}
byte[] bytes = Encoding.Default.GetBytes(returnHtml.ToCharArray());
returnHtml = cding.GetString(bytes);
if (returnHtml.Split('?').Length > 100)
{
returnHtml = Encoding.Default.GetString(bytes);
}
}
rsps.Close();
return returnHtml;
}
catch
{
return "";
}
}