using System.Text.RegularExpressions;
static string GetTitle(string html)
{
//string regex = @"(<title>)([\s\S]*)(</title>)";
string regex = @"(?<=<title.*>)([\s\S]*)(?=</title>)";
//正向预搜索与反向预搜索:http://www.rczjp.cn/HTML/120709/20120409090416.html
Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
return reg.Match(html).Value.Trim();
}
static void Main(string[] args)
{
string html = GetHtmlInfo("www.rczjp.cn", 5000, Encoding.UTF8);
Console.WriteLine(html);
Console.WriteLine(GetTitle(html));
Console.Read();
}
///<summary>
///获取页面的HTML信息,到标题(</title>)位置结束
///</summary>
///<param name="url">页面地址</param>
///<param name="timeout">超时时间,单位:ms</param>
///<param name="EnCodeType">编码</param>
///<returns></returns>
static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
{
if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
string result = "";
System.IO.StreamReader reader = null;
string temp = "";
try
{
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
request.Timeout = timeout;
request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)";
request.Accept = "*/*";
request.KeepAlive = true;
request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
if (response.StatusCode == System.Net.HttpStatusCode.OK)
{
StringBuilder builder = new StringBuilder();
Stream stream = response.GetResponseStream();
reader = new StreamReader(stream, EnCodeType);
string tmp = "";
while ((temp = reader.ReadLine()) != null)
{
builder.Append(temp);
tmp = builder.ToString();
if (tmp.IndexOf("</title>") > 0) { break; }//ReadLine是读取整行,所以有时在它后面的很多字符串也会读取
Console.WriteLine(tmp.IndexOf("</title>"));
builder.Append("\r\n");
}
result = builder.ToString();
return result;
}
return string.Empty;
}
catch (Exception ex)
{
return ex.Message;
}
finally { if (reader != null) { reader.Close(); } }
}