//根据网页地址获取网页内容
public static string GetWebContent(string url)
{
string htmlCode;
HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
webRequest.KeepAlive = false;
//webRequest.ProtocolVersion = HttpVersion.Version10;
webRequest.Timeout = 30000;
webRequest.Method = "GET";
webRequest.UserAgent = "Mozilla/4.0";
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
//获取目标网站的编码格式
string contentype = webResponse.Headers["Content-Type"];
Regex regex = new Regex("charset\\s*=\\s*[\\W]?\\s*([\\w-]+)", RegexOptions.IgnoreCase);
if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (var zipStream = new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
{
//匹配编码格式
if (regex.IsMatch(contentype))
{
Encoding ending = Encoding.GetEncoding(regex.Match(contentype).Groups[1].Value.Trim());
using (StreamReader sr = new System.IO.StreamReader(zipStream, ending))
{
htmlCode = sr.ReadToEnd();
}
}
else
{
//using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.UTF8))
using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.Default))//GB2312
{
htmlCode = sr.ReadToEnd();
}
}
}
}
}
else
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))
{
htmlCode = sr.ReadToEnd();
}
}
}
return htmlCode;
}
WebBrowser webb = new WebBrowser();//可放在构造函数中
//根据网页内容查找相关数据
public void SearchData()
{
string strWebContent = GetWebContent(url);
//取出和数据有关的那段源码
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf("城市", iBodyStart);
int iEnd = strWebContent.IndexOf("价格", iStart);
string strWeb = strWebContent.Substring(iStart, (iEnd - iStart) + 50);
//生成HtmlDocument
webb.Navigate("about:blank");
HtmlDocument htmldoc = webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollection htmlLI = htmldoc.GetElementsByTagName("li");
HtmlElementCollection htmlTD = htmldoc.GetElementsByTagName("td");
foreach (HtmlElement tr in htmlLI)
{
if (tr.GetAttribute("ClassName") == "city")
{
Console.WriteLine(tr.InnerText.Trim());
}
}
}
//采集的数据内容在table标签中获取示例
/// <summary>
/// 采集数据
/// </summary>
/// <param name="webSite">如 :http://www.forbeschina.com/list/more/2473/page/1</param>
private void GatherData(string webSite)
{
try
{
string strWebContent = GetWebContent(webSite);
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf("<table", iBodyStart);
int iEnd = strWebContent.IndexOf("</table>", iStart);
string strWeb = strWebContent.Substring(iStart, (iEnd - iStart) + 11);
//生成HtmlDocument
webb.Navigate("about:blank");
HtmlDocument htmldoc = webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollection htmlTable = htmldoc.GetElementsByTagName("table");
StringBuilder str = new StringBuilder();
foreach (HtmlElement tbItem in htmlTable)
{
HtmlElementCollection trChild = tbItem.GetElementsByTagName("tr");
foreach (HtmlElement trItem in trChild)
{
if (trItem.GetAttribute("ClassName") == "list_btr")
{
int index = 0;
DataRow dr = dt.NewRow();
HtmlElementCollection tdChild = trItem.GetElementsByTagName("td");
foreach (HtmlElement tdItem in tdChild)
{
str.Append(tdItem.InnerText.Trim() + " , ");
dr[index] = tdItem.InnerText.Trim();
index++;
}
dr["CreateTime"] = DateTime.Now;
dt.Rows.Add(dr);
}
File.AppendAllText("forbeschina.txt", str.ToString().TrimEnd(',') + System.Environment.NewLine);
str.Clear();
}
}
}
catch (Exception ex)
{
File.AppendAllText("error.txt", ex.Message + "," + ex.StackTrace + System.Environment.NewLine);
}
}