需要引用:
using Fizzler;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
public class FizzlerHelper
{
/// <summary>
/// 获取相应的标签内容
/// </summary>
/// <param name="url">地址链接</param>
/// <param name="cssLoad">css路径</param>
/// <returns></returns>
public static IEnumerable<HtmlNode> GetUrlInfo(string url, string cssLoad)
{
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
string html = HttpGet<string>(url);
htmlDoc.LoadHtml(html);
IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(cssLoad);//查询的路径
return NodesMainContent;
}
/// <summary>
/// 获取相应的标签内容
/// </summary>
/// <param name="html">html内容</param>
/// <param name="cssLoad">css路径</param>
/// <returns></returns>
public static IEnumerable<HtmlNode> GetHtmlInfo(string html, string cssLoad)
{
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
htmlDoc.LoadHtml(html);
IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(cssLoad);//查询的路径
return NodesMainContent;
}
#region GET请求
public static T HttpGet<T>(string url)
{
try
{
string retString = "";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
Stream stream = response.GetResponseStream();
using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8))
{
retString = streamReader.ReadToEnd().ToString();
}
}
return (T)Convert.ChangeType(retString, typeof(T));
}
catch
{
return default(T);
}
}
#endregion
}
实现数据抓取(透明售房网),Fizzler:主要是通过Html中的标签样式获取数据,屏蔽了复杂的正则表达式。
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.Services;
using System.Web.UI;
using System.Web.UI.WebControls;
using Fizzler;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using Newtonsoft.Json;
using DataCollectionCommon;
using System.Text;
using DataCollectionDAL;
using DataCollectionModel;
using System.Text.RegularExpressions;
namespace DataCollectionDemo
{
public partial class WebForm1 : System.Web.UI.Page
{
public static FizzlerHelper fizzlerHelper = new FizzlerHelper();
public string resultHtml = string.Empty;
protected void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
//杭州
resultHtml = StartDataCollection("http://www.tmsf.com/daily.htm");
}
}
/// <summary>
/// 开始抓取数据
/// </summary>
/// <param name="<span style="font-family: Arial, Helvetica, sans-serif;">url</span><span style="font-family: Arial, Helvetica, sans-serif;">">网站地址路径</param></span>
/// <returns></returns>
[WebMethod]
public static string StartDataCollection(string url)
{
StringBuilder temp_table = new StringBuilder();
temp_table.Append("<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">");
temp_table.Append("<tr><td>楼盘名称</td><td>城区</td><td>签约套数</td><td>预定套数</td><td>签约面积</td><td>签约均价</td></tr>");
List<HtmlNode> list_tr = FizzlerHelper.GetUrlInfo(url, "div.datanowin table tr").ToList();
if (list_tr.Count > 0)
{
//移除表头
list_tr.RemoveAt(0);
}
//循环行tr
foreach (HtmlNode node_tr in list_tr)
{
string tdHtml = node_tr.InnerHtml;//再去解析html中的td
List<HtmlNode> list_td = FizzlerHelper.GetHtmlInfo(tdHtml, "td").ToList();
temp_table.Append("<tr>");
//循环列td
foreach (HtmlNode node_td in list_td)
{
string spanHtml = node_td.InnerHtml;
MatchCollection mcc_temp = Regex.Matches(spanHtml, @"<span class=""(?<url>.+?)""></span>", RegexOptions.Singleline);
string values = GetValueBySpanClass(mcc_temp);
temp_table.AppendFormat("<td>{0}{1}</td>", values, node_td.InnerText);
}
temp_table.Append("</tr>");
}
temp_table.Append("</table>");
return temp_table.ToString();
//return JsonConvert.SerializeObject(new { code = 1, msg = "数据采集失败", data = temp.ToString() });
}
/// <summary>
/// 根据span样式名称 解析值
/// </summary>
/// <param name="mcc_span"></param>
/// <returns></returns>
private static string GetValueBySpanClass(MatchCollection mcc_span)
{
string str_value = "";
for (int i = 0, length = mcc_span.Count; i < length; i++)
{
switch (mcc_span[i].Groups["url"].Value)
{
case "numbdor":
str_value += ".";
break;
case "numbzero":
str_value += "0";
break;
case "numbone":
str_value += "1";
break;
case "numbtwo":
str_value += "2";
break;
case "numbthree":
str_value += "3";
break;
case "numbfour":
str_value += "4";
break;
case "numbfive":
str_value += "5";
break;
case "numbsix":
str_value += "6";
break;
case "numbseven":
str_value += "7";
break;
case "numbeight":
str_value += "8";
break;
case "numbnine":
str_value += "9";
break;
default:
break;
}
}
return str_value;
}
}
}