功能:
1、轻松获取指元素HTML元素。
2、可以根据属性标签进行筛选
3、返回的都是Llist强类型无需转换
用过XElement的都知道 用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。
所以我就写了这么一个类似XElement的 XHTMLElement
用法:
string filePath = Server.MapPath("~/file/test.htm");
//获取HTML代码
string mailBody = FileHelper.FileToString(filePath);
XHtmlElement xh = new XHtmlElement(mailBody);
//获取body的子集a标签并且class="icon"
var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();
//获取带href的a元素
var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();
foreach (var r in links)
{
Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href
}
//获取第一个img
var img = xh.Descendants("img");
//获取最近的第一个p元素以及与他同一级的其它p元素
var ps = xh.Descendants("p");
代码:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;
namespace SyntacticSugar
{
///
/// ** 描述:html解析类
/// ** 创始时间:2015-4-23
/// ** 修改时间:-
/// ** 作者:sunkaixuan
/// ** qq:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议
///
public class XHtmlElement
{
private string _html;
public XHtmlElement(string html)
{
_html = html;
}
///
/// 获取最近的相同层级的HTML元素
///
/// 等于null为所有元素
///
public List Descendants(string elementName = null)
{
if (_html == null)
{
throw new ArgumentNullException("html不能这空!");
}
var allList = RootDescendants(_html);
var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
if (reval == null || reval.Count == 0)
{
reval = GetDescendantsSource(allList, elementName);
}
return reval;
}
///
/// 获取第一级元素
///
///
///
public List RootDescendants(string html = null)
{
/*
* 业务逻辑:
* 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1
* 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素
*/
if (html == null) html = _html;
var firstTag = Regex.Match(html, "<.>");
List eleList = new List();
List reval = new List();
GetElementsStringList(html, ref eleList);
foreach (var r in eleList)
{
HtmlInfo data = new HtmlInfo();
data.OldFullHtml = r;
data.SameLeveHtml = html;
data.TagName = Regex.Match(r, @"(?<=\s{1}|\|\s)", RegexOptions.IgnoreCase).Value;
data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=
var eleBegin = Regex.Match(r, "<.>").Value;
var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
data.Attributes = new Dictionary();
if (attrList != null && attrList.Count > 0)
{
foreach (var a in attrList)
{
data.Attributes.Add(a.key, a.value);
}
}
reval.Add(data);
}
return reval;
}
#region private
private List GetDescendantsSource(List allList, string elementName)
{
foreach (var r in allList)
{
if (r.InnerHtml == null || !r.InnerHtml.Contains("
var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
if (childList == null || childList.Count == 0)
{
childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);
if (childList != null && childList.Count > 0)
return childList;
}
else
{
return childList;
}
}
return null;
}
private void GetElementsStringList(string html, ref List eleList)
{
HtmlInfo info = new HtmlInfo();
info.TagName = Regex.Match(html, @"(?<=\|\s)", RegexOptions.IgnoreCase).Value;
string currentTagBeginReg = @"";//获取当前标签元素开始标签正则
string currentTagEndReg = @"\";//获取当前标签元素收尾标签正则
if (string.IsNullOrEmpty(info.TagName)) return;
string eleHtml = "";
//情况1
//情况2
//情况3 错误格式
//情况4endif
if (Regex.IsMatch(html, @""))//单标签
{
eleHtml = Regex.Match(html, @"").Value;
}
else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾
{
if (Regex.IsMatch(html, @"\s{0,10}\
{
eleHtml = GetElementString(html, @"\s{0,10}\", 1);
}
else
{
eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;
}
}
else
{
eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
}
try
{
eleList.Add(eleHtml);
html = html.Replace(eleHtml, "");
html = Regex.Replace(html, @"", "");
if (!Regex.IsMatch(html, @"^\s*$"))
{
GetElementsStringList(html, ref eleList);
}
}
catch (Exception ex)
{
throw new Exception("SORRY,您的HTML格式不能解析!!!");
}
}
private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
{
string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast().Select(c => c.Value).ToList();
var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast().Select(c => c.Value).ToList();
if (currentTagBeginMatches.Count == currentTagEndMatches.Count)
{ //两个签标元素相等
return newHtml;
}
return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
}
private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
{
return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;
}
#endregion
}
public static class XHtmlElementExtendsion
{
///
/// 获取最近的相同层级的HTML元素
///
/// 等于null为所有元素
///
public static List Descendants(this IEnumerable htmlInfoList, string elementName = null)
{
var html = htmlInfoList.First().InnerHtml;
XHtmlElement xhe = new XHtmlElement(html);
return xhe.Descendants(elementName);
}
///
/// 获取下级元素
///
///
///
public static List ChildDescendants(this IEnumerable htmlInfoList, string elementName = null)
{
var html = htmlInfoList.First().InnerHtml;
XHtmlElement xhe = new XHtmlElement(html);
return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();
}
///
/// 获取父级
///
///
///
public static List ParentDescendant(this IEnumerable htmlInfoList,string fullHtml)
{
var saveLeveHtml = htmlInfoList.First().SameLeveHtml;
string replaceGuid=Guid.NewGuid().ToString();
fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);
var parentHtml = Regex.Match(fullHtml, @"[^").Value;
parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);
XHtmlElement xhe = new XHtmlElement(parentHtml);
return xhe.RootDescendants();
}
}
///
/// html信息类
///
public class HtmlInfo
{
///
/// 元素名
///
public string TagName { get; set; }
///
/// 元素属性
///
public Dictionary Attributes { get; set; }
///
/// 元素内部html
///
public string InnerHtml { get; set; }
public string OldFullHtml { get; set; }
public string SameLeveHtml { get; set; }
///
/// 得到元素的html
///
///
public string FullHtml
{
get
{
StringBuilder reval = new StringBuilder();
string attributesString = string.Empty;
if (Attributes != null && Attributes.Count > 0)
{
attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));
}
reval.AppendFormat("{1}{0}>", TagName, InnerHtml, attributesString);
return reval.ToString();
}
}
}
}
前台HTML: