c# .htm和.html,C#自写的一个HTML解析类(类似XElement语法)

功能:

1、轻松获取指元素HTML元素。

2、可以根据属性标签进行筛选

3、返回的都是Llist强类型无需转换

用过XElement的都知道 用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。

所以我就写了这么一个类似XElement的 XHTMLElement

用法:

string filePath = Server.MapPath("~/file/test.htm");

//获取HTML代码

string mailBody = FileHelper.FileToString(filePath);

XHtmlElement xh = new XHtmlElement(mailBody);

//获取body的子集a标签并且class="icon"

var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();

//获取带href的a元素

var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();

foreach (var r in links)

{

Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href

}

//获取第一个img

var img = xh.Descendants("img");

//获取最近的第一个p元素以及与他同一级的其它p元素

var ps = xh.Descendants("p");

代码:

using System;

using System.Collections.Generic;

using System.Linq;

using System.Web;

using System.Text;

using System.Text.RegularExpressions;

namespace SyntacticSugar

{

///

/// ** 描述:html解析类

/// ** 创始时间:2015-4-23

/// ** 修改时间:-

/// ** 作者:sunkaixuan

/// ** qq:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议

///

public class XHtmlElement

{

private string _html;

public XHtmlElement(string html)

{

_html = html;

}

///

/// 获取最近的相同层级的HTML元素

///

/// 等于null为所有元素

///

public List Descendants(string elementName = null)

{

if (_html == null)

{

throw new ArgumentNullException("html不能这空!");

}

var allList = RootDescendants(_html);

var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();

if (reval == null || reval.Count == 0)

{

reval = GetDescendantsSource(allList, elementName);

}

return reval;

}

///

/// 获取第一级元素

///

///

///

public List RootDescendants(string html = null)

{

/*

* 业务逻辑:

* 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1

* 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素

*/

if (html == null) html = _html;

var firstTag = Regex.Match(html, "<.>");

List eleList = new List();

List reval = new List();

GetElementsStringList(html, ref eleList);

foreach (var r in eleList)

{

HtmlInfo data = new HtmlInfo();

data.OldFullHtml = r;

data.SameLeveHtml = html;

data.TagName = Regex.Match(r, @"(?<=\s{1}|\|\s)", RegexOptions.IgnoreCase).Value;

data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=

var eleBegin = Regex.Match(r, "<.>").Value;

var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();

data.Attributes = new Dictionary();

if (attrList != null && attrList.Count > 0)

{

foreach (var a in attrList)

{

data.Attributes.Add(a.key, a.value);

}

}

reval.Add(data);

}

return reval;

}

#region private

private List GetDescendantsSource(List allList, string elementName)

{

foreach (var r in allList)

{

if (r.InnerHtml == null || !r.InnerHtml.Contains("

var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();

if (childList == null || childList.Count == 0)

{

childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);

if (childList != null && childList.Count > 0)

return childList;

}

else

{

return childList;

}

}

return null;

}

private void GetElementsStringList(string html, ref List eleList)

{

HtmlInfo info = new HtmlInfo();

info.TagName = Regex.Match(html, @"(?<=\|\s)", RegexOptions.IgnoreCase).Value;

string currentTagBeginReg = @"";//获取当前标签元素开始标签正则

string currentTagEndReg = @"\";//获取当前标签元素收尾标签正则

if (string.IsNullOrEmpty(info.TagName)) return;

string eleHtml = "";

//情况1

//情况2

//情况3 错误格式

//情况4endif

if (Regex.IsMatch(html, @""))//单标签

{

eleHtml = Regex.Match(html, @"").Value;

}

else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾

{

if (Regex.IsMatch(html, @"\s{0,10}\

{

eleHtml = GetElementString(html, @"\s{0,10}\", 1);

}

else

{

eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;

}

}

else

{

eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);

}

try

{

eleList.Add(eleHtml);

html = html.Replace(eleHtml, "");

html = Regex.Replace(html, @"", "");

if (!Regex.IsMatch(html, @"^\s*$"))

{

GetElementsStringList(html, ref eleList);

}

}

catch (Exception ex)

{

throw new Exception("SORRY,您的HTML格式不能解析!!!");

}

}

private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)

{

string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);

var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast().Select(c => c.Value).ToList();

var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast().Select(c => c.Value).ToList();

if (currentTagBeginMatches.Count == currentTagEndMatches.Count)

{ //两个签标元素相等

return newHtml;

}

return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);

}

private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)

{

return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;

}

#endregion

}

public static class XHtmlElementExtendsion

{

///

/// 获取最近的相同层级的HTML元素

///

/// 等于null为所有元素

///

public static List Descendants(this IEnumerable htmlInfoList, string elementName = null)

{

var html = htmlInfoList.First().InnerHtml;

XHtmlElement xhe = new XHtmlElement(html);

return xhe.Descendants(elementName);

}

///

/// 获取下级元素

///

///

///

public static List ChildDescendants(this IEnumerable htmlInfoList, string elementName = null)

{

var html = htmlInfoList.First().InnerHtml;

XHtmlElement xhe = new XHtmlElement(html);

return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();

}

///

/// 获取父级

///

///

///

public static List ParentDescendant(this IEnumerable htmlInfoList,string fullHtml)

{

var saveLeveHtml = htmlInfoList.First().SameLeveHtml;

string replaceGuid=Guid.NewGuid().ToString();

fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);

var parentHtml = Regex.Match(fullHtml, @"[^").Value;

parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);

XHtmlElement xhe = new XHtmlElement(parentHtml);

return xhe.RootDescendants();

}

}

///

/// html信息类

///

public class HtmlInfo

{

///

/// 元素名

///

public string TagName { get; set; }

///

/// 元素属性

///

public Dictionary Attributes { get; set; }

///

/// 元素内部html

///

public string InnerHtml { get; set; }

public string OldFullHtml { get; set; }

public string SameLeveHtml { get; set; }

///

/// 得到元素的html

///

///

public string FullHtml

{

get

{

StringBuilder reval = new StringBuilder();

string attributesString = string.Empty;

if (Attributes != null && Attributes.Count > 0)

{

attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));

}

reval.AppendFormat("{1}{0}>", TagName, InnerHtml, attributesString);

return reval.ToString();

}

}

}

}

前台HTML:

我是1

icon

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值