这篇文章主要介绍了C#自写的一个HTML解析类(类似XElement语法),本文给出了实现代码和使用实例,同时给出了测试HTML实例,需要的朋友可以参考下
功能:
1、轻松获取指元素HTML元素。
2、可以根据属性标签进行筛选
3、返回的都是Llist强类型无需转换
用过XElement的都知道 用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。
所以我就写了这么一个类似XElement的 XHTMLElement
用法:
- string filePath = Server.MapPath("~/file/test.htm");
- //获取HTML代码
- string mailBody = FileHelper.FileToString(filePath);
- XHtmlElement xh = new XHtmlElement(mailBody);
- //获取body的子集a标签并且class="icon"
- var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();
- //获取带href的a元素
- var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();
- foreach (var r in links)
- {
- Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href
- }
- //获取第一个img
- var img = xh.Descendants("img");
- //获取最近的第一个p元素以及与他同一级的其它p元素
- var ps = xh.Descendants("p");
代码:
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Web;
- using System.Text;
- using System.Text.RegularExpressions;
- namespace SyntacticSugar
- {
- /// <summary>
- /// ** 描述:html解析类
- /// ** 创始时间:2015-4-23
- /// ** 修改时间:-
- /// ** 作者:sunkaixuan
- /// ** qq:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议
- /// </summary>
- public class XHtmlElement
- {
- private string _html;
- public XHtmlElement(string html)
- {
- _html = html;
- }
- /// <summary>
- /// 获取最近的相同层级的HTML元素
- /// </summary>
- /// <param name="elementName">等于null为所有元素</param>
- /// <returns></returns>
- public List<HtmlInfo> Descendants(string elementName = null)
- {
- if (_html == null)
- {
- throw new ArgumentNullException("html不能这空!");
- }
- var allList = RootDescendants(_html);
- var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
- if (reval == null || reval.Count == 0)
- {
- reval = GetDescendantsSource(allList, elementName);
- }
- return reval;
- }
- /// <summary>
- /// 获取第一级元素
- /// </summary>
- /// <param name="elementName"></param>
- /// <returns></returns>
- public List<HtmlInfo> RootDescendants(string html = null)
- {
- /*
- * 业务逻辑:
- * 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1
- * 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素
- */
- if (html == null) html = _html;
- var firstTag = Regex.Match(html, "<.+?>");
- List<string> eleList = new List<string>();
- List<HtmlInfo> reval = new List<HtmlInfo>();
- GetElementsStringList(html, ref eleList);
- foreach (var r in eleList)
- {
- HtmlInfo data = new HtmlInfo();
- data.OldFullHtml = r;
- data.SameLeveHtml = html;
- data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value;
- data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value;
- var eleBegin = Regex.Match(r, "<.+?>").Value;
- var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
- data.Attributes = new Dictionary<string, string>();
- if (attrList != null && attrList.Count > 0)
- {
- foreach (var a in attrList)
- {
- data.Attributes.Add(a.key, a.value);
- }
- }
- reval.Add(data);
- }
- return reval;
- }
- #region private
- private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)
- {
- foreach (var r in allList)
- {
- if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;
- var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
- if (childList == null || childList.Count == 0)
- {
- childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);
- if (childList != null && childList.Count > 0)
- return childList;
- }
- else
- {
- return childList;
- }
- }
- return null;
- }
- private void GetElementsStringList(string html, ref List<string> eleList)
- {
- HtmlInfo info = new HtmlInfo();
- info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value;
- string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>";//获取当前标签元素开始标签正则
- string currentTagEndReg = @"\<\/" + info.TagName + @"\>";//获取当前标签元素收尾标签正则
- if (string.IsNullOrEmpty(info.TagName)) return;
- string eleHtml = "";
- //情况1 <a/>
- //情况2 <a></a>
- //情况3 <a> 错误格式
- //情况4endif
- if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//单标签
- {
- eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value;
- }
- else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾
- {
- if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if"))
- {
- eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"
endif\-\-\>", 1);
- }
- else
- {
- eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;
- }
- }
- else
- {
- eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
- }
- try
- {
- eleList.Add(eleHtml);
- html = html.Replace(eleHtml, "");
- html = Regex.Replace(html, @"<\!DOCTYPE.*?>", "");
- if (!Regex.IsMatch(html, @"^\s*$"))
- {
- GetElementsStringList(html, ref eleList);
- }
- }
- catch (Exception ex)
- {
- throw new Exception("SORRY,您的HTML格式不能解析!!!");
- }
- }
- private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
- {
- string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
- var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();
- var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();
- if (currentTagBeginMatches.Count == currentTagEndMatches.Count)
- { //两个签标元素相等
- return newHtml;
- }
- return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
- }
- private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
- {
- return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;
- }
- #endregion
- }
- public static class XHtmlElementExtendsion
- {
- /// <summary>
- /// 获取最近的相同层级的HTML元素
- /// </summary>
- /// <param name="elementName">等于null为所有元素</param>
- /// <returns></returns>
- public static List<HtmlInfo> Descendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
- {
- var html = htmlInfoList.First().InnerHtml;
- XHtmlElement xhe = new XHtmlElement(html);
- return xhe.Descendants(elementName);
- }
- /// <summary>
- /// 获取下级元素
- /// </summary>
- /// <param name="elementName"></param>
- /// <returns></returns>
- public static List<HtmlInfo> ChildDescendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
- {
- var html = htmlInfoList.First().InnerHtml;
- XHtmlElement xhe = new XHtmlElement(html);
- return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();
- }
- /// <summary>
- /// 获取父级
- /// </summary>
- /// <param name="htmlInfoList"></param>
- /// <returns></returns>
- public static List<HtmlInfo> ParentDescendant(this IEnumerable<HtmlInfo> htmlInfoList,string fullHtml)
- {
- var saveLeveHtml = htmlInfoList.First().SameLeveHtml;
- string replaceGuid=Guid.NewGuid().ToString();
- fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);
- var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?<\/.+?>").Value;
- parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);
- XHtmlElement xhe = new XHtmlElement(parentHtml);
- return xhe.RootDescendants();
- }
- }
- /// <summary>
- /// html信息类
- /// </summary>
- public class HtmlInfo
- {
- /// <summary>
- /// 元素名
- /// </summary>
- public string TagName { get; set; }
- /// <summary>
- /// 元素属性
- /// </summary>
- public Dictionary<string, string> Attributes { get; set; }
- /// <summary>
- /// 元素内部html
- /// </summary>
- public string InnerHtml { get; set; }
- public string OldFullHtml { get; set; }
- public string SameLeveHtml { get; set; }
- /// <summary>
- /// 得到元素的html
- /// </summary>
- /// <returns></returns>
- public string FullHtml
- {
- get
- {
- StringBuilder reval = new StringBuilder();
- string attributesString = string.Empty;
- if (Attributes != null && Attributes.Count > 0)
- {
- attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));
- }
- reval.AppendFormat("<{0} {2}>{1}</{0}>", TagName, InnerHtml, attributesString);
- return reval.ToString();
- }
- }
- }
- }
前台HTML:
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <title></title>
- </head>
- <body>
- <a id="1">我是1</a>
- <a id="2" class="icon">icon</a>
- <img />
- </body>
- </html>