C#自写的一个HTML解析类（类似XElement语法）

最新推荐文章于 2022-08-30 17:27:00 发布

my98800

最新推荐文章于 2022-08-30 17:27:00 发布

阅读量1.8k

点赞数

分类专栏： .Net

.Net 专栏收录该内容

364 篇文章 5 订阅

订阅专栏

这篇文章主要介绍了C#自写的一个HTML解析类（类似XElement语法）,本文给出了实现代码和使用实例,同时给出了测试HTML实例,需要的朋友可以参考下

功能：

1、轻松获取指元素HTML元素。
2、可以根据属性标签进行筛选
3、返回的都是Llist强类型无需转换

用过XElement的都知道用来解析XML非常的方便，但是对于HTML的格式多样化实在是没办法兼容。

所以我就写了这么一个类似XElement的 XHTMLElement

用法：

[csharp]view plaincopy 
     
 string filePath = Server.MapPath("~/file/test.htm");  
       //获取HTML代码  
       string mailBody = FileHelper.FileToString(filePath);  
   
       XHtmlElement xh = new XHtmlElement(mailBody);  
   
       //获取body的子集a标签并且class="icon"  
       var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();  
   
       //获取带href的a元素  
       var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();  
       foreach (var r in links)  
       {  
         Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href  
       }  
   
       //获取第一个img  
       var img = xh.Descendants("img");  
   
       //获取最近的第一个p元素以及与他同一级的其它p元素  
       var ps = xh.Descendants("p");  

代码：

[csharp]view plaincopy 
     
 using System;  
 using System.Collections.Generic;  
 using System.Linq;  
 using System.Web;  
 using System.Text;  
 using System.Text.RegularExpressions;  
   
 namespace SyntacticSugar  
 {  
   /// <summary>  
   /// ** 描述：html解析类  
   /// ** 创始时间：2015-4-23  
   /// ** 修改时间：-  
   /// ** 作者：sunkaixuan  
   /// ** qq：610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议  
   /// </summary>  
   public class XHtmlElement  
   {  
     private string _html;  
     public XHtmlElement(string html)  
     {  
       _html = html;  
     }  
   
     /// <summary>  
     /// 获取最近的相同层级的HTML元素  
     /// </summary>  
     /// <param name="elementName">等于null为所有元素</param>  
     /// <returns></returns>  
     public List<HtmlInfo> Descendants(string elementName = null)  
     {  
       if (_html == null)  
       {  
         throw new ArgumentNullException("html不能这空！");  
       }  
       var allList = RootDescendants(_html);  
       var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();  
       if (reval == null || reval.Count == 0)  
       {  
         reval = GetDescendantsSource(allList, elementName);  
       }  
       return reval;  
     }  
   
   
     /// <summary>  
     /// 获取第一级元素  
     /// </summary>  
     /// <param name="elementName"></param>  
     /// <returns></returns>  
     public List<HtmlInfo> RootDescendants(string html = null)  
     {  
       /* 
        * 业务逻辑: 
              * 1、获取第一个html标签一直找结尾标签，如果在这个过程中遇到相同的标签收尾标签就要加1 
              * 2、第一个标签取到后继续第一步操作，找第2个元素 。。第N个元素 
        */  
       if (html == null) html = _html;  
       var firstTag = Regex.Match(html, "<.+?>");  
   
       List<string> eleList = new List<string>();  
       List<HtmlInfo> reval = new List<HtmlInfo>();  
       GetElementsStringList(html, ref eleList);  
       foreach (var r in eleList)  
       {  
         HtmlInfo data = new HtmlInfo();  
         data.OldFullHtml = r;  
         data.SameLeveHtml = html;  
         data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value;  
         data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value;  
         var eleBegin = Regex.Match(r, "<.+?>").Value;  
         var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();  
         data.Attributes = new Dictionary<string, string>();  
         if (attrList != null && attrList.Count > 0)  
         {  
           foreach (var a in attrList)  
           {  
             data.Attributes.Add(a.key, a.value);  
           }  
         }  
         reval.Add(data);  
       }  
       return reval;  
   
     }  
  
  
  
  
  
     #region private  
     private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)  
     {  
       foreach (var r in allList)  
       {  
         if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;  
         var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();  
         if (childList == null || childList.Count == 0)  
         {  
           childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);  
           if (childList != null && childList.Count > 0)  
             return childList;  
         }  
         else  
         {  
           return childList;  
         }  
       }  
       return null;  
     }  
   
     private void GetElementsStringList(string html, ref List<string> eleList)  
     {  
       HtmlInfo info = new HtmlInfo();  
       info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value;  
       string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>";//获取当前标签元素开始标签正则  
       string currentTagEndReg = @"\<\/" + info.TagName + @"\>";//获取当前标签元素收尾标签正则  
       if (string.IsNullOrEmpty(info.TagName)) return;  
   
       string eleHtml = "";  
       //情况1 <a/>  
       //情况2 <a></a>  
       //情况3 <a> 错误格式  
       //情况4endif  
       if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//单标签  
       {  
         eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value;  
       }  
       else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾  
       {  
         if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if"))  
         {  
          eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @" 
       
        
        endif 
        
     
 \-\-\>", 1);  
         }  
         else  
         {  
           eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;  
         }  
       }  
       else  
       {  
         eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);  
       }  
   
   
       try  
       {  
         eleList.Add(eleHtml);  
         html = html.Replace(eleHtml, "");  
         html = Regex.Replace(html, @"<\!DOCTYPE.*?>", "");  
         if (!Regex.IsMatch(html, @"^\s*$"))  
         {  
           GetElementsStringList(html, ref eleList);  
         }  
   
       }  
       catch (Exception ex)  
       {  
         throw new Exception("SORRY,您的HTML格式不能解析！！！");  
   
       }  
   
     }  
   
     private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)  
     {  
   
       string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);  
       var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();  
       var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();  
       if (currentTagBeginMatches.Count == currentTagEndMatches.Count)  
       { //两个签标元素相等  
         return newHtml;  
       }  
       return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);  
     }  
   
     private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)  
     {  
       return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;  
     }  
     #endregion  
   
   
   
   }  
   public static class XHtmlElementExtendsion  
   {  
     /// <summary>  
     /// 获取最近的相同层级的HTML元素  
     /// </summary>  
     /// <param name="elementName">等于null为所有元素</param>  
     /// <returns></returns>  
     public static List<HtmlInfo> Descendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)  
     {  
       var html = htmlInfoList.First().InnerHtml;  
       XHtmlElement xhe = new XHtmlElement(html);  
       return xhe.Descendants(elementName);  
     }  
     /// <summary>  
     /// 获取下级元素  
     /// </summary>  
     /// <param name="elementName"></param>  
     /// <returns></returns>  
     public static List<HtmlInfo> ChildDescendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)  
     {  
       var html = htmlInfoList.First().InnerHtml;  
       XHtmlElement xhe = new XHtmlElement(html);  
       return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();  
     }  
   
     /// <summary>  
     /// 获取父级  
     /// </summary>  
     /// <param name="htmlInfoList"></param>  
     /// <returns></returns>  
     public static List<HtmlInfo> ParentDescendant(this IEnumerable<HtmlInfo> htmlInfoList,string fullHtml)  
     {  
       var saveLeveHtml = htmlInfoList.First().SameLeveHtml;  
       string replaceGuid=Guid.NewGuid().ToString();  
       fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);  
       var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?<\/.+?>").Value;  
       parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);  
       XHtmlElement xhe = new XHtmlElement(parentHtml);  
       return xhe.RootDescendants();  
     }  
   }  
   /// <summary>  
   /// html信息类  
   /// </summary>  
   public class HtmlInfo  
   {  
     /// <summary>  
     /// 元素名  
     /// </summary>  
     public string TagName { get; set; }  
     /// <summary>  
     /// 元素属性  
     /// </summary>  
     public Dictionary<string, string> Attributes { get; set; }  
     /// <summary>  
     /// 元素内部html  
     /// </summary>  
     public string InnerHtml { get; set; }  
   
     public string OldFullHtml { get; set; }  
   
     public string SameLeveHtml { get; set; }  
   
     /// <summary>  
     /// 得到元素的html  
     /// </summary>  
     /// <returns></returns>  
     public string FullHtml  
     {  
       get  
       {  
         StringBuilder reval = new StringBuilder();  
         string attributesString = string.Empty;  
         if (Attributes != null && Attributes.Count > 0)  
         {  
           attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));  
         }  
         reval.AppendFormat("<{0} {2}>{1}</{0}>", TagName, InnerHtml, attributesString);  
         return reval.ToString();  
       }  
     }  
   }  
 }  

前台HTML:

[xhtml]view plaincopy 
     
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
 <html xmlns="http://www.w3.org/1999/xhtml">  
 <head>  
   <title></title>  
 </head>  
 <body>  
   <a id="1">我是1</a>   
   <a id="2" class="icon">icon</a>  
   <img />  
 </body>  
 </html>