html标签及xpath处理相关方法

最新推荐文章于 2024-05-25 19:30:16 发布
Jack2013tong
最新推荐文章于 2024-05-25 19:30:16 发布
阅读量3.4k
点赞数
分类专栏： asp.net C# Tools other 文章标签： html tag正则替换正则表达式 regex xpath
C# 同时被 3 个专栏收录
234 篇文章 4 订阅
订阅专栏
asp.net
203 篇文章 4 订阅
订阅专栏
other
61 篇文章 3 订阅
订阅专栏
html标签及xpath处理相关方法

直接上代码
</pre></p><p><pre name="code" class="csharp">using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Web;
using HtmlAgilityPack;

namespace TL.NewsGatheringService.Business.Implement
{
    /// <summary>
    /// 字符串处理类
    /// </summary>
    public class StrHelperUtil
    {
        /// <summary>
        /// 替换HTML标记
        /// </summary>
        /// <param name="strHtml"></param>
        /// <returns></returns>
        public static string FormatHtml(string strHtml)
        {
            //删除脚本
            strHtml = Regex.Replace(strHtml, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
            //删除HTML
            strHtml = Regex.Replace(strHtml, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"([rn])[s]+", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"-->", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"<!--.*", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(quot|#34);", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&#(d+);", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"<img[^>]*>;", "", RegexOptions.IgnoreCase);
            strHtml = strHtml.Replace("<", "");
            strHtml = strHtml.Replace(">", "");
            strHtml = strHtml.Replace("rn", "");
            strHtml = strHtml.Replace("\r\n", "");
            //Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
            return strHtml;
        }
        /// <summary>
        /// 从一段html中取出一个url
        /// </summary>
        /// <param name="strHtml"></param>
        /// <returns></returns>
        public static string GetUrlFromHtml(string strHtml)
        {
            var strUrl = GetStrByRegx(strHtml, @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-,]*)?");
            //MatchCollection mc = Regex.Matches(strHtml, @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-,]*)?", RegexOptions.IgnoreCase);
            String check = @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?";
            //if (mc.Count > 0)
            //{
            //    foreach (Match m in mc)
            //    {
            //        strUrl = m.Groups[0].Value.ToString().Trim();
            //        if (strUrl.Length > 5)
            //        {
            //            break;
            //        }
            //    }
            //}
            return strUrl;
        }
        /// <summary>
        /// 从字符串中取出与正则匹配的字符串
        /// </summary>
        /// <param name="inputStr">源字符串</param>
        /// <param name="strPattern">正则表达式 如:/\\d{2,20}/</param>
        /// <returns>string</returns>
        public static string GetStrByRegx(string inputStr, string strPattern)
        {
            var retStr = "";
            try
            {
                MatchCollection mc = Regex.Matches(inputStr, strPattern, RegexOptions.IgnoreCase);
                if (mc.Count > 0)
                {
                    foreach (Match m in mc)
                    {
                        retStr += m.Value;
                    }
                }
            }
            catch
            {
            }
            return retStr;
        }
        /// <summary>
        /// 从字符串中取出与正则匹配的字符串组
        /// </summary>
        /// <param name="inputStr">源字符串</param>
        /// <param name="strPattern">正则表达式 注意要带分组 分组名固定为:"gname" 如: <a id=\"ctl00_M_dtgResumeList(?<gname>.*?).*> </param>
        /// <returns>List-string</returns>
        public static List<string> GetListStrByRegxGroup(string inputStr, string strPattern)
        {
            var list = new List<string>();
           
            MatchCollection mc = Regex.Matches(inputStr, strPattern, RegexOptions.IgnoreCase);

            if (mc.Count > 0)
            {
                foreach (Match m in mc)
                {
                    var str = m.Groups["gname"].Value.ToString().Trim();
                    if (str.Length > 0)
                    {
                        list.Add(str);
                    }
                }
            }

            return list;
        }
        /// <summary>
        /// 根据XPATH获取筛选的字符串
        /// </summary>
        /// <param name="content">需要提取HTML的内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="separ">分隔符</param>
        /// <returns>提取后的内容</returns>
        public static string GetStrByXPath(string content, string xpath, string separ)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += node.InnerHtml + separ;
            }

            return text;
        }
        /// <summary>
        /// 根据XPATH获取筛选的字符串 每个字符串加上前缀后缀
        /// </summary>
        /// <param name="content">需要提取HTML的内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="preSepar">前缀</param>
        /// <param name="lastSepar">后缀</param>
        /// <returns></returns>
        public static string GetStrByXPath(string content, string xpath, string preSepar, string lastSepar)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += preSepar + node.InnerHtml + lastSepar;
            }

            return text;
        }
        /// <summary>
        /// 获取某个xpath取到的元素的指定属性的值
        /// </summary>
        /// <param name="content">原内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="attrName">属性名</param>
        /// <returns></returns>
        public static string GetAttrValueByXPath(string content, string xpath, string attrName)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += node.Attributes[attrName].Value;
            }

            return text;
        }
        /// <summary>
        /// 根据XPATH获取筛选的字符串 每个字符串加上前缀后缀
        /// </summary>
        /// <param name="content">原内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="attrName">属性名</param>
        /// <param name="preSepar">前缀</param>
        /// <param name="lastSepar">后缀</param>
        /// <returns></returns>
        public static string GetAttrValueByXPath(string content, string xpath, string attrName, string preSepar, string lastSepar)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += preSepar + node.Attributes[attrName].Value + lastSepar;
            }

            return text;
        }
        /// <summary>
        /// 中文转unicode
        /// </summary>
        /// <returns></returns>
        public static string chinese_to_unicode(string str)
        {
            string outStr = "";
            if (!string.IsNullOrEmpty(str))
            {
                for (int i = 0; i < str.Length; i++)
                {
                    outStr += "/u" + ((int)str[i]).ToString("x");
                }
            }
            return outStr;
        }
        /// <summary>
        /// unicode转中文
        /// </summary>
        /// <returns></returns>
        public static string unicode_to_chinese(string str)
        {
            string outStr = "";
            if (!string.IsNullOrEmpty(str))
            {
                string[] strlist = str.Replace("/", "").Split('u');
                try
                {
                    for (int i = 1; i < strlist.Length; i++)
                    {
                        //将unicode字符转为10进制整数，然后转为char中文字符  
                        outStr += (char)int.Parse(strlist[i], System.Globalization.NumberStyles.HexNumber);
                    }
                }
                catch (FormatException ex)
                {
                    outStr = ex.Message;
                }
            }
            return outStr;
        }


        /// <summary>
        /// unicode转中文（符合js规则的）
        /// </summary>
        /// <returns></returns>
        public static string unicode_to_chinese_js(string str)
        {
            string outStr = "";
            Regex reg = new Regex(@"(?i)\\u([0-9a-f]{4})");
            outStr = reg.Replace(str, delegate(Match m1)
            {
                return ((char)Convert.ToInt32(m1.Groups[1].Value, 16)).ToString();
            });
            return outStr;
        }
        /// <summary>
        /// 中文转unicode（符合js规则的）
        /// </summary>
        /// <returns></returns>
        public static string chinese_to_unicode_js(string str)
        {
            string outStr = "";
            string a = "";
            if (!string.IsNullOrEmpty(str))
            {
                for (int i = 0; i < str.Length; i++)
                {
                    if (Regex.IsMatch(str[i].ToString(), @"[\u4e00-\u9fa5]")) { outStr += "\\u" + ((int)str[i]).ToString("x"); }
                    else { outStr += str[i]; }
                }
            }
            return outStr;
        } 
    }
}
Jack2013tong
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
html标签及xpath处理相关方法

收藏一个很有用的html tag替换的工具类,可用作替换，清楚字符串中的html标签 /// /// 字符串处理类 /// public class StrHelperUtil { /// /// 格式化掉字符串中的html tag /// ///
复制链接

扫一扫