html标签及xpath处理相关方法

203 篇文章 4 订阅
61 篇文章 3 订阅

html标签及xpath处理相关方法

直接上代码

</pre></p><p><pre name="code" class="csharp">using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Web;
using HtmlAgilityPack;

namespace TL.NewsGatheringService.Business.Implement
{
    /// <summary>
    /// 字符串处理类
    /// </summary>
    public class StrHelperUtil
    {
        /// <summary>
        /// 替换HTML标记
        /// </summary>
        /// <param name="strHtml"></param>
        /// <returns></returns>
        public static string FormatHtml(string strHtml)
        {
            //删除脚本
            strHtml = Regex.Replace(strHtml, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
            //删除HTML
            strHtml = Regex.Replace(strHtml, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"([rn])[s]+", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"-->", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"<!--.*", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(quot|#34);", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"&#(d+);", "", RegexOptions.IgnoreCase);
            strHtml = Regex.Replace(strHtml, @"<img[^>]*>;", "", RegexOptions.IgnoreCase);
            strHtml = strHtml.Replace("<", "");
            strHtml = strHtml.Replace(">", "");
            strHtml = strHtml.Replace("rn", "");
            strHtml = strHtml.Replace("\r\n", "");
            //Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
            return strHtml;
        }
        /// <summary>
        /// 从一段html中取出一个url
        /// </summary>
        /// <param name="strHtml"></param>
        /// <returns></returns>
        public static string GetUrlFromHtml(string strHtml)
        {
            var strUrl = GetStrByRegx(strHtml, @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-,]*)?");
            //MatchCollection mc = Regex.Matches(strHtml, @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-,]*)?", RegexOptions.IgnoreCase);
            String check = @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?";
            //if (mc.Count > 0)
            //{
            //    foreach (Match m in mc)
            //    {
            //        strUrl = m.Groups[0].Value.ToString().Trim();
            //        if (strUrl.Length > 5)
            //        {
            //            break;
            //        }
            //    }
            //}
            return strUrl;
        }
        /// <summary>
        /// 从字符串中取出与正则匹配的字符串
        /// </summary>
        /// <param name="inputStr">源字符串</param>
        /// <param name="strPattern">正则表达式 如:/\\d{2,20}/</param>
        /// <returns>string</returns>
        public static string GetStrByRegx(string inputStr, string strPattern)
        {
            var retStr = "";
            try
            {
                MatchCollection mc = Regex.Matches(inputStr, strPattern, RegexOptions.IgnoreCase);
                if (mc.Count > 0)
                {
                    foreach (Match m in mc)
                    {
                        retStr += m.Value;
                    }
                }
            }
            catch
            {
            }
            return retStr;
        }
        /// <summary>
        /// 从字符串中取出与正则匹配的字符串组
        /// </summary>
        /// <param name="inputStr">源字符串</param>
        /// <param name="strPattern">正则表达式 注意要带分组 分组名固定为:"gname" 如: <a id=\"ctl00_M_dtgResumeList(?<gname>.*?).*> </param>
        /// <returns>List-string</returns>
        public static List<string> GetListStrByRegxGroup(string inputStr, string strPattern)
        {
            var list = new List<string>();
           
            MatchCollection mc = Regex.Matches(inputStr, strPattern, RegexOptions.IgnoreCase);

            if (mc.Count > 0)
            {
                foreach (Match m in mc)
                {
                    var str = m.Groups["gname"].Value.ToString().Trim();
                    if (str.Length > 0)
                    {
                        list.Add(str);
                    }
                }
            }

            return list;
        }
        /// <summary>
        /// 根据XPATH获取筛选的字符串
        /// </summary>
        /// <param name="content">需要提取HTML的内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="separ">分隔符</param>
        /// <returns>提取后的内容</returns>
        public static string GetStrByXPath(string content, string xpath, string separ)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += node.InnerHtml + separ;
            }

            return text;
        }
        /// <summary>
        /// 根据XPATH获取筛选的字符串 每个字符串加上前缀后缀
        /// </summary>
        /// <param name="content">需要提取HTML的内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="preSepar">前缀</param>
        /// <param name="lastSepar">后缀</param>
        /// <returns></returns>
        public static string GetStrByXPath(string content, string xpath, string preSepar, string lastSepar)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += preSepar + node.InnerHtml + lastSepar;
            }

            return text;
        }
        /// <summary>
        /// 获取某个xpath取到的元素的指定属性的值
        /// </summary>
        /// <param name="content">原内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="attrName">属性名</param>
        /// <returns></returns>
        public static string GetAttrValueByXPath(string content, string xpath, string attrName)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += node.Attributes[attrName].Value;
            }

            return text;
        }
        /// <summary>
        /// 根据XPATH获取筛选的字符串 每个字符串加上前缀后缀
        /// </summary>
        /// <param name="content">原内容</param>
        /// <param name="xpath">XPath表达式</param>
        /// <param name="attrName">属性名</param>
        /// <param name="preSepar">前缀</param>
        /// <param name="lastSepar">后缀</param>
        /// <returns></returns>
        public static string GetAttrValueByXPath(string content, string xpath, string attrName, string preSepar, string lastSepar)
        {
            var text = "";
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(content);
            var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
            if (repeatNodes == null)
                return text;
            //循环节点  
            foreach (var node in repeatNodes)
            {
                //text += node.InnerText + separ;
                text += preSepar + node.Attributes[attrName].Value + lastSepar;
            }

            return text;
        }
        /// <summary>
        /// 中文转unicode
        /// </summary>
        /// <returns></returns>
        public static string chinese_to_unicode(string str)
        {
            string outStr = "";
            if (!string.IsNullOrEmpty(str))
            {
                for (int i = 0; i < str.Length; i++)
                {
                    outStr += "/u" + ((int)str[i]).ToString("x");
                }
            }
            return outStr;
        }
        /// <summary>
        /// unicode转中文
        /// </summary>
        /// <returns></returns>
        public static string unicode_to_chinese(string str)
        {
            string outStr = "";
            if (!string.IsNullOrEmpty(str))
            {
                string[] strlist = str.Replace("/", "").Split('u');
                try
                {
                    for (int i = 1; i < strlist.Length; i++)
                    {
                        //将unicode字符转为10进制整数,然后转为char中文字符  
                        outStr += (char)int.Parse(strlist[i], System.Globalization.NumberStyles.HexNumber);
                    }
                }
                catch (FormatException ex)
                {
                    outStr = ex.Message;
                }
            }
            return outStr;
        }


        /// <summary>
        /// unicode转中文(符合js规则的)
        /// </summary>
        /// <returns></returns>
        public static string unicode_to_chinese_js(string str)
        {
            string outStr = "";
            Regex reg = new Regex(@"(?i)\\u([0-9a-f]{4})");
            outStr = reg.Replace(str, delegate(Match m1)
            {
                return ((char)Convert.ToInt32(m1.Groups[1].Value, 16)).ToString();
            });
            return outStr;
        }
        /// <summary>
        /// 中文转unicode(符合js规则的)
        /// </summary>
        /// <returns></returns>
        public static string chinese_to_unicode_js(string str)
        {
            string outStr = "";
            string a = "";
            if (!string.IsNullOrEmpty(str))
            {
                for (int i = 0; i < str.Length; i++)
                {
                    if (Regex.IsMatch(str[i].ToString(), @"[\u4e00-\u9fa5]")) { outStr += "\\u" + ((int)str[i]).ToString("x"); }
                    else { outStr += str[i]; }
                }
            }
            return outStr;
        } 
    }
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值