C#清理文本里面的所有html标签

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace WuZiFenGongSiInfomation.Common
{
    /// <summary>
    /// html文本处理
    /// </summary>
    public class MyHtmlHelper
    {
        /// <summary>
        /// 替换文本
        /// </summary>
        public const string replaceTxt = "##em286.replace453##";

        /// <summary>
        /// 清理文本里面的所有html标签,返回没有标签后的文本
        /// </summary>
        /// <param name="htmlString">含有html的文本</param>
        /// <returns></returns>
        public static string ClearHtmlTag(string htmlString)
        {
            //删除脚本   
            htmlString = Regex.Replace(htmlString, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

            //检查是否有<p style="font-family:'SimSun';font-size:16px;text-indent:2em;">这个类似的标签,就是首行缩进
            //<p style="text-indent:2em;">这个类似的标签,就是首行缩进
            //text-indent:2em;将这个标签替换为##em286.replace453##
            htmlString = Regex.Replace(htmlString, "<.*(text-indent:\\d*em;).*(\"|')>\\s*\n", replaceTxt, RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, "<.*(text-indent:\\d*em;).*(\"|')>", replaceTxt, RegexOptions.IgnoreCase);

            //删除HTML   
            htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"([/r/n])[/s]+", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(quot|#34);", "/", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            //测试
            htmlString = Regex.Replace(htmlString, @"&nbsp;\s*\n", "    ", RegexOptions.IgnoreCase);
        
            htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "/xa2", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "/xa3", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "/xa9", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&#(/d+);", "", RegexOptions.IgnoreCase);
            //替换掉 < 和 > 标记
            htmlString= htmlString.Replace("<", "");
            htmlString = htmlString.Replace(">", "");
            //htmlString = htmlString.Replace("\n", "");
        
            htmlString= Regex.Replace(htmlString, replaceTxt + "\\s*", replaceTxt, RegexOptions.IgnoreCase);

            //替换多个\n为一个 2020-3-26 10:40:16 添加         
            htmlString = Regex.Replace(htmlString, "\\n\\n\\n|\\n\\n", "\n", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, "\\n\\t\\n\\t", "\n\t", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, "\\n\\t\\n", "\n", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, "\\n\\n\\n|\\n\\n", "\n", RegexOptions.IgnoreCase);

            //返回去掉html标记的字符串
            return htmlString;
        }
 
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

王焜棟琦

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值