using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace WuZiFenGongSiInfomation.Common
{
/// <summary>
/// html文本处理
/// </summary>
public class MyHtmlHelper
{
/// <summary>
/// 替换文本
/// </summary>
public const string replaceTxt = "##em286.replace453##";
/// <summary>
/// 清理文本里面的所有html标签,返回没有标签后的文本
/// </summary>
/// <param name="htmlString">含有html的文本</param>
/// <returns></returns>
public static string ClearHtmlTag(string htmlString)
{
//删除脚本
htmlString = Regex.Replace(htmlString, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//检查是否有<p style="font-family:'SimSun';font-size:16px;text-indent:2em;">这个类似的标签,就是首行缩进
//<p style="text-indent:2em;">这个类似的标签,就是首行缩进
//text-indent:2em;将这个标签替换为##em286.replace453##
htmlString = Regex.Replace(htmlString, "<.*(text-indent:\\d*em;).*(\"|')>\\s*\n", replaceTxt, RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, "<.*(text-indent:\\d*em;).*(\"|')>", replaceTxt, RegexOptions.IgnoreCase);
//删除HTML
htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"([/r/n])[/s]+", "", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(quot|#34);", "/", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
//测试
htmlString = Regex.Replace(htmlString, @" \s*\n", " ", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "/xa2", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "/xa3", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "/xa9", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, @"&#(/d+);", "", RegexOptions.IgnoreCase);
//替换掉 < 和 > 标记
htmlString= htmlString.Replace("<", "");
htmlString = htmlString.Replace(">", "");
//htmlString = htmlString.Replace("\n", "");
htmlString= Regex.Replace(htmlString, replaceTxt + "\\s*", replaceTxt, RegexOptions.IgnoreCase);
//替换多个\n为一个 2020-3-26 10:40:16 添加
htmlString = Regex.Replace(htmlString, "\\n\\n\\n|\\n\\n", "\n", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, "\\n\\t\\n\\t", "\n\t", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, "\\n\\t\\n", "\n", RegexOptions.IgnoreCase);
htmlString = Regex.Replace(htmlString, "\\n\\n\\n|\\n\\n", "\n", RegexOptions.IgnoreCase);
//返回去掉html标记的字符串
return htmlString;
}
}
}