c#通过正则表达式去掉html的一些方法，记录一下

最新推荐文章于 2023-07-13 15:51:32 发布

avi9111

最新推荐文章于 2023-07-13 15:51:32 发布

阅读量306

点赞数

分类专栏：每天一点u3d

本文链接：https://blog.csdn.net/avi9111/article/details/119354176

版权

每天一点u3d 专栏收录该内容

382 篇文章 18 订阅

订阅专栏

public string RemoveHTMLTags(string htmlStream)
        {
            if (htmlStream == null)
            {
                throw new Exception("Your input html stream is null!");
                return null;
            }
            /*
             * 最好把所有的特殊HTML标记都找出来，然后把与其相对应的Unicode字符一起影射到Hash表内，最后一起都替换掉
             */
            //先单独测试,成功后,再把所有模式合并
            //注:这两个必须单独处理
            //去掉嵌套了HTML标记的JavaScript:(<script)[\\s\\S]*(</script>)
            //去掉css标记:(<style)[\\s\\S]*(</style>)
            //去掉css标记:\\..*\\{[\\s\\S]*\\}
            htmlStream = Regex.Replace(htmlStream, "(<script)[\\s\\S]*?(</script>)|(<style)[\\s\\S]*?(</style>)", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream, "script");
            //htmlStream = RemoveTag(htmlStream, "style");
            //去掉普通HTML标记:<[^>]+>
            //替换空格: |&|| |
            htmlStream = Regex.Replace(htmlStream, "<[^>]+>| |&|| ||•|<|>", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream);
            //替换左尖括号
            //htmlStream = Regex.Replace(htmlStream, "<", "<");
            //替换右尖括号
            //htmlStream = Regex.Replace(htmlStream, ">", ">");
            //替换空行
            //htmlStream = Regex.Replace(htmlStream, "[\n|\r|\t]", " ");//[\n|\r][\t*| *]*[\n|\r]
            htmlStream = Regex.Replace(htmlStream, "(\r\n[\r|\n|\t| ]*\r\n)|(\n[\r|\n|\t| ]*\n)", "\r\n");
            htmlStream = Regex.Replace(htmlStream, "[\t| ]{1,}", " ");
            return htmlStream.Trim();
        }

avi9111

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
c#通过正则表达式去掉html的一些方法，记录一下

public string RemoveHTMLTags(string htmlStream) { if (htmlStream == null) { throw new Exception("Your input html stream is null!"); return null; } /* * 最好.
复制链接

扫一扫