使用C#代码清除多余HTML

最新推荐文章于 2022-12-16 15:30:08 发布

achir29946

最新推荐文章于 2022-12-16 15:30:08 发布

阅读量165

点赞数

文章标签：数据库

原文链接：http://www.cnblogs.com/Simcoder/archive/2010/09/11/1823869.html

版权

数据库中所有公司简介都是从文本编辑器存入的当然样式也存到了数据库在读取的时候我只想获取文本内容怎么办呢？

代码

#region 过滤HTML
         /// <summary>
         /// 过滤html标签
         /// </summary>
         /// <param name="strHtml"> html的内容 </param>
         /// <returns></returns>
         public static string StripHTML( string strHtml)
        {
             string [] aryReg = {
                                   @" <script[^>]*?>.*?</script> " ,

                                   @" <(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?> " ,
                                   @" ([\r\n])[\s]+ " ,
                                   @" &(quot|#34); " ,
                                   @" &(amp|#38); " ,
                                   @" &(lt|#60); " ,
                                   @" &(gt|#62); " ,
                                   @" &(nbsp|#160); " ,
                                   @" &(iexcl|#161); " ,
                                   @" &(cent|#162); " ,
                                   @" &(pound|#163); " ,
                                   @" &(copy|#169); " ,
                                   @" &#(\d+); " ,
                                   @" --> " ,
                                   @" <!--.*\n "
                              };

             string [] aryRep = {
                                    "" ,
                                    "" ,
                                    "" ,
                                    " \ "" ,
                                    " & " ,
                                    " < " ,
                                    " > " ,
                                    " " ,
                                    " \xa1 " , // chr(161),
                                    " \xa2 " , // chr(162),
                                    " \xa3 " , // chr(163),
                                    " \xa9 " , // chr(169),
                                    "" ,
                                    " \r\n " ,
                                    ""
                               };

             string newReg = aryReg[ 0 ];
             string strOutput = strHtml;
             for ( int i = 0 ; i < aryReg.Length; i ++ )
            {
                System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(aryReg[i], System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                strOutput = regex.Replace(strOutput, aryRep[i]);
            }
            strOutput.Replace( " < " , "" );
            strOutput.Replace( " > " , "" );
            strOutput.Replace( " \r\n " , "" );
             return strOutput;
        }
         #endregion

转载于:https://www.cnblogs.com/Simcoder/archive/2010/09/11/1823869.html