提取HTML代码中文字的C#函数

最新推荐文章于 2024-07-15 08:00:00 发布

hiaspx

最新推荐文章于 2024-07-15 08:00:00 发布

阅读量359

点赞数

文章标签： html c# string regex 平台 class

本文链接：https://blog.csdn.net/hiaspx/article/details/2552240

版权

/// 提取HTML代码中文字的C#函数 /// <summary> /// 去除HTML标记 /// </summary> /// <param name="strHtml"> 包括HTML的源码 </param> /// <returns> 已经去除后的文字 </returns> using System; using System.Text.RegularExpressions; public class StripHTMLTest{ public static void Main(){ string s = StripHTML( " <HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML> " ); Console.WriteLine(s); } public static string StripHTML( string strHtml){ string [] aryReg = { @" <script[^>]*?>.*?</script> " , @" <(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?> " , @" ([/r/n])[/s]+ " , @" &(quot|#34); " , @" &(amp|#38); " , @" &(lt|#60); " , @" &(gt|#62); " , @" &(nbsp|#160); " , @" &(iexcl|#161); " , @" &(cent|#162); " , @" &(pound|#163); " , @" &(copy|#169); " , @" &#(/d+); " , @" --> " , @" <!--.*/n " }; string [] aryRep = { "" , "" , "" , " /" " , " & " , " < " , " > " , " " , " /xa1 " , // chr(161), " /xa2 " , // chr(162), " /xa3 " , // chr(163), " /xa9 " , // chr(169), "" , " /r/n " , "" }; string newReg = aryReg[ 0 ]; string strOutput = strHtml; for ( int i = 0 ;i < aryReg.Length;i ++ ){ Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput,aryRep[i]); } strOutput.Replace( " < " , "" ); strOutput.Replace( " > " , "" ); strOutput.Replace( " /r/n " , "" ); return strOutput; } }