C#提取HTML中文字的函数

 

  public     static     string    StripHTML( string    strHtml){
                  
string    []   aryReg    = {
                              
@" <script[^>]*?>.*?</script> " ,
                              
@" <(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?> " ,
                              
@" ([\r\n])[\s]+ " ,
                              
@" &(quot|#34); " ,
                              
@" &(amp|#38); " ,
                              
@" &(lt|#60); " ,
                              
@" &(gt|#62); " ,
                              
@" &(nbsp|#160); " ,
                              
@" &(iexcl|#161); " ,
                              
@" &(cent|#162); " ,
                              
@" &(pound|#163); " ,
                              
@" &(copy|#169); " ,
                              
@" (\d+); " ,
                              
@" --> " ,
                              
@" <!--.*\n "
                            };
                  
string    []   aryRep    =    {
                                
"" ,
                                
"" ,
                                
"" ,
                                
" \ "" ,
                                 " & " ,
                                
" < " ,
                                
" > " ,
                                
"     " ,
                                
" \xa1 " , // chr(161),
                                 " \xa2 " , // chr(162),
                                 " \xa3 " , // chr(163),
                                 " \xa9 " , // chr(169),
                                 "" ,
                                
" \r\n " ,
                                
""
                              }; 
                  
string    newReg    = aryReg[ 0 ];
                  
string    strOutput = strHtml;
                  
for ( int    i    =     0 ;i < aryReg.Length;i ++ ){
                      Regex   regex   
=     new    Regex(aryReg[i],RegexOptions.IgnoreCase);
                      strOutput   
=    regex.Replace(strOutput,aryRep[i]);
                  }
                  strOutput.Replace(
" < " , "" );
                  strOutput.Replace(
" > " , "" );
                  strOutput.Replace(
" \r\n " , "" );
                  
return    strOutput;
              }

转载于:https://www.cnblogs.com/breezeblew/archive/2009/03/10/1408214.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值