///
提取HTML代码中文字的C#函数
///
<summary>
///
去除HTML标记
///
</summary>
///
<param name="strHtml">
包括HTML的源码
</param>
///
<returns>
已经去除后的文字
</returns>
using
System;
using
System.Text.RegularExpressions;
public
class
StripHTMLTest{
public
static
void
Main(){
string
s
=
StripHTML(
"
<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>
"
); Console.WriteLine(s); }
public
static
string
StripHTML(
string
strHtml){
string
[] aryReg
=
{
@"
<script[^>]*?>.*?</script>
"
,
@"
<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>
"
,
@"
([/r/n])[/s]+
"
,
@"
&(quot|#34);
"
,
@"
&(amp|#38);
"
,
@"
&(lt|#60);
"
,
@"
&(gt|#62);
"
,
@"
&(nbsp|#160);
"
,
@"
&(iexcl|#161);
"
,
@"
&(cent|#162);
"
,
@"
&(pound|#163);
"
,
@"
&(copy|#169);
"
,
@"
&#(/d+);
"
,
@"
-->
"
,
@"
<!--.*/n
"
};
string
[] aryRep
=
{
""
,
""
,
""
,
"
/"
"
,
"
&
"
,
"
<
"
,
"
>
"
,
"
"
,
"
/xa1
"
,
//
chr(161),
"
/xa2
"
,
//
chr(162),
"
/xa3
"
,
//
chr(163),
"
/xa9
"
,
//
chr(169),
""
,
"
/r/n
"
,
""
};
string
newReg
=
aryReg[
0
];
string
strOutput
=
strHtml;
for
(
int
i
=
0
;i
<
aryReg.Length;i
++
){ Regex regex
=
new
Regex(aryReg[i],RegexOptions.IgnoreCase); strOutput
=
regex.Replace(strOutput,aryRep[i]); } strOutput.Replace(
"
<
"
,
""
); strOutput.Replace(
"
>
"
,
""
); strOutput.Replace(
"
/r/n
"
,
""
);
return
strOutput; } }
提取HTML代码中文字的C#函数
最新推荐文章于 2024-07-15 08:00:00 发布