去除框架、去除连接、去除脚本等特殊字符。正则表达式

本文链接：https://blog.csdn.net/mhl0410/article/details/5293772

c# 过滤页面特殊字符

        /// <summary>
        /// 去除框架、去除连接、去除脚本等特殊字符。
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private string checkStr(string html)
        {
            System.Text.RegularExpressions.Regex regex1 = new

System.Text.RegularExpressions.Regex(@"<script[/s/S]+</script *>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//System.Text.RegularExpressions.Regex regex2 = new

System.Text.RegularExpressions.Regex(@" href *= *[/s/S]*script *:",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex2 = new

System.Text.RegularExpressions.Regex(@"(<a(.|/s)*?>)|</a>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex3 = new

System.Text.RegularExpressions.Regex(@" no[/s/S]*=",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex4 = new

System.Text.RegularExpressions.Regex(@"<iframe[/s/S]+</iframe *>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex5 = new

System.Text.RegularExpressions.Regex(@"<frameset[/s/S]+</frameset *>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//System.Text.RegularExpressions.Regex regex6 = new

System.Text.RegularExpressions.Regex(@"/<img[^/>]+/>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex7 = new

System.Text.RegularExpressions.Regex(@"</p>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex8 = new

System.Text.RegularExpressions.Regex(@"<p>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//System.Text.RegularExpressions.Regex regex9 = new

System.Text.RegularExpressions.Regex(@"<[^>]*>",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = regex1.Replace(html, ""); //过滤<script></script>标记
            html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
            html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
            html = regex4.Replace(html, ""); //过滤iframe
            html = regex5.Replace(html, ""); //过滤frameset
            //html = regex6.Replace(html, ""); //过滤img
            html = regex7.Replace(html, ""); //过滤frameset
            html = regex8.Replace(html, ""); //过滤frameset
            //html = regex9.Replace(html, "");//过滤html标签。
            //html = html.Replace(" ", "");
            html = html.Replace("</strong>", "");
            html = html.Replace("<strong>", "");
            return html;
        }

常用正则表达式

"^/d+$"　　//非负整数（正整数 + 0）
"^[0-9]*[1-9][0-9]*$"　　//正整数

"^((-/d+)|(0+))$"　　//非正整数（负整数 + 0）

"^-[0-9]*[1-9][0-9]*$"　　//负整数

"^-?/d+$"　　　　//整数

"^/d+(/./d+)?$"　　//非负浮点数（正浮点数 + 0）

"^(([0-9]+/.[0-9]*[1-9][0-9]*)|([0-9]*[1-9][0-9]*/.[0-9]+)|([0-9]*[1-9][0-9]*))$"　

　//正浮点数

"^((-/d+(/./d+)?)|(0+(/.0+)?))$"　　//非正浮点数（负浮点数 + 0）

"^(-(([0-9]+/.[0-9]*[1-9][0-9]*)|([0-9]*[1-9][0-9]*/.[0-9]+)|([0-9]*[1-9][0-9]*)))$"　

　//负浮点数

"^(-?/d+)(/./d+)?$"　　//浮点数

"^[A-Za-z]+$"　　//由26个英文字母组成的字符串

"^[A-Z]+$"　　//由26个英文字母的大写组成的字符串

"^[a-z]+$"　　//由26个英文字母的小写组成的字符串

"^[A-Za-z0-9]+$"　　//由数字和26个英文字母组成的字符串

"^/w+$"　　//由数字、26个英文字母或者下划线组成的字符串

"^[/w-]+(/.[/w-]+)*@[/w-]+(/.[/w-]+)+$" 或　w+([-+.]w+)*@w+([-.]w+)*.w+([-.]w+)*"　　

　//email地址

"^[a-zA-z]+://(/w+(-/w+)*)(/.(/w+(-/w+)*))*(/?/S*)?$"　　//url

"((d{3,4})|d{3,4}-)?d{7,8}(-d{3})*" //中国电话号码验证
匹配形式如:0511-4405222 或者021-87888822 或者 021-44055520-555 或者 (0511)4405222

"d{6}" //中国邮政编码验证
匹配形式如:215421

"d{18}|d{15}" //身份证验证
匹配形式如:15位或者18位身份证

[^<>&/|'/]+ //非法字符验证
匹配非法字符如:< > & / ' |
正则表达式

((((19){1}|(20){1})d{2})|d{2})[01]{1}d{1}[0-3]{1}d{1} //日期验证
匹配形式如:20030718,030718

^/d+$　　//匹配非负整数（正整数 + 0）
^[0-9]*[1-9][0-9]*$　　//匹配正整数
^((-/d+)|(0+))$　　//匹配非正整数（负整数 + 0）
^-[0-9]*[1-9][0-9]*$　　//匹配负整数
^-?/d+$　　　　//匹配整数
^/d+(/./d+)?$　　//匹配非负浮点数（正浮点数 + 0）
^(([0-9]+/.[0-9]*[1-9][0-9]*)|([0-9]*[1-9][0-9]*/.[0-9]+)|([0-9]*[1-9][0-9]*))$　　//

匹配正浮点数
^((-/d+(/./d+)?)|(0+(/.0+)?))$　　//匹配非正浮点数（负浮点数 + 0）
^(-(([0-9]+/.[0-9]*[1-9][0-9]*)|([0-9]*[1-9][0-9]*/.[0-9]+)|([0-9]*[1-9][0-9]*)))$　　

//匹配负浮点数
^(-?/d+)(/./d+)?$　　//匹配浮点数
^[A-Za-z]+$　　//匹配由26个英文字母组成的字符串
^[A-Z]+$　　//匹配由26个英文字母的大写组成的字符串
^[a-z]+$　　//匹配由26个英文字母的小写组成的字符串
^[A-Za-z0-9]+$　　//匹配由数字和26个英文字母组成的字符串
^/w+$　　//匹配由数字、26个英文字母或者下划线组成的字符串
^[/w-]+(/.[/w-]+)*@[/w-]+(/.[/w-]+)+$　　　　//匹配email地址
^[a-zA-z]+://匹配(/w+(-/w+)*)(/.(/w+(-/w+)*))*(/?/S*)?$　　//匹配url

匹配中文字符的正则表达式： [/u4e00-/u9fa5]
匹配双字节字符(包括汉字在内)：[^/x00-/xff]
匹配空行的正则表达式：/n[/s| ]*/r
匹配HTML标记的正则表达式：/<(.*)>.*<//>|<(.*) //>/
匹配首尾空格的正则表达式：(^/s*)|(/s*$)
匹配Email地址的正则表达式：/w+([-+.]/w+)*@/w+([-.]/w+)*/./w+([-.]/w+)*
匹配网址URL的正则表达式：^[a-zA-z]+://(/w+(-/w+)*)(/.(/w+(-/w+)*))*(/?/S*)?$
匹配帐号是否合法(字母开头，允许5-16字节，允许字母数字下划线)：^[a-zA-Z][a-zA-Z0-9_]

{4,15}$
匹配国内电话号码：(/d{3}-|/d{4}-)?(/d{8}|/d{7})?
匹配腾讯QQ号：^[1-9]*[1-9][0-9]*$
下表是元字符及其在正则表达式上下文中的行为的一个完整列表：
/ 将下一个字符标记为一个特殊字符、或一个原义字符、或一个后向引用、或一个八进制转义符。
^ 匹配输入字符串的开始位置。如果设置了 RegExp 对象的Multiline 属性，^ 也匹配 ’/n’ 或

’/r’ 之后的位置。
$ 匹配输入字符串的结束位置。如果设置了 RegExp 对象的Multiline 属性，$ 也匹配 ’/n’ 或

’/r’ 之前的位置。
* 匹配前面的子表达式零次或多次。
+ 匹配前面的子表达式一次或多次。+ 等价于 {1,}。
? 匹配前面的子表达式零次或一次。? 等价于 {0,1}。
{n} n 是一个非负整数，匹配确定的n 次。
{n,} n 是一个非负整数，至少匹配n 次。
{n,m} m 和 n 均为非负整数，其中n <= m。最少匹配 n 次且最多匹配 m 次。在逗号和两个数之

间不能有空格。
? 当该字符紧跟在任何一个其他限制符 (*, +, ?, {n}, {n,}, {n,m}) 后面时，匹配模式是非贪

婪的。非贪婪模式尽可能少的匹配所搜索的字符串，而默认的贪婪模式则尽可能多的匹配所搜索的

字符串。
. 匹配除 "/n" 之外的任何单个字符。要匹配包括 ’/n’ 在内的任何字符，请使用象 ’[./n]’

的模式。
(pattern) 匹配pattern 并获取这一匹配。
(?:pattern) 匹配pattern 但不获取匹配结果，也就是说这是一个非获取匹配，不进行存储供以后

使用。
(?=pattern) 正向预查，在任何匹配 pattern 的字符串开始处匹配查找字符串。这是一个非获取

匹配，也就是说，该匹配不需要获取供以后使用。
(?!pattern) 负向预查，与(?=pattern)作用相反
x|y 匹配 x 或 y。
[xyz] 字符集合。
[^xyz] 负值字符集合。
[a-z] 字符范围，匹配指定范围内的任意字符。
[^a-z] 负值字符范围，匹配任何不在指定范围内的任意字符。
/b 匹配一个单词边界，也就是指单词和空格间的位置。
/B 匹配非单词边界。
/cx 匹配由x指明的控制字符。
/d 匹配一个数字字符。等价于 [0-9]。
/D 匹配一个非数字字符。等价于 [^0-9]。
/f 匹配一个换页符。等价于 /x0c 和 /cL。
/n 匹配一个换行符。等价于 /x0a 和 /cJ。
/r 匹配一个回车符。等价于 /x0d 和 /cM。
/s 匹配任何空白字符，包括空格、制表符、换页符等等。等价于[ /f/n/r/t/v]。
/S 匹配任何非空白字符。等价于 [^ /f/n/r/t/v]。
/t 匹配一个制表符。等价于 /x09 和 /cI。
/v 匹配一个垂直制表符。等价于 /x0b 和 /cK。
/w 匹配包括下划线的任何单词字符。等价于’[A-Za-z0-9_]’。
/W 匹配任何非单词字符。等价于 ’[^A-Za-z0-9_]’。
/xn 匹配 n，其中 n 为十六进制转义值。十六进制转义值必须为确定的两个数字长。
/num 匹配 num，其中num是一个正整数。对所获取的匹配的引用。
/n 标识一个八进制转义值或一个后向引用。如果 /n 之前至少 n 个获取的子表达式，则 n 为后