常用正则表达式

weixin_30249203

于 2008-05-04 17:15:00 发布

阅读量83

点赞数

文章标签： c#

原文链接：http://www.cnblogs.com/litsword/articles/1182053.html

版权

1、从Xml中提取Encoding值的
<\?xml\s+[^>]*Encoding=('|")?(?<Encoding>[^>\s'"]*)('|")?[^>]*>

2、过滤所有HTML标签取纯文本

Code

/**//// <summary>

/// 过滤所有HTML标签取纯文本

/// </summary>

/// <param name="html"></param>

/// <returns></returns>

public static string FilterHTMLTags(string html)

{

html = html.Replace("\n\r", " ").Replace("\r\n", " ").Replace("\n", " ").Replace("\r", " ").Replace("\t", " ");

Regex regex1 = new Regex(@"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>", RegexOptions.IgnoreCase);

Regex regex2 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);

Regex regex3 = new Regex(@"<[^>]+>", RegexOptions.IgnoreCase);

//Regex regex7 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);

html = regex1.Replace(html, string.Empty); //过滤<script></script>标记

html = regex2.Replace(html, string.Empty); //过滤<style> 里面的内容

html = regex3.Replace(html, string.Empty); //过滤<> 里面的内容

return html;

}

3、过滤不安全的HTML脚本

Code

/**//// <summary>

/// 正则表达式过滤不安全的HTML脚本

/// </summary>

/// <param name="html">输入的Html</param>

/// <returns>过滤净化后的Html</returns>

public static string Wipescript(string html)

{

Regex regex1 = new Regex(@"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>", RegexOptions.IgnoreCase);

Regex regex2 = new Regex(@"</?form[^>]*>", RegexOptions.IgnoreCase);

Regex regex3 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);

html = regex1.Replace(html, ""); //过滤<script></script>标记

html = regex2.Replace(html, "");

html = regex3.Replace(html, "");

return html;

}

4、取所有src链接
(\ssrc=)(?<url>[^>\s]*)

5、取所有图片链接地址

Code

/**//// <summary>

/// 获取HTML中所有图片的地址

/// </summary>

/// <param name="html">原HTML</param>

/// <returns>HashTable 所有图片链接</returns>

public static Hashtable GetImg(string html)

{

MatchCollection mm = Regex.Matches(html, @"<img\s+[^>]*src=\s*('|"")*(?<url>[^>\s'""]*)('|"")*[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);

int i = 0;

bool blnRepeat;

Hashtable htb = new Hashtable();

if (mm != null)

{

foreach (Match ss in mm)

{

if (i == 0)

{

i++;

htb.Add(0, ss.Groups["url"].Value);

}

else

{

blnRepeat = false;

foreach (DictionaryEntry de in htb)

{

if (ss.Groups["url"].Value.ToLower() == de.Value.ToString().ToLower())

{

blnRepeat = true;

break;

}

if (blnRepeat == false)

{

htb.Add(i, ss.Groups["url"].Value);

i++;

}

return htb;

}

6、高亮显示文本内容

Code

/**//// <summary>

/// 将文本标记为高亮

/// </summary>

/// <param name="textToShow">要显示的文本</param>

/// <param name="keysString">高亮匹配的关键字串(逗号分隔)</param>

/// <returns>返回高亮标记后的文本</returns>

public static string MarkHightLightText(string textToShow, string keysString)

{

string[] strAkeys = keysString.Split(new char[]

{ ',', '，' }, StringSplitOptions.RemoveEmptyEntries);

foreach (string keyString in strAkeys)

{

System.Text.RegularExpressions.MatchCollection m = Regex.Matches(textToShow, keyString, RegexOptions.IgnoreCase);//忽略大小写搜索字符串中的关键字

for (int j = 0; j < m.Count; j++)//循环在匹配的子串前后插东东

{

//j×31为插入html标签使pain字符串增加的长度:

textToShow = textToShow.Insert((m[j].Index + keyString.Length + j * 31), "</span>");//关键字后插入html标签

textToShow = textToShow.Insert((m[j].Index + j * 31), "<span class=\"Highlight\">");//关键字前插入html标签

}

return textToShow;

}

7、在页面头部找feed链接

例:

Code

/**//// <summary>

/// 在Html中找Feed地址

/// </summary>

/// <param name="strHtml">可能包含Feed地址的Html</param>

/// <returns></returns>

public static string GetFeedAddressInHtml(string strHtml)

{

string strFeedAddress = string.Empty;

try

{

if (strHtml != string.Empty)

{

int intHeaderEndIndex = strHtml.ToLower().IndexOf("</head>");

if (intHeaderEndIndex > -1)

{

// 截取头部代码

string strHeader = strHtml.Substring(0, strHtml.ToLower().IndexOf("</head>"));

string strRegex = @"<link[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>";

MatchCollection results = Regex.Matches(strHeader, strRegex);

foreach (Match res in results)

{

string strFeed = res.Groups["href"].Value.Replace("\"", string.Empty).Trim();

string strExtend = strFeed.Substring(strFeed.LastIndexOf('.') + 1, strFeed.Length - strFeed.LastIndexOf('.') - 1);

if (_blackExtendNameString.IndexOf("," + strExtend.Trim().ToLower() + ",") >= 0)

{

continue;

}

else

{

strFeedAddress = strFeed;

break;

}

catch (Exception ex)

{

Log.Write("在Html中查找Feed地址出错");

Log.Write(ex);

}

return strFeedAddress;

}

8、分离Url参数和锚点

Code

Regex rex = new Regex("#.+");

MatchCollection mct = rex.Matches(Request.QueryString["idx"]);

string strAnchor = mct == null ? string.Empty : mct.Count == 0 ? string.Empty : mct[0].Value.Trim();

string strIdx = rex.Replace(Request.QueryString["idx"], "");

9、过滤unicode编码

\&\#\d*;

附：C#中调用正则表达式匹配的代码示例

Code

MatchCollection results = Regex.Matches(strHtml, @"<\?xml\s+[^>]*Encoding=('|"")?(?<Encoding>[^>\s'""]*)('|"")?[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);

foreach (Match res in results)

{

strEncoding = res.Groups["Encoding"].Value;

}

转载于:https://www.cnblogs.com/litsword/articles/1182053.html

weixin_30249203

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
常用正则表达式

1、从Xml中提取Encoding值的<\?xml\s+[^>]*Encoding=('|")?(?<Encoding>[^>\s'"]*)('|")?[^>]*>2、过滤所有HTML标签取纯文本Code/**////<summary>///过滤所有HTML标签取纯文本///&lt...
复制链接

扫一扫