1、从Xml中提取Encoding值的
<\?xml\s+[^>]*Encoding=('|")?(?<Encoding>[^>\s'"]*)('|")?[^>]*>
2、过滤所有HTML标签取纯文本
Code
/**//// <summary>
/// 过滤所有HTML标签取纯文本
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string FilterHTMLTags(string html)
{
html = html.Replace("\n\r", " ").Replace("\r\n", " ").Replace("\n", " ").Replace("\r", " ").Replace("\t", " ");
Regex regex1 = new Regex(@"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>", RegexOptions.IgnoreCase);
Regex regex2 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);
Regex regex3 = new Regex(@"<[^>]+>", RegexOptions.IgnoreCase);
//Regex regex7 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);
html = regex1.Replace(html, string.Empty); //过滤<script></script>标记
html = regex2.Replace(html, string.Empty); //过滤<style> 里面的内容
html = regex3.Replace(html, string.Empty); //过滤<> 里面的内容
return html;
}
/**//// <summary>
/// 过滤所有HTML标签取纯文本
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string FilterHTMLTags(string html)
{
html = html.Replace("\n\r", " ").Replace("\r\n", " ").Replace("\n", " ").Replace("\r", " ").Replace("\t", " ");
Regex regex1 = new Regex(@"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>", RegexOptions.IgnoreCase);
Regex regex2 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);
Regex regex3 = new Regex(@"<[^>]+>", RegexOptions.IgnoreCase);
//Regex regex7 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);
html = regex1.Replace(html, string.Empty); //过滤<script></script>标记
html = regex2.Replace(html, string.Empty); //过滤<style> 里面的内容
html = regex3.Replace(html, string.Empty); //过滤<> 里面的内容
return html;
}
3、过滤不安全的HTML脚本
Code
/**//// <summary>
/// 正则表达式过滤不安全的HTML脚本
/// </summary>
/// <param name="html">输入的Html</param>
/// <returns>过滤净化后的Html</returns>
public static string Wipescript(string html)
{
Regex regex1 = new Regex(@"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>", RegexOptions.IgnoreCase);
Regex regex2 = new Regex(@"</?form[^>]*>", RegexOptions.IgnoreCase);
Regex regex3 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, "");
html = regex3.Replace(html, "");
return html;
}
/**//// <summary>
/// 正则表达式过滤不安全的HTML脚本
/// </summary>
/// <param name="html">输入的Html</param>
/// <returns>过滤净化后的Html</returns>
public static string Wipescript(string html)
{
Regex regex1 = new Regex(@"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>", RegexOptions.IgnoreCase);
Regex regex2 = new Regex(@"</?form[^>]*>", RegexOptions.IgnoreCase);
Regex regex3 = new Regex(@"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>", RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, "");
html = regex3.Replace(html, "");
return html;
}
4、取所有src链接
(\ssrc=)(?<url>[^>\s]*)
5、取所有图片链接地址
Code
/**//// <summary>
/// 获取HTML中所有图片的地址
/// </summary>
/// <param name="html">原HTML</param>
/// <returns>HashTable 所有图片链接</returns>
public static Hashtable GetImg(string html)
{
MatchCollection mm = Regex.Matches(html, @"<img\s+[^>]*src=\s*('|"")*(?<url>[^>\s'""]*)('|"")*[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
int i = 0;
bool blnRepeat;
Hashtable htb = new Hashtable();
if (mm != null)
{
foreach (Match ss in mm)
{
if (i == 0)
{
i++;
htb.Add(0, ss.Groups["url"].Value);
}
else
{
blnRepeat = false;
foreach (DictionaryEntry de in htb)
{
if (ss.Groups["url"].Value.ToLower() == de.Value.ToString().ToLower())
{
blnRepeat = true;
break;
}
}
if (blnRepeat == false)
{
htb.Add(i, ss.Groups["url"].Value);
i++;
}
}
}
}
return htb;
}
/**//// <summary>
/// 获取HTML中所有图片的地址
/// </summary>
/// <param name="html">原HTML</param>
/// <returns>HashTable 所有图片链接</returns>
public static Hashtable GetImg(string html)
{
MatchCollection mm = Regex.Matches(html, @"<img\s+[^>]*src=\s*('|"")*(?<url>[^>\s'""]*)('|"")*[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
int i = 0;
bool blnRepeat;
Hashtable htb = new Hashtable();
if (mm != null)
{
foreach (Match ss in mm)
{
if (i == 0)
{
i++;
htb.Add(0, ss.Groups["url"].Value);
}
else
{
blnRepeat = false;
foreach (DictionaryEntry de in htb)
{
if (ss.Groups["url"].Value.ToLower() == de.Value.ToString().ToLower())
{
blnRepeat = true;
break;
}
}
if (blnRepeat == false)
{
htb.Add(i, ss.Groups["url"].Value);
i++;
}
}
}
}
return htb;
}
6、高亮显示文本内容
Code
/**//// <summary>
/// 将文本标记为高亮
/// </summary>
/// <param name="textToShow">要显示的文本</param>
/// <param name="keysString">高亮匹配的关键字串(逗号分隔)</param>
/// <returns>返回高亮标记后的文本</returns>
public static string MarkHightLightText(string textToShow, string keysString)
{
string[] strAkeys = keysString.Split(new char[] { ',', ',' }, StringSplitOptions.RemoveEmptyEntries);
foreach (string keyString in strAkeys)
{
System.Text.RegularExpressions.MatchCollection m = Regex.Matches(textToShow, keyString, RegexOptions.IgnoreCase);//忽略大小写搜索字符串中的关键字
for (int j = 0; j < m.Count; j++)//循环在匹配的子串前后插东东
{
//j×31为插入html标签使pain字符串增加的长度:
textToShow = textToShow.Insert((m[j].Index + keyString.Length + j * 31), "</span>");//关键字后插入html标签
textToShow = textToShow.Insert((m[j].Index + j * 31), "<span class=\"Highlight\">");//关键字前插入html标签
}
}
return textToShow;
}
/**//// <summary>
/// 将文本标记为高亮
/// </summary>
/// <param name="textToShow">要显示的文本</param>
/// <param name="keysString">高亮匹配的关键字串(逗号分隔)</param>
/// <returns>返回高亮标记后的文本</returns>
public static string MarkHightLightText(string textToShow, string keysString)
{
string[] strAkeys = keysString.Split(new char[] { ',', ',' }, StringSplitOptions.RemoveEmptyEntries);
foreach (string keyString in strAkeys)
{
System.Text.RegularExpressions.MatchCollection m = Regex.Matches(textToShow, keyString, RegexOptions.IgnoreCase);//忽略大小写搜索字符串中的关键字
for (int j = 0; j < m.Count; j++)//循环在匹配的子串前后插东东
{
//j×31为插入html标签使pain字符串增加的长度:
textToShow = textToShow.Insert((m[j].Index + keyString.Length + j * 31), "</span>");//关键字后插入html标签
textToShow = textToShow.Insert((m[j].Index + j * 31), "<span class=\"Highlight\">");//关键字前插入html标签
}
}
return textToShow;
}
7、在页面头部找feed链接
<link[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>
例:
Code
/**//// <summary>
/// 在Html中找Feed地址
/// </summary>
/// <param name="strHtml">可能包含Feed地址的Html</param>
/// <returns></returns>
public static string GetFeedAddressInHtml(string strHtml)
{
string strFeedAddress = string.Empty;
try
{
if (strHtml != string.Empty)
{
int intHeaderEndIndex = strHtml.ToLower().IndexOf("</head>");
if (intHeaderEndIndex > -1)
{
// 截取头部代码
string strHeader = strHtml.Substring(0, strHtml.ToLower().IndexOf("</head>"));
string strRegex = @"<link[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>";
MatchCollection results = Regex.Matches(strHeader, strRegex);
foreach (Match res in results)
{
string strFeed = res.Groups["href"].Value.Replace("\"", string.Empty).Trim();
string strExtend = strFeed.Substring(strFeed.LastIndexOf('.') + 1, strFeed.Length - strFeed.LastIndexOf('.') - 1);
if (_blackExtendNameString.IndexOf("," + strExtend.Trim().ToLower() + ",") >= 0)
{
continue;
}
else
{
strFeedAddress = strFeed;
break;
}
}
}
}
}
catch (Exception ex)
{
Log.Write("在Html中查找Feed地址出错");
Log.Write(ex);
}
return strFeedAddress;
}
/**//// <summary>
/// 在Html中找Feed地址
/// </summary>
/// <param name="strHtml">可能包含Feed地址的Html</param>
/// <returns></returns>
public static string GetFeedAddressInHtml(string strHtml)
{
string strFeedAddress = string.Empty;
try
{
if (strHtml != string.Empty)
{
int intHeaderEndIndex = strHtml.ToLower().IndexOf("</head>");
if (intHeaderEndIndex > -1)
{
// 截取头部代码
string strHeader = strHtml.Substring(0, strHtml.ToLower().IndexOf("</head>"));
string strRegex = @"<link[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>";
MatchCollection results = Regex.Matches(strHeader, strRegex);
foreach (Match res in results)
{
string strFeed = res.Groups["href"].Value.Replace("\"", string.Empty).Trim();
string strExtend = strFeed.Substring(strFeed.LastIndexOf('.') + 1, strFeed.Length - strFeed.LastIndexOf('.') - 1);
if (_blackExtendNameString.IndexOf("," + strExtend.Trim().ToLower() + ",") >= 0)
{
continue;
}
else
{
strFeedAddress = strFeed;
break;
}
}
}
}
}
catch (Exception ex)
{
Log.Write("在Html中查找Feed地址出错");
Log.Write(ex);
}
return strFeedAddress;
}
8、分离Url参数和锚点
Code
Regex rex = new Regex("#.+");
MatchCollection mct = rex.Matches(Request.QueryString["idx"]);
string strAnchor = mct == null ? string.Empty : mct.Count == 0 ? string.Empty : mct[0].Value.Trim();
string strIdx = rex.Replace(Request.QueryString["idx"], "");
Regex rex = new Regex("#.+");
MatchCollection mct = rex.Matches(Request.QueryString["idx"]);
string strAnchor = mct == null ? string.Empty : mct.Count == 0 ? string.Empty : mct[0].Value.Trim();
string strIdx = rex.Replace(Request.QueryString["idx"], "");
9、过滤unicode编码
\&\#\d*;
附:C#中调用正则表达式匹配的代码示例
Code
MatchCollection results = Regex.Matches(strHtml, @"<\?xml\s+[^>]*Encoding=('|"")?(?<Encoding>[^>\s'""]*)('|"")?[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
foreach (Match res in results)
{
strEncoding = res.Groups["Encoding"].Value;
}
MatchCollection results = Regex.Matches(strHtml, @"<\?xml\s+[^>]*Encoding=('|"")?(?<Encoding>[^>\s'""]*)('|"")?[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
foreach (Match res in results)
{
strEncoding = res.Groups["Encoding"].Value;
}