yHtmlDmQwStr = Regex.Replace(yHtmlDmQwStr, @"<(?!img|br|p|/p|div|/div).*?>", "", RegexOptions.IgnoreCase);//保留p,br,img
//yHtmlDmQwStr = Regex.Replace(yHtmlDmQwStr, @"<[^>]*>", "", RegexOptions.IgnoreCase);//去掉所有格式
class Program
{
static void Main(string[] args)
{
//截取标签中的内容
#region //不可以重复
//string str = "<p align=\"right\">北京市卫生健康委员会<br>2020年3月16日 <br></p>qqqqq<p align=\"right\">北京市卫生健康委员会2<br>2020年3月12日<br></p>";
//Regex regex1 = new Regex("(?<=(<p align=\"right\">))[.\\s\\S]*?(?=(</p>))", RegexOptions.IgnoreCase);
//string Pcontent = regex1.Match(str).Groups[0].Value;
//string[] qfrq = Pcontent.Split(new string[] { "<br>", "<br>" }, StringSplitOptions.RemoveEmptyEntries);
//Console.WriteLine(Pcontent);
//Console.ReadLine();
#endregion
#region//可以重复出现
string str = "<p align=\"right\">北京市卫生健康委员会<br>2020年3月16日 <br></p>qqqqq<p align=\"right\">北京市卫生健康委员会2<br>2020年3月12日<br></p>";
Regex regex = new Regex("(?<=(<p align=\"right\">))[.\\s\\S]*?(?=(</p>))", RegexOptions.IgnoreCase);
for (Match match = regex.Match(str); match.Success; match = match.NextMatch())
{
string d = match.Groups[0].ToString();//每个<div class="brand_items"> </div>里的内容
string[] qf = d.Split(new string[] { "<br>","<br>" }, StringSplitOptions.RemoveEmptyEntries);
}
#endregion
}
}
public static string CHEO(string html)
{
string stringPattern = @"</?(?(?=img|br| @)notag|[a-zA-Z0-9]+)(?:\s[a-zA-Z0-9\-]+=?(?:(["",']?).*?\1?)?)*\s*/?>";//保留指定标签
html = Regex.Replace(html, stringPattern, "");
return html;
}
public static void asdf()
{
//方法一<a><a>注意
var strJson = @"1231231231231<a style=\color:blue\ href=\javascript:GetOrderDetails('1156605')\>1156605<a>1231231231231233";
Regex rg = new Regex(@"<a[^>]*>([^<]*)<a>");
MatchCollection mc = rg.Matches(strJson);
Console.WriteLine(mc[0].Groups[1].Value);
//方法二
var dd = Regex.Match(strJson, "<a[^>]*>([^<]*)<a>");
if (dd.Success)
Console.WriteLine(dd.Groups[1].Value);
foreach (Match match in Regex.Matches(strJson, "<a[^>]*>([^<]*)<a>"))
Console.WriteLine("Duplicate '{0}' found at position {1}.", match.Groups[1].Value, match.Groups[2].Index);
}
//获取所有a标签中的text中的值
Regex reg = new Regex(@"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
MatchCollection mc = reg.Matches(HTMLstr);
foreach (Match m in mc)
{
string url=m.Groups["url"].Value
string text= m.Groups["text"].Value
}
提取标签属性值
#region 案例
//Regex regex = new Regex(@"<a[^>]*>([^<]*)</a>", RegexOptions.IgnoreCase);
//for (Match match = regex.Match(yHtmlDmQwStr); match.Success; match = match.NextMatch())
//{
// string d = match.Groups[0].ToString();
// if (d.Contains("fjLink"))
// {
// string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<href>[^'\"\"\\s>]+)\\1[^>]*>", "a", "href");
// MatchCollection titleMatch = Regex.Matches(d, tmpStr, RegexOptions.IgnoreCase);
// foreach (Match m in titleMatch)
// {
// HtmlMB[28] = HtmlMB[28] + m.Groups["href"].Value + "||";
// }
// }
//}
#endregion
//去掉a标签全部
<a(?:(?!href=).)*href=(['""]?)(?<url>[^""'\s>]*)\1[^>]*>(?<text>(?:(?!</a>).)*)</a>
//正则表达式获取a标签中class指定属性值的所有a标签内容
"/<div class=\"tideTable\"(.*?)>(.*?)<\/div>/ism"
<a class=\"alink\"(.*?)>(.*?)</a>ism
http://www.jb51.net/tools/zhengze.html//正则
Regex reg = new Regex(@"(?<=<td>)(.*?)(?=</td>)", RegexOptions.IgnoreCase);
public static string noHtml(string str)
{
if (str != null)
{
str = Regex.Replace(str, @"<script[^>]*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);//删除脚本
str = Regex.Replace(str, @"(<style)+[^<>]*>[^\0]*(<\/style>)+", "", RegexOptions.IgnoreCase);//删除样式
str = Regex.Replace(str, @"<object.*?/object>", "", RegexOptions.IgnoreCase);//删除object
str = Regex.Replace(str, @"<!--.*", "", RegexOptions.IgnoreCase);//删除开始注释
str = Regex.Replace(str, @"-->", "", RegexOptions.IgnoreCase);//删除结尾注释
//str = Regex.Replace(str, @"<\/*[^<>]*>", "", RegexOptions.IgnoreCase);//删除全部html
//str = Regex.Replace(str, @"<(\/){0,1}div[^<>]*>", "", RegexOptions.IgnoreCase);//删除div
//str = Regex.Replace(str, @"<(\/){0,1}a[^<>]*>", "", RegexOptions.IgnoreCase);//删除超链接
//str = Regex.Replace(str, @"<(\/){0,1}font[^<>]*>", "", RegexOptions.IgnoreCase);//删除文字样式
//str = Regex.Replace(str, @"(class=){1,}(""|\'){0,1}\S+(""|\'|>|\s){0,1}", "", RegexOptions.IgnoreCase);//删除class
//str = Regex.Replace(str, @"(<iframe){1,}[^<>]*>[^\0]*(<\/iframe>){1,}", "", RegexOptions.IgnoreCase);//删除框架
//str = Regex.Replace(str, @"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", "", RegexOptions.IgnoreCase);//删除脚本
str = Regex.Replace(str, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);//删除全部html
str = Regex.Replace(str, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);//删除换行
str = Regex.Replace(str, @" ", " ", RegexOptions.IgnoreCase);//替换空格
}
return str;
}
//正则表达式获取指定class的所有a标签
//<a\s+[^>]*class='fjLink'[^>]*>[^<]*</a>|<img\s+[^>]*class='fjLink'[^>]*>[^<]*</img>