正则表达式与HTML网页源码

最新推荐文章于 2021-12-20 22:59:53 发布

GaoXianSheng7

最新推荐文章于 2021-12-20 22:59:53 发布

阅读量671

点赞数

本文链接：https://blog.csdn.net/GaoXianSheng7/article/details/105994406

版权

 yHtmlDmQwStr = Regex.Replace(yHtmlDmQwStr, @"<(?!img|br|p|/p|div|/div).*?>", "", RegexOptions.IgnoreCase);//保留p，br,img
//yHtmlDmQwStr = Regex.Replace(yHtmlDmQwStr, @"<[^>]*>", "", RegexOptions.IgnoreCase);//去掉所有格式

  class Program
    {
        static void Main(string[] args)
        {
            //截取标签中的内容
            #region //不可以重复
            //string str = "<p align=\"right\">北京市卫生健康委员会<br>2020年3月16日 <br></p>qqqqq<p align=\"right\">北京市卫生健康委员会2<br>2020年3月12日<br></p>";
            //Regex regex1 = new Regex("(?<=(<p align=\"right\">))[.\\s\\S]*?(?=(</p>))", RegexOptions.IgnoreCase);
            //string Pcontent = regex1.Match(str).Groups[0].Value;
            //string[] qfrq = Pcontent.Split(new string[] { "<br>", "<br>" }, StringSplitOptions.RemoveEmptyEntries);
            //Console.WriteLine(Pcontent);
            //Console.ReadLine();
            #endregion
            #region//可以重复出现
            string str = "<p align=\"right\">北京市卫生健康委员会<br>2020年3月16日 <br></p>qqqqq<p align=\"right\">北京市卫生健康委员会2<br>2020年3月12日<br></p>";
            Regex regex = new Regex("(?<=(<p align=\"right\">))[.\\s\\S]*?(?=(</p>))", RegexOptions.IgnoreCase);
            for (Match match = regex.Match(str); match.Success; match = match.NextMatch())
            {
                string d = match.Groups[0].ToString();//每个<div class="brand_items"> </div>里的内容
                string[] qf = d.Split(new string[] { "<br>","<br>" }, StringSplitOptions.RemoveEmptyEntries);              
               
            }
            #endregion         
        }
    }

 public static string CHEO(string html)
        {
            string stringPattern = @"</?(?(?=img|br|&nbsp;@)notag|[a-zA-Z0-9]+)(?:\s[a-zA-Z0-9\-]+=?(?:(["",']?).*?\1?)?)*\s*/?>";//保留指定标签
            html = Regex.Replace(html, stringPattern, "");
            return html;
        }

public static void asdf() 
        {
            //方法一<a><a>注意
            var strJson = @"1231231231231<a style=\color:blue\ href=\javascript:GetOrderDetails('1156605')\>1156605<a>1231231231231233";
            Regex rg = new Regex(@"<a[^>]*>([^<]*)<a>");
            MatchCollection mc = rg.Matches(strJson);
            Console.WriteLine(mc[0].Groups[1].Value);
            //方法二
            var dd = Regex.Match(strJson, "<a[^>]*>([^<]*)<a>");
            if (dd.Success)
                Console.WriteLine(dd.Groups[1].Value);
            foreach (Match match in Regex.Matches(strJson, "<a[^>]*>([^<]*)<a>"))
                Console.WriteLine("Duplicate '{0}' found at position {1}.", match.Groups[1].Value, match.Groups[2].Index);
        }

//获取所有a标签中的text中的值
Regex reg = new Regex(@"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
MatchCollection mc = reg.Matches(HTMLstr);
foreach (Match m in mc)
{
 string url=m.Groups["url"].Value 
 string text= m.Groups["text"].Value 
}

提取标签属性值 
 #region 案例
                                    //Regex regex = new Regex(@"<a[^>]*>([^<]*)</a>", RegexOptions.IgnoreCase);
                                    //for (Match match = regex.Match(yHtmlDmQwStr); match.Success; match = match.NextMatch())
                                    //{
                                    //    string d = match.Groups[0].ToString();
                                    //    if (d.Contains("fjLink"))
                                    //    {
                                    //        string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<href>[^'\"\"\\s>]+)\\1[^>]*>", "a", "href");
                                    //        MatchCollection titleMatch = Regex.Matches(d, tmpStr, RegexOptions.IgnoreCase);
                                    //        foreach (Match m in titleMatch)
                                    //        {
                                    //            HtmlMB[28] = HtmlMB[28] + m.Groups["href"].Value + "||";
                                    //        }
                                    //    }
                                    //} 
                                    #endregion

//去掉a标签全部
<a(?:(?!href=).)*href=(['""]?)(?<url>[^""'\s>]*)\1[^>]*>(?<text>(?:(?!</a>).)*)</a>

//正则表达式获取a标签中class指定属性值的所有a标签内容
"/<div class=\"tideTable\"(.*?)>(.*?)<\/div>/ism"
<a class=\"alink\"(.*?)>(.*?)</a>ism

http://www.jb51.net/tools/zhengze.html//正则
Regex reg = new Regex(@"(?<=<td>)(.*?)(?=</td>)", RegexOptions.IgnoreCase);
   public static string noHtml(string str)
        {
            if (str != null)
            {
                str = Regex.Replace(str, @"<script[^>]*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);//删除脚本
                str = Regex.Replace(str, @"(<style)+[^<>]*>[^\0]*(<\/style>)+", "", RegexOptions.IgnoreCase);//删除样式
                str = Regex.Replace(str, @"<object.*?/object>", "", RegexOptions.IgnoreCase);//删除object
                str = Regex.Replace(str, @"<!--.*", "", RegexOptions.IgnoreCase);//删除开始注释
                str = Regex.Replace(str, @"-->", "", RegexOptions.IgnoreCase);//删除结尾注释
                //str = Regex.Replace(str, @"<\/*[^<>]*>", "", RegexOptions.IgnoreCase);//删除全部html
                //str = Regex.Replace(str, @"<(\/){0,1}div[^<>]*>", "", RegexOptions.IgnoreCase);//删除div
                //str = Regex.Replace(str, @"<(\/){0,1}a[^<>]*>", "", RegexOptions.IgnoreCase);//删除超链接
                //str = Regex.Replace(str, @"<(\/){0,1}font[^<>]*>", "", RegexOptions.IgnoreCase);//删除文字样式
                //str = Regex.Replace(str, @"(class=){1,}(""|\'){0,1}\S+(""|\'|>|\s){0,1}", "", RegexOptions.IgnoreCase);//删除class
                //str = Regex.Replace(str, @"(<iframe){1,}[^<>]*>[^\0]*(<\/iframe>){1,}", "", RegexOptions.IgnoreCase);//删除框架
                //str = Regex.Replace(str, @"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", "", RegexOptions.IgnoreCase);//删除脚本
                str = Regex.Replace(str, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);//删除全部html
                str = Regex.Replace(str, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);//删除换行
                str = Regex.Replace(str, @"&nbsp;&nbsp;", "　", RegexOptions.IgnoreCase);//替换空格
            }
            return str;
        }

//正则表达式获取指定class的所有a标签
 //<a\s+[^>]*class='fjLink'[^>]*>[^<]*</a>|<img\s+[^>]*class='fjLink'[^>]*>[^<]*</img>

GaoXianSheng7

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
正则表达式与HTML网页源码

yHtmlDmQwStr = Regex.Replace(yHtmlDmQwStr, @"<(?!img|br|p|/p|div|/div).*?>", "", RegexOptions.IgnoreCase);//保留p，br,img//yHtmlDmQwStr = Regex.Replace(yHtmlDmQwStr, @"<[^>]*>", "", Reg...
复制链接

扫一扫