正则表达式过滤HTML、JS、CSS

功能用途

主要是用来提取html页面内容时使用。

示例代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Net;
using System.Net.NetworkInformation;
using System.Net.Sockets;
using System.Threading;
using System.Text.RegularExpressions;
namespace HtmlRegex
{
    public class BaseRegex
    {
        WebClient web = new WebClient();
        public void DeBug(string path,int encoding,string content)
        {
            Encoding encods;
            if (encoding == 1)
                encods = Encoding.UTF8;
            else
                encods = Encoding.Default;
            StreamWriter sw = new StreamWriter(path,true ,encods);
            sw.WriteLine(content);
            sw.Flush();
            sw.Close();
        }
        public string getPageContent(string url, int encoding)
        {
            byte[] buff = web.DownloadData(url);
            if (encoding == 1)
            {
                return Encoding.UTF8.GetString(buff);
            }
            return Encoding.Default.GetString(buff);
        }
        public string checkHtml(string html)
        {
            //过滤JS和CSS
            Regex regex1 = new Regex(@"\<script.*?\>.+?\</script\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex2 = new Regex(@"\<style.*?\>.+?\</style\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex3 = new Regex(@"\<script.*?\>.*?\</script\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex4 = new Regex(@"\<style.*?\>.*?\</style\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            
            Regex regex5 = new Regex(@"\<.*?\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex6 = new Regex(@"&\S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex7 = new Regex(@"\<!\-\-.+?\-\-\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex8 = new Regex(@"[\r\n]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            //HTML标签包括自闭和标签
            //Regex regex9 = new Regex(@"<(.*)(.*)>.*<\/\1>|<(.*) \/>", RegexOptions.Singleline | RegexOptions.IgnoreCase);

            html = regex1.Replace(html, "");
            html = regex2.Replace(html, "");
            html = regex3.Replace(html, "");
            html = regex4.Replace(html, "");
            html = regex5.Replace(html, "");
            html = regex6.Replace(html, "");
            html = regex7.Replace(html, "");
            html = regex8.Replace(html, "");
            html = html.Replace(" ", "");
            return html;
        }
    }
}

转载于:https://www.cnblogs.com/shya/p/2439443.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值