asp.net如何去掉HTML标记。.net去除html标记

最新推荐文章于 2021-06-09 21:29:55 发布

hljqfl

最新推荐文章于 2021-06-09 21:29:55 发布

阅读量678

点赞数

分类专栏： C# .NET

本文链接：https://blog.csdn.net/hljqfl/article/details/86316145

版权

C# 同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

.NET

10 篇文章 0 订阅

订阅专栏

asp.net如何去掉HTML标记

/// <summary>

/// 去除HTML标记

/// </summary>

/// <param name="NoHTML">包括HTML的源码 </param>

/// <returns>已经去除后的文字</returns>

public static string NoHTML(string Htmlstring)

{

//删除脚本

Htmlstring = Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);

//删除HTML

Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);","",RegexOptions.IgnoreCase);

Htmlstring.Replace("<","");

Htmlstring.Replace(">","");

Htmlstring.Replace("\r\n","");

Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

return Htmlstring;

}

///提取HTML代码中文字的C#函数

/// <summary>

/// 去除HTML标记

/// </summary>

/// <param name="strHtml">包括HTML的源码 </param>

/// <returns>已经去除后的文字</returns>

using System;

using System.Text.RegularExpressions;

public class StripHTMLTest{

public static void Main(){

string s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");

Console.WriteLine(s);

}

public static string StripHTML(string strHtml){

string [] aryReg ={

@"<script[^>]*?>.*?</script>",

@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",

@"([\r\n])[\s]+",

@"&(quot|#34);",

@"&(amp|#38);",

@"&(lt|#60);",

@"&(gt|#62);",

@"&(nbsp|#160);",

@"&(iexcl|#161);",

@"&(cent|#162);",

@"&(pound|#163);",

@"&(copy|#169);",

@"&#(\d+);",

@"-->",

@"<!--.*\n"

};

string [] aryRep = {

"",

"\"",

"&",

"<",

">",

" ",

"\xa1",//chr(161),

"\xa2",//chr(162),

"\xa3",//chr(163),

"\xa9",//chr(169),

"",

"\r\n",

};

string newReg =aryReg[0];

string strOutput=strHtml;

for(int i = 0;i<aryReg.Length;i++){

Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase);

strOutput = regex.Replace(strOutput,aryRep[i]);

}

strOutput.Replace("<","");

strOutput.Replace(">","");

strOutput.Replace("\r\n","");

return strOutput;

}

写一个静态方法

移除HTML标签#region 移除HTML标签

/** <summary>

/// 移除HTML标签

/// </summary>

/// <param name="HTMLStr">HTMLStr</param>

public static string ParseTags(string HTMLStr)

{

return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");

}

#endregion

取出文本中的图片地址#region 取出文本中的图片地址

/** <summary>

/// 取出文本中的图片地址

/// </summary>

/// <param name="HTMLStr">HTMLStr</param>

public static string GetImgUrl(string HTMLStr)

{

string str = string.Empty;

string sPattern = @"^<img\s+[^>]*>";

Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",

RegexOptions.Compiled);

Match m = r.Match(HTMLStr.ToLower());

if (m.Success)

str = m.Result("${url}");

return str;

}

#endregion

=================================================================================

.net去除html标记

去除html标记

分类:.NET技术点滴

2007.10.25 11:23 作者：海浪 | 评论：0 | 阅读：337

相信大家在做博客，文章管理系统之类的时候经常会遇到这样的问题：

　　把一些文章抽出来放在首页，当然这些文章要显示内容简介。但是问题来了，这些内容简介有时候有HTML，有时候没有，如果单纯的去截断字符似乎有点笨拙，特别是在DIV页面上，很容易就截断了HTML，导致页面变形了。

　　解决办法有很多，有些人是用div的样式去隐藏来解决，但是还是不完整，导致页面变形，或者直接显示出整篇文章来。

　　下面的代码就可以解决这个问题，（:) 代码很简单，只是介乎你有没有认真去想而已了，在这里就不对代码做相信说明了。）

　　使用方法：直接调用 StripLongContent("你的内容",你要显示在页面上的长度);

static Regex Content_regex = new Regex("<[^<>]+>?", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline);

//直接调用这个函数去截断字符

public static string StripLongContent(string content, int length)

{

　//去掉JS

　content = StripScriptTags(content);

　string input = "";

　if(content.Length <= length)

　return content;

　//匹配出<>中的标签

　Match mx = Content_regex.Match(content, 0, content.Length);

　Stack tagStack = new Stack();

　int startIndex = 0;

　int maxlength = 0;

　//其实很简单，保证截取出来的段落有结束的html标签就可以了

　while (mx.Value != string.Empty)

　{

　temp = content.Substring(startIndex, mx.Index);

　//当然也要保证非显示字符数不要包括到要显示的字符数中。否则会严重影响简介的质量

　if (maxlength + temp.Length >= length)

　{

　temp = temp.Substring(0, length - maxlength);

　input = input + temp + "" + mx.Value;

　break;

　}

　maxlength += temp.Length;

　input = input + temp + mx.Value;

　//一个先进后出队列，如果发现有结束，那么从堆栈中取出来

　if (mx.Value.EndsWith("/>"))

　{

　}

　else if (mx.Value.StartsWith("< P>

　tagStack.Pop();

　else

　 tagStack.Push(mx.Value);

　int index = content.IndexOf(mx.Value);

　content = content.Remove(0, index);

　content = content.Remove(0, mx.Length);

　mx = Content_regex.Match(content, 0, content.Length);

　}

　if (maxlength == 0)

　{

　return content.Substring(0, length) + "";

　}

　//对先进后出队列中已经压入的开始标签填充结束标签

　while (tagStack.Count > 0)

　{

　string tag = tagStack.Pop().ToString();

　if (tag.IndexOf(' ') > 0)

　tag = tag.Substring(0, tag.IndexOf(' ')).Replace("<", "");

　else

　tag = tag.Replace("<", "").Replace(">", "");

　input = input + "　}

　return input;

　}

//去掉JS代码

public static string StripScriptTags(string content)

　{

　string cleanText;

　content = Regex.Replace(content, "　cleanText = Regex.Replace(content, "\"", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);

　return cleanText;

　}

hljqfl

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
asp.net如何去掉HTML标记。.net去除html标记

asp.net如何去掉HTML标记 /// &lt;summary&gt; /// 去除HTML标记 /// &lt;/summary&gt; /// &lt;param name="NoHTML"&gt;包括HTML的源码 &lt;/param&gt; /// &lt;returns&gt;已经去除后的文字&lt;/ret..
复制链接

扫一扫

专栏目录