html plain text,php - HTML to plain text (for email) - Stack Overflow

In c# :

private string StripHTML(string source)

{

try

{

string result;

// Remove HTML Development formatting

// Replace line breaks with space

// because browsers inserts space

result = source.Replace("\r", " ");

// Replace line breaks with space

// because browsers inserts space

result = result.Replace("\n", " ");

// Remove step-formatting

result = result.Replace("\t", string.Empty);

// Remove repeating spaces because browsers ignore them

result = System.Text.RegularExpressions.Regex.Replace(result,

@"( )+", " ");

// Remove the header (prepare first by clearing attributes)

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "

",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"()", "",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

"(

).*()", string.Empty,

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// remove all scripts (prepare first by clearing attributes)

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"(<( )*(/)( )*script( )*>)", "",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

//result = System.Text.RegularExpressions.Regex.Replace(result,

// @"()])*()",

// string.Empty,

// System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"()", string.Empty,

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// remove all styles (prepare first by clearing attributes)

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"(<( )*(/)( )*style( )*>)", "",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

"()", string.Empty,

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// insert tabs in spaces of

tags

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "\t",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// insert line breaks in places of
and

tags

result = System.Text.RegularExpressions.Regex.Replace(result,

@"", "\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"", "\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// insert line paragraphs (double line breaks) in place

// if

,

and tags

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "\r\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "\r\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"])*>", "\r\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// Remove remaining tags like , links, images,

// comments etc - anything that's enclosed inside < >

result = System.Text.RegularExpressions.Regex.Replace(result,

@"]*>", string.Empty,

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// replace special characters:

result = System.Text.RegularExpressions.Regex.Replace(result,

@" ", " ",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"•", " * ",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"‹", "

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"›", ">",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"™", "(tm)",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"⁄", "/",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"<", "

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@">", ">",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"©", "(c)",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

@"®", "(r)",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// Remove all others. More can be added, see

// http://hotwired.lycos.com/webmonkey/reference/special_characters/

result = System.Text.RegularExpressions.Regex.Replace(result,

@"&(.{2,6});", string.Empty,

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// for testing

//System.Text.RegularExpressions.Regex.Replace(result,

// this.txtRegex.Text,string.Empty,

// System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// make line breaking consistent

result = result.Replace("\n", "\r");

// Remove extra line breaks and tabs:

// replace over 2 breaks with 2 and over 4 tabs with 4.

// Prepare first to remove any whitespaces in between

// the escaped characters and remove redundant tabs in between line breaks

result = System.Text.RegularExpressions.Regex.Replace(result,

"(\r)( )+(\r)", "\r\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

"(\t)( )+(\t)", "\t\t",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

"(\t)( )+(\r)", "\t\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,

"(\r)( )+(\t)", "\r\t",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// Remove redundant tabs

result = System.Text.RegularExpressions.Regex.Replace(result,

"(\r)(\t)+(\r)", "\r\r",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// Remove multiple tabs following a line break with just one tab

result = System.Text.RegularExpressions.Regex.Replace(result,

"(\r)(\t)+", "\r\t",

System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// Initial replacement target string for line breaks

string breaks = "\r\r\r";

// Initial replacement target string for tabs

string tabs = "\t\t\t\t\t";

for (int index = 0; index < result.Length; index++)

{

result = result.Replace(breaks, "\r\r");

result = result.Replace(tabs, "\t\t\t\t");

breaks = breaks + "\r";

tabs = tabs + "\t";

}

// That's it.

return result;

}

catch

{

MessageBox.Show("Error");

return source;

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值