//
------------
private static string html2TextPattern =
@" (?<script><script[^>]*?>.*?</script>)|(?<style><style>.*?</style>)|(?<comment><!--.*?-->) " +
@" |(?<html><[^>]+>) " + // HTML标记
@" |(?<quot>&(quot|#34);) " + // 符号: "
@" |(?<amp>&(amp|#38);) " + // 符号: &
@" |(?<lt>&(lt|#60);) " + // 符号: <
@" |(?<gt>&(gt|#62);) " + // 符号: >
@" |(?<iexcl>&(iexcl|#161);) " + // 符号: (char)161
@" |(?<cent>&(cent|#162);) " + // 符号: (char)162
@" |(?<pound>&(pound|#163);) " + // 符号: (char)163
@" |(?<copy>&(copy|#169);) " + // 符号: (char)169
@" |(?<others>&(d+);) " + // 符号: 其他
@" |(?<space> | ) " ; // 空格
/**/ /// <summary>
/// 转换HTML为纯文本
/// </summary>
/// <param name="html">HTML字符串</param>
/// <param name="keepFormat">是否保留换行格式</param>
/// <returns></returns>
public static string Html2Text( string html, bool keepFormat)
... {
string pattern = html2TextPattern;
if (! keepFormat) pattern += "|(?<control>[ /s])"; // 换行字符
RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled;
string txt = Regex.Replace(html, pattern, new MatchEvaluator(Html2Text_Match), options);
if (! keepFormat)
return Regex.Replace(txt.Trim(), "[ ]+", " ", options); // 替换多个连续空格
else
return txt;
}
private static string Html2Text_Match(System.Text.RegularExpressions.Match m)
... {
if (m.Groups["quot"].Value != string.Empty)
return """;
else if (m.Groups["amp"].Value != string.Empty)
return "&";
else if (m.Groups["lt"].Value != string.Empty)
return "<";
else if (m.Groups["gt"].Value != string.Empty)
return ">";
else if (m.Groups["iexcl"].Value != string.Empty)
return "¡";
else if (m.Groups["cent"].Value != string.Empty)
return "¢";
else if (m.Groups["pound"].Value != string.Empty)
return "£";
else if (m.Groups["copy"].Value != string.Empty)
return "(c)";
else if (m.Groups["space"].Value != string.Empty)
return " ";
else if (m.Groups["control"].Value != string.Empty)
return " ";
else
return string.Empty;
}
private static string html2TextPattern =
@" (?<script><script[^>]*?>.*?</script>)|(?<style><style>.*?</style>)|(?<comment><!--.*?-->) " +
@" |(?<html><[^>]+>) " + // HTML标记
@" |(?<quot>&(quot|#34);) " + // 符号: "
@" |(?<amp>&(amp|#38);) " + // 符号: &
@" |(?<lt>&(lt|#60);) " + // 符号: <
@" |(?<gt>&(gt|#62);) " + // 符号: >
@" |(?<iexcl>&(iexcl|#161);) " + // 符号: (char)161
@" |(?<cent>&(cent|#162);) " + // 符号: (char)162
@" |(?<pound>&(pound|#163);) " + // 符号: (char)163
@" |(?<copy>&(copy|#169);) " + // 符号: (char)169
@" |(?<others>&(d+);) " + // 符号: 其他
@" |(?<space> | ) " ; // 空格
/**/ /// <summary>
/// 转换HTML为纯文本
/// </summary>
/// <param name="html">HTML字符串</param>
/// <param name="keepFormat">是否保留换行格式</param>
/// <returns></returns>
public static string Html2Text( string html, bool keepFormat)
... {
string pattern = html2TextPattern;
if (! keepFormat) pattern += "|(?<control>[ /s])"; // 换行字符
RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled;
string txt = Regex.Replace(html, pattern, new MatchEvaluator(Html2Text_Match), options);
if (! keepFormat)
return Regex.Replace(txt.Trim(), "[ ]+", " ", options); // 替换多个连续空格
else
return txt;
}
private static string Html2Text_Match(System.Text.RegularExpressions.Match m)
... {
if (m.Groups["quot"].Value != string.Empty)
return """;
else if (m.Groups["amp"].Value != string.Empty)
return "&";
else if (m.Groups["lt"].Value != string.Empty)
return "<";
else if (m.Groups["gt"].Value != string.Empty)
return ">";
else if (m.Groups["iexcl"].Value != string.Empty)
return "¡";
else if (m.Groups["cent"].Value != string.Empty)
return "¢";
else if (m.Groups["pound"].Value != string.Empty)
return "£";
else if (m.Groups["copy"].Value != string.Empty)
return "(c)";
else if (m.Groups["space"].Value != string.Empty)
return " ";
else if (m.Groups["control"].Value != string.Empty)
return " ";
else
return string.Empty;
}