在网页上直接复制过来的新闻通常包括大量的html标记,下面两个函数消除这些标记,并可以有选择的保留一些标记
private
static
string
html2TextPattern
=
@" (?<script><script[^>]*?>.*?</script>)|(?<style><style>.*?</style>)|(?<comment><!--.*?-->) " +
@" |(?<html>(?!(<a)|<ps|(<p>)|(<img)|(<br)|(</)|(<strong)) " + // 保留的html标记前缀,<a>,<p>,<img><br><STRONG>
@" <[^>]+>) " + // HTML标记
@" |(?<quot>&(quot|#34);) " + // 符号: "
@" |(?<amp>&(amp|#38);) " + // 符号: &
@" |(?<end>(?!(</a)|(</strong)|(</p>))</[^>]+>) " + // HTML闭合标签 保留</A>,</STRONG>,</P>
@" |(?<iexcl>&(iexcl|#161);) " + // 符号: (char)161
@" |(?<cent>&(cent|#162);) " + // 符号: (char)162
@" |(?<pound>&(pound|#163);) " + // 符号: (char)163
@" |(?<copy>&(copy|#169);) " + // 符号: (char)169
@" |(?<others>&(d+);) " ; // 符号: 其他
/**/ /// <param name="html">HTML字符串</param>
public static string Html2Text( string html)
... {
string pattern = html2TextPattern;
string pattern2 = @"([^>] s+)|(<br>( ){2,4})|(<br>s{2,4})"; //匹配换行符+空格 并替换为<P>标签
RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled;
string txt = Regex.Replace(html, pattern2, "<P>", options);
txt = Regex.Replace(txt, pattern, new MatchEvaluator(Html2Text_Match), options);
return txt;
}
private static string Html2Text_Match(Match m)
... {
if (m.Groups["quot"].Value != string.Empty)
return """;
else if (m.Groups["amp"].Value != string.Empty)
return "&";
else if (m.Groups["iexcl"].Value != string.Empty)
return "¡";
else if (m.Groups["cent"].Value != string.Empty)
return "¢";
else if (m.Groups["pound"].Value != string.Empty)
return "£";
else if (m.Groups["copy"].Value != string.Empty)
return "(c)";
else
return string.Empty;
}
@" (?<script><script[^>]*?>.*?</script>)|(?<style><style>.*?</style>)|(?<comment><!--.*?-->) " +
@" |(?<html>(?!(<a)|<ps|(<p>)|(<img)|(<br)|(</)|(<strong)) " + // 保留的html标记前缀,<a>,<p>,<img><br><STRONG>
@" <[^>]+>) " + // HTML标记
@" |(?<quot>&(quot|#34);) " + // 符号: "
@" |(?<amp>&(amp|#38);) " + // 符号: &
@" |(?<end>(?!(</a)|(</strong)|(</p>))</[^>]+>) " + // HTML闭合标签 保留</A>,</STRONG>,</P>
@" |(?<iexcl>&(iexcl|#161);) " + // 符号: (char)161
@" |(?<cent>&(cent|#162);) " + // 符号: (char)162
@" |(?<pound>&(pound|#163);) " + // 符号: (char)163
@" |(?<copy>&(copy|#169);) " + // 符号: (char)169
@" |(?<others>&(d+);) " ; // 符号: 其他
/**/ /// <param name="html">HTML字符串</param>
public static string Html2Text( string html)
... {
string pattern = html2TextPattern;
string pattern2 = @"([^>] s+)|(<br>( ){2,4})|(<br>s{2,4})"; //匹配换行符+空格 并替换为<P>标签
RegexOptions options = RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled;
string txt = Regex.Replace(html, pattern2, "<P>", options);
txt = Regex.Replace(txt, pattern, new MatchEvaluator(Html2Text_Match), options);
return txt;
}
private static string Html2Text_Match(Match m)
... {
if (m.Groups["quot"].Value != string.Empty)
return """;
else if (m.Groups["amp"].Value != string.Empty)
return "&";
else if (m.Groups["iexcl"].Value != string.Empty)
return "¡";
else if (m.Groups["cent"].Value != string.Empty)
return "¢";
else if (m.Groups["pound"].Value != string.Empty)
return "£";
else if (m.Groups["copy"].Value != string.Empty)
return "(c)";
else
return string.Empty;
}
调用html2text()即可将html标记去掉并返回去掉后的文本,保留了加粗,超链接,段落,图片,并将以换行加空格来分段的字符替换成<P>