#region GetOnlyTextFromHtmlCode + RemoveHtmlChars + RemoveTagFromHtmlCode
/// <summary>
/// http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fedit%2FZetaHtmlEditControl%2F%2FZetaHtmlEditControl-Source.zip&zep=Control%2FHtmlEditControl.cs&obid=43954&obtid=2&ovid=13
/// </summary>
/// <param name="htmlCode"></param>
/// <returns></returns>
private static string getOnlyTextFromHtmlCode(string htmlCode)
{
//<br>
htmlCode = htmlCode.Replace("\r\n", @" ");
htmlCode = htmlCode.Replace("\r", @" ");
htmlCode = htmlCode.Replace("\n", @" ");
htmlCode = htmlCode.Replace(@"</p>", Environment.NewLine + Environment.NewLine);
htmlCode = htmlCode.Replace(@"</P>", Environment.NewLine + Environment.NewLine);
//html comment
htmlCode = Regex.Replace(
htmlCode,
@"<!--.*?-->",
string.Empty,
RegexOptions.Singleline | RegexOptions.IgnoreCase);
//<p>
htmlCode = Regex.Replace(htmlCode,
@"<br[^>]*>",
Environment.NewLine,
RegexOptions.Singleline | RegexOptions.IgnoreCase);
//tags
htmlCode = removeTagFromHtmlCode(@"style", htmlCode);
htmlCode = removeTagFromHtmlCode(@"script", htmlCode);
//html
htmlCode = Regex.Replace(
htmlCode,
"<(.|\n)+?>",
string.Empty,
RegexOptions.Singleline | RegexOptions.IgnoreCase);
//umlaute
htmlCode = unescapeHtmlEntities(htmlCode);
//whitespaces
htmlCode = Regex.Replace(
htmlCode,
@" +",
@" ",
RegexOptions.Singleline | RegexOptions.IgnoreCase);
return htmlCode;
}
/// <summary>
/// http://dev.w3.org/html5/html-author/charref
/// </summary>
/// <param name="htmlCode"></param>
/// <returns></returns>
private static string unescapeHtmlEntities(string htmlCode)
{
htmlCode = htmlCode.Replace(@" ", @" ");
htmlCode = htmlCode.Replace(@"Ä", @"ä");
htmlCode = htmlCode.Replace(@"&absp;", @"");
htmlCode = htmlCode.Replace(@"&obsp;", @"");
htmlCode = htmlCode.Replace(@"&Obsp;", @"");
htmlCode = htmlCode.Replace(@"&ubsp;", @"");
htmlCode = htmlCode.Replace(@"&Ubsp;", @"");
htmlCode = htmlCode.Replace(@"ß", @"ß");
htmlCode = htmlCode.Replace(@"£", @"£");
htmlCode = htmlCode.Replace(@"§", @"§");
htmlCode = htmlCode.Replace(@"©", @"©");
htmlCode = htmlCode.Replace(@"®", @"®");
htmlCode = htmlCode.Replace(@"µ", @"µ");
htmlCode = htmlCode.Replace(@"¶", @"¶");
htmlCode = htmlCode.Replace(@"Ø", @"Ø");
htmlCode = htmlCode.Replace(@"ø", @"Ø");
htmlCode = htmlCode.Replace(@"÷", @"÷");
htmlCode = htmlCode.Replace(@"×", @"×");
return htmlCode;
}
private static string removeTagFromHtmlCode(
string tag,
string htmlCode)
{
return Regex.Replace(
htmlCode,
string.Format(@"<{0}.*?</{1}>", tag, tag),
string.Empty,
RegexOptions.Singleline | RegexOptions.IgnoreCase);
}
#endregion