using
System;
public class Html2Text
{
public static void Main( string [] args)
{
if (args.Length < 1 )
{
Console.WriteLine( " usage: Htm2Txt xxxx.htm [target.txt] " );
}
else
{
Console.WriteLine(StripHTML( " <html><head><title>测试</title></head><body>哈哈</body></html> " ));
}
char name1 = Convert.ToChar( "" );
Console.WriteLine( " ' " + name1 + " ' " );
}
private static string StripHTML( string source)
{
try
{
string result;
// Remove HTML Development formatting
result = source.Replace( " /r " , " " ); // Replace line breaks with space because browsers inserts space
result = result.Replace( " /n " , " " ); // Replace line breaks with space because browsers inserts space
result = result.Replace( " /t " , string .Empty); // Remove step-formatting
result = System.Text.RegularExpressions.Regex.Replace(result, @" ( )+ " , " " ); // Remove repeating speces becuase browsers ignore them
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*head([^>])*> " , " <head> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<( )*(/)( )*head( )*>) " , " </head> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (<head>).*(</head>) " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*script([^>])*> " , " <script> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<( )*(/)( )*script( )*>) " , " </script> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>/.</script>)])*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<script>).*(</script>) " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*style([^>])*> " , " <style> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<( )*(/)( )*style( )*>) " , " </style> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (<style>).*(</style>) " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*td([^>])*> " , " /t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*br( )*> " , " /r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*li( )*> " , " /r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*div([^>])*> " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*tr([^>])*> " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*p([^>])*> " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result, @" <[^>]*> " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result, @" " , " " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" • " , " * " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ‹ " , " < " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" › " , " > " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ™ " , " (tm) " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ⁄ " , " / " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" < " , " < " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" > " , " > " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" © " , " (c) " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ® " , " (r) " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result, @" &(.{2,6}); " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
// System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace( " /n " , " /r " );
// Remove extra line breaks and tabs: replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)( )+(/r) " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/t)( )+(/t) " , " /t/t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/t)( )+(/r) " , " /t/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)( )+(/t) " , " /r/t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)(/t)+(/r) " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)(/t)+ " , " /r/t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multible tabs followind a linebreak with just one tab
string breaks = " /r/r/r " ; // Initial replacement target string for linebreaks
string tabs = " /t/t/t/t/t " ; // Initial replacement target string for tabs
for ( int index = 0 ; index < result.Length; index ++ )
{
result = result.Replace(breaks, " /r/r " );
result = result.Replace(tabs, " /t/t/t/t " );
breaks = breaks + " /r " ;
tabs = tabs + " /t " ;
}
return result;
}
catch
{
}
return source;
}
};
public class Html2Text
{
public static void Main( string [] args)
{
if (args.Length < 1 )
{
Console.WriteLine( " usage: Htm2Txt xxxx.htm [target.txt] " );
}
else
{
Console.WriteLine(StripHTML( " <html><head><title>测试</title></head><body>哈哈</body></html> " ));
}
char name1 = Convert.ToChar( "" );
Console.WriteLine( " ' " + name1 + " ' " );
}
private static string StripHTML( string source)
{
try
{
string result;
// Remove HTML Development formatting
result = source.Replace( " /r " , " " ); // Replace line breaks with space because browsers inserts space
result = result.Replace( " /n " , " " ); // Replace line breaks with space because browsers inserts space
result = result.Replace( " /t " , string .Empty); // Remove step-formatting
result = System.Text.RegularExpressions.Regex.Replace(result, @" ( )+ " , " " ); // Remove repeating speces becuase browsers ignore them
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*head([^>])*> " , " <head> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<( )*(/)( )*head( )*>) " , " </head> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (<head>).*(</head>) " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*script([^>])*> " , " <script> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<( )*(/)( )*script( )*>) " , " </script> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>/.</script>)])*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<script>).*(</script>) " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*style([^>])*> " , " <style> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" (<( )*(/)( )*style( )*>) " , " </style> " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (<style>).*(</style>) " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*td([^>])*> " , " /t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*br( )*> " , " /r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*li( )*> " , " /r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*div([^>])*> " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*tr([^>])*> " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" <( )*p([^>])*> " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result, @" <[^>]*> " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result, @" " , " " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" • " , " * " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ‹ " , " < " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" › " , " > " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ™ " , " (tm) " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ⁄ " , " / " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" < " , " < " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" > " , " > " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" © " , " (c) " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, @" ® " , " (r) " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result, @" &(.{2,6}); " , string .Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
// System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace( " /n " , " /r " );
// Remove extra line breaks and tabs: replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)( )+(/r) " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/t)( )+(/t) " , " /t/t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/t)( )+(/r) " , " /t/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)( )+(/t) " , " /r/t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)(/t)+(/r) " , " /r/r " , System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result, " (/r)(/t)+ " , " /r/t " , System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multible tabs followind a linebreak with just one tab
string breaks = " /r/r/r " ; // Initial replacement target string for linebreaks
string tabs = " /t/t/t/t/t " ; // Initial replacement target string for tabs
for ( int index = 0 ; index < result.Length; index ++ )
{
result = result.Replace(breaks, " /r/r " );
result = result.Replace(tabs, " /t/t/t/t " );
breaks = breaks + " /r " ;
tabs = tabs + " /t " ;
}
return result;
}
catch
{
}
return source;
}
};