- using System;
- using System.Data;
- using System.Configuration;
- using System.Web;
- using System.Web.Security;
- using System.Web.UI;
- using System.Web.UI.WebControls;
- using System.Web.UI.WebControls.WebParts;
- using System.Web.UI.HtmlControls;
- using System.Text.RegularExpressions;
- /// <summary>
- /// HtmlExtract 抽取html里面的文本信息
- /// </summary>
- public class HtmlExtract
- {
- #region private attributes
- private string _strHtml;
- #endregion
- #region public mehtods
- public HtmlExtract(string inStrHtml)
- { _strHtml = inStrHtml;}
- public string ExtractText()
- {
- string result = _strHtml;
- result = RemoveComment(result);
- result = RemoveScript(result);
- result = RemoveStyle(result);
- result = RemoveTags(result);
- return result.Trim();
- }
- #endregion
- #region private methods
- private string RemoveComment(string input)
- {
- string result = input;
- //remove comment
- result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
- return result;
- }
- private string RemoveStyle(string input)
- {
- string result = input;
- //remove all styles
- result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
- return result;
- }
- private string RemoveScript(string input)
- {
- string result = input;
- result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
- result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
- return result;
- }
- private string RemoveTags(string input)
- {
- string result = input;
- result = result.Replace(" ", " ");
- result = result.Replace("'", "\"");
- result = result.Replace("<", "<");
- result = result.Replace(">", ">");
- result = result.Replace("&", "&");
- result = result.Replace("<br>", "\r\n");
- result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
- return result;
- }
- #endregion
- }
转载于:https://blog.51cto.com/cnming/766895