几个C#关于Html解析的类

命名空间里有两个类:1.HtmlUtil;2.Htmlpage。分别引用的别人的写好的类,其中也做了不少改变。一开始是用HtmlUtil解析网页,它使用正则表达式解析HTML网页,后来发现某些情况下解析的不是太好。后来在sourceforge里面发现了MLIHTML拿过来用了一下,还不错。

  1 None.gif using  System;
  2 None.gif using  System.Collections.Generic;
  3 None.gif using  System.Text;
  4 None.gif using  System.Text.RegularExpressions;
  5 None.gif using  MIL.Html;
  6 None.gif
  7 None.gif namespace  Yuanso.Sitework.Crawler
  8 ExpandedBlockStart.gifContractedBlock.gif dot.gif {
  9InBlock.gif     public class HtmlUtil
 10ExpandedSubBlockStart.gifContractedSubBlock.gif     dot.gif{
 11ExpandedSubBlockStart.gifContractedSubBlock.gif         /**//// <summary>
 12InBlock.gif         /// Written:     [CHINA] Zhang Liu 
 13InBlock.gif         /// Date:        1,Jun,2006 
 14InBlock.gif         /// Version:     1.0
 15InBlock.gif         /// Support:     MYBASK <see cref="http://www.mybask.net"/>
 16InBlock.gif         /// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>
 17InBlock.gif         /// Summary:
 18InBlock.gif         /// Picking up text content from a html document. This function will remove:
 19InBlock.gif         /// 1. <%=%>
 20InBlock.gif         /// 2. script
 21InBlock.gif         /// 3. style
 22InBlock.gif         /// 4. html tags
 23InBlock.gif         /// 6. &nbsp; and others
 24InBlock.gif         /// 7. html comments
 25InBlock.gif         /// After all above removed, \r\n will be replaced by an empty character.
 26InBlock.gif         /// </summary>
 27InBlock.gif         /// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>
 28ExpandedSubBlockEnd.gif         /// <returns>string: Stripped text</returns>

 29InBlock.gif         public static string ExtractContent(string strHtml)
 30ExpandedSubBlockStart.gifContractedSubBlock.gif         dot.gif{
 31InBlock.gif             //All the regular expression for matching html, javascript, style elements and others.
 32ExpandedSubBlockStart.gifContractedSubBlock.gif             string[] aryRegex =dot.gif{@"<%=[\w\W]*?%>",    @"<script[\w\W]*?</script>",     @"<style[\w\W]*?</style>",   @"<[/]?[\w\W]*?>",   @"([\r\n])[\s]+",
 33InBlock.gif                                 @"&(nbsp|#160);",    @"&(iexcl|#161);",               @"&(cent|#162);",            @"&(pound|#163);",   @"&(copy|#169);",
 34ExpandedSubBlockEnd.gif                                 @"(\d+);",         @"-->",                          @"<!--.*\n"}
;
 35InBlock.gif             //Corresponding replacment to the regular expressions.
 36InBlock.gif             //string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
 37ExpandedSubBlockStart.gifContractedSubBlock.gif             string[] aryReplacment = dot.gif""""""""""" """"""""""""""" };
 38InBlock.gif             string strStripped = strHtml;
 39InBlock.gif             //Loop to replacing.
 40InBlock.gif             for (int i = 0; i < aryRegex.Length; i++)
 41ExpandedSubBlockStart.gifContractedSubBlock.gif             dot.gif{
 42InBlock.gif                 Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
 43InBlock.gif                 strStripped = regex.Replace(strStripped, aryReplacment[i]);
 44ExpandedSubBlockEnd.gif             }

 45InBlock.gif             //Replace "\r\n" to an empty character.
 46InBlock.gif             strStripped.Replace("\r\n""");
 47InBlock.gif             strStripped.Replace("\t""");
 48InBlock.gif             //Return stripped string.
 49InBlock.gif             return strStripped;
 50ExpandedSubBlockEnd.gif         }

 51InBlock.gif         public static string ExtractTitle(string strHtml)
 52ExpandedSubBlockStart.gifContractedSubBlock.gif         dot.gif{
 53InBlock.gif
 54InBlock.gif             string title;
 55InBlock.gif             //string titleResult;
 56InBlock.gif             Match m;
 57InBlock.gif             string titlePatern = @"<title[^>]*?>.*?</title>";
 58InBlock.gif             Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);
 59InBlock.gif             m = regex.Match(strHtml);
 60InBlock.gif             if (m.Success)
 61ExpandedSubBlockStart.gifContractedSubBlock.gif             dot.gif{
 62InBlock.gif                 title = m.Value.ToString();
 63InBlock.gif                 title = title.Replace("<title>""");
 64InBlock.gif                 title = title.Replace("</title>""");
 65ExpandedSubBlockEnd.gif             }

 66InBlock.gif             else title = "无标题";
 67InBlock.gif
 68InBlock.gif             return title;
 69ExpandedSubBlockEnd.gif         }

 70ExpandedSubBlockStart.gifContractedSubBlock.gif         /**//// <summary>
 71InBlock.gif         /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
 72InBlock.gif         /// </summary>
 73InBlock.gif         /// <param name="instr">HTML代码</param>
 74InBlock.gif         /// <param name="firstN">提取从头数多少个字</param>
 75InBlock.gif         /// <param name="withLink">是否要链接里面的字</param>
 76ExpandedSubBlockEnd.gif         /// <returns>纯文本</returns>

 77InBlock.gif         public static string getFirstNchar(string instr, int firstN, bool withLink)
 78ExpandedSubBlockStart.gifContractedSubBlock.gif         dot.gif{
 79InBlock.gif             string strStripped;
 80InBlock.gif             strStripped = instr.Clone() as string;
 81InBlock.gif             strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
 82InBlock.gif             strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
 83InBlock.gif             strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
 84InBlock.gif             if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
 85InBlock.gif             Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);
 86InBlock.gif             strStripped = objReg.Replace(strStripped, "");
 87InBlock.gif             Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
 88InBlock.gif             strStripped = objReg2.Replace(strStripped, " ");
 89InBlock.gif             //return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;
 90InBlock.gif             return strStripped;
 91ExpandedSubBlockEnd.gif         }

 92InBlock.gif
 93InBlock.gif         public static string getTitle(string strHtml)
 94ExpandedSubBlockStart.gifContractedSubBlock.gif         dot.gif{
 95InBlock.gif             string title="";
 96InBlock.gif             Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
 97InBlock.gif             Match mc = reg.Match(strHtml);
 98InBlock.gif             if (mc.Success)
 99InBlock.gif                 title = mc.Groups["title"].Value.Trim();
100InBlock.gif
101InBlock.gif             return title;
102ExpandedSubBlockEnd.gif         }

103ExpandedSubBlockEnd.gif     }

104InBlock.gif     public class Htmlpage
105ExpandedSubBlockStart.gifContractedSubBlock.gif     dot.gif{
106InBlock.gif         public static string GetTitle(string strHtml)
107ExpandedSubBlockStart.gifContractedSubBlock.gif         dot.gif{
108InBlock.gif             MIL.Html.HtmlDocument documnet;
109InBlock.gif             HtmlParser parser = new HtmlDomainTreeParser();
110InBlock.gif             documnet = parser.Parse(strHtml);
111InBlock.gif             StringBuilder text = new StringBuilder("");
112InBlock.gif             foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
113ExpandedSubBlockStart.gifContractedSubBlock.gif             dot.gif{
114InBlock.gif
115InBlock.gif                 HtmlText textNode;
116InBlock.gif                 textNode = (HtmlText)node;
117InBlock.gif                 if (!textNode.Text.Contains("\r"&& !textNode.Text.Contains("\n"))
118ExpandedSubBlockStart.gifContractedSubBlock.gif                 dot.gif{
119InBlock.gif                     text.Append(textNode.Text);
120InBlock.gif                     break;
121ExpandedSubBlockEnd.gif                 }

122InBlock.gif                
123ExpandedSubBlockEnd.gif             }

124InBlock.gif             return text.ToString();
125InBlock.gif
126ExpandedSubBlockEnd.gif         }

127InBlock.gif         public static string GetContent(string strHtml)
128ExpandedSubBlockStart.gifContractedSubBlock.gif         dot.gif{
129InBlock.gif             MIL.Html.HtmlDocument documnet;
130InBlock.gif             HtmlParser parser = new HtmlDomainTreeParser();
131InBlock.gif             documnet = parser.Parse(strHtml);
132InBlock.gif             StringBuilder text = new StringBuilder();
133InBlock.gif             foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
134ExpandedSubBlockStart.gifContractedSubBlock.gif             dot.gif{
135InBlock.gif
136InBlock.gif                 HtmlText textNode;
137InBlock.gif                 textNode = (HtmlText)node;
138InBlock.gif                 if (textNode.Text.Contains("\r"|| textNode.Text.Contains("\n"))
139InBlock.gif                     continue;
140InBlock.gif                 else text.Append(textNode.Text);
141InBlock.gif
142ExpandedSubBlockEnd.gif             }

143InBlock.gif             return text.ToString();
144InBlock.gif
145ExpandedSubBlockEnd.gif         }

146ExpandedSubBlockEnd.gif     }

147InBlock.gif
148ExpandedBlockEnd.gif}

149 None.gif

转载于:https://www.cnblogs.com/jadepark/archive/2007/08/01/838907.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值