htmlParser.Net解析yahoo搜索结果

各位看官也许会问why yahoo? why not use google's? google的页面中使用js下载搜索结果再显示出来的,由于不会js哭所以选择了yahoo。

我们只想解析搜索结果,即搜索结果项的url,tittle,snippet三部分,自己写了一OneAttributeTagFilter类,这个类有点像htmlParser提供的HasAttributeFilter,但是他是为yahoo量身定做的。另外还写了Yahoo类,把解析过程包装了一下。源码如下。

class OneAttributeTageFilter : NodeFilter
    {
        string m_tag = null;
        string m_attr = null;
        string m_value1 = null;
        string m_value2 = null;

       public OneAttributeTageFilter(string tag, string attr, string value1, string value2)
       {
            this.m_tag = tag;
            this.m_attr = attr;
            this.m_value1= value1;
            this.m_value2 = value2;
        }

        public bool Accept(INode node)
        {
            if (!(node is ITag))
                return false;
            ITag tag;
            TagAttribute attribute;
            tag = (ITag)node;
            if (tag.IsEndTag())
                return false;
            if(!tag.TagName.Equals(this.m_tag.ToUpper()))
                return false;
            attribute = tag.GetAttributeEx(this.m_attr);
            if (attribute == null || !attribute.Valued || attribute.Length == 0)
                return false;
            string attrValue = attribute.GetValue();
            if (this.m_value2 == null && string.Compare(this.m_value1, attrValue, true) == 0)
                return true;
            if (string.Compare(this.m_value1, attrValue, true) == 0 || string.Compare(this.m_value2, attrValue, true) == 0)
                return true;
            return false;          
        }
static class Yahoo
    {
        /* TagNode <a class="yschttl spt" href="xxxx" .... >xxxxx</a> contains  the url and title.
         * Descriptors "a", "class" and "yschttl spt" are able to locate the tag.
         * Since href is universally used for hyperlink, it is not used as a param but as a const label in program.
         */
        private  const string strUrlTag = "a";
        private const string strUrlTaglAttr = "class";
        private const string strUrlTagValue = "yschttl spt";

        /* TagNode <div class="sm-abs">xxxxx</div> or <div class="abstr">xxxxx</div>contains the snippet.
         * Descriptors "div", "class", and "sm-abs"/ "abstr" are able to locate this tag.
         * In fact, "abstr" are used more frequently.
         */
        private  const string strSnpTag = "div";
        private  const string strSnpAttr = "class";
        private const string strSnpValue1 = "abstr";
        private const string strSnpValue2 = "sm-abs";


        public static void  getSnippets(string strUrl, StringBuilder sbSnippets) {
            OneAttributeTageFilter f = new OneAttributeTageFilter(Yahoo.strSnpTag, Yahoo.strSnpAttr, Yahoo.strSnpValue1, Yahoo.strSnpValue2);
            Yahoo.getSnippets(getNodes(strUrl, f), sbSnippets);
        }

        public static void getUrlsAndTitles(string strUrl, StringBuilder sbUrls, StringBuilder sbTitles) {
            OneAttributeTageFilter f = new OneAttributeTageFilter(Yahoo.strUrlTag, Yahoo.strUrlTaglAttr, Yahoo.strUrlTagValue, null);
            Yahoo.getUrlsAndTitles(getNodes(strUrl, f), sbUrls, sbTitles);
        }
       
        private static NodeList getNodes(string strUrl, NodeFilter filter)  {
            System.Net.WebClient aWebClient = new System.Net.WebClient();  
            aWebClient.Encoding = System.Text.Encoding.Default;  
            string strHtml = aWebClient.DownloadString(strUrl);
            //StreamWriter sw = new StreamWriter(new FileStream("googel_result1.txt", FileMode.Create), Encoding.Default);
            //sw.Write(strHtml);
            //sw.Flush();
            Lexer lexer = new Lexer(strHtml);
            Parser parser = new Parser(lexer);
            NodeList nodes = parser.Parse(filter);
            return nodes;
        }

        private static void getSnippets(NodeList snpNodes, StringBuilder sbSnippets)
        {
            ISimpleNodeIterator itr = snpNodes.Elements();
            while (itr.HasMoreNodes())
            {
                INode n = itr.NextNode();
                sbSnippets.AppendLine(n.ToPlainTextString());
            }
        }private  void getUrlsAndTitles(NodeList urlNodes, StringBuilder sbUrls, StringBuilder sbTitles)
        {            
            ISimpleNodeIterator itr = urlNodes.Elements();
            while (itr.HasMoreNodes())
            {
                INode n = itr.NextNode();
                sbTitles.AppendLine(n.ToPlainTextString());
                 ITag tag = (ITag)n;
                 TagAttribute attr = tag.GetAttributeEx("href");
                 sbUrls.AppendLine(attr.GetValue());
              };
        }}

一切都在源码里了,我就不多说了。


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值