htmlParser.Net解析yahoo搜索结果

最新推荐文章于 2015-08-16 23:59:00 发布

zzxian

最新推荐文章于 2015-08-16 23:59:00 发布

阅读量706

点赞数

分类专栏： open source 文章标签： yahoo string class null hyperlink filter

本文链接：https://blog.csdn.net/zzxian/article/details/6748592

版权

open source 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

各位看官也许会问why yahoo? why not use google's? google的页面中使用js下载搜索结果再显示出来的，由于不会js所以选择了yahoo。

我们只想解析搜索结果，即搜索结果项的url，tittle，snippet三部分，自己写了一OneAttributeTagFilter类，这个类有点像htmlParser提供的HasAttributeFilter，但是他是为yahoo量身定做的。另外还写了Yahoo类，把解析过程包装了一下。源码如下。

class OneAttributeTageFilter : NodeFilter
    {
        string m_tag = null;
        string m_attr = null;
        string m_value1 = null;
        string m_value2 = null;

       public OneAttributeTageFilter(string tag, string attr, string value1, string value2)
       {
            this.m_tag = tag;
            this.m_attr = attr;
            this.m_value1= value1;
            this.m_value2 = value2;
        }

        public bool Accept(INode node)
        {
            if (!(node is ITag))
                return false;
            ITag tag;
            TagAttribute attribute;
            tag = (ITag)node;
            if (tag.IsEndTag())
                return false;
            if(!tag.TagName.Equals(this.m_tag.ToUpper()))
                return false;
            attribute = tag.GetAttributeEx(this.m_attr);
            if (attribute == null || !attribute.Valued || attribute.Length == 0)
                return false;
            string attrValue = attribute.GetValue();
            if (this.m_value2 == null && string.Compare(this.m_value1, attrValue, true) == 0)
                return true;
            if (string.Compare(this.m_value1, attrValue, true) == 0 || string.Compare(this.m_value2, attrValue, true) == 0)
                return true;
            return false;          
        }

static class Yahoo
    {
        /* TagNode <a class="yschttl spt" href="xxxx" .... >xxxxx</a> contains  the url and title.
         * Descriptors "a", "class" and "yschttl spt" are able to locate the tag.
         * Since href is universally used for hyperlink, it is not used as a param but as a const label in program.
         */
        private  const string strUrlTag = "a";
        private const string strUrlTaglAttr = "class";
        private const string strUrlTagValue = "yschttl spt";

        /* TagNode <div class="sm-abs">xxxxx</div> or <div class="abstr">xxxxx</div>contains the snippet.
         * Descriptors "div", "class", and "sm-abs"/ "abstr" are able to locate this tag.
         * In fact, "abstr" are used more frequently.
         */
        private  const string strSnpTag = "div";
        private  const string strSnpAttr = "class";
        private const string strSnpValue1 = "abstr";
        private const string strSnpValue2 = "sm-abs";


        public static void  getSnippets(string strUrl, StringBuilder sbSnippets) {
            OneAttributeTageFilter f = new OneAttributeTageFilter(Yahoo.strSnpTag, Yahoo.strSnpAttr, Yahoo.strSnpValue1, Yahoo.strSnpValue2);
            Yahoo.getSnippets(getNodes(strUrl, f), sbSnippets);
        }

        public static void getUrlsAndTitles(string strUrl, StringBuilder sbUrls, StringBuilder sbTitles) {
            OneAttributeTageFilter f = new OneAttributeTageFilter(Yahoo.strUrlTag, Yahoo.strUrlTaglAttr, Yahoo.strUrlTagValue, null);
            Yahoo.getUrlsAndTitles(getNodes(strUrl, f), sbUrls, sbTitles);
        }
       
        private static NodeList getNodes(string strUrl, NodeFilter filter)  {
            System.Net.WebClient aWebClient = new System.Net.WebClient();  
            aWebClient.Encoding = System.Text.Encoding.Default;  
            string strHtml = aWebClient.DownloadString(strUrl);
            //StreamWriter sw = new StreamWriter(new FileStream("googel_result1.txt", FileMode.Create), Encoding.Default);
            //sw.Write(strHtml);
            //sw.Flush();
            Lexer lexer = new Lexer(strHtml);
            Parser parser = new Parser(lexer);
            NodeList nodes = parser.Parse(filter);
            return nodes;
        }

        private static void getSnippets(NodeList snpNodes, StringBuilder sbSnippets)
        {
            ISimpleNodeIterator itr = snpNodes.Elements();
            while (itr.HasMoreNodes())
            {
                INode n = itr.NextNode();
                sbSnippets.AppendLine(n.ToPlainTextString());
            }
        }private  void getUrlsAndTitles(NodeList urlNodes, StringBuilder sbUrls, StringBuilder sbTitles)
        {            
            ISimpleNodeIterator itr = urlNodes.Elements();
            while (itr.HasMoreNodes())
            {
                INode n = itr.NextNode();
                sbTitles.AppendLine(n.ToPlainTextString());
                 ITag tag = (ITag)n;
                 TagAttribute attr = tag.GetAttributeEx("href");
                 sbUrls.AppendLine(attr.GetValue());
              };
        }}

一切都在源码里了，我就不多说了。