htmlParser.Net解析yahoo搜索结果2

最新推荐文章于 2017-03-01 16:57:14 发布

zzxian

最新推荐文章于 2017-03-01 16:57:14 发布

阅读量936

点赞数

分类专栏： open source .net 文章标签： yahoo string null class exception div

本文链接：https://blog.csdn.net/zzxian/article/details/6751706

版权

.net 同时被 2 个专栏收录

21 篇文章 0 订阅

订阅专栏

open source

2 篇文章 0 订阅

订阅专栏

昨天的代码很粗糙，而且存在功能错误。

回忆一下，我们通过一个url初始化了一个Parser。使用了两个NodeFilter类，期望调用Parser.Parse(nodeFilter)两次获得两个NodeList，再分别处理各自的INode。事与愿违，调用Parse方法后Parser的状态会改变，再次调用Parse得不到想要的结果了。

所以昨天的urlTitles和snippets只能二得其一。考虑到Parser的空间消耗和Parse的时间消耗，将一个Parser克隆多份多次Parse是不可取。于是今天重写了Yahoo类，使用Lexer完成我的工作。Lexer大概是htmlParser处理html最底层的东东了，它将html看成INode，即TagNode, TextNode, RemarkNode的串。使用Lexer可以灵活处理html，但这一层丧失了DOM树的概念，INode之间没有了隶属关系，只能用lexer.NextNode()获得下一个INode。所以处理起来也稍微麻烦，但是个人感觉利大于弊。

这里附加的提一下iNode.ToPlainTextString()，这个函数很重要，用于提取iNode及其子节点的PlainTextString。TagNode的PTS==“”，TextNode的PTS==TextNode.ToHTML()，RemarkNode的PTS==RemarkNode.GetText()。例如this is text的PTS是由三个PTS拼成的"this is text"。

Yahoo的snippet有点诡异，绝大部分返回结果的snippet是非空的，但存在为空的情况。代码中的flag就是为了处理这个诡异的现象的。好吧，上代码。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Filters;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Nodes;
using System.IO;


namespace htmlParser
{
    class Yahoo
    {
        private const string strUrlTag = "A";
        private const string strUrlTaglAttr = "class";
        private const string strUrlTagValue = "yschttl spt";


        /* TagNode <div class="sm-abs">xxxxx</div> or <div class="abstr">xxxxx</div>contains the snippet.
         * Descriptors "div", "class", and "sm-abs"/ "abstr" are able to locate this tag.
         * In fact, "abstr" are used more frequently.
         */
        private const string strSnpTag = "DIV";
        private const string strSnpAttr = "class";
        private const string strSnpValue1 = "abstr";
        private const string strSnpValue2 = "sm-abs";


        private const string strSearchUrl = @"http://search.yahoo.com/search?n=100&ei=UTF-8&va_vt=any&vo_vt=any&ve_vt=any&vp_vt=any&vd=all&vst=0&vf=html&vm=p&fl=1&vl=lang_en&fr=yfp-t-701&fp_ip=cn&pstart=1&p=";
        private const string strResultBase = "&b=";
        private string strKey = null;
        private uint hundreds = 0; // how many hundreds results wanted
        private uint done = 0; // already got ' done'  hundreds results
        private Lexer lexer = null;


        public Yahoo(string strKey, uint hundreds)
        {
            if (strKey == null || strKey.Length == 0)
                return;
            string s = strKey.Trim();
            if (s.Length == 0)
                return;
            foreach (char c in s)
            {
                if (!(c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ' '))
                    return;
            }
            this.strKey = s;
            this.hundreds = hundreds;
        }


        private void setLexer(string strUrl)
        {
            try
            {
                System.Net.WebClient aWebClient = new System.Net.WebClient();
                aWebClient.Encoding = System.Text.Encoding.UTF8;
                string strHtml = aWebClient.DownloadString(strUrl);
                StreamWriter sw = new StreamWriter("yahoo_1.txt", false);
                sw.Write(strHtml);
                sw.Flush();
                this.lexer = new Lexer(strHtml);
            }
            catch (Exception e)
            {
                this.lexer = null;
            }
        }


        private bool isUrlTilteTagNode(TagNode node)
        {
            TagAttribute attr;
            if (node.IsEndTag())
                return false;
            if (!node.TagName.Equals(Yahoo.strUrlTag))
                return false;
            attr = node.GetAttributeEx(Yahoo.strUrlTaglAttr);
            if (attr == null || !attr.Valued || attr.Length == 0)
                return false;
            if (string.Compare(Yahoo.strUrlTagValue, attr.GetValue(), true) != 0)
                return false;
            return true;
        }
        private bool isSnippetTagNode(TagNode node)
        {
            TagAttribute attr;
            if (node.IsEndTag())
                return false;
            if (!node.TagName.Equals(Yahoo.strSnpTag))
                return false;
            attr = node.GetAttributeEx(Yahoo.strSnpAttr);
            if (attr == null || !attr.Valued || attr.Length == 0)
                return false;
            string tagValue = attr.GetValue();
            if (string.Compare(Yahoo.strSnpValue1, tagValue, true) == 0 || string.Compare(Yahoo.strSnpValue2, tagValue, true) == 0)
                return true;
            return false;
        }


        // call it just after isUrlTitleTagNode() and before any calling of this.lexer.nextnode().
        private void getUrl(TagNode node, StringBuilder sbUrls)
        {
            TagAttribute attr = node.GetAttributeEx("href");
            string url = attr.GetValue();
            if (url.Contains(".search.yahoo.com/search/"))
                return;
            sbUrls.AppendLine(url);
        }


        // call it just after isUrlTitleTagNode() (or getUrl()) and before any calling of this.lexer.nextnode().
        private void getTitle(StringBuilder sbTitles)
        {
            StringBuilder sbTl = new StringBuilder();


            for (INode node = this.lexer.NextNode(); ; node = this.lexer.NextNode())
            {
                if (node is TagNode && ((TagNode)node).TagName.Equals(Yahoo.strUrlTag))
                    break;
                sbTl.Append(node.ToPlainTextString());
            }
            sbTitles.AppendLine(sbTl.ToString());
        }


        // call it just after isSnippetTagNode() and before any calling of this.lexer.nextnode().
        private void getSnippet(StringBuilder sbSnippets)
        {
            StringBuilder sbSnp = new StringBuilder();


            for (INode node = this.lexer.NextNode(); ; node = this.lexer.NextNode())
            {
                if (node is TagNode && ((TagNode)node).TagName.Equals(Yahoo.strSnpTag))
                    break;
                sbSnp.Append(node.ToPlainTextString());
            }
            sbSnippets.AppendLine(sbSnp.ToString());
        }


        public void getResults(StringBuilder sbUrls, StringBuilder sbTitles, StringBuilder sbSnippets)
        {
            if (this.strKey == null)
                return;


            uint count = 1;
            int flag = 0;
            string strBaseUrl = Yahoo.strSearchUrl + this.strKey + Yahoo.strResultBase;
            for (this.done = 0; this.done < this.hundreds; this.done++,this.lexer = null)
            {
                // in case of bad network
                while (this.lexer == null)
                {
                    string strUrl = strBaseUrl + count.ToString();
                    this.setLexer(strUrl);
                    count += 100;
                }
                for (INode node = this.lexer.NextNode(); node != null; node = this.lexer.NextNode())
                {
                    if (!(node is TagNode))
                        continue;
                    TagNode tn = (TagNode)node;
                    if (this.isUrlTilteTagNode(tn))
                    {
                        this.getUrl(tn, sbUrls);
                        this.getTitle(sbTitles);
                         flag++;
                        if (flag >1)
                        {
                            sbSnippets.AppendLine("NULL");
                            flag = 1;
                        }
                    }


                    if (this.isSnippetTagNode(tn))
                    {
                        this.getSnippet(sbSnippets);
                        flag--;
                    }
                }
            }
        }
    }
}