昨天的代码很粗糙,而且存在功能错误。
回忆一下,我们通过一个url初始化了一个Parser。使用了两个NodeFilter类,期望调用Parser.Parse(nodeFilter)两次获得两个NodeList,再分别处理各自的INode。事与愿违,调用Parse方法后Parser的状态会改变,再次调用Parse得不到想要的结果了。
所以昨天的urlTitles和snippets只能二得其一。考虑到Parser的空间消耗和Parse的时间消耗,将一个Parser克隆多份多次Parse是不可取。于是今天重写了Yahoo类,使用Lexer完成我的工作。Lexer大概是htmlParser处理html最底层的东东了,它将html看成INode,即TagNode, TextNode, RemarkNode的串。使用Lexer可以灵活处理html,但这一层丧失了DOM树的概念,INode之间没有了隶属关系,只能用lexer.NextNode()获得下一个INode。所以处理起来也稍微麻烦,但是个人感觉利大于弊。
这里附加的提一下iNode.ToPlainTextString(),这个函数很重要,用于提取iNode及其子节点的PlainTextString。TagNode的PTS==“”,TextNode的PTS==TextNode.ToHTML(),RemarkNode的PTS==RemarkNode.GetText()。例如this is text的PTS是由三个PTS拼成的"this is text"。
Yahoo的snippet有点诡异,绝大部分返回结果的snippet是非空的,但存在为空的情况。代码中的flag就是为了处理这个诡异的现象的。好吧,上代码。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Filters;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Nodes;
using System.IO;
namespace htmlParser
{
class Yahoo
{
private const string strUrlTag = "A";
private const string strUrlTaglAttr = "class";
private const string strUrlTagValue = "yschttl spt";
/* TagNode <div class="sm-abs">xxxxx</div> or <div class="abstr">xxxxx</div>contains the snippet.
* Descriptors "div", "class", and "sm-abs"/ "abstr" are able to locate this tag.
* In fact, "abstr" are used more frequently.
*/
private const string strSnpTag = "DIV";
private const string strSnpAttr = "class";
private const string strSnpValue1 = "abstr";
private const string strSnpValue2 = "sm-abs";
private const string strSearchUrl = @"http://search.yahoo.com/search?n=100&ei=UTF-8&va_vt=any&vo_vt=any&ve_vt=any&vp_vt=any&vd=all&vst=0&vf=html&vm=p&fl=1&vl=lang_en&fr=yfp-t-701&fp_ip=cn&pstart=1&p=";
private const string strResultBase = "&b=";
private string strKey = null;
private uint hundreds = 0; // how many hundreds results wanted
private uint done = 0; // already got ' done' hundreds results
private Lexer lexer = null;
public Yahoo(string strKey, uint hundreds)
{
if (strKey == null || strKey.Length == 0)
return;
string s = strKey.Trim();
if (s.Length == 0)
return;
foreach (char c in s)
{
if (!(c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ' '))
return;
}
this.strKey = s;
this.hundreds = hundreds;
}
private void setLexer(string strUrl)
{
try
{
System.Net.WebClient aWebClient = new System.Net.WebClient();
aWebClient.Encoding = System.Text.Encoding.UTF8;
string strHtml = aWebClient.DownloadString(strUrl);
StreamWriter sw = new StreamWriter("yahoo_1.txt", false);
sw.Write(strHtml);
sw.Flush();
this.lexer = new Lexer(strHtml);
}
catch (Exception e)
{
this.lexer = null;
}
}
private bool isUrlTilteTagNode(TagNode node)
{
TagAttribute attr;
if (node.IsEndTag())
return false;
if (!node.TagName.Equals(Yahoo.strUrlTag))
return false;
attr = node.GetAttributeEx(Yahoo.strUrlTaglAttr);
if (attr == null || !attr.Valued || attr.Length == 0)
return false;
if (string.Compare(Yahoo.strUrlTagValue, attr.GetValue(), true) != 0)
return false;
return true;
}
private bool isSnippetTagNode(TagNode node)
{
TagAttribute attr;
if (node.IsEndTag())
return false;
if (!node.TagName.Equals(Yahoo.strSnpTag))
return false;
attr = node.GetAttributeEx(Yahoo.strSnpAttr);
if (attr == null || !attr.Valued || attr.Length == 0)
return false;
string tagValue = attr.GetValue();
if (string.Compare(Yahoo.strSnpValue1, tagValue, true) == 0 || string.Compare(Yahoo.strSnpValue2, tagValue, true) == 0)
return true;
return false;
}
// call it just after isUrlTitleTagNode() and before any calling of this.lexer.nextnode().
private void getUrl(TagNode node, StringBuilder sbUrls)
{
TagAttribute attr = node.GetAttributeEx("href");
string url = attr.GetValue();
if (url.Contains(".search.yahoo.com/search/"))
return;
sbUrls.AppendLine(url);
}
// call it just after isUrlTitleTagNode() (or getUrl()) and before any calling of this.lexer.nextnode().
private void getTitle(StringBuilder sbTitles)
{
StringBuilder sbTl = new StringBuilder();
for (INode node = this.lexer.NextNode(); ; node = this.lexer.NextNode())
{
if (node is TagNode && ((TagNode)node).TagName.Equals(Yahoo.strUrlTag))
break;
sbTl.Append(node.ToPlainTextString());
}
sbTitles.AppendLine(sbTl.ToString());
}
// call it just after isSnippetTagNode() and before any calling of this.lexer.nextnode().
private void getSnippet(StringBuilder sbSnippets)
{
StringBuilder sbSnp = new StringBuilder();
for (INode node = this.lexer.NextNode(); ; node = this.lexer.NextNode())
{
if (node is TagNode && ((TagNode)node).TagName.Equals(Yahoo.strSnpTag))
break;
sbSnp.Append(node.ToPlainTextString());
}
sbSnippets.AppendLine(sbSnp.ToString());
}
public void getResults(StringBuilder sbUrls, StringBuilder sbTitles, StringBuilder sbSnippets)
{
if (this.strKey == null)
return;
uint count = 1;
int flag = 0;
string strBaseUrl = Yahoo.strSearchUrl + this.strKey + Yahoo.strResultBase;
for (this.done = 0; this.done < this.hundreds; this.done++,this.lexer = null)
{
// in case of bad network
while (this.lexer == null)
{
string strUrl = strBaseUrl + count.ToString();
this.setLexer(strUrl);
count += 100;
}
for (INode node = this.lexer.NextNode(); node != null; node = this.lexer.NextNode())
{
if (!(node is TagNode))
continue;
TagNode tn = (TagNode)node;
if (this.isUrlTilteTagNode(tn))
{
this.getUrl(tn, sbUrls);
this.getTitle(sbTitles);
flag++;
if (flag >1)
{
sbSnippets.AppendLine("NULL");
flag = 1;
}
}
if (this.isSnippetTagNode(tn))
{
this.getSnippet(sbSnippets);
flag--;
}
}
}
}
}
}