各位看官也许会问why yahoo? why not use google's? google的页面中使用js下载搜索结果再显示出来的,由于不会js所以选择了yahoo。
我们只想解析搜索结果,即搜索结果项的url,tittle,snippet三部分,自己写了一OneAttributeTagFilter类,这个类有点像htmlParser提供的HasAttributeFilter,但是他是为yahoo量身定做的。另外还写了Yahoo类,把解析过程包装了一下。源码如下。
class OneAttributeTageFilter : NodeFilter
{
string m_tag = null;
string m_attr = null;
string m_value1 = null;
string m_value2 = null;
public OneAttributeTageFilter(string tag, string attr, string value1, string value2)
{
this.m_tag = tag;
this.m_attr = attr;
this.m_value1= value1;
this.m_value2 = value2;
}
public bool Accept(INode node)
{
if (!(node is ITag))
return false;
ITag tag;
TagAttribute attribute;
tag = (ITag)node;
if (tag.IsEndTag())
return false;
if(!tag.TagName.Equals(this.m_tag.ToUpper()))
return false;
attribute = tag.GetAttributeEx(this.m_attr);
if (attribute == null || !attribute.Valued || attribute.Length == 0)
return false;
string attrValue = attribute.GetValue();
if (this.m_value2 == null && string.Compare(this.m_value1, attrValue, true) == 0)
return true;
if (string.Compare(this.m_value1, attrValue, true) == 0 || string.Compare(this.m_value2, attrValue, true) == 0)
return true;
return false;
}
static class Yahoo
{
/* TagNode <a class="yschttl spt" href="xxxx" .... >xxxxx</a> contains the url and title.
* Descriptors "a", "class" and "yschttl spt" are able to locate the tag.
* Since href is universally used for hyperlink, it is not used as a param but as a const label in program.
*/
private const string strUrlTag = "a";
private const string strUrlTaglAttr = "class";
private const string strUrlTagValue = "yschttl spt";
/* TagNode <div class="sm-abs">xxxxx</div> or <div class="abstr">xxxxx</div>contains the snippet.
* Descriptors "div", "class", and "sm-abs"/ "abstr" are able to locate this tag.
* In fact, "abstr" are used more frequently.
*/
private const string strSnpTag = "div";
private const string strSnpAttr = "class";
private const string strSnpValue1 = "abstr";
private const string strSnpValue2 = "sm-abs";
public static void getSnippets(string strUrl, StringBuilder sbSnippets) {
OneAttributeTageFilter f = new OneAttributeTageFilter(Yahoo.strSnpTag, Yahoo.strSnpAttr, Yahoo.strSnpValue1, Yahoo.strSnpValue2);
Yahoo.getSnippets(getNodes(strUrl, f), sbSnippets);
}
public static void getUrlsAndTitles(string strUrl, StringBuilder sbUrls, StringBuilder sbTitles) {
OneAttributeTageFilter f = new OneAttributeTageFilter(Yahoo.strUrlTag, Yahoo.strUrlTaglAttr, Yahoo.strUrlTagValue, null);
Yahoo.getUrlsAndTitles(getNodes(strUrl, f), sbUrls, sbTitles);
}
private static NodeList getNodes(string strUrl, NodeFilter filter) {
System.Net.WebClient aWebClient = new System.Net.WebClient();
aWebClient.Encoding = System.Text.Encoding.Default;
string strHtml = aWebClient.DownloadString(strUrl);
//StreamWriter sw = new StreamWriter(new FileStream("googel_result1.txt", FileMode.Create), Encoding.Default);
//sw.Write(strHtml);
//sw.Flush();
Lexer lexer = new Lexer(strHtml);
Parser parser = new Parser(lexer);
NodeList nodes = parser.Parse(filter);
return nodes;
}
private static void getSnippets(NodeList snpNodes, StringBuilder sbSnippets)
{
ISimpleNodeIterator itr = snpNodes.Elements();
while (itr.HasMoreNodes())
{
INode n = itr.NextNode();
sbSnippets.AppendLine(n.ToPlainTextString());
}
}private void getUrlsAndTitles(NodeList urlNodes, StringBuilder sbUrls, StringBuilder sbTitles)
{
ISimpleNodeIterator itr = urlNodes.Elements();
while (itr.HasMoreNodes())
{
INode n = itr.NextNode();
sbTitles.AppendLine(n.ToPlainTextString());
ITag tag = (ITag)n;
TagAttribute attr = tag.GetAttributeEx("href");
sbUrls.AppendLine(attr.GetValue());
};
}}
一切都在源码里了,我就不多说了。