using
System;
using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
private void button1_Click( object sender, EventArgs e)
{
// we can use the stream to load a html file from the local disk
// or use the uri to load a web page from the internet
// byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
// MemoryStream memsteam = new MemoryStream(htmlBytes);
// InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
// Page page = new Page(input);
// Lexer lex = new Lexer(page);
if ( this .textBox1.Text.Length <= 0 )
return ;
// here I read the html from the textbox
Lexer lexer = new Lexer( this .textBox1.Text);
Parser parser = new Parser(lexer);
NodeList htmlNodes = parser.Parse( null );
this .treeView1.Nodes.Clear();
this .treeView1.Nodes.Add( " root " );
TreeNode treeRoot = this .treeView1.Nodes[ 0 ];
for ( int i = 0 ; i < htmlNodes.Count; i ++ )
{
this .RecursionHtmlNode(treeRoot, htmlNodes[i], false );
}
}
private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
if (htmlNode == null || treeNode == null ) return ;
TreeNode current = treeNode;
// current node
if (htmlNode is ITag)
{
ITag tag = (htmlNode as ITag);
if ( ! tag.IsEndTag())
{
string nodeString = tag.TagName;
if (tag.Attributes != null && tag.Attributes.Count > 0 )
{
if (tag.Attributes[ " ID " ] != null )
nodeString = nodeString + " { id=\ "" + tag.Attributes[ " ID " ].ToString() + " \ " } " ;
if (tag.Attributes[ " CLASS " ] != null )
nodeString = nodeString + " { class=\ "" + tag.Attributes[ " CLASS " ].ToString() + " \ " } " ;
if (tag.Attributes[ " STYLE " ] != null )
nodeString = nodeString + " { style=\ "" + tag.Attributes[ " STYLE " ].ToString() + " \ " } " ;
if (tag.Attributes[ " HREF " ] != null )
nodeString = nodeString + " { href=\ "" + tag.Attributes[ " HREF " ].ToString() + " \ " } " ;
}
current = new TreeNode(nodeString);
treeNode.Nodes.Add(current);
}
}
// the children nodes
if (htmlNode.Children != null && htmlNode.Children.Count > 0 )
{
this .RecursionHtmlNode(current, htmlNode.FirstChild, true );
}
// the sibling nodes
if (siblingRequired)
{
INode sibling = htmlNode.NextSibling;
while (sibling != null )
{
this .RecursionHtmlNode(treeNode, sibling, false );
sibling = sibling.NextSibling;
}
}
}
using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
private void button1_Click( object sender, EventArgs e)
{
// we can use the stream to load a html file from the local disk
// or use the uri to load a web page from the internet
// byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
// MemoryStream memsteam = new MemoryStream(htmlBytes);
// InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
// Page page = new Page(input);
// Lexer lex = new Lexer(page);
if ( this .textBox1.Text.Length <= 0 )
return ;
// here I read the html from the textbox
Lexer lexer = new Lexer( this .textBox1.Text);
Parser parser = new Parser(lexer);
NodeList htmlNodes = parser.Parse( null );
this .treeView1.Nodes.Clear();
this .treeView1.Nodes.Add( " root " );
TreeNode treeRoot = this .treeView1.Nodes[ 0 ];
for ( int i = 0 ; i < htmlNodes.Count; i ++ )
{
this .RecursionHtmlNode(treeRoot, htmlNodes[i], false );
}
}
private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
if (htmlNode == null || treeNode == null ) return ;
TreeNode current = treeNode;
// current node
if (htmlNode is ITag)
{
ITag tag = (htmlNode as ITag);
if ( ! tag.IsEndTag())
{
string nodeString = tag.TagName;
if (tag.Attributes != null && tag.Attributes.Count > 0 )
{
if (tag.Attributes[ " ID " ] != null )
nodeString = nodeString + " { id=\ "" + tag.Attributes[ " ID " ].ToString() + " \ " } " ;
if (tag.Attributes[ " CLASS " ] != null )
nodeString = nodeString + " { class=\ "" + tag.Attributes[ " CLASS " ].ToString() + " \ " } " ;
if (tag.Attributes[ " STYLE " ] != null )
nodeString = nodeString + " { style=\ "" + tag.Attributes[ " STYLE " ].ToString() + " \ " } " ;
if (tag.Attributes[ " HREF " ] != null )
nodeString = nodeString + " { href=\ "" + tag.Attributes[ " HREF " ].ToString() + " \ " } " ;
}
current = new TreeNode(nodeString);
treeNode.Nodes.Add(current);
}
}
// the children nodes
if (htmlNode.Children != null && htmlNode.Children.Count > 0 )
{
this .RecursionHtmlNode(current, htmlNode.FirstChild, true );
}
// the sibling nodes
if (siblingRequired)
{
INode sibling = htmlNode.NextSibling;
while (sibling != null )
{
this .RecursionHtmlNode(treeNode, sibling, false );
sibling = sibling.NextSibling;
}
}
}
screen snapshot for the example:
The fault tolerance of the parser is very good, as shown in the pic below (although it could do this more intelligently, I really think that's enough for use):