HTMLParser.net使用详解

最新推荐文章于 2021-06-15 14:43:06 发布

weixin_33744854

最新推荐文章于 2021-06-15 14:43:06 发布

阅读量101

点赞数

原文链接：http://blog.51cto.com/2270991/479215

版权

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.Linq;

using System.Text;

using System.Windows.Forms;

using Winista.Text.HtmlParser;

using Winista.Text.HtmlParser.Lex;

using Winista.Text.HtmlParser.Nodes;

using Winista.Text.HtmlParser.Util;

using Winista.Text.HtmlParser.Visitors;

using Winista.Text.HtmlParser.Filters;

using Winista.Text.HtmlParser.Tags;

using Winista.Text.HtmlParser.Http;

using System.Threading;

using System.IO;

using System.Net;

namespace parsertitle

{

public partial class Form1 : Form

{

public Form1()

{

InitializeComponent();

}

/// <summary>

/// 该软件的功能实现你在Textbox1里输入一个网址，Textbox2里会返回该网页的标题

/// 有点慢，您需要耐心等待

/// </summary>

/// <param name="sender"></param>

/// <param name="e"></param>

private void button1_Click( object sender, EventArgs e)

{

downhtml_1(textBox1 .Text );

textBox2 .Text = tohtml(htmlText );

}

string htmlText = "";

private string downhtml_1( string WebUrl) //抓取网页源代码方法一

{

try

{

WebClient myWebClient = new WebClient();

myWebClient.Encoding = System.Text.Encoding.Default; //获取和设置用于上载和下载字符串的encoding，默认值是default

//myWebClient.Encoding = System.Text.Encoding.Default;

htmlText = myWebClient.DownloadString(WebUrl); //将下载的资源付给字符串

}

catch (Exception ex)

{

MessageBox.Show(ex.Message);

}

if (htmlText.Trim() == "")

htmlText = "失败！";

return htmlText;

}

private string tohtml( string str)

{

string strParser = "";

Lexer lexer1 = new Lexer(str );

Parser parser1 = new Parser(lexer1);

//Creates a TagNameFilter that accepts tags with the given name.

NodeFilter filter_title1 = new TagNameFilter( "TITLE"); //TagNameFilter是NodeFilter的子类，用子类初始化NodeFilter对象

//NodeList nodelistoftitle = parser1.Parse(filter_title1);//Parse方法将返回HTML文档包含的字符

NodeList nodelistoftitle = parser1.ExtractAllNodesThatMatch(filter_title1); //此方法能实现同上的功能

//int i = nodelistoftitle.Count;

INode node_title1 = nodelistoftitle.ElementAt(0);

string title1 = "";

if (node_title1 != null)

{

title1 = node_title1.ToHtml(); //Return the HTML for this node.

}

else return "";

Lexer lexer2 = new Lexer(title1);

Parser parser_title1 = new Parser(lexer2);

TextExtractingVisitor title_visitor1 = new TextExtractingVisitor();

parser_title1.VisitAllNodesWith(title_visitor1); //遍历所有节点,提取纯文本；相当于游客进去转了一圈，他手里就得到了全部的信息

strParser = title_visitor1.ExtractedText.ToString(); //提取出所有的纯文本信息

return strParser;

}

private void Form1_Load( object sender, EventArgs e)

{

textBox1.Text = "http://www.sina.com";

}

}

}

转载于:https://blog.51cto.com/2270991/479215

weixin_33744854

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。