c#信息抓取二：HTMLParser.net使用详解

最新推荐文章于 2021-06-15 14:43:06 发布

banny

最新推荐文章于 2021-06-15 14:43:06 发布

阅读量5.7k

点赞数 1

分类专栏： .NET/C# 文章标签： c# textbox string filter exception object

本文链接：https://blog.csdn.net/malimalihun/article/details/6128790

版权

.NET/C# 专栏收录该内容

33 篇文章 0 订阅

订阅专栏

第一步还是添加引用，在上文已经说过，不再赘述。

代码：

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using Winista.Text.HtmlParser; using Winista.Text.HtmlParser.Lex; using Winista.Text.HtmlParser.Nodes; using Winista.Text.HtmlParser.Util; using Winista.Text.HtmlParser.Visitors; using Winista.Text.HtmlParser.Filters; using Winista.Text.HtmlParser.Tags; using Winista.Text.HtmlParser.Http; using System.Threading; using System.IO; using System.Net; namespace parsertitle { public partial class Form1 : Form { public Form1() { InitializeComponent(); } /// <summary> /// 该软件的功能实现你在Textbox1里输入一个网址，Textbox2里会返回该网页的标题 /// 有点慢，您需要耐心等待 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void button1_Click(object sender, EventArgs e) { downhtml_1(textBox1 .Text ); textBox2 .Text = tohtml(htmlText ); } string htmlText = ""; private string downhtml_1(string WebUrl)//抓取网页源代码方法一 { try { WebClient myWebClient = new WebClient(); myWebClient.Encoding = System.Text.Encoding.Default;//获取和设置用于上载和下载字符串的encoding，默认值是default //myWebClient.Encoding = System.Text.Encoding.Default; htmlText = myWebClient.DownloadString(WebUrl);//将下载的资源付给字符串 } catch (Exception ex) { MessageBox.Show(ex.Message); } if (htmlText.Trim() == "") htmlText = "失败！"; return htmlText; } private string tohtml( string str) { string strParser = ""; Lexer lexer1 = new Lexer(str ); Parser parser1 = new Parser(lexer1); //Creates a TagNameFilter that accepts tags with the given name. NodeFilter filter_title1 = new TagNameFilter("TITLE");//TagNameFilter是NodeFilter的子类，用子类初始化NodeFilter对象 //NodeList nodelistoftitle = parser1.Parse(filter_title1);//Parse方法将返回HTML文档包含的字符 NodeList nodelistoftitle = parser1.ExtractAllNodesThatMatch(filter_title1);//此方法能实现同上的功能 //int i = nodelistoftitle.Count; INode node_title1 = nodelistoftitle.ElementAt(0); string title1 = ""; if (node_title1 != null) { title1 = node_title1.ToHtml();//Return the HTML for this node. } else return ""; Lexer lexer2 = new Lexer(title1); Parser parser_title1 = new Parser(lexer2); TextExtractingVisitor title_visitor1 = new TextExtractingVisitor(); parser_title1.VisitAllNodesWith(title_visitor1);//遍历所有节点,提取纯文本；相当于游客进去转了一圈，他手里就得到了全部的信息 strParser = title_visitor1.ExtractedText.ToString();//提取出所有的纯文本信息 return strParser; } private void Form1_Load(object sender, EventArgs e) { textBox1.Text = "http://www.sina.com"; } } }