using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Nodes;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Visitors;
using Winista.Text.HtmlParser.Filters;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Http;
using System.Threading;
using System.IO;
using System.Net;
namespace parsertitle
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
/// <summary>
/// 该软件的功能实现你在Textbox1里输入一个网址,Textbox2里会返回该网页的标题
/// 有点慢,您需要耐心等待
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click( object sender, EventArgs e)
{
downhtml_1(textBox1 .Text );
textBox2 .Text = tohtml(htmlText );
}
string htmlText = "";
private string downhtml_1( string WebUrl) //抓取网页源代码方法一
{
try
{
WebClient myWebClient = new WebClient();
myWebClient.Encoding = System.Text.Encoding.Default; //获取和设置用于上载和下载字符串的encoding,默认值是default
//myWebClient.Encoding = System.Text.Encoding.Default;
htmlText = myWebClient.DownloadString(WebUrl); //将下载的资源付给字符串
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
if (htmlText.Trim() == "")
htmlText = "失败!";
return htmlText;
}
private string tohtml( string str)
{
string strParser = "";
Lexer lexer1 = new Lexer(str );
Parser parser1 = new Parser(lexer1);
//Creates a TagNameFilter that accepts tags with the given name.
NodeFilter filter_title1 = new TagNameFilter( "TITLE"); //TagNameFilter是NodeFilter的子类,用子类初始化NodeFilter对象
//NodeList nodelistoftitle = parser1.Parse(filter_title1);//Parse方法将返回HTML文档包含的字符
NodeList nodelistoftitle = parser1.ExtractAllNodesThatMatch(filter_title1); //此方法能实现同上的功能
//int i = nodelistoftitle.Count;
INode node_title1 = nodelistoftitle.ElementAt(0);
string title1 = "";
if (node_title1 != null)
{
title1 = node_title1.ToHtml(); //Return the HTML for this node.
}
else return "";
Lexer lexer2 = new Lexer(title1);
Parser parser_title1 = new Parser(lexer2);
TextExtractingVisitor title_visitor1 = new TextExtractingVisitor();
parser_title1.VisitAllNodesWith(title_visitor1); //遍历所有节点,提取纯文本;相当于游客进去转了一圈,他手里就得到了全部的信息
strParser = title_visitor1.ExtractedText.ToString(); //提取出所有的纯文本信息
return strParser;
}
private void Form1_Load( object sender, EventArgs e)
{
textBox1.Text = "http://www.sina.com";
}
}
}
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Nodes;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Visitors;
using Winista.Text.HtmlParser.Filters;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Http;
using System.Threading;
using System.IO;
using System.Net;
namespace parsertitle
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
/// <summary>
/// 该软件的功能实现你在Textbox1里输入一个网址,Textbox2里会返回该网页的标题
/// 有点慢,您需要耐心等待
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click( object sender, EventArgs e)
{
downhtml_1(textBox1 .Text );
textBox2 .Text = tohtml(htmlText );
}
string htmlText = "";
private string downhtml_1( string WebUrl) //抓取网页源代码方法一
{
try
{
WebClient myWebClient = new WebClient();
myWebClient.Encoding = System.Text.Encoding.Default; //获取和设置用于上载和下载字符串的encoding,默认值是default
//myWebClient.Encoding = System.Text.Encoding.Default;
htmlText = myWebClient.DownloadString(WebUrl); //将下载的资源付给字符串
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
if (htmlText.Trim() == "")
htmlText = "失败!";
return htmlText;
}
private string tohtml( string str)
{
string strParser = "";
Lexer lexer1 = new Lexer(str );
Parser parser1 = new Parser(lexer1);
//Creates a TagNameFilter that accepts tags with the given name.
NodeFilter filter_title1 = new TagNameFilter( "TITLE"); //TagNameFilter是NodeFilter的子类,用子类初始化NodeFilter对象
//NodeList nodelistoftitle = parser1.Parse(filter_title1);//Parse方法将返回HTML文档包含的字符
NodeList nodelistoftitle = parser1.ExtractAllNodesThatMatch(filter_title1); //此方法能实现同上的功能
//int i = nodelistoftitle.Count;
INode node_title1 = nodelistoftitle.ElementAt(0);
string title1 = "";
if (node_title1 != null)
{
title1 = node_title1.ToHtml(); //Return the HTML for this node.
}
else return "";
Lexer lexer2 = new Lexer(title1);
Parser parser_title1 = new Parser(lexer2);
TextExtractingVisitor title_visitor1 = new TextExtractingVisitor();
parser_title1.VisitAllNodesWith(title_visitor1); //遍历所有节点,提取纯文本;相当于游客进去转了一圈,他手里就得到了全部的信息
strParser = title_visitor1.ExtractedText.ToString(); //提取出所有的纯文本信息
return strParser;
}
private void Form1_Load( object sender, EventArgs e)
{
textBox1.Text = "http://www.sina.com";
}
}
}
转载于:https://blog.51cto.com/2270991/479215