因为前段时间对于爬虫有些兴趣,所以研究了一下HtmlAgilityPack.dll而它是可以基于XPath来解析的。
关于Xpath相关的查看这里 http://www.w3school.com.cn/xpath/index.asp
网上找了半天没找到几个XPath工具。找到一份源代码,在它的基础上自己做了一个Xpath工具
如图 这里是通过XPath获取百度音乐歌曲名
源代码
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.Threading;
using System.Text.RegularExpressions;
namespace XPathTools
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
comboBox1.SelectedIndexChanged += comboBox1_SelectedIndexChanged;
comboBox2.SelectedIndexChanged += comboBox2_SelectedIndexChanged;
}
private void comboBox2_SelectedIndexChanged(object sender, EventArgs e)
{
try
{
textBox3.Text = hd.DocumentNode.SelectNodes(comboBox2.Text)[0].InnerHtml;
}
catch (System.Exception ex)
{
MessageBox.Show("表达式有误" + ex.ToString());
}
//throw new NotImplementedException();
}
//鼠标滚轮
private void comboBox1_SelectedIndexChanged(object sender, EventArgs e)
{
try
{
textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml;
}
catch (System.Exception ex)
{
MessageBox.Show("表达式有误" + ex.ToString());
return;
}
comboBox2.Text = comboBox1.Text;
//throw new NotImplementedException();
}
//指定文件路径
private void button1_Click(object sender, EventArgs e)
{
textBox3.Text = textBox2.Text = null;
OpenFileDialog ofg = new OpenFileDialog();
ofg.Filter = "网页文件(*.html)|*.html;*.xml;*.htm;*.txt";
ofg.Multiselect = false;
if (ofg.ShowDialog() == DialogResult.OK)
{
textBox1.Text = ofg.FileName;
if (ofg.FileName == null)
{
return;
}
textBox1.ReadOnly = true;
textBox2.ReadOnly = true;
//将选择的文件加载到tab1的textbox中
FileStream fs = new FileStream(textBox1.Text, FileMode.OpenOrCreate, FileAccess.Read);
StreamReader sr = new StreamReader(fs, UnicodeEncoding.GetEncoding("GB2312"));
textBox2.AppendText(sr.ReadToEnd());
sr.Close();
fs.Close();
//开始解析文件
StartAnalyse();
}
}
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
//开始分析文件的xpath路径
private void StartAnalyse()
{
comboBox1.Items.Clear();
comboBox2.Items.Clear();
hd.LoadHtml(textBox2.Text);
Thread th = new Thread(NewMethod);
th.Start();
//throw new NotImplementedException();
}
//向combox1添加数据
private void UIContorol(string str)
{
//textBox1.Text = str;
comboBox1.Items.Add(str);
comboBox1.Text = str;
//让combox2等于combox1 combox2 也可以自己根据关键字查询得出
comboBox2.Text = str;
//toolStripStatusLabel1.Text = str;
}
private delegate void Dg(string str);
Dictionary<string, string> D = new Dictionary<string, string>();
private void NewMethod()
{
Dg dgUIContorol = new Dg(UIContorol);
List<string> returnList = new List<string>();
string str = textBox2.Text;
string s = "<script[\\s\\S]*?</script>";
MatchCollection ms = Regex.Matches(str, s, RegexOptions.IgnoreCase | RegexOptions.Compiled);
foreach (Match m in ms)
{
str = str.Replace(m.Value, "");
}
Dictionary<string, int> dic = new Dictionary<string, int>();
List<string> strList = new List<string>();
strList.Add(".");
string strPattern = "<([^<>]*?)>";
MatchCollection Matches = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
foreach (Match NextMatch in Matches)
{
if (!NextMatch.Groups[0].Value.EndsWith("/ >") && !NextMatch.Groups[0].Value.EndsWith("/>") && !NextMatch.Groups[0].Value.StartsWith("<!"))
{
if (NextMatch.Groups[0].Value.StartsWith("</"))
{
if (NextMatch.Groups[0].Value.Replace("</", "<").ToLower() == strList[strList.Count - 1].ToLower())
{
strList.RemoveAt(strList.Count - 1);
}
}
else
{
string strOldXpath = XpathRow(strList, dic);
string strp = "(<(?<body>[^>]*?) [^>]*?>)|(<(?<body>[^>]*?)>)";
string v = Regex.Matches(NextMatch.Groups[0].Value, strp, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value.ToLower();
if (v.ToUpper() != "LINK" && v.ToUpper() != "META" && v.ToUpper() != "SCRIPT" && v.ToUpper() != "IMG" && v.ToUpper() != "INPUT" && v.ToUpper() != "FORM")
{
AddRowNumber(strOldXpath, "<" + v + ">", dic);
strList.Add("<" + v + ">");
returnList.Add(XpathRow(strList, dic));
//label1.Text = returnList.Last();
try
{
string SelectNodes = hd.DocumentNode.SelectNodes(returnList.Last())[0].InnerHtml;
textBox3.Invoke(dgUIContorol, new object[] { returnList.Last() });
D.Add(returnList.Last(), SelectNodes);
//if (D.ContainsKey("./html[1]/body[1]/table[1]/tr[1]/td[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[1]/div[1]/div[1]/fieldset[1]/div[1]"))
}
catch
{
}
}
}
}
else
{
}
}
//listBox1.DataSource = returnList;
//listBox1.Items.Add(returnList.Count);
if (strList.Count == 1)
{
//toolStripStatusLabel1.Text = "OK";
}
else
{
//toolStripStatusLabel1.Text = "False";
}
}
private string XpathRow(List<string> strList, Dictionary<string, int> dic)
{
StringBuilder sb = new StringBuilder();
foreach (var str in strList)
{
string strPattern = "<(?<body>[^>]*?)>";
string v = "";
try
{
v = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value;
string temp = sb.ToString() + v;
v = v + "[" + dic[temp].ToString() + "]";
}
catch
{
v = str;
}
sb.Append(v + "/");
}
return sb.ToString().TrimEnd('/');
}
private void AddRowNumber(string strOldXpatch, string NewNode, Dictionary<string, int> dic)
{
if (strOldXpatch == "")
{
if (!dic.ContainsKey("."))
{
dic.Add(".", 0);
}
else
{
dic["."] = 0;
}
return;
}
string strPattern = "<(?<body>[^>]*?)>";
string v = Regex.Matches(NewNode, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value;
if (dic.ContainsKey(strOldXpatch + "/" + v))
{
dic[strOldXpatch + "/" + v]++;
}
else
{
dic.Add(strOldXpatch + "/" + v, 1);
}
}
//获取指定的远程网页
private void button2_Click(object sender, EventArgs e)
{
}
private void Form1_Load(object sender, EventArgs e)
{
}
//执行xpath查询
private void button3_Click(object sender, EventArgs e)
{
comboBox2.Items.Clear();
foreach (string str in D.Where(fun => fun.Value.ToLower().Contains(textBox5.Text.ToLower())).Select(fun => fun.Key))
{
comboBox2.Items.Add(str);
comboBox2.Text = str;
}
}
//获取远程
private void button2_Click_1(object sender, EventArgs e)
{
textBox3.Text = textBox2.Text = null;
if (textBox1.Text == null)
{
MessageBox.Show("地址不能为空!");
return;
}
string strUrl = textBox1.Text;
HtmlWeb hw = new HtmlWeb();
string url = strUrl;
try
{
hd = hw.Load(url);
}
catch (System.Exception ex)
{
MessageBox.Show(ex.ToString());
return;
}
textBox2.Text = hd.DocumentNode.InnerHtml;
//开始解析标签
StartAnalyse();
}
//解析textbox1中的标签
private void button4_Click(object sender, EventArgs e)
{
//分析textbox2中的xpath项
StartAnalyse();
}
private void textBox2_TextChanged(object sender, EventArgs e)
{
}
//执行combox1中的XPath语句
private void OnXPath(object sender, EventArgs e)
{
try
{
textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml;
}
catch (System.Exception ex)
{
MessageBox.Show("表达式有误" + ex.ToString());
}
//
comboBox2.Text = comboBox1.Text;
}
//获取匹配的结果值
private void button6_Click(object sender, EventArgs e)
{
//标签
string strLabel = textBox6.Text;
//值
string strValue= textBox7.Text;
string strXPathLabel_Val = "descendant::" + strLabel;
//XPath语句
string strXPath = comboBox2.Text;
HtmlNode node = hd.DocumentNode.SelectSingleNode(strXPath);
//HtmlNode ^node = doc->GetElementbyId("entry_content");
if (node == null)
{
return ;
}
Form2 f2 = new Form2();
try
{
//HtmlNodeCollection atts = node.SelectNodes("//*[@background or @lowsrc or @src or @href]");
//这样得到的是基于全文的
//HtmlNodeCollection hrefs = node.SelectNodes("//a[@href]");
//这样得到的是基于本节点的
HtmlNodeCollection hrefs = node.SelectNodes(strXPathLabel_Val);
if (hrefs == null)
{
return;
}
foreach (HtmlNode href in hrefs)
{
if (href.Attributes[strValue] == null)
{
continue;
}
String strImg = href.Attributes[strValue].Value;
f2.AddData2ListView(textBox6.Text, strValue, strImg);
}
}
catch (System.Exception ex)
{
MessageBox.Show(ex.ToString());
}
finally
{
f2.Show();
}
}
}
}
源代码下载地址:
http://download.csdn.net/detail/witch_soya/4978587