使用的外部dll
HtmlAgilityPack 目的,将html源码解析成xml格式方便使用
1.通过url获取网站源码,这里是一个比较简单的方法,不过方法不怪乎简单,能用就行
public string getWeb(string url, Encoding en)
{
CookieCollection cookies = new CookieCollection();//如何从response.Headers["Set-Cookie"];中获取并设置CookieCollection的代码略
HttpWebResponse response = HttpWebResponseUtility.CreateGetHttpResponse(url, null, null, cookies);//下面有HttpWebResponseUtility类的下载
Stream sm = response.GetResponseStream();
System.IO.StreamReader streamReader = new System.IO.StreamReader(sm, en);//Encoding.Default
//将流转换为字符串
string html = streamReader.ReadToEnd();
streamReader.Close();
string cookieString = response.Headers["Set-Cookie"];
return html;
}
2.获取源码之后通过HtmlAgilityPack解析源码
//规则类
public class TestWeb {
public string testMatch { set; get; }//匹配规则
public string test { set; get; }//返回值
}
解析源码,返回List数据
public List<TestWeb> getLegalPage(Category category)
{
TestWeb test = new TestWeb();
test.testMatch = "//tr[2]/td[2]";//
CreateHtml ch = new CreateHtml("", "");
HtmlDocument document = new HtmlDocument();
document.LoadHtml(ch.getWeb("url路劲", "url编码"));
HtmlNode rootNode = document.DocumentNode;
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[1]/table");//匹配所有的数据
HtmlNode temp = null;
List<TestWeb> list = new List<TestWeb>();
TestWeb test1 = new TestWeb();
foreach (HtmlNode categoryNode in categoryNodeList)
{
temp = HtmlNode.CreateNode(categoryNode.OuterHtml);
try
{
test1 = new TestWeb();
test1.ggr = temp.SelectSingleNode(test.testMatch).InnerText.Replace(" ", "");
list.Add(test1);
}
catch {
}
}
return list;
}
一个简单的使用url获取数据的方法就完了。
这里提供一个http请求辅助类HttpWebResponseUtility(转载自-----)
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using System.DirectoryServices.Protocols;
using System.ServiceModel.Security;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;
namespace Common
{
/// <summary>
/// 有关HTTP请求的辅助类
/// </summary>
public class HttpWebResponseUtility
{
private static readonly string DefaultUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
/// <summary>
/// 创建GET方式的HTTP请求
/// </summary>
/// <param name="url">请求的URL</param>
/// <param name="timeout">请求的超时时间</param>
/// <param name="userAgent">请求的客户端浏览器信息,可以为空</param>
/// <param name="cookies">随同HTTP请求发送的Cookie信息,如果不需要身份验证可以为空</param>
/// <returns></returns>
public static HttpWebResponse CreateGetHttpResponse(string url, int? timeout, string userAgent, CookieCollection cookies)
{
if (string.IsNullOrEmpty(url))
{
throw new ArgumentNullException("url");
}
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "GET";
request.UserAgent = DefaultUserAgent;
if (!string.IsNullOrEmpty(userAgent))
{
request.UserAgent = userAgent;
}
if (timeout.HasValue)
{
request.Timeout = timeout.Value;
}
if (cookies != null)
{
request.CookieContainer = new CookieContainer();
request.CookieContainer.Add(cookies);
}
return request.GetResponse() as HttpWebResponse;
}
/// <summary>
/// 创建POST方式的HTTP请求
/// </summary>
/// <param name="url">请求的URL</param>
/// <param name="parameters">随同请求POST的参数名称及参数值字典</param>
/// <param name="timeout">请求的超时时间</param>
/// <param name="userAgent">请求的客户端浏览器信息,可以为空</param>
/// <param name="requestEncoding">发送HTTP请求时所用的编码</param>
/// <param name="cookies">随同HTTP请求发送的Cookie信息,如果不需要身份验证可以为空</param>
/// <returns></returns>
public static HttpWebResponse CreatePostHttpResponse(string url, IDictionary<string, string> parameters, int? timeout, string userAgent, Encoding requestEncoding, CookieCollection cookies)
{
if (string.IsNullOrEmpty(url))
{
throw new ArgumentNullException("url");
}
if (requestEncoding == null)
{
throw new ArgumentNullException("requestEncoding");
}
HttpWebRequest request = null;
//如果是发送HTTPS请求
if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
request = WebRequest.Create(url) as HttpWebRequest;
request.ProtocolVersion = HttpVersion.Version10;
}
else
{
request = WebRequest.Create(url) as HttpWebRequest;
}
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
if (!string.IsNullOrEmpty(userAgent))
{
request.UserAgent = userAgent;
}
else
{
request.UserAgent = DefaultUserAgent;
}
if (timeout.HasValue)
{
request.Timeout = timeout.Value;
}
if (cookies != null)
{
request.CookieContainer = new CookieContainer();
request.CookieContainer.Add(cookies);
}
//如果需要POST数据
if (!(parameters == null || parameters.Count == 0))
{
StringBuilder buffer = new StringBuilder();
int i = 0;
foreach (string key in parameters.Keys)
{
if (i > 0)
{
buffer.AppendFormat("&{0}={1}", key, parameters[key]);
}
else
{
buffer.AppendFormat("{0}={1}", key, parameters[key]);
}
i++;
}
byte[] data = requestEncoding.GetBytes(buffer.ToString());
using (Stream stream = request.GetRequestStream())
{
stream.Write(data, 0, data.Length);
}
}
return request.GetResponse() as HttpWebResponse;
}
private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
{
return true; //总是接受
}
}
}