使用c#来进行html源码解析
本文将介绍一下如何使用c#以及HtmlAgilityPack包对html源码进行解析与信息提取。
HtmlAgilityPack安装
在visual studio工具栏中依次点Tools–>NuGet Package Manager–>Manage Package for Solution,搜索HtmlAgilityPack,然后进行安装。在此不详细叙述。
使用
不要忘了引用一下HtmlAgilityPack
using HtmlAgilityPack;
直接通过字符串来解析
public void HtmlStringPattern(string htmlString)
{
//加载html string
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(htmlString);
//以下部分为具体的提取操作,因情况而异
HtmlNode version = htmlDocument.DocumentNode.SelectSingleNode("html/body//table[@class='basicinfo']/tr[5]/td[2]");
Console.WriteLine("version: " + version.InnerHtml.Trim());
string target_table = "html/body//table[@class='webgrid_table'][1]/tbody";
List<string> result = new List<string>();
HtmlNodeCollection countcollection = htmlDocument.DocumentNode.SelectNodes(target_table + "/tr");
int length_collection = countcollection.Count;
for (int i = 1; i <= length_collection; i++)
{
HtmlNode result_node = htmlDocument.DocumentNode.SelectSingleNode(target_table + "/tr[" + i + "]/td[2]/a");
result.Add(result_node.InnerHtml);
}
Console.WriteLine("version");
foreach (string s in result)
{
Console.WriteLine(s);
}
}
通过html文件
public void HtmlFilePattern(string filePath)
{
string htmlDoc;
using (StreamReader sr = new StreamReader(filePath))
{
htmlDoc = sr.ReadToEnd();
}
htmlDoc = htmlDoc.Replace(@"\r\n", "");
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(@htmlDoc);
//具体的提取操作参考上一代码块
//....
//
通过httpclient
public void WebURLPattern(string webURL)
{
string resultset = new HttpHepler().HttpClientRequest(webURL);
//上一行写了一个HttpHepler函数来发送http请求,具体见下一代码块
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(@resultset);
//具体的提取操作参考上一代码块
//....
//
这里写了一个HttpHepler函数来发送http请求,具体见下一代码块
public class HttpHepler
{
public HttpHepler()
{
}
//通过httpClient发送请求的函数
public string HttpClientRequest(string webURL)
{
string resultset = "";
using (HttpClient client = new HttpClient())
{
using (HttpResponseMessage response = client.GetAsync(webURL).Result)
{
using (HttpContent content = response.Content)
{
resultset = content.ReadAsStringAsync().Result;
}
}
}
return resultset;
}
//通过httpWeb发送请求的函数
public string HttpWebRequest(string webURL)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webURL);
CookieContainer cookieContainer = new CookieContainer();
request.CookieContainer = cookieContainer;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
string data = "";
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
{
readStream = new StreamReader(receiveStream);
}
else
{
readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
}
data = readStream.ReadToEnd();
response.Close();
readStream.Close();
}
return data;
}
}