13.4. 去除HTML的标签tag:htmlRemoveTag
/*
* [Function]
* remove html tag, retain html content
* [Input]
* html, with tag
*
* [Output]
* pure content, no html tag
*
* [Note]
*/
public string htmlRemoveTag(string html)
{
string filteredHtml = "";
if (!string.IsNullOrEmpty(html))
{
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(html);
if (htmlDoc == null)
{
return "";
}
// 1. remove all comments
//(1)get all comment nodes using XPATH
HtmlNodeCollection commentNodeList = htmlDoc.DocumentNode.SelectNodes("//comment()");
if (commentNodeList != null)
{
foreach (HtmlNode comment in commentNodeList)
{
//(2) remove comment node itself
comment.ParentNode.RemoveChild(comment);
}
}
//2. get all content
foreach (var node in htmlDoc.DocumentNode.ChildNodes)
{
filteredHtml += node.InnerText;
}
}
return filteredHtml;
}
例 13.4. htmlRemoveTag 的使用范例
HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(googleSearchRespHtml);
HtmlNodeCollection liNodeList = htmlDoc.DocumentNode.SelectNodes("//li[@class='g']");
foreach (HtmlNode liNode in liNodeList)
{
HtmlNode h3ANode = liNode.SelectSingleNode(".//h3[@class='r']/a");
if (h3ANode != null)
{
googleSearchResultItem singleResultItem = new googleSearchResultItem();
//string titleHtml = h3ANode.InnerHtml; //"Amritanandamayi Math to sponsor charity events - Times Of India"
string titleHtml = h3ANode.InnerText; //"Amritanandamayi Math to sponsor charity events - Times Of India"
string filteredTitle = crl.htmlRemoveTag(titleHtml);