html标签及xpath处理相关方法
直接上代码
</pre></p><p><pre name="code" class="csharp">using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Web;
using HtmlAgilityPack;
namespace TL.NewsGatheringService.Business.Implement
{
/// <summary>
/// 字符串处理类
/// </summary>
public class StrHelperUtil
{
/// <summary>
/// 替换HTML标记
/// </summary>
/// <param name="strHtml"></param>
/// <returns></returns>
public static string FormatHtml(string strHtml)
{
//删除脚本
strHtml = Regex.Replace(strHtml, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
strHtml = Regex.Replace(strHtml, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"([rn])[s]+", "", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"-->", "", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"<!--.*", "", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(quot|#34);", "", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"&#(d+);", "", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"<img[^>]*>;", "", RegexOptions.IgnoreCase);
strHtml = strHtml.Replace("<", "");
strHtml = strHtml.Replace(">", "");
strHtml = strHtml.Replace("rn", "");
strHtml = strHtml.Replace("\r\n", "");
//Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
return strHtml;
}
/// <summary>
/// 从一段html中取出一个url
/// </summary>
/// <param name="strHtml"></param>
/// <returns></returns>
public static string GetUrlFromHtml(string strHtml)
{
var strUrl = GetStrByRegx(strHtml, @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-,]*)?");
//MatchCollection mc = Regex.Matches(strHtml, @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-,]*)?", RegexOptions.IgnoreCase);
String check = @"((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?";
//if (mc.Count > 0)
//{
// foreach (Match m in mc)
// {
// strUrl = m.Groups[0].Value.ToString().Trim();
// if (strUrl.Length > 5)
// {
// break;
// }
// }
//}
return strUrl;
}
/// <summary>
/// 从字符串中取出与正则匹配的字符串
/// </summary>
/// <param name="inputStr">源字符串</param>
/// <param name="strPattern">正则表达式 如:/\\d{2,20}/</param>
/// <returns>string</returns>
public static string GetStrByRegx(string inputStr, string strPattern)
{
var retStr = "";
try
{
MatchCollection mc = Regex.Matches(inputStr, strPattern, RegexOptions.IgnoreCase);
if (mc.Count > 0)
{
foreach (Match m in mc)
{
retStr += m.Value;
}
}
}
catch
{
}
return retStr;
}
/// <summary>
/// 从字符串中取出与正则匹配的字符串组
/// </summary>
/// <param name="inputStr">源字符串</param>
/// <param name="strPattern">正则表达式 注意要带分组 分组名固定为:"gname" 如: <a id=\"ctl00_M_dtgResumeList(?<gname>.*?).*> </param>
/// <returns>List-string</returns>
public static List<string> GetListStrByRegxGroup(string inputStr, string strPattern)
{
var list = new List<string>();
MatchCollection mc = Regex.Matches(inputStr, strPattern, RegexOptions.IgnoreCase);
if (mc.Count > 0)
{
foreach (Match m in mc)
{
var str = m.Groups["gname"].Value.ToString().Trim();
if (str.Length > 0)
{
list.Add(str);
}
}
}
return list;
}
/// <summary>
/// 根据XPATH获取筛选的字符串
/// </summary>
/// <param name="content">需要提取HTML的内容</param>
/// <param name="xpath">XPath表达式</param>
/// <param name="separ">分隔符</param>
/// <returns>提取后的内容</returns>
public static string GetStrByXPath(string content, string xpath, string separ)
{
var text = "";
var doc1 = new HtmlDocument();
doc1.LoadHtml(content);
var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
if (repeatNodes == null)
return text;
//循环节点
foreach (var node in repeatNodes)
{
//text += node.InnerText + separ;
text += node.InnerHtml + separ;
}
return text;
}
/// <summary>
/// 根据XPATH获取筛选的字符串 每个字符串加上前缀后缀
/// </summary>
/// <param name="content">需要提取HTML的内容</param>
/// <param name="xpath">XPath表达式</param>
/// <param name="preSepar">前缀</param>
/// <param name="lastSepar">后缀</param>
/// <returns></returns>
public static string GetStrByXPath(string content, string xpath, string preSepar, string lastSepar)
{
var text = "";
var doc1 = new HtmlDocument();
doc1.LoadHtml(content);
var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
if (repeatNodes == null)
return text;
//循环节点
foreach (var node in repeatNodes)
{
//text += node.InnerText + separ;
text += preSepar + node.InnerHtml + lastSepar;
}
return text;
}
/// <summary>
/// 获取某个xpath取到的元素的指定属性的值
/// </summary>
/// <param name="content">原内容</param>
/// <param name="xpath">XPath表达式</param>
/// <param name="attrName">属性名</param>
/// <returns></returns>
public static string GetAttrValueByXPath(string content, string xpath, string attrName)
{
var text = "";
var doc1 = new HtmlDocument();
doc1.LoadHtml(content);
var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
if (repeatNodes == null)
return text;
//循环节点
foreach (var node in repeatNodes)
{
//text += node.InnerText + separ;
text += node.Attributes[attrName].Value;
}
return text;
}
/// <summary>
/// 根据XPATH获取筛选的字符串 每个字符串加上前缀后缀
/// </summary>
/// <param name="content">原内容</param>
/// <param name="xpath">XPath表达式</param>
/// <param name="attrName">属性名</param>
/// <param name="preSepar">前缀</param>
/// <param name="lastSepar">后缀</param>
/// <returns></returns>
public static string GetAttrValueByXPath(string content, string xpath, string attrName, string preSepar, string lastSepar)
{
var text = "";
var doc1 = new HtmlDocument();
doc1.LoadHtml(content);
var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
if (repeatNodes == null)
return text;
//循环节点
foreach (var node in repeatNodes)
{
//text += node.InnerText + separ;
text += preSepar + node.Attributes[attrName].Value + lastSepar;
}
return text;
}
/// <summary>
/// 中文转unicode
/// </summary>
/// <returns></returns>
public static string chinese_to_unicode(string str)
{
string outStr = "";
if (!string.IsNullOrEmpty(str))
{
for (int i = 0; i < str.Length; i++)
{
outStr += "/u" + ((int)str[i]).ToString("x");
}
}
return outStr;
}
/// <summary>
/// unicode转中文
/// </summary>
/// <returns></returns>
public static string unicode_to_chinese(string str)
{
string outStr = "";
if (!string.IsNullOrEmpty(str))
{
string[] strlist = str.Replace("/", "").Split('u');
try
{
for (int i = 1; i < strlist.Length; i++)
{
//将unicode字符转为10进制整数,然后转为char中文字符
outStr += (char)int.Parse(strlist[i], System.Globalization.NumberStyles.HexNumber);
}
}
catch (FormatException ex)
{
outStr = ex.Message;
}
}
return outStr;
}
/// <summary>
/// unicode转中文(符合js规则的)
/// </summary>
/// <returns></returns>
public static string unicode_to_chinese_js(string str)
{
string outStr = "";
Regex reg = new Regex(@"(?i)\\u([0-9a-f]{4})");
outStr = reg.Replace(str, delegate(Match m1)
{
return ((char)Convert.ToInt32(m1.Groups[1].Value, 16)).ToString();
});
return outStr;
}
/// <summary>
/// 中文转unicode(符合js规则的)
/// </summary>
/// <returns></returns>
public static string chinese_to_unicode_js(string str)
{
string outStr = "";
string a = "";
if (!string.IsNullOrEmpty(str))
{
for (int i = 0; i < str.Length; i++)
{
if (Regex.IsMatch(str[i].ToString(), @"[\u4e00-\u9fa5]")) { outStr += "\\u" + ((int)str[i]).ToString("x"); }
else { outStr += str[i]; }
}
}
return outStr;
}
}
}