using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace Public
{
public class HtmlHelper
{
/// <summary>
/// 返回匹配的字符串集合
/// </summary>
/// <param name="strHTML">源代码</param>
/// <param name="rule">规则</param>
/// <returns></returns>
public static List<string> GetMatchString(string strHTML, string rule)
{
// rule = "<dd>((?:(?!id=1766690474)[\\d\\D])+?)dd>";
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return null;
}
try
{
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
MatchCollection mc = r.Matches(strHTML);
if (mc != null && mc.Count > 0)
{
List<string> list = new List<string>();
string value;
foreach (Match m in mc)
{
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
for (int i = 1; i < gc.Count; i++)
{
value = gc[i].Value.Trim();
if (value.Length > 0)
{
list.Add(value);
}
}
}
}
return list;
}
}
catch (Exception ex){ }
return null;
}
/// <summary>
/// 判断字符串是否匹配,并返回匹配组的值
/// </summary>
/// <param name="strHTML"></param>
/// <param name="rule"></param>
/// <param name="matchValue"></param>
/// <returns></returns>
public static bool Match(string strHTML, string rule, out string groupValue)
{
groupValue = "";
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return false;
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
Match m = r.Match(strHTML);
if (m.Success)
{
string value;
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
for (int i = 1; i < gc.Count; i++)
{
value = gc[i].Value.Trim();
if (value.Length > 0)
{
groupValue =value ;
break;
}
}
}
return true;
}
return false;
}
/// <summary>
/// 返回匹配的键值对
/// </summary>
/// <param name="strHTML">源代码</param>
/// <param name="rule">规则</param>
/// <returns></returns>
public static string GetKeyValue(string strHTML, string rule,out string value)
{
value = "";
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return "";
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
Match m = r.Match(strHTML);
if (m!=null && m.Success)
{
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
try
{
value = gc["value"].Value.Trim();
return gc["key"].Value.Trim();
}
catch (Exception)
{
return "";
}
}
}
return "";
}
/// <summary>
/// 分割字符串
/// </summary>
/// <param name="str"></param>
/// <param name="split"></param>
/// <returns></returns>
public static string[] SplitString(string str, string split)
{
if (str != null && split != null)
{
return Regex.Split(str, split);
}
return null;
}
/// <summary>
/// 是否匹配
/// </summary>
/// <param name="strHTML"></param>
/// <param name="rule"></param>
/// <returns></returns>
public static bool IsMatch(string strHTML, string rule)
{
if (strHTML == null || strHTML == "")
{
return false;
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
return r.IsMatch(strHTML);
}
/// <summary>
/// 是否匹配
/// </summary>
/// <param name="strHTML"></param>
/// <param name="rule"></param>
/// <returns></returns>
public static string Replace(string orgStr, string rule,string str)
{
if (orgStr == null || orgStr == "")
{
return "";
}
Regex r = new Regex(rule, RegexOptions.IgnoreCase);
return r.Replace(orgStr,str);
}
/// <summary>
/// 创建正则模型
/// </summary>
/// <param name="rule"></param>
/// <returns></returns>
private static string CreatePattern(string rule)
{
// return rule;
rule = rule.Trim();
if (rule.IndexOf("#content#") >= 0)
{
//rule = rule.Replace("#content#", "((?:(?!" + Regex.Split(rule, "#content#")[1] + ").)+)"); // 不包含结束标签的html
rule = rule.Replace("#content#", "(.+?)");
}
if (rule.IndexOf("#url#") >= 0)
{
rule = rule.Replace("#url#", "([^<>\"\']+)");
}
if (rule.IndexOf("#title#") >= 0)
{
rule = rule.Replace("#title#", "((?:(?!" + Regex.Split(rule, "#title#")[1] + ").)+)");
}
if (rule.IndexOf("#keyword#") >= 0)
{
rule = rule.Replace("#keyword#", "((?:(?!" + Regex.Split(rule, "#keyword#")[1] + ").)+)");
}
if (rule.IndexOf("#parameter#") >= 0)
{
rule = rule.Replace("#parameter#", "([^\\}]+)");
}
return rule;
}
/// <summary>
/// 获取指定编码后的字符串
/// </summary>
/// <param name="str"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetEncoding(string str, string encoding)
{
if (encoding.ToLower() == "utf-8")
{
return UTF8(str);
}
return GB2312(str);
}
public static string GB2312(string str)
{
str = ReplaceSpecialChar(str);
StringBuilder sb = new StringBuilder();
Encoding en = Encoding.GetEncoding("GB2312");
for (int i = 0; i < str.Length; i++)
{
byte[] byteCode = en.GetBytes(str[i].ToString());
if (byteCode.Length == 2)
{
sb.Append("%" + Convert.ToString(byteCode[0], 16) + "%" + Convert.ToString(byteCode[1], 16));
}
else
{
sb.Append(str[i]);
}
}
return sb.ToString();
}
/// <summary>
/// 把汉字按utf-8 编码
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public static string UTF8(string str)
{
str = ReplaceSpecialChar(str) ;
StringBuilder sb = new StringBuilder();
Encoding en = Encoding.GetEncoding("UTF-8");
for (int i = 0; i < str.Length; i++)
{
byte[] byteCode = en.GetBytes(str[i].ToString());
if (byteCode.Length == 3)
{
sb.Append("%" + Convert.ToString(byteCode[0], 16) + "%" + Convert.ToString(byteCode[1], 16) + "%" + Convert.ToString(byteCode[2], 16));
}
else
{
sb.Append(str[i]);
}
}
return sb.ToString();
}
/// <summary>
/// 替换特殊字符
/// </summary>
/// <returns></returns>
public static string ReplaceChar(string str)
{
return str.Replace("%", "%25").Replace(" ", "%20").Replace("&", "%26").Replace("?", "%3F").Replace("=", "%3D");
}
/// <summary>
/// 替换特殊字符
/// </summary>
/// <returns></returns>
public static string ReplaceSpecialChar(string str)
{
return str.Replace("%", "%25").Replace(" ", "%20").Replace("&", "%26").Replace("?", "%3F").Replace(":", "%3A").Replace("=", "%3D").Replace("/", "%2F").Replace("+", "%2B").Replace("@", "%40");
}
/// <summary>
/// 处理问答列表区域、问答标题、最佳答案
/// 返回指定规则的HTML内容
/// </summary>
/// <param name="strHTML">源代码</param>
/// <param name="rule">规则</param>
/// <returns></returns>
public static string GetHtmlContent(string strHTML, string rule)
{
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return "";
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
Match m = r.Match(strHTML);
if (m.Success)
{
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
for (int i = 1; i < gc.Count; i++)
{
if (gc[i].Value.Length > 0)
{
return gc[i].Value.Trim();
}
}
}
}
return "";
}
/// <summary>
/// 过滤html 标签
/// </summary>
/// <param name="strHTML"></param>
/// <returns></returns>
public static string FilerHTML(string strHTML)
{
if (strHTML == null || strHTML.Length ==0)
{
return "";
}
Regex r = new Regex("<(a|(?:script)|(?:style))[^>]*?>[\\d\\D]*?</\\1>", RegexOptions.IgnoreCase);
string str = r.Replace(strHTML, "");
r = new Regex("<[^<>]+?>", RegexOptions.IgnoreCase);
return r.Replace(str, "");
}
/// <summary>
/// 验证域名是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidDomainName(string dm)
{
Regex r = new Regex("^[\\w-]+(\\.[\\w-]+)+$", RegexOptions.IgnoreCase);
return r.IsMatch(dm);
}
/// <summary>
/// 验证电话是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidPhone(string phone)
{
Regex r = new Regex("^[\\d]{3,5}-[\\d]{7,8}(-[\\d]{3,6})?$", RegexOptions.IgnoreCase);
return r.IsMatch(phone);
}
/// <summary>
/// 验证移动电话是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidMobile(string mobile)
{
Regex r = new Regex("^1[3-9]\\d{9}$", RegexOptions.IgnoreCase);
return r.IsMatch(mobile);
}
/// <summary>
/// 验证QQ是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidQQ(string qq)
{
Regex r = new Regex("^\\d{5,11}$", RegexOptions.IgnoreCase);
return r.IsMatch(qq);
}
/// <summary>
/// 验证网址是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidUrl(string qq)
{
Regex r = new Regex("^https?://([\\w-]+\\.)+[\\w-]+(/[\\w-.\\/?%&=]*)?$", RegexOptions.IgnoreCase);
return r.IsMatch(qq);
}
/// <summary>
/// html 代码转UBB
/// </summary>
/// <returns></returns>
public static string HtmlToUBB(string html)
{
if (html == null || html.Length == 0) { return ""; }
string ubb = Regex.Replace(html,"<img.+?src=\\s*[\"']?([^\"' ]+)[\"']?[\\d\\D]*?>", "[img]$1[/img]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<a.+?href=\\s*[\"']?([^\"']+)[\"' ]?[\\d\\D]*?>([\\d\\D]+?)</a>", "[url=$1]$2[/url]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<([bui])>([\\d\\D]+?)</\\1>", "[$1]$2[/$1]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<(h[1-6])[\\d\\D]+?>([\\d\\D]+?)</\\1>", "[$1]$2[/$1]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<div[\\d\\D]+?>([\\d\\D]+?)</div>", "[CODE]$1[/CODE]", RegexOptions.IgnoreCase);
return FilerHTML(ubb);
}
/// <summary>
/// 获取主域名
/// </summary>
/// <param name="domainName"></param>
/// <returns></returns>
public static string GetMainDomainName(string domainName)
{
if (domainName == null | domainName.Length ==0)
{
return "";
}
string mainDom = HtmlHelper.GetHtmlContent(domainName, "([^.]+(?:\\.(?:com|cn|org|net|edu|info|gov|cc|tv|hk|me))+$)");
if (mainDom.Length == 0)
{
return domainName;
}
return mainDom;
}
}
}
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace Public
{
public class HtmlHelper
{
/// <summary>
/// 返回匹配的字符串集合
/// </summary>
/// <param name="strHTML">源代码</param>
/// <param name="rule">规则</param>
/// <returns></returns>
public static List<string> GetMatchString(string strHTML, string rule)
{
// rule = "<dd>((?:(?!id=1766690474)[\\d\\D])+?)dd>";
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return null;
}
try
{
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
MatchCollection mc = r.Matches(strHTML);
if (mc != null && mc.Count > 0)
{
List<string> list = new List<string>();
string value;
foreach (Match m in mc)
{
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
for (int i = 1; i < gc.Count; i++)
{
value = gc[i].Value.Trim();
if (value.Length > 0)
{
list.Add(value);
}
}
}
}
return list;
}
}
catch (Exception ex){ }
return null;
}
/// <summary>
/// 判断字符串是否匹配,并返回匹配组的值
/// </summary>
/// <param name="strHTML"></param>
/// <param name="rule"></param>
/// <param name="matchValue"></param>
/// <returns></returns>
public static bool Match(string strHTML, string rule, out string groupValue)
{
groupValue = "";
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return false;
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
Match m = r.Match(strHTML);
if (m.Success)
{
string value;
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
for (int i = 1; i < gc.Count; i++)
{
value = gc[i].Value.Trim();
if (value.Length > 0)
{
groupValue =value ;
break;
}
}
}
return true;
}
return false;
}
/// <summary>
/// 返回匹配的键值对
/// </summary>
/// <param name="strHTML">源代码</param>
/// <param name="rule">规则</param>
/// <returns></returns>
public static string GetKeyValue(string strHTML, string rule,out string value)
{
value = "";
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return "";
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
Match m = r.Match(strHTML);
if (m!=null && m.Success)
{
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
try
{
value = gc["value"].Value.Trim();
return gc["key"].Value.Trim();
}
catch (Exception)
{
return "";
}
}
}
return "";
}
/// <summary>
/// 分割字符串
/// </summary>
/// <param name="str"></param>
/// <param name="split"></param>
/// <returns></returns>
public static string[] SplitString(string str, string split)
{
if (str != null && split != null)
{
return Regex.Split(str, split);
}
return null;
}
/// <summary>
/// 是否匹配
/// </summary>
/// <param name="strHTML"></param>
/// <param name="rule"></param>
/// <returns></returns>
public static bool IsMatch(string strHTML, string rule)
{
if (strHTML == null || strHTML == "")
{
return false;
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
return r.IsMatch(strHTML);
}
/// <summary>
/// 是否匹配
/// </summary>
/// <param name="strHTML"></param>
/// <param name="rule"></param>
/// <returns></returns>
public static string Replace(string orgStr, string rule,string str)
{
if (orgStr == null || orgStr == "")
{
return "";
}
Regex r = new Regex(rule, RegexOptions.IgnoreCase);
return r.Replace(orgStr,str);
}
/// <summary>
/// 创建正则模型
/// </summary>
/// <param name="rule"></param>
/// <returns></returns>
private static string CreatePattern(string rule)
{
// return rule;
rule = rule.Trim();
if (rule.IndexOf("#content#") >= 0)
{
//rule = rule.Replace("#content#", "((?:(?!" + Regex.Split(rule, "#content#")[1] + ").)+)"); // 不包含结束标签的html
rule = rule.Replace("#content#", "(.+?)");
}
if (rule.IndexOf("#url#") >= 0)
{
rule = rule.Replace("#url#", "([^<>\"\']+)");
}
if (rule.IndexOf("#title#") >= 0)
{
rule = rule.Replace("#title#", "((?:(?!" + Regex.Split(rule, "#title#")[1] + ").)+)");
}
if (rule.IndexOf("#keyword#") >= 0)
{
rule = rule.Replace("#keyword#", "((?:(?!" + Regex.Split(rule, "#keyword#")[1] + ").)+)");
}
if (rule.IndexOf("#parameter#") >= 0)
{
rule = rule.Replace("#parameter#", "([^\\}]+)");
}
return rule;
}
/// <summary>
/// 获取指定编码后的字符串
/// </summary>
/// <param name="str"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetEncoding(string str, string encoding)
{
if (encoding.ToLower() == "utf-8")
{
return UTF8(str);
}
return GB2312(str);
}
public static string GB2312(string str)
{
str = ReplaceSpecialChar(str);
StringBuilder sb = new StringBuilder();
Encoding en = Encoding.GetEncoding("GB2312");
for (int i = 0; i < str.Length; i++)
{
byte[] byteCode = en.GetBytes(str[i].ToString());
if (byteCode.Length == 2)
{
sb.Append("%" + Convert.ToString(byteCode[0], 16) + "%" + Convert.ToString(byteCode[1], 16));
}
else
{
sb.Append(str[i]);
}
}
return sb.ToString();
}
/// <summary>
/// 把汉字按utf-8 编码
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public static string UTF8(string str)
{
str = ReplaceSpecialChar(str) ;
StringBuilder sb = new StringBuilder();
Encoding en = Encoding.GetEncoding("UTF-8");
for (int i = 0; i < str.Length; i++)
{
byte[] byteCode = en.GetBytes(str[i].ToString());
if (byteCode.Length == 3)
{
sb.Append("%" + Convert.ToString(byteCode[0], 16) + "%" + Convert.ToString(byteCode[1], 16) + "%" + Convert.ToString(byteCode[2], 16));
}
else
{
sb.Append(str[i]);
}
}
return sb.ToString();
}
/// <summary>
/// 替换特殊字符
/// </summary>
/// <returns></returns>
public static string ReplaceChar(string str)
{
return str.Replace("%", "%25").Replace(" ", "%20").Replace("&", "%26").Replace("?", "%3F").Replace("=", "%3D");
}
/// <summary>
/// 替换特殊字符
/// </summary>
/// <returns></returns>
public static string ReplaceSpecialChar(string str)
{
return str.Replace("%", "%25").Replace(" ", "%20").Replace("&", "%26").Replace("?", "%3F").Replace(":", "%3A").Replace("=", "%3D").Replace("/", "%2F").Replace("+", "%2B").Replace("@", "%40");
}
/// <summary>
/// 处理问答列表区域、问答标题、最佳答案
/// 返回指定规则的HTML内容
/// </summary>
/// <param name="strHTML">源代码</param>
/// <param name="rule">规则</param>
/// <returns></returns>
public static string GetHtmlContent(string strHTML, string rule)
{
if (strHTML == null || strHTML == "" || rule == null || rule == "")
{
return "";
}
Regex r = new Regex(CreatePattern(rule), RegexOptions.IgnoreCase);
Match m = r.Match(strHTML);
if (m.Success)
{
GroupCollection gc = m.Groups;
if (gc != null && gc.Count > 1)
{
for (int i = 1; i < gc.Count; i++)
{
if (gc[i].Value.Length > 0)
{
return gc[i].Value.Trim();
}
}
}
}
return "";
}
/// <summary>
/// 过滤html 标签
/// </summary>
/// <param name="strHTML"></param>
/// <returns></returns>
public static string FilerHTML(string strHTML)
{
if (strHTML == null || strHTML.Length ==0)
{
return "";
}
Regex r = new Regex("<(a|(?:script)|(?:style))[^>]*?>[\\d\\D]*?</\\1>", RegexOptions.IgnoreCase);
string str = r.Replace(strHTML, "");
r = new Regex("<[^<>]+?>", RegexOptions.IgnoreCase);
return r.Replace(str, "");
}
/// <summary>
/// 验证域名是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidDomainName(string dm)
{
Regex r = new Regex("^[\\w-]+(\\.[\\w-]+)+$", RegexOptions.IgnoreCase);
return r.IsMatch(dm);
}
/// <summary>
/// 验证电话是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidPhone(string phone)
{
Regex r = new Regex("^[\\d]{3,5}-[\\d]{7,8}(-[\\d]{3,6})?$", RegexOptions.IgnoreCase);
return r.IsMatch(phone);
}
/// <summary>
/// 验证移动电话是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidMobile(string mobile)
{
Regex r = new Regex("^1[3-9]\\d{9}$", RegexOptions.IgnoreCase);
return r.IsMatch(mobile);
}
/// <summary>
/// 验证QQ是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidQQ(string qq)
{
Regex r = new Regex("^\\d{5,11}$", RegexOptions.IgnoreCase);
return r.IsMatch(qq);
}
/// <summary>
/// 验证网址是否合法
/// </summary>
/// <param name="dm"></param>
/// <returns></returns>
public static bool ValidUrl(string qq)
{
Regex r = new Regex("^https?://([\\w-]+\\.)+[\\w-]+(/[\\w-.\\/?%&=]*)?$", RegexOptions.IgnoreCase);
return r.IsMatch(qq);
}
/// <summary>
/// html 代码转UBB
/// </summary>
/// <returns></returns>
public static string HtmlToUBB(string html)
{
if (html == null || html.Length == 0) { return ""; }
string ubb = Regex.Replace(html,"<img.+?src=\\s*[\"']?([^\"' ]+)[\"']?[\\d\\D]*?>", "[img]$1[/img]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<a.+?href=\\s*[\"']?([^\"']+)[\"' ]?[\\d\\D]*?>([\\d\\D]+?)</a>", "[url=$1]$2[/url]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<([bui])>([\\d\\D]+?)</\\1>", "[$1]$2[/$1]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<(h[1-6])[\\d\\D]+?>([\\d\\D]+?)</\\1>", "[$1]$2[/$1]", RegexOptions.IgnoreCase);
ubb = Regex.Replace(ubb, "<div[\\d\\D]+?>([\\d\\D]+?)</div>", "[CODE]$1[/CODE]", RegexOptions.IgnoreCase);
return FilerHTML(ubb);
}
/// <summary>
/// 获取主域名
/// </summary>
/// <param name="domainName"></param>
/// <returns></returns>
public static string GetMainDomainName(string domainName)
{
if (domainName == null | domainName.Length ==0)
{
return "";
}
string mainDom = HtmlHelper.GetHtmlContent(domainName, "([^.]+(?:\\.(?:com|cn|org|net|edu|info|gov|cc|tv|hk|me))+$)");
if (mainDom.Length == 0)
{
return domainName;
}
return mainDom;
}
}
}