/// <summary>
/// DOM查询器,用法跟jquery差不多
/// </summary>
public class DomQuery
{
/// <summary>
/// 获得节点
/// </summary>
/// <param name="_HtmlDocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
/// <remarks>DOM选择器,用法跟jquery差不多</remarks>
public IList<HtmlNode> Get(HtmlDocument _HtmlDocument, string selector)
{
string[] Expressions = selector.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
List<HtmlNode> hnList = new List<HtmlNode>();
if (Expressions[0].StartsWith("#"))
{
hnList.Add(_HtmlDocument.GetElementbyId(Expressions[0].TrimStart('#')));
hnList.RemoveAll(x => { return x == null; });
if (Expressions.Length == 1)
{
return hnList;
}
for (int i = 1; i < Expressions.Length; i++)
{
hnList = Get(hnList, Expressions[i]);
}
}
else
{
hnList.AddRange(_HtmlDocument.DocumentNode.ChildNodes.Where(x => { return x.NodeType == HtmlNodeType.Element; }));
for (int i = 0; i < Expressions.Length; i++)
{
hnList = Get(hnList, Expressions[i]);
}
}
return hnList;
}
/// <summary>
/// 查找节点,并直接返回InnerHtml
/// </summary>
/// <param name="_HtmlDocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
public string SingleGetInnerHtml(HtmlDocument _HtmlDocument, string selector)
{
HtmlNode hn = SingleGet(_HtmlDocument, selector);
if (hn == null)
return null;
else
return hn.InnerHtml;
}
/// <summary>
/// 查找节点,并直接返回InnerText
/// </summary>
/// <param name="_HtmlDocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
public string SingleGetInnerText(HtmlDocument _HtmlDocument, string selector)
{
HtmlNode hn = SingleGet(_HtmlDocument, selector);
if (hn == null)
return null;
else
return hn.InnerText.Trim();
}
/// <summary>
/// 查找节点
/// </summary>
/// <param name="_HtmlDocument"></param>
/// <param name="selector"></param>
/// <returns></returns>
public HtmlNode SingleGet(HtmlDocument _HtmlDocument, string selector)
{
IList<HtmlNode> hnList = Get(_HtmlDocument, selector);
if (hnList.Count == 0)
{
return null;
}
else
{
return hnList[0];
}
}
#region 获得属性
/// <summary>
/// 获得属性
/// </summary>
/// <param name="_HtmlNodes"></param>
/// <param name="attr"></param>
/// <returns></returns>
public string[] Attr(IList<HtmlNode> _HtmlNodes, string attr)
{
if (_HtmlNodes == null)
{
return new string[0];
}
if (_HtmlNodes.Count() == 0)
{
return new string[0];
}
var v = from x in _HtmlNodes where x.Attributes[attr] != null select x;
return (from x in v select x.Attributes[attr].Value).ToArray();
}
#endregion
#region 根据选择器语法查找
/// <summary>
/// 根据选择器语法查找
/// </summary>
/// <param name="_HtmlNodes"></param>
/// <param name="Expression"></param>
/// <returns></returns>
private List<HtmlNode> Get(List<HtmlNode> _HtmlNodes, string Expression)
{
string _expre = null;
string fun = null;
int index = -1;
string keyword = null;
Regex reg = new Regex(@"([.|\-|\w]+)", RegexOptions.Singleline);
MatchCollection mc = reg.Matches(Expression);
for (int i = 0; i < mc.Count; i++)
{
if (i == 0)
{
_expre = mc[i].Value;
}
if (i == 1)
{
fun = mc[i].Value;
}
if (i == 2)
{
if (int.TryParse(mc[i].Value, out index) == false)
{
keyword = mc[i].Value;
}
}
}
List<HtmlNode> list = new List<HtmlNode>();
if (string.IsNullOrEmpty(fun) == true)
{
if (Expression.StartsWith("."))
{
return Class(_HtmlNodes, Expression).ToList();
}
else
{
return NodeType(_HtmlNodes, Expression).ToList();
}
}
else
{
foreach (var n in _HtmlNodes)
{
IEnumerable<HtmlNode> v;
if (_expre.StartsWith("."))
{
v = Class(n, _expre);
}
else
{
v = NodeType(n, _expre);
}
list.AddRange(FunAction(v, fun, index, keyword));
}
return list;
}
}
#region 函数处理
/// <summary>
/// 函数处理
/// </summary>
/// <param name="v"></param>
/// <param name="fun"></param>
/// <returns></returns>
private IEnumerable<HtmlNode> FunAction(IEnumerable<HtmlNode> v, string fun, int index, string keyword)
{
switch (fun.ToLower())
{
case "eq":
return v.Where((nn, _index) => _index == index);
case "lt":
return v.Where((nn, _index) => _index < index);
case "gt":
return v.Where((nn, _index) => _index > index);
case "first":
if (v.Count() > 0)
return new HtmlNode[] { v.First() };
else
return v;
case "last":
if (v.Count() > 0)
return new HtmlNode[] { v.Last() };
else
return v;
case "even":
return v.Where((nn, _index) => _index % 2 == 0);
case "odd":
return v.Where((nn, _index) => (_index & 1) == 1);
case "next":
return v.Select(nn => nn.NextSibling);
case "contains":
return v.Where(x => { return x.InnerHtml.Contains(keyword); });
case "empty":
return v.Where(x => { return x.HasChildNodes == false; });
case "header":
string[] headers = new string[] { "h1", "h2", "h3", "h4", "h5", "h6" };
return FindChildNodes(v.ToArray()).Where(x => { return headers.Contains(x.OriginalName); });
default:
throw new NotSupportedException("函数不支持。");
}
}
#endregion
#endregion
#region 根据类名找节点
private ParallelQuery<HtmlNode> Class(HtmlNode hn, string Expression)
{
return Class(new HtmlNode[] { hn }, Expression);
}
/// <summary>
/// 根据类名找节点
/// </summary>
/// <param name="_HtmlNodes"></param>
/// <param name="Expression"></param>
/// <returns></returns>
private ParallelQuery<HtmlNode> Class(IList<HtmlNode> _HtmlNodes, string Expression)
{
var v = FindChildNodes(_HtmlNodes).AsParallel().Where(x => x.Attributes["class"] != null);
var Y = v.Where(x => x.Attributes["class"].Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Contains(Expression.TrimStart('.'), StringComparer.CurrentCultureIgnoreCase));
return Y;
}
#endregion
#region 根据类型找节点
/// <summary>
/// 根据类型找节点
/// </summary>
/// <param name="hn"></param>
/// <param name="Expression"></param>
/// <returns></returns>
private ParallelQuery<HtmlNode> NodeType(HtmlNode hn, string Expression)
{
return NodeType(new HtmlNode[] { hn }, Expression);
}
/// <summary>
/// 根据类型找节点
/// </summary>
/// <param name="_HtmlNodes"></param>
/// <param name="Expression"></param>
/// <returns></returns>
private ParallelQuery<HtmlNode> NodeType(IList<HtmlNode> _HtmlNodes, string Expression)
{
var v = FindChildNodes(_HtmlNodes).AsParallel().Where(
x => x.OriginalName.Equals(Expression, StringComparison.CurrentCultureIgnoreCase));
return v;
}
#endregion
#region 查找所有下级
/// <summary>
/// 查找所有下级
/// </summary>
/// <param name="_HtmlNodes"></param>
/// <returns></returns>
private List<HtmlNode> FindChildNodes(IList<HtmlNode> _HtmlNodes)
{
if (_HtmlNodes == null)
{
throw new Exception("");
}
List<HtmlNode> list = new List<HtmlNode>();
foreach (var v in _HtmlNodes)
{
FindChildNodesAction(v, list);
}
return list;
}
private void FindChildNodesAction(HtmlNode hn, List<HtmlNode> list)
{
if (list == null)
{
throw new Exception("");
}
foreach (var v in hn.ChildNodes)
{
if (hn.NodeType == HtmlNodeType.Element)
{
list.Add(v);
FindChildNodesAction(v, list);
}
}
}
#endregion
}
用 DOM 实现文章采集 -- 通过jquery 语法式的方法采集指定对象的文本。
最新推荐文章于 2022-10-17 15:42:42 发布