仿chrome实现获取元素xpath

最新推荐文章于 2023-06-29 16:17:39 发布

LLittleKevin

最新推荐文章于 2023-06-29 16:17:39 发布

阅读量2k

点赞数

分类专栏：随手小记文章标签： xpath chrome插件爬虫

本文链接：https://blog.csdn.net/zhihang19941024/article/details/78112514

版权

随手小记专栏收录该内容

33 篇文章 0 订阅

订阅专栏

最近做的一个项目是自动化爬虫系统，其中包括了前端chrome插件，在插件中实现了仿chrome获取元素xpath的方式．

1.代码

xh.elementsShareFamily = function(primaryEl, siblingEl) {
  var p = primaryEl, s = siblingEl;
    //(!p.className || p.className === s.className &&
  return (p.tagName === s.tagName &&
          (!p.id || p.id === s.id));
};

xh.getElementIndex = function(el) {
  var index = 1;
  var sib;
  for (sib = el.previousSibling; sib; sib = sib.previousSibling) {
    if (sib.nodeType === Node.ELEMENT_NODE && xh.elementsShareFamily(el, sib)) {
      index++;
    }
  }
  if (index > 1) {
    return index;
  }
  for (sib = el.nextSibling; sib; sib = sib.nextSibling) {
    if (sib.nodeType === Node.ELEMENT_NODE && xh.elementsShareFamily(el, sib)) {
      return 1;
    }
  }
  return 0;
};

xh.makeQueryForElement = function(el) {
  var query = '';
  for (; el && el.nodeType === Node.ELEMENT_NODE; el = el.parentNode) {
    var component = el.tagName.toLowerCase();
    var index = xh.getElementIndex(el);
    if (el.id) {
      component += '[@id=\'' + el.id + '\']';
    } 
      //else if (el.className) {
    //  component += '[@class=\'' + el.className + '\']';
   // }
    if (index >= 1) {
      component += '[' + index + ']';
    }
    // If the last tag is an img, the user probably wants img/@src.
    if (query === '' && el.tagName.toLowerCase() === 'img') {
      component += '/@src';
    }
    query = '/' + component + query;
  }
  return query;
};

　在调用时只需要调用xh.makeQueryForElement方法即可，其中注释部分是把标签的className也作为xpath的一部分，由于chrome的获取xpath是不带className信息的，所以打上了注释

2.效果

可以看到在根列表下的xpath就是通过模拟chrome的方式获取的，下面的内容是通过xpath解析出来的．

3.附录

xpath解析内容函数

xh.evaluateQuery = function(query) {
  var xpathResult = null;
  var str = '';
  var nodeCount = 0;

  try {
    xpathResult = document.evaluate(query, document, null,
                                    XPathResult.ANY_TYPE, null);
      
     // console.log("==================" + xpathResult.resultType);
      
  } catch (e) {
    str = '[INVALID XPATH EXPRESSION]';
    nodeCount = 0;
  }

  if (!xpathResult) {
    return [str, nodeCount];
  }

  if (xpathResult.resultType === XPathResult.BOOLEAN_TYPE) {
    str = xpathResult.booleanValue ? '1' : '0';
    nodeCount = 1;
  } else if (xpathResult.resultType === XPathResult.NUMBER_TYPE) {
    str = xpathResult.numberValue.toString();
    nodeCount = 1;
  } else if (xpathResult.resultType === XPathResult.STRING_TYPE) {
    str = xpathResult.stringValue;
    nodeCount = 1;
  } else if (xpathResult.resultType ===
             XPathResult.UNORDERED_NODE_ITERATOR_TYPE) {
    for (var node = xpathResult.iterateNext(); node;
         node = xpathResult.iterateNext()) {
      if (null != str && str != '') {
        str += '[newline]';
      }
      str += node.textContent.replace(/[\r\n\t]/g, ' ').replace(/[ ]{2,}/g,' ');
      nodeCount++;
    }
    if (nodeCount === 0) {
        //console.log("=====nodeCount0========="+xpathResult);
        //console.log(xpathResult.iterateNext())
      str = '[NULL]';
    }
  } else {
    // Since we pass XPathResult.ANY_TYPE to document.evaluate(), we should
    // never get back a result type not handled above.
    str = '[INTERNAL ERROR]';
    nodeCount = 0;
  }

  return [str, nodeCount];
};