php网页如何查看节点,解析HTML标签,并实现快速查找节点,获取节点信息

* @copyright 2014 kun

* @license http://www.php.com/license/3_01.txt PHP License 3.01

* @version 1.0

* @link http://www.blogkun.com

* @since 1.0

*/

class TagDomRoot

{

public $tag = 'root';

public $plaintext;

public $child = array();

public $level = 0;

public static $TagParseError = false;

protected static $TagSet = array();

protected static $FoundNode = array();

public static $ErrorTag = array();

/**

* initProperty

*

* @access public

*

* @return null

*/

public function initProperty()

{

$TagParseError = false;

$TagSet = array();

$FoundNode = array();

$DumpScriptCode = array();

$ErrorTag = array();

}

/**

* __construct

*

* @param string $str The tag string to be parse.

*

* @access public

*

* @return TagDomRoot

*/

public function __construct($str)

{

$this->_removeNoise($str);

if ($str === null) {

self::$TagParseError = true;

} else {

$l = strpos($str, ' if ($l !== false) {

$this->plaintext = substr($str, 0, $l);

}

$res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) {

$this->plaintext .= implode($matches[1]);

}

$r = strrpos($str, '>');

if ($r !== false) {

$this->plaintext .= substr($str, $r+1);

}

$tagCollect = array();

$attrCollect = array();

$innerContentCollect = array();

if ($this->parseTag($str, $tagCollect, $attrCollect, $innerContentCollect) === false) {

self::$TagParseError = true;

}

foreach ($tagCollect as $index => $tag) {

$this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);

}

}

}

/**

* parseTag

*

* @param mixed $str Description.

* @param mixed &$tagCollect Description.

* @param mixed &$attrCollect Description.

* @param mixed &$innerContentCollect Description.

*

* @access protected

*

* @return boolean Value.

*/

protected function parseTag($str, array &$tagCollect, array &$attrCollect, array &$innerContentCollect)

{

$selfClosingTags = array('img' => 1, 'br' => 1, 'input' => 1, 'meta' => 1, 'link' => 1, 'hr' => 1, 'base' => 1, 'embed' => 1, 'spacer' => 1);

$end = -2;

$close = 0;

$error = false;

$tag = '';

while (true) {

$l = strpos($str, ' if ($l === false) {//parse end

break;

}

if (strpos(substr($str, $l, 2), '/') !== false) {//surplus closing tag,discard

$error = true;

$end = $l+strlen($tag);

self::$ErrorTag[] = substr($str, $l, strpos($str, '>', $l)-$l+1);

continue;

}

$r = strpos($str, '>', $l);

$tag = substr($str, $l+1, $r-$l-1);

if (!ctype_alpha($tag[0]) || strpos($tag, ' $end = $r + 1;

continue;

}

$tag = preg_replace("~\n+~", ' ', $tag);

$space = strpos($tag, ' ');

if ($space !== false) {

$attrCollect[] = substr($tag, $space+1);

$tag = substr($tag, 0, $space);

} else {

$attrCollect[] = '';

}

$tagCollect[] = $tag;

if (isset($selfClosingTags[$tag])) {

$innerContentCollect[] = '';

$end = $r-strlen($tag)-2;

$close = $r+1;

continue;

}

$countOpen = -1;

$open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $open);

if ($close === false) {//surplus opening tag

$innerContentCollect[] = substr($str, $r+1);

$error = true;

self::$ErrorTag[] = '';

break;

}

$start = $open;

while ($open < $close && $open !== false) {

$countOpen++;

$open = strpos($str, ' }

while ($countOpen > 0 && $close !== false) {

$open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $close+strlen($tag)+3);

if ($close === false) {

break;

}

$countOpen--;

while ($open < $close && $open !== false) {

$open = strpos($str, ' $countOpen++;

}

}

if ($close === false) {//标签闭合不配对

$innerContentCollect[] = substr($str, $r+1);

$error = true;

break;

}

$end = $close;

$r = strpos($str, '>', $start);

$innerContentCollect[] = substr($str, $r+1, $end - $r - 1);

}

return !$error;

}

/**

* _removeNoise

*

* @param string &$str The tag string to be parse.

*

* @access private

*

* @return string

*/

private function _removeNoise(&$str)

{

$str = preg_replace('~~is', '', $str);

$str = preg_replace('~~is', '', $str);

$str = preg_replace('~*?>~is', '', $str);

}

/**

* parseSelectors

*

* @param string $selectors user's select condition.

* @param array &$selectorsTag tags

* @param array &$selectorsAttr attributes

*

* @access protected

*

* @return null

*/

protected function parseSelectors($selectors, array &$selectorsTag, array &$selectorsAttr)

{

preg_match_all('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selectors, $matches);

$selectorsTag = $matches[1];

foreach ($matches[2] as $key => $value) {

$selectorsAttr[$key] = array();

if ($value !== '') {

preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $value, $matches);

foreach ($matches[1] as $index => $attr) {

$selectorsAttr[$key][$attr] = $matches[2][$index];

}

}

}

}

/**

* find

*

* @param mixed $selectors user's select condition.

* @param array $selectorsTag tags.

* @param array $selectorsAttr attributes.

*

* @access public

*

* @return array

*/

public function find($selectors, $selectorsTag = array(), $selectorsAttr = array())

{

if ($selectors !== null) {

$this->parseSelectors($selectors, $selectorsTag, $selectorsAttr);

}

var_dump($selectorsTag, $selectorsAttr);exit();

if (!empty($selectorsTag)) {

$this->seek($selectorsTag, $selectorsAttr);

foreach ($this->child as $key => $node) {

$node->find(null, $selectorsTag, $selectorsAttr);

}

}

if ($selectors !== null) {

$res = self::$FoundNode;

self::$FoundNode = array();

return $res;

}

}

/**

* findGlobal

*

* @param string $selectors user's select condition.

*

* @access public

*

* @return array

*/

public function findGlobal($selectors)

{

$space = strpos($selectors, ' ', strpos($selectors, ']'));

if ($space === false) {

return $this->findOneGlobal($selectors);

} else {

$selectorsAttr = array();

$selectorsTag = array();

$this->findOneGlobal(substr($selectors, 0, $space), false);

$this->parseSelectors(substr($selectors, $space + 1), $selectorsTag, $selectorsAttr);

if (!empty(self::$FoundNode) && !empty($selectorsTag)) {

$nodes = self::$FoundNode;

self::$FoundNode = array();

foreach ($nodes as $key => $node) {

$node->seek($selectorsTag, $selectorsAttr);

}

}

}

$res = self::$FoundNode;

self::$FoundNode = array();

return $res;

}

/**

* seek

*

* @param array $selectorsTag tags.

* @param array $selectorsAttr attributes.

*

* @access protected

*

* @return null

*/

protected function seek($selectorsTag, $selectorsAttr)

{

foreach ($this->child as $key => $node) {

$isFind = true;

if ($node->tag === $selectorsTag[0]) {

foreach ($selectorsAttr[0] as $attrName => $value) {

if (isset($node->attr[$attrName])

&& (preg_match('~.*? '.$value.' .*?~', $node->attr[$attrName]) > 0

|| preg_match('~^'.$value.'$~', $node->attr[$attrName]) > 0

|| preg_match('~^'.$value.' ~', $node->attr[$attrName]) > 0

|| preg_match('~ '.$value.'$~', $node->attr[$attrName]) > 0)

) {

continue;

} else {

$isFind = false;

break;

}

}

} else {

$isFind = false;

}

if ($isFind) {

if (count($selectorsTag) === 1) {

self::$FoundNode[] = $node;

} else {

$node->seek(

array_slice($selectorsTag, 1),

array_slice($selectorsAttr, 1)

);

}

}

}

}

/**

* findOneGlobal

*

* @param string $selector user's select condition.

* @param bool $isReturn weather return value.

*

* @access public

*

* @return array

*/

public function findOneGlobal($selector, $isReturn = true)

{

preg_match('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selector, $matches);

$tag = $matches[1];

$attr = array();

if (isset($matches[2])) {

preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $matches[2], $matches);

foreach ($matches[1] as $key => $value) {

$attr[$value] = $matches[2][$key];

}

}

if (isset(self::$TagSet[$tag])) {

foreach (self::$TagSet[$tag] as $attrValue => $nodeArray) {

$isFind = true;

foreach ($attr as $attrName => $value) {

if (preg_match('~'.$attrName.'=".*? '.$value.' .*?"~', $attrValue)

|| preg_match('~'.$attrName.'="'.$value.' .*?"~', $attrValue)

|| preg_match('~'.$attrName.'=".*? '.$value.'"~', $attrValue)

|| preg_match('~'.$attrName.'="'.$value.'"~', $attrValue)

) {

continue;

} else {

$isFind = false;

break;

}

}

if ($isFind) {

foreach ($nodeArray as $key => $node) {

self::$FoundNode[] = $node;

}

}

}

}

if ($isReturn) {

$res = self::$FoundNode;

self::$FoundNode = array();

return $res;

}

}

}

/**

* TagDomNode

*

* @uses TagDomRoot

*

* @category TagParse

* @package TagParse

* @author kun * @copyright 2014 kun

* @license http://www.php.com/license/3_01.txt PHP License 3.01

* @version 1.0

* @link http://www.blogkun.com

* @since 1.0

*/

class TagDomNode extends TagDomRoot

{

public $attr = array();

public $parent = null;

/**

* __construct

*

* @param mixed $tag tag.

* @param mixed $parent parent node.

* @param mixed $attr attribute.

* @param mixed $innerContent tag content.

* @param mixed $level node level.

*

* @access public

*

* @return TagDomNode

*/

public function __construct($tag, $parent, $attr, $innerContent, $level)

{

$this->tag = $tag;

$this->parent = $parent;

$this->_parseAttr($attr);

$this->level = $level;

$l = strpos($innerContent, ' if ($l !== false) {

$this->plaintext = substr($innerContent, 0, $l);

}

$res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) {

$this->plaintext .= implode($matches[1]);

} else {

$this->plaintext .= $innerContent;

}

$r = strrpos($innerContent, '>');

if ($r !== false) {

$this->plaintext .= substr($innerContent, $r+1);

}

$tagCollect = array();

$attrCollect = array();

$innerContentCollect = array();

if ($this->parseTag($innerContent, $tagCollect, $attrCollect, $innerContentCollect) === false) {

self::$TagParseError = true;

}

foreach ($tagCollect as $index => $tag) {

$this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);

}

if (!isset(self::$TagSet[$this->tag])) {

self::$TagSet[$this->tag] = array();

}

if (!isset(self::$TagSet[$this->tag][$attr])) {

self::$TagSet[$this->tag][$attr] = array();

}

self::$TagSet[$this->tag][$attr][] = &$this;

}

/**

* _parseAttr

*

* @param string $str attribute string.

*

* @access public

*

* @return null

*/

private function _parseAttr($str)

{

preg_match_all('~(?[\w-]+)="(?.*?)"~s', $str, $matches);

foreach ($matches['attrName'] as $key => $value) {

$this->attr[$value] = $matches['attrValue'][$key];

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值