php htmlparsermodel,src/ParserDom.php · Gitee 极速下载/htmlparsermodel - Gitee.com

namespace HtmlParser;

/**

* Copyright (c) 2013, 俊杰Jerry

* All rights reserved.

*

* @description: html解析器

* @author : 俊杰Jerry

* @date : 2013-6-10

*/

class ParserDom {

/**

* @var \DOMNode

*/

public $node;

/**

* @var array

*/

private $_lFind = [];

/**

* @param \DOMNode|string $node

* @throws \Exception

*/

public function __construct($node = NULL) {

if ($node !== NULL) {

if ($node instanceof \DOMNode) {

$this->node = $node;

} else {

$dom = new \DOMDocument();

$dom->preserveWhiteSpace = FALSE;

$dom->strictErrorChecking = FALSE;

if (@$dom->loadHTML($node)) {

$this->node = $dom;

} else {

throw new \Exception('load html error');

}

}

}

}

/**

* 初始化的时候可以不用传入html,后面可以多次使用

* @param null $node

* @throws \Exception

*/

public function load($node = NULL) {

if ($node instanceof \DOMNode) {

$this->node = $node;

} else {

$dom = new \DOMDocument();

$dom->preserveWhiteSpace = FALSE;

$dom->strictErrorChecking = FALSE;

if (@$dom->loadHTML($node)) {

$this->node = $dom;

} else {

throw new \Exception('load html error');

}

}

}

/**

* @codeCoverageIgnore

* @param string $name

* @return mixed

*/

function __get($name) {

switch ($name) {

case 'outertext':

return $this->outerHtml();

case 'innertext':

return $this->innerHtml();

case 'plaintext':

return $this->getPlainText();

case 'href':

return $this->getAttr("href");

case 'src':

return $this->getAttr("src");

default:

return NULL;

}

}

/**

* 深度优先查询

*

* @param string $selector

* @param number $idx 找第几个,从0开始计算,null 表示都返回, 负数表示倒数第几个

* @return self|self[]

*/

public function find($selector, $idx = NULL) {

if (empty($this->node->childNodes)) {

return FALSE;

}

$selectors = $this->parse_selector($selector);

if (($count = count($selectors)) === 0) {

return FALSE;

}

for ($c = 0; $c < $count; $c++) {

if (($level = count($selectors [$c])) === 0) {

return FALSE;

}

$this->search($this->node, $idx, $selectors [$c], $level);

}

$found = $this->_lFind;

$this->_lFind = [];

if ($idx !== NULL) {

if ($idx < 0) {

$idx = count($found) + $idx;

}

if (isset($found[$idx])) {

return $found[$idx];

} else {

return FALSE;

}

}

return $found;

}

/**

* 返回文本信息

*

* @return string

*/

public function getPlainText() {

return $this->text($this->node);

}

/**

* 获取innerHtml

* @return string

*/

public function innerHtml() {

$innerHTML = "";

$children = $this->node->childNodes;

foreach ($children as $child) {

$innerHTML .= $this->node->ownerDocument->saveHTML($child) ?: '';

}

return $innerHTML;

}

/**

* 获取outerHtml

* @return string|bool

*/

public function outerHtml() {

$doc = new \DOMDocument();

$doc->appendChild($doc->importNode($this->node, TRUE));

return $doc->saveHTML($doc);

}

/**

* 获取html的元属值

*

* @param string $name

* @return string|null

*/

public function getAttr($name) {

$oAttr = $this->node->attributes->getNamedItem($name);

if (isset($oAttr)) {

return $oAttr->nodeValue;

}

return NULL;

}

/**

* 匹配

*

* @param string $exp

* @param string $pattern

* @param string $value

* @return boolean|number

*/

private function match($exp, $pattern, $value) {

$pattern = strtolower($pattern);

$value = strtolower($value);

switch ($exp) {

case '=' :

return ($value === $pattern);

case '!=' :

return ($value !== $pattern);

case '^=' :

return preg_match("/^" . preg_quote($pattern, '/') . "/", $value);

case '$=' :

return preg_match("/" . preg_quote($pattern, '/') . "$/", $value);

case '*=' :

if ($pattern [0] == '/') {

return preg_match($pattern, $value);

}

return preg_match("/" . $pattern . "/i", $value);

}

return FALSE;

}

/**

* 分析查询语句

*

* @param string $selector_string

* @return array

*/

private function parse_selector($selector_string) {

$pattern = '/([\w\-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)["\']?(.*?)["\']?)?\])?([\/, ]+)/is';

preg_match_all($pattern, trim($selector_string) . ' ', $matches, PREG_SET_ORDER);

$selectors = [];

$result = [];

foreach ($matches as $m) {

$m [0] = trim($m [0]);

if ($m [0] === '' || $m [0] === '/' || $m [0] === '//')

continue;

if ($m [1] === 'tbody')

continue;

list ($tag, $key, $val, $exp, $no_key) = [$m [1], NULL, NULL, '=', FALSE];

if (!empty ($m [2])) {

$key = 'id';

$val = $m [2];

}

if (!empty ($m [3])) {

$key = 'class';

$val = $m [3];

}

if (!empty ($m [4])) {

$key = $m [4];

}

if (!empty ($m [5])) {

$exp = $m [5];

}

if (!empty ($m [6])) {

$val = $m [6];

}

// convert to lowercase

$tag = strtolower($tag);

$key = strtolower($key);

// elements that do NOT have the specified attribute

if (isset ($key [0]) && $key [0] === '!') {

$key = substr($key, 1);

$no_key = TRUE;

}

$result [] = [$tag, $key, $val, $exp, $no_key];

if (trim($m [7]) === ',') {

$selectors [] = $result;

$result = [];

}

}

if (count($result) > 0) {

$selectors [] = $result;

}

return $selectors;

}

/**

* 深度查询

*

* @param \DOMNode $search

* @param $idx

* @param $selectors

* @param $level

* @param int $search_level

* @return bool

*/

private function search(&$search, $idx, $selectors, $level, $search_level = 0) {

if ($search_level >= $level) {

$rs = $this->seek($search, $selectors, $level - 1);

if ($rs !== FALSE && $idx !== NULL) {

if ($idx == count($this->_lFind)) {

$this->_lFind[] = new self($rs);

return TRUE;

} else {

$this->_lFind[] = new self($rs);

}

} elseif ($rs !== FALSE) {

$this->_lFind[] = new self($rs);

}

}

if (!empty($search->childNodes)) {

foreach ($search->childNodes as $val) {

if ($this->search($val, $idx, $selectors, $level, $search_level + 1)) {

return TRUE;

}

}

}

return FALSE;

}

/**

* 获取tidy_node文本

*

* @param \DOMNode $node

* @return string

*/

private function text(&$node) {

return $node->textContent;

}

/**

* 匹配节点,由于采取的倒序查找,所以时间复杂度为n+m*l n为总节点数,m为匹配最后一个规则的个数,l为规则的深度,

* @codeCoverageIgnore

* @param \DOMNode $search

* @param array $selectors

* @param int $current

* @return boolean|\DOMNode

*/

private function seek($search, $selectors, $current) {

if (!($search instanceof \DOMElement)) {

return FALSE;

}

list ($tag, $key, $val, $exp, $no_key) = $selectors [$current];

$pass = TRUE;

if ($tag === '*' && !$key) {

exit('tag为*时,key不能为空');

}

if ($tag && $tag != $search->tagName && $tag !== '*') {

$pass = FALSE;

}

if ($pass && $key) {

if ($no_key) {

if ($search->hasAttribute($key)) {

$pass = FALSE;

}

} else {

if ($key != "plaintext" && !$search->hasAttribute($key)) {

$pass = FALSE;

}

}

}

if ($pass && $key && $val && $val !== '*') {

if ($key == "plaintext") {

$nodeKeyValue = $this->text($search);

} else {

$nodeKeyValue = $search->getAttribute($key);

}

$check = $this->match($exp, $val, $nodeKeyValue);

if (!$check && strcasecmp($key, 'class') === 0) {

foreach (explode(' ', $search->getAttribute($key)) as $k) {

if (!empty ($k)) {

$check = $this->match($exp, $val, $k);

if ($check) {

break;

}

}

}

}

if (!$check) {

$pass = FALSE;

}

}

if ($pass) {

$current--;

if ($current < 0) {

return $search;

} elseif ($this->seek($this->getParent($search), $selectors, $current)) {

return $search;

} else {

return FALSE;

}

} else {

return FALSE;

}

}

/**

* 获取父亲节点

*

* @param \DOMNode $node

* @return \DOMNode

*/

private function getParent($node) {

return $node->parentNode;

}

}

一键复制

编辑

Web IDE

原始数据

按行查看

历史

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值