2021年8月7日09:44:05
之前一直使用phpspider
官网:https://doc.phpspider.org/
但是官方对psr4,对php7 php8似乎没有升级的意思,用的比较多就是 selector 选择器
现在使用的laravel8 php8的框架,所以一直没有做更改,我其实比较不接受
composer require owner888/phpspider
提示一堆psr4不兼容问题,确实有点强迫症,我就直接把selector 直接单独提取出来,试了下竟然没问题,可以单独使用
经过测试php 7.1 7.2 7.3 8.0都是运行OK的,挺好的,可以自己去拿出来,这里贴出来一个,如果需要curl客户端可以使用 GuzzleHttp\Client; 特别是你在使用laravel本身都是集成的
注意看官方文档
<?php
namespace App\Utils;
use DOMDocument;
use DOMXpath;
use Exception;
class Selector
{
/**
* 版本号
* @var string
*/
const VERSION = '1.0.2';
public static $dom = null;
public static $dom_auth = '';
public static $xpath = null;
public static $error = null;
public static function select($html, $selector, $selector_type = 'xpath')
{
if (empty($html) || empty($selector)) {
return false;
}
$selector_type = strtolower($selector_type);
if ($selector_type == 'xpath') {
return self::_xpath_select($html, $selector);
} elseif ($selector_type == 'regex') {
return self::_regex_select($html, $selector);
} elseif ($selector_type == 'css') {
return self::_css_select($html, $selector);
}
}
public static function remove($html, $selector, $selector_type = 'xpath')
{
if (empty($html) || empty($selector)) {
return false;
}
$remove_html = "";
$selector_type = strtolower($selector_type);
if ($selector_type == 'xpath') {
$remove_html = self::_xpath_select($html, $selector, true);
} elseif ($selector_type == 'regex') {
$remove_html = self::_regex_select($html, $selector, true);
} elseif ($selector_type == 'css') {
$remove_html = self::_css_select($html, $selector, true);
}
$html = str_replace($remove_html, "", $html);
return $html;
}
/**
* xpath选择器
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-26 12:53
*/
private static function _xpath_select($html, $selector, $remove = false)
{
if (!is_object(self::$dom)) {
self::$dom = new DOMDocument();
}
// 如果加载的不是之前的HTML内容,替换一下验证标识
if (self::$dom_auth != md5($html)) {
self::$dom_auth = md5($html);
@self::$dom->loadHTML('<?xml encoding="UTF-8">' . $html);
self::$xpath = new DOMXpath(self::$dom);
}
//libxml_use_internal_errors(true);
//self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
//$errors = libxml_get_errors();
//if (!empty($errors))
//{
//print_r($errors);
//exit;
//}
$elements = @self::$xpath->query($selector);
if ($elements === false) {
self::$error = "the selector in the xpath(\"{$selector}\") syntax errors";
// 不应该返回false,因为isset(false)为true,更不能通过 !$values 去判断,因为!0为true,所以这里只能返回null
//return false;
return null;
}
$result = array();
if (!is_null($elements)) {
foreach ($elements as $element) {
// 如果是删除操作,取一整块代码
if ($remove) {
$content = self::$dom->saveXml($element);
} else {
$nodeName = $element->nodeName;
$nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text
//$nodeAttr = $element->getAttribute('src');
//$nodes = util::node_to_array(self::$dom, $element);
//echo $nodes['@src']."\n";
// 如果是img标签,直接取src值
if ($nodeType == 1 && in_array($nodeName, array('img'))) {
$content = $element->getAttribute('src');
} // 如果是标签属性,直接取节点值
elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4) {
$content = $element->nodeValue;
} else {
// 保留nodeValue里的html符号,给children二次提取
$content = self::$dom->saveXml($element);
//$content = trim(self::$dom->saveHtml($element));
$content = preg_replace(array("#^<{$nodeName}.*>#isU", "#</{$nodeName}>$#isU"), array('', ''), $content);
}
}
$result[] = $content;
}
}
if (empty($result)) {
return null;
}
// 如果只有一个元素就直接返回string,否则返回数组
return count($result) > 1 ? $result : $result[0];
}
/**
* css选择器
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-26 12:53
*/
private static function _css_select($html, $selector, $remove = false)
{
$selector = self::css_to_xpath($selector);
//echo $selector."\n";
//exit("\n");
return self::_xpath_select($html, $selector, $remove);
// 如果加载的不是之前的HTML内容,替换一下验证标识
//if (self::$dom_auth['css'] != md5($html))
//{
//self::$dom_auth['css'] = md5($html);
//phpQuery::loadDocumentHTML($html);
//}
//if ($remove)
//{
//return phpQuery::pq($selector)->remove();
//}
//else
//{
//return phpQuery::pq($selector)->html();
//}
}
/**
* 正则选择器
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-26 12:53
*/
private static function _regex_select($html, $selector, $remove = false)
{
if (@preg_match_all($selector, $html, $out) === false) {
self::$error = "the selector in the regex(\"{$selector}\") syntax errors";
return null;
}
$count = count($out);
$result = array();
// 一个都没有匹配到
if ($count == 0) {
return null;
} // 只匹配一个,就是只有一个 ()
elseif ($count == 2) {
// 删除的话取匹配到的所有内容
if ($remove) {
$result = $out[0];
} else {
$result = $out[1];
}
} else {
for ($i = 1; $i < $count; $i++) {
// 如果只有一个元素,就直接返回好了
$result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0];
}
}
if (empty($result)) {
return null;
}
return count($result) > 1 ? $result : $result[0];
}
public static function find_all($html, $selector)
{
}
public static function css_to_xpath($selectors)
{
$queries = self::parse_selector($selectors);
$delimiter_before = false;
$xquery = '';
foreach ($queries as $s) {
// TAG
$is_tag = preg_match('@^[\w|\||-]+$@', $s) || $s == '*';
if ($is_tag) {
$xquery .= $s;
} // ID
else if ($s[0] == '#') {
if ($delimiter_before) {
$xquery .= '*';
}
// ID用精确查询
$xquery .= "[@id='" . substr($s, 1) . "']";
} // CLASSES
else if ($s[0] == '.') {
if ($delimiter_before) {
$xquery .= '*';
}
// CLASS用模糊查询
$xquery .= "[contains(@class,'" . substr($s, 1) . "')]";
} // ATTRIBUTES
else if ($s[0] == '[') {
if ($delimiter_before) {
$xquery .= '*';
}
// strip side brackets
$attr = trim($s, '][');
// attr with specifed value
if (mb_strpos($s, '=')) {
$value = null;
list($attr, $value) = explode('=', $attr);
$value = trim($value, "'\"");
if (self::is_regexp($attr)) {
// cut regexp character
$attr = substr($attr, 0, -1);
$xquery .= "[@{$attr}]";
} else {
$xquery .= "[@{$attr}='{$value}']";
}
} // attr without specified value
else {
$xquery .= "[@{$attr}]";
}
} // ~ General Sibling Selector
else if ($s[0] == '~') {
} // + Adjacent sibling selectors
else if ($s[0] == '+') {
} // PSEUDO CLASSES
else if ($s[0] == ':') {
} // DIRECT DESCENDANDS
else if ($s == '>') {
$xquery .= '/';
$delimiter_before = 2;
} // ALL DESCENDANDS
else if ($s == ' ') {
$xquery .= '//';
$delimiter_before = 2;
} // ERRORS
else {
exit("Unrecognized token '$s'");
}
$delimiter_before = $delimiter_before === 2;
}
return $xquery;
}
/**
* @access private
*/
public static function parse_selector($query)
{
$query = trim(preg_replace('@\s+@', ' ', preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query)));
$queries = array();
if (!$query) {
return $queries;
}
$special_chars = array('>', ' ');
$special_chars_mapping = array();
$strlen = mb_strlen($query);
$class_chars = array('.', '-');
$pseudo_chars = array('-');
$tag_chars = array('*', '|', '-');
// split multibyte string
// http://code.google.com/p/phpquery/issues/detail?id=76
$_query = array();
for ($i = 0; $i < $strlen; $i++) {
$_query[] = mb_substr($query, $i, 1);
}
$query = $_query;
// it works, but i dont like it...
$i = 0;
while ($i < $strlen) {
$c = $query[$i];
$tmp = '';
// TAG
if (self::is_char($c) || in_array($c, $tag_chars)) {
while (isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $tag_chars))) {
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
} // IDs
else if ($c == '#') {
$i++;
while (isset($query[$i]) && (self::is_char($query[$i]) || $query[$i] == '-')) {
$tmp .= $query[$i];
$i++;
}
$queries[] = '#' . $tmp;
} // SPECIAL CHARS
else if (in_array($c, $special_chars)) {
$queries[] = $c;
$i++;
// MAPPED SPECIAL MULTICHARS
// } else if ( $c.$query[$i+1] == '//') {
// $return[] = ' ';
// $i = $i+2;
} // MAPPED SPECIAL CHARS
else if (isset($special_chars_mapping[$c])) {
$queries[] = $special_chars_mapping[$c];
$i++;
} // COMMA
else if ($c == ',') {
$i++;
while (isset($query[$i]) && $query[$i] == ' ') {
$i++;
}
} // CLASSES
else if ($c == '.') {
while (isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $class_chars))) {
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
} // ~ General Sibling Selector
else if ($c == '~') {
$space_allowed = true;
$tmp .= $query[$i++];
while (isset($query[$i])
&& (self::is_char($query[$i])
|| in_array($query[$i], $class_chars)
|| $query[$i] == '*'
|| ($query[$i] == ' ' && $space_allowed)
)) {
if ($query[$i] != ' ') {
$space_allowed = false;
}
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
} // + Adjacent sibling selectors
else if ($c == '+') {
$space_allowed = true;
$tmp .= $query[$i++];
while (isset($query[$i])
&& (self::is_char($query[$i])
|| in_array($query[$i], $class_chars)
|| $query[$i] == '*'
|| ($space_allowed && $query[$i] == ' ')
)) {
if ($query[$i] != ' ')
$space_allowed = false;
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
} // ATTRS
else if ($c == '[') {
$stack = 1;
$tmp .= $c;
while (isset($query[++$i])) {
$tmp .= $query[$i];
if ($query[$i] == '[') {
$stack++;
} else if ($query[$i] == ']') {
$stack--;
if (!$stack) {
break;
}
}
}
$queries[] = $tmp;
$i++;
} // PSEUDO CLASSES
else if ($c == ':') {
$stack = 1;
$tmp .= $query[$i++];
while (isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $pseudo_chars))) {
$tmp .= $query[$i];
$i++;
}
// with arguments ?
if (isset($query[$i]) && $query[$i] == '(') {
$tmp .= $query[$i];
$stack = 1;
while (isset($query[++$i])) {
$tmp .= $query[$i];
if ($query[$i] == '(') {
$stack++;
} else if ($query[$i] == ')') {
$stack--;
if (!$stack) {
break;
}
}
}
$queries[] = $tmp;
$i++;
} else {
$queries[] = $tmp;
}
} else {
$i++;
}
}
if (isset($queries[0])) {
if (isset($queries[0][0]) && $queries[0][0] == ':') {
array_unshift($queries, '*');
}
if ($queries[0] != '>') {
array_unshift($queries, ' ');
}
}
return $queries;
}
public static function is_char($char)
{
return preg_match('@\w@', $char);
}
/**
* 模糊匹配
* ^ 前缀字符串
* * 包含字符串
* $ 后缀字符串
* @access private
*/
protected static function is_regexp($pattern)
{
return in_array(
$pattern[mb_strlen($pattern) - 1],
array('^', '*', '$')
);
}
}
使用:
use GuzzleHttp\Client;
use App\Utils\Selector;
class SpiderService extends BaseService
{
/**
* @param int $page
* http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html
*/
public static function getSsq(int $page)
{
// phpinfo();
// die;
$client = new Client();
$body = $client->get('http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html');
$html = (string)$body->getBody();
//pp($data);
$data = Selector::select($html, "table");
pp($data);
}