phpspider 完整使用技巧 含代码

目标:我们要抓 http://www.cnbaowen.net/news/list-3720-1.html 右侧的内容,保存在数据库6.1 中的spider_baowen表中;

我用的windows系统

 

下载

1) https://github.com/owner888/phpspider

2) https://pan.baidu.com/s/10n9ZOUQBlrJzOQx0ShOmMQ    提取码:b2zc

创建数据库与相关表

 CREATE TABLE `spider_baowen` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
  `content` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
  `site_url` varchar(200) CHARACTER SET utf8mb4 DEFAULT '0' COMMENT '文章类型 1行业资讯 2技术资料',
  `site_id` int(5) DEFAULT NULL COMMENT '站点id',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

php代码 放在demo目录下 命名为test_baowenwang.php

<?php
// composer下载方式
// 先使用composer命令下载:
// composer require owner888/phpspider
// 引入加载器
//require './vendor/autoload.php';

// GitHub下载方式
require_once __DIR__ . '/../autoloader.php';

use phpspider\core\phpspider;
use phpspider\core\log;
use phpspider\core\selector;

/* Do NOT delete this comment */
/* 不要删除这段注释 */

$configs = array(
    'name' => '保温网',
    'domains' => array(
        'www.cnbaowen.net'
    ),
    'log_file' => 'data/test_baowenwang.log',
    'log_type' => 'warn,debug',
    'tasknum' => 5,
    'max_depth' => 1,
    'export' => [
        'type' => 'db',
        'table' => 'spider_baowen',
    ],

    'db_config' => [
        'host' => '127.0.0.1',
        'port' => '3306',
        'user' => 'root',
        'pass' => 'root',
        'name' => '6.1'
    ],
    'scan_urls' => array(
        'http://www.cnbaowen.net/news/list-3720-1.html'
    ),
    'content_url_regexes' => [
        'http://www.cnbaowen.net/news/show-\d+.html'
    ],
    'list_url_regexes' => [
        'http://www.cnbaowen.net/news/list-3720-\d+.html'
    ],

    'fields' => [
        [
            'name' => "title",
            'selector' => "//h1[@id='title']",
            'required' => true,
        ],
        [
            'name' => 'content',
            'selector' => "//div[@id='content']",
            'required' => true,
        ],
        [
            'name' => 'site_url'
        ],
        [
            'name' => 'site_id'
        ],

    ],
);

$spider = new phpspider($configs);


$spider->on_start = function ($spider) {
    for ($i = 1; $i <= 2; $i++) {
        $url = "http://www.cnbaowen.net/news/list-3720-{$i}.html";
        $spider->add_url($url);
    }
};
$spider->on_list_page = function ($page, $content, $spider) {

    //$content = selector::select($content,"//span[@class='f_r']");         // 中间详情页

    $content = selector::select($content, "//div[@class='box_body thumb']");// 右侧详情页

    if (is_array($content)) {
        $content = implode('', $content);
    }
    $regex = "#http://www.cnbaowen.net/news/show-\d+.html#";
    $urls = array();
    preg_match_all($regex, $content, $out);
    $urls = empty($out[0]) ? [] : $out[0];
    $urls = array_unique($urls);


    if (!empty($urls)) {
        foreach ($urls as $url) {
            $spider->add_url($url);
        }
    }
    // 通知爬虫不再从当前网页中发现待爬url
    return false;
};


$spider->on_content_page = function ($page, $content, $phpspider) {
    return false;
};

$spider->on_extract_field = function ($fieldname, $data, $page) {
    switch ($fieldname) {
        case 'content':
        {
            $s = preg_replace("/<div style=\"float:right[\s\S]*?div>/", "", $data);
            $s = preg_replace('/<a .*?href="(.*?)".*?>/is', "<a href='#'>", $s);
            $data = preg_replace('/<img.*?>/is', "", $s);
            $data = mb_substr($data, 0, 1000);
            return $data;
        }
        case 'site_url':
            return $page['url'];
        case 'site_id':
            return 1;
        default:
            return $data;
    }

};
$spider->start();

打开命令行 cd /d demo所在的目录

php -f test_baowenwang.php

 

查看数据库表,完毕;

发布了67 篇原创文章 · 获赞 6 · 访问量 5万+
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 编程工作室 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览