我的php爬虫

一:通过culr抓取网站,产品列表url并且入库

<?php

namespace App\Console\Commands\Cacuqecig;

use Illuminate\Console\Command;
use App\Models\Cacuqecig\Cacuqecig;

class Product extends Command {

    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'loadProductFromCacuqecig';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'loadProductFromCacuqecig';

    /**
     * Create a new command instance.
     *
     * @return void
     */
    public function __construct() {
        parent::__construct();
    }

    /**
     * Execute the console command.
     *
     * @return mixed
     */
    public function handle() 
    {
        for($n = 1; $n < 1000; ++$n) {
            $url = "https://www.cacuqecig.com/search.html?page_size=48&p={$n}";
            $rtn = $this->get($url);
            $pattern = '/<li skuid="(?<sku>.*?)" data-pro="[0-9]">(.*?)<a href="(?<url>.*?)"\>/';
            preg_match_all($pattern, $rtn, $matches);

            if(empty($matches['sku'])) {
                break;
            }
            
            foreach ($matches['sku'] as $k => $v) {
                $param = [
                    'url' => $matches['url'][$k],
                ];
                if (Cacuqecig::where('url', $param['url'])->first()) {
                    Cacuqecig::updated($param);
                } else {
                    Cacuqecig::create($param);
                }
            }
        }

        return  true;
    }
    
    public function get($url)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_HEADER, false);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        $rtn = curl_exec($ch);
        
        return preg_replace("/\n/",'',$rtn);
    }
    
}

二:数据库获取产品列表url,遍历详情,正则匹配所要数据,包括模拟用户登录

<?php

namespace App\Console\Commands\Cacuqecig;

use App\Exceptions\Handler;
use Illuminate\Console\Command;
use App\Console\Commands\Cacuqecig\Product;
use App\Models\Elegomall\Elegomall;
use App\Models\Cacuqecig\Cacuqecig;

class ProductInfo extends Command {

    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'loadProductInfoFromCacuqecig';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'loadProductInfoFromCacuqecig';

    /**
     * Create a new command instance.
     *
     * @return void
     */
    public function __construct() {
        parent::__construct();
    }

    /**
     * Execute the console command.
     *
     * @return mixed
     */
    public function handle() 
    {
        $root = "https://www.cacuqecig.com";
        $cookie = base_path() . '/cookie_cacuqecig.txt';
        $this->login($cookie);

        // 从数据库独处产品列表,遍历下载产品详情
        $products_url = Cacuqecig::select('url')->get()->toArray();
        foreach($products_url as $v) {
            $url = "{$root}{$v['url']}";
            $rtn = $this->get($url, $cookie);

            $title = '/<div class="pro_show_name">(\s*.*?)<\/div>/s';
            preg_match_all($title, $rtn, $matchetitle);

            $img = '/<div class="list-img"><img.*? src=\"(.*?)\".*?\/><\/div>/s';
            preg_match_all($img, $rtn, $matcheimg);

            $price = '/<span class="price".*?>(.*?)<\/span>/is';
            preg_match_all($price, $rtn, $matcheprice);
            $childs = [];

            // 获取子产品SKU和属性
            $subs = explode('"list-mods', $rtn);
            if (!empty($subs[1])) {
                unset($subs[0]);
                $patternAttr = '/<div class="pro-attr" title="(?<sku>.*?)">(?<attr>.*?)<span/';
                preg_match_all($patternAttr, $subs[1], $matcheAttrs);

                foreach ($matcheAttrs['sku'] as $k => $v) {
                    $childs[$k] = [
                        'title' => $matchetitle[1][0] ?? 0,
                        'img'   => $matcheimg[1][0] ?? 0,
                        'price' => $matcheprice[1][1] ?? 0,
                        'sku' => $v,
                        'attr' => $matcheAttrs['attr'][$k] ?? '',
                        'origin' => 'cacuqecig',
                        'type'   => $url,
                    ];
                    // 获取子产品库存
                    foreach ($subs as $warehouseIndex => $sub) {
                        $patternStock = '/<span class="liststock' . $v . '">(?<qty>.*?)<\/span>/';
                        preg_match($patternStock, $sub, $matcheStock);
                        $childs[$k]['qty'] = $matcheStock['qty'] ?? 0;
                    }
                }

                foreach ($childs as $key=>$val) {
                    $param['title'] = $val['title'];
                    $param['product'] = $val['title'];
                    $param['img'] = $val['img'];
                    $param['sku'] = $val['sku'];
                    $param['color'] = $val['attr'];
                    $param['origin'] = $val['origin'];
                    $param['types'] = $val['type'];
                    $param['stock'] = $val['qty'];
                    $param['price'] = $val['price'];
                    $param['created_at'] = date_format(now(), 'Y-m-d');

                    //前一天时间
                    $info['last_day'] = date('Y-m-d', strtotime($param['created_at'])-3600*24);
                    $stockLast = Elegomall::select('title', 'color', 'stock')->where('title', $param['title'])->where('color', $param['color'])->where('created_at',$info['last_day'])->first();
                    if (!empty($stockLast->stock)) {
                        $last_day = $stockLast->stock;
                    } else {
                        $last_day = 0;
                    }
                    //当天减去前一天 正数为补货量 负数为销量
                    $param['diff'] = $param['stock'] - $last_day;
                    Elegomall::create($param);
                }
            } else {
                continue;
            }
        }
    }
    
    public function login($cookie)
    {
        $post = [ 
            'email' => '1279991307@qq.com', 
            'password' => 'cy123456',
        ];

        $UserAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)';
        $headers = [
            "Content-type: application/x-www-form-urlencoded; charset=UTF-8", 
        ];
        
        $curl = curl_init();//初始化curl模块 
        
        curl_setopt($curl, CURLOPT_URL, "https://www.cacuqecig.com/user/loginAjax");//登录提交的地址 
        curl_setopt($curl, CURLOPT_HEADER, false);//是否显示头信息 
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);//是否自动显示返回的信息 
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie); //设置Cookie信息保存在指定的文件中 
        curl_setopt($curl, CURLOPT_POST, 1);//post方式提交 
        curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));//要提交的信息
        curl_setopt($curl, CURLOPT_USERAGENT, $UserAgent);
        curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); 
        $rtn = curl_exec($curl);//执行cURL 
        curl_close($curl);//关闭cURL资源,并且释放系统资源 

        return $cookie;
    }
    
    public function get($url, $cookie)
    {
        $ch = curl_init();
        
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_HEADER, false);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie); //读取cookie
        $rtn = curl_exec($ch);
        $result = preg_replace("/\n/",'',$rtn);

        return $result;
    }
    
}

三:配置定时任务

protected function schedule(Schedule $schedule)
{
     $schedule->command('loadProductFromElegomall')
              ->daily()->withoutOverlapping();
     $schedule->command('loadProductFromCacuqecig')
              ->daily()->withoutOverlapping();
     $schedule->command('loadProductInfoFromCacuqecig')
              ->daily()->withoutOverlapping();
    $schedule->command('loadProductFromDemandvape')
             ->daily()->withoutOverlapping();
    $schedule->command('loadProductInfoFromDemandvape')
             ->daily()->withoutOverlapping();
}

四:运行代码,获取数据

win: cmd命令行执行  php artisan xxxxxxxxxxxxxx

Linux: crontab -e 配置

 

五:数据展示

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值