PHP爬取网站内容

最新推荐文章于 2024-06-26 15:22:47 发布

2019ab

最新推荐文章于 2024-06-26 15:22:47 发布

阅读量279

点赞数

分类专栏： PHP 文章标签： php 爬虫

本文链接：https://blog.csdn.net/ab15176142633/article/details/119945985

版权

PHP 同时被 2 个专栏收录

44 篇文章 1 订阅

订阅专栏

ThinkPHP

17 篇文章 0 订阅

订阅专栏

最近公司需要存在阿里云对象存储（oss）里的视频文件，而且需要18套课程的视频源文件，这周就要，我想了一下这要是一个一个找那可就麻烦了。要想想一套课程有n个章节，每个章节有n个视频文件。所以我下定决心要写一个自动下载的程序。

废话不多说，先看效果

在这里插入图片描述

下面是代码

<?php 

class Request{

 public static function post($url, $post_data = '', $timeout = 5){//curl

  $ch = curl_init();

  curl_setopt ($ch, CURLOPT_URL, $url);

  curl_setopt ($ch, CURLOPT_POST, 1);

  if($post_data != ''){

   curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);

  }

  curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);

  curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

  curl_setopt($ch, CURLOPT_HEADER, false);

  $file_contents = curl_exec($ch);

  curl_close($ch);

  return $file_contents;

 }

 public static function post2($url, $data=array()){//file_get_content

  $postdata = http_build_query(

   $data

  );  

  $opts = array('http' =>

      array(

       'method' => 'POST',

       'header' => 'Content-type: application/x-www-form-urlencoded',

       'content' => $postdata

      )

  );  

  $context = stream_context_create($opts);

  $result = file_get_contents($url, false, $context);

  return $result;

 }

 public static function post3($host,$path,$query,$others=''){//fsocket

  $post="POST $path HTTP/1.1\r\nHost: $host\r\n";

  $post.="Content-type: application/x-www-form-";

  $post.="urlencoded\r\n${others}";

  $post.="User-Agent: Mozilla 4.0\r\nContent-length: ";

  $post.=strlen($query)."\r\nConnection: close\r\n\r\n$query";

  $h=fsockopen($host,80);

  fwrite($h,$post);

  for($a=0,$r='';!$a;){

    $b=fread($h,8192);

    $r.=$b;

    $a=(($b=='')?1:0);

   }

  fclose($h);

  return $r;

 }

}

// 设置页面不超时
ini_set('max_execution_time', '0');
// 设置PHP存大小
@ini_set('memory_limit', '4048M');
// 抓取视频数据内容  21,30,132,9,77,128,129,133,130,134,7,16,135,29,31,92,146,147
// 失败的77 
$data = Request::post2('https://www.xxxx.cn/index/details_data',array('id'=>77));
echo '<pre>';
$data = json_decode($data,true);
foreach ($data as $k => &$v) {
    
   if(is_array($v)){
    // 创建文件夹
    $dir = iconv("UTF-8", "GBK", "Public/".$v['name']);

    mkdir ($dir,0777,true);
    $Catalogdata = $v['Catalogdata'];

    foreach ($Catalogdata as $kk => &$vv) {
        //遍历文件夹
        $hd = opendir($dir);
        $i = 0;
        // 读取
        while($f=readdir($hd)){
            // 创建文件夹
            $dir = iconv("UTF-8", "GBK","Public/".$v['name'].'/'.$vv['id'].$vv['name']);
            mkdir ($dir,0777,true);
            // 放入文件 
            $hd = opendir($dir);
            // 读取
            while($f=readdir($hd)){
                // 1.读取文件内容
                if(!empty($vv['video_url'])){
                    $mov = file_get_contents('http:'.$vv['video_url']);
                    file_put_contents($dir.'/'.$vv['name'].'.mov',$mov);
                }

                $chapter = $vv['chapter'];
                // 判断是否是数组
                if(is_array($chapter)){
                   foreach ($chapter as $key => $value) {
                        // 1.读取文件内容
                        $mov = file_get_contents('http:'.$value['vedio']);
                        file_put_contents($dir.'/'.$value['title'].'.mov',$mov);
                        echo ++$i;

                  }
            }
            }
        }
        // 关闭
        closedir($hd);
        }
    }
}
?>

由于有些特别大的文件下载还有点问题，所以我没有遍历循环，方便下载一套课程检查一套课程。

要是我一个一个手动下载那估计得需要两天多，写这个程序用了三个小时，下载文件大约两个小时，极大的缩短了工作时间，提高了工作效率。感谢大家观看，我们下次见。

2019ab

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
0
评论
PHP爬取网站内容

最近公司需要存在阿里云对象存储（oss）里的视频文件，而且需要18套课程的视频源文件，这周就要，我想了一下这要是一个一个找那可就麻烦了。要想想一套课程有n个章节，每个章节有n个视频文件。所以我下定决心要写一个自动下载的程序。废话不多说，先看效果下面是代码<?php class Request{ public static function post($url, $post_data = '', $timeout = 5){//curl $ch = curl_init();
复制链接

扫一扫