PHP纯手写正则爬取星座屋网站星座运势数据

星座屋(http://www.xzw.com/fortune/)运势界面:
clipboard.png

最终爬取数据结果展示在APP上的效果:
图片描述

图片描述

下面就是使用正则实现的代码,是自己一年多前花了半天时间写的。现在想来,如果使用Scrapy或者phpspider只用几行代码就搞定了,不用这么费力气了~

<?php
/**
* 星座运势
* author: pengfei
* http://www.xzw.com/fortune/aries/ 今日
* http://www.xzw.com/fortune/aries/1.html 明日
* http://www.xzw.com/fortune/aries/2.html 本周
* http://www.xzw.com/fortune/aries/3.html 本月
* http://www.xzw.com/fortune/aries/4.html 今年
* http://www.xzw.com/fortune/aries/5.html 爱情
*/
define('IN_FTE', true);
require(dirname(__FILE__) . '/includes/init.php');
date_default_timezone_set('Asia/Shanghai');
$json  = new JSON();

/*
$constellation = array (
        '白羊座' => array('aries', '03/21-04/19'),
        '金牛座' => array('taurus', '04/20-05/20'),
        '双子座' => array('gemini', '05/21-06/21'),
        '巨蟹座' => array('cancer', '06/22-07/22'),
        '狮子座' => array('leo', '07/23-08/22'),
        '处女座' => array('virgo', '08/23-09/22'),
        '天秤座' => array('libra', '09/23-10/23'),
        '天蝎座' => array('scorpio', '10/24-11/22'),
        '射手座' => array('sagittarius', '11/23-12/21'),
        '魔羯座' => array('capricorn', '12/22-01/19'),
        '水瓶座' => array('aquarius', '01/20-02/18'),
        '双鱼座' => array('pisces', '02/19-03/20')
);*/

$constellation = isset($_REQUEST['xingzuo']) && !empty($_REQUEST['xingzuo']) ? trim($_REQUEST['xingzuo']) : null;
$category = isset($_REQUEST['category']) && !empty($_REQUEST['category']) ? intval($_REQUEST['category']) : null;

$all_xingzuo = array(
    'aries',
    'taurus',
    'gemini',
    'cancer',
    'leo',
    'virgo',
    'libra',
    'scorpio',
    'sagittarius',
    'capricorn',
    'aquarius',
    'pisces'
    );
$all_category = array(0,1,2,3,4,5);
if(!in_array($constellation, $all_xingzuo) || !in_array($category, $all_category)){
    exit('Params error');
}

$domain = 'http://www.xzw.com/fortune/';
$apiUrl = '';
if($category){
    $apiUrl = $domain.$constellation.'/'.$category.'.html';
} else {
    $apiUrl = $domain.$constellation.'/';
}

header("Content-type: text/html; charset=utf-8");
function getFortuneData($url){
    $fortune_data = array();
    $data = file_get_contents($url);
    $data = mb_convert_encoding($data, 'utf-8', 'gbk');
    preg_match('/<div class="c_main">(.*)<\/div>/ism', $data, $div_c_main);
    preg_match('/<dl>(.*?)<\/dl>/ism', $div_c_main[1], $dl);
    preg_match('/<dd>(.*?)<\/dd>/ism', $dl[1], $dd);
    preg_match('/<ul>(.*?)<\/ul>/ism', $dd[1], $ul);
    $ul = str_replace('<label>', '{label}', $ul[1]);
    $ul = preg_replace('/<span[^>]*?>/ism', '', $ul);
    $ul = preg_replace('/<li[^>]*>/ism', '', $ul);
    $ul = preg_replace('/<\/label>/ism', '', $ul);
    $ul = preg_replace('/<\/li>/ism', '', $ul);
    //$ul = preg_replace('/\s+/','',$ul);

    $ul_arr = explode('{label}', $ul);
    array_shift($ul_arr);

    foreach ($ul_arr as $key => &$li) {
        //preg_match_all("/([\x81-\xfe][\x40-\xfe])+/", $li, $matches);//转换编码 $matches[1][0]表示":"
        preg_match('/<em style="width:(\d{1,}).*">/ism', $li, $width);
        
        if(!empty($width)){
            $li = explode(":",$li);
            $li['label'] = preg_replace('/<em[^>]*?>/ism', '', $li[0]);
            $li['value'] = sprintf('%0.2f', floatval($width[1]/80));
            unset($li[0]);
            unset($li[1]);
              
            //$val[1] = $width[1]/16;
        } else {
            $li = explode(":",$li);
            $li['label'] = $li[0];
            $li['value'] = $li[1];
            unset($li[0]);
            unset($li[1]);
        }
    }

    $fortune_data['ul'] = $ul_arr;

    //获取c_cont
    preg_match('/<div class="c_cont">(.*?)<\/div>/ism', $data, $cont);
    $p_cont = preg_replace('/<strong[^>]*?>/ism', '', $cont[1]);
    $p_cont = str_replace('<span>', '{span}', $p_cont);
    $p_cont = str_replace('<p>', '{p}', $p_cont);
    $p_cont = preg_replace("'<[/!]*?[^<>]*?>'si","",$p_cont);
    $p_cont = preg_replace('/\s+/','',$p_cont);
    //$p_cont = preg_replace("'([rn])[s]+'","",$p_cont);
    $p_cont = str_replace('<div class="z">', '', $p_cont);

    $p_cont_arr = explode('{p}',$p_cont);
    array_shift($p_cont_arr);

    foreach ($p_cont_arr as $key => $val) {
          $temp = explode('{span}', $val);
          $temp_arr['label'] = $temp[0];
          $temp_arr['value'] = $temp[1]; 
          $fortune_data['cont'][] = $temp_arr;
          unset($temp);
    }
    
    return $fortune_data;
}

$write_result = '';//写入状态 默认为空表示不写入
$local_data = '';
$result = array();
$fileName = !empty($category) ? $constellation.'-'.$category.'.php' : $constellation.'.php';
$fortune_data_path = 'fortune_data/'.$fileName;

if(file_exists($fortune_data_path)){
    $local_data = @file_get_contents('fortune_data/'.$fileName);
}

if (!empty($local_data)) {
    $filemtime = filemtime($fortune_data_path);
    //判断缓存时间是否在当天内产生
    $todayStart = mktime(0, 0, 0, date("m"), date("d"), date("Y"));

    if($filemtime < $todayStart){ //缓存过期
        $data = getFortuneData($apiUrl);
        $write_result = write_fortune_cache($data, $fileName);
        if(empty($data)){
            $result['result'] = -1;
            $result['msg'] = '数据抓取失败!'; 
            $result['write_result'] = $write_result;
            $result['data'] = array();
            exit($json->encode($result));
        }
    } else {
        $data = unserialize($local_data);
    }
    $result['result'] = 0;
    $result['msg'] = 'success';
    $result['write_result'] = $write_result;
    $result['data'] = $data;
    exit($json->encode($result));
} else {
    $data = getFortuneData($apiUrl);
    
    if(!empty($data)){
        $write_result = write_fortune_cache($data, $fileName);

        $result['result'] = 0;
        $result['msg'] = 'success'; 
        $result['write_result'] = $write_result;
        $result['data'] = $data;
        exit($json->encode($result));
    } else {
        $result['result'] = -1;
        $result['msg'] = '数据抓取失败!'; 
        $result['write_result'] = $write_result;
        $result['data'] = array();
        exit($json->encode($result));
    }
}

function write_fortune_cache($data, $fileName){
    $fp = fopen('./fortune_data/'.$fileName, 'w+') or die('fortune_data/'.$fileName.'不存在!');
    $fw = fwrite($fp, serialize($data));
    
    if($fw){
        $write_result = 'success';
    } else {
        $write_result = 'fail';
    }
    fclose($fp);

    return $write_result;
}
?>

End

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值