关闭

[置顶] php爬虫教程(四)抓取数据并进行处理

标签: php脚本爬虫
2068人阅读 评论(0) 收藏 举报
分类:

欢迎加入,新群号码:99640845


经过链接的分析,数据的分析,再加上规则的验证。

很容易的我们就get到了我们打算抓取到的数据,

so,我们就可以做我们想做的事情了。例如:

<?php
header("Content-type:text/html;charset=utf8");
set_time_limit(0);
require('client.php');
$client = new client();
$base_url = "http://t.pp.cc/";
for($i=0;$i<5;$i++) {
    echo 'page:',$i,"\n";
    $client->setHeader('Cookie', 'pt2gguin=o0056707892; RK=MBl/Y/W2em; ptcz=3c94d72206e5c146a03701b2cd5baa2dbf898ced78a80ca14afcb1c4347815d3; pgv_pvid=9725655970; g_ut=2; 3g_guest_id=-9042816631926882304; o_cookie=56707892; pgv_pvi=1429736448; eas_sid=K1S4H5o7F6b68265o2T8t240H5; luin=o0056707892; lskey=00010000d8b324c3df16b631120077e9d27f35b7d564ebc529087b9dcbc2f7556d9126fe81efd33c2d046cfd; pgv_si=s9506151424; pgv_info=pgvReferrer=&ssid=s6703251255; ptisp=ctc; ptui_loginuin=; uin=; skey=@5ZzsPWzRc; verifysession=h01a106acab1cddfbb02999f5bd471c902ebe5ab556be3b40de657fe21ffea2f01c24e692c37c2bd63c; IED_LOG_INFO=uin*|nick*%u7B11%u7740%u770B%u4F60%u54ED%20|time*1461910804; qzone_check=56707892_1461913345; rv2=802C9F7A654B37CD767C9691A7A5A7BF7F09CAB51D6341AA0B; property20=41424F4482BCD05C0A25B282DF8B360B38C86FEB7860B26C51C256022F9C1879FF87187E60572F65; qqmusic_uin=0056707892; qqmusic_key=@5ZzsPWzRc; qqmusic_fromtag=6');
    $client->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');
    $client->setHeader('Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3');



    $url = 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=1312024342&inCharset=utf-8&outCharset=utf-8&hostUin=1312024342¬ice=0&sort=0&pos=40&num=20&cgi_host=http%3A%2F%2Ftaotao.qq.com%2Fcgi-bin%2Femotion_cgi_msglist_v6&code_version=1&format=jsonp&need_private_comment=1&g_tk=978158941';
    $res = $client->get($url);
    $res = substr($res, 10);
    $res = substr($res, 0, -2);
    //echo $res;die;

    $res = json_decode($res, true);
    if(@$res['msglist'] == '') continue;
    foreach ($res['msglist'] as $k => $v) {
        if (!empty($v['commentlist'])) {
            foreach ($v['commentlist'] as $k2 => $v2) {
                if(isset($arr[$v2['name']])){
                    $arr[$v2['name']]['num'] = $arr[$v2['name']]['num']+1;
                }else{
                    $client->setHeader('Cookie', 'randomSeed=824410; QZ_FE_WEBP_SUPPORT=0; cpu_performance_v8=31; pt2gguin=; RK=MBl/Y/W2em; ptcz=3c94d72206e5c146a03701b2cd5baa2dbf898ced78a80ca14afcb1c4347815d3; pgv_pvid=9725655970; g_ut=2; 3g_guest_id=-9042816631926882304; o_cookie=; pgv_pvi=1429736448; eas_sid=K1S4H5o7F6b68265o2T8t240H5; luin=; lskey=00010000d8b324c3df16b631120077e9d27f35b7d564ebc529087b9dcbc2f7556d9126fe81efd33c2d046cfd; pgv_si=s9506151424; pgv_info=pgvReferrer=&ssid=s6703251255; ptisp=ctc; ptui_loginuin=675365043; uin=; skey=@5ZzsPWzRc; verifysession=h01a106acab1cddfbb02999f5bd471c902ebe5ab556be3b40de657fe21ffea2f01c24e692c37c2bd63c; IED_LOG_INFO=uin*675365043|nick*%u7B11%u7740%u770B%u4F60%u54ED%20|time*1461910804; zzpaneluin=; zzpanelkey=; p_skey=bAQZCU78gH4Qy0BSWeZ5pOsOdoKEnmVDRCdEi2HTIUY_; pt4_token=MNU3KRdqZCn9wQhASxnjt2lE*Ikt29Yf-6r8jHUPFMw_; p_uin=; qzone_check=56707892_1461913345; rv2=802C9F7A654B37CD767C9691A7A5A7BF7F09CAB51D6341AA0B; property20=41424F4482BCD05C0A25B282DF8B360B38C86FEB7860B26C51C256022F9C1879FF87187E60572F65; qqmusic_uin=; qqmusic_key=@5ZzsPWzRc; qqmusic_fromtag=6; __Q_w_s_hat_seed=1');
//                    $url2="http://base.s21.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all?uin=". $v2['uin']."&vuin=56707892&fupdate=1&rd=0.8304920770656681&g_tk=1551039607";
                    $url2="http://base.s11.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all?uin=". $v2['uin']."&vuin=56707892&fupdate=1&rd=0.3045121533378856&g_tk=1845089435";
                    $res2 = $client->get($url2);
                    $res2 = substr($res2, 10);
                    $res2 = substr($res2, 0, -2);
                    $res2 = json_decode($res2, true);
                    $arr[$v2['name']]['qq'] = $v2['uin'];
                    $arr[$v2['name']]['num'] = 1;
                    $arr[$v2['name']]['sex'] = $res2['data']['sex'];
                    $arr[$v2['name']]['age'] = $res2['data']['age'];
                    $arr[$v2['name']]['birthday'] = $res2['data']['birthday'];
                }
            }
        }
    }
    sleep('1');
}

if(empty($arr)) die;
$ages = array();
foreach ($arr as $k=>$v) {
    $ages[] = $v['num'];
}
$num=$num2=$num3=0;
array_multisort($ages, SORT_DESC, $arr);
foreach($arr as $k3=>$v3){
    echo "昵称:",$k3,'账号:',$v3['qq'],'访问次数:',$v3['num'],'性别',$v3['sex'],'年龄',$v3['age'],'生日',$v3['birthday'],"\n";
    $v3['sex']==2? $num++:$num2++;
    $num3 = $v3['num']>$num3?$v3['num']:$num3;
//    echo $v3,"\n";
}
echo  "共有妹子:$num 人,其他:$num2 人,最高访问次数:$num3";


这是我之前写过的一个抓取qq好友空间所有点过赞,评过论的用户,也就是他的QQ好友 :)

并且进行数据的整理和分析,找出

//共有妹子:$num 人,其他:$num2 人,最高访问次数:$num3
一些好玩的数据

这个脚本是半自动的需要手动的写入cookie保持登陆的状态。

想写一个全自动的来着,实在是搞不懂tx的加密规则就放弃了(破涕为笑)


总结:至此恭喜会抓取数据了,但是人的创造力是无限的。





2
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:33687次
    • 积分:821
    • 等级:
    • 排名:千里之外
    • 原创:48篇
    • 转载:3篇
    • 译文:0篇
    • 评论:22条
    最新评论