(可爱的刘看山用来做图标还是不错的!)
有没有发现知乎的Android平台客户端不支持看自己关注的收藏夹,每次都用电脑看感觉好麻烦啊,所以写了这个小软件,用PHP(暂时不会Python……)爬 知乎 - 与世界分享你的知识、经验和见解 来获取收藏夹里面的内容,将结果返回json数据:
{"URL":["http:\/\/www.zhihu.com\/collection\/19906419","http:\/\/www.zhihu.com\/collection\/29347192","http:\/\/www.zhihu.com\/collection\/34260000","http:\/\/www.zhihu.com\/collection\/21104418","http:\/\/www.zhihu.com\/collection\/30984146","http:\/\/www.zhihu.com\/collection\/42109172","http:\/\/www.zhihu.com\/collection\/19633165","http:\/\/www.zhihu.com\/collection\/26815754"],"TITLE":["\u62cd\u6848\u53eb\u7edd\u6216\u5927\u7b11","\u5982\u679c\u8fd8\u6709\u4ec0\u4e48\u80fd\u591f\u957f\u59ff\u52bf","\u4e00\u672c\u6b63\u7ecf\u5730\u626f\u6de1","\u7231\u60c5\u3002\u4ee5\u7231\u4e4b\u540d\u3002","\u8fd9\u624d\u662f\u771f\u6b63\u725b\u903c\u7684\u6545\u4e8b","\u5410\u69fd\u5410\u5f97\u6211\u90fd\u5c04\u4e86\uff01\u96c5\u881b\u8776\uff01","\u6253\u51fb\u9898\u4e3b","\u77e5\u5973\u4e4e"],"NUM":[6,0,0,0,0,0,0,0],"SMALLTITLE":["\u7531 \u4e8e\u6e90 \u521b\u5efa \u2022 1041 \u4e2a\u7b54\u6848","\u7531 \u6768\u6d0b \u521b\u5efa \u2022 255 \u4e2a\u7b54\u6848","\u7531 \u8c22\u676d \u521b\u5efa \u2022 678 \u4e2a\u7b54\u6848","\u7531 \u4e01\u5e7f\u6770 \u521b\u5efa \u2022 782 \u4e2a\u7b54\u6848","\u7531 \u6587\u897f \u521b\u5efa \u2022 41 \u4e2a\u7b54\u6848","\u7531 \u5218\u5947\u5947 \u521b\u5efa \u2022 368 \u4e2a\u7b54\u6848","\u7531 \u738b\u7d0d\u7c73 \u521b\u5efa \u2022 2982 \u4e2a\u7b54\u6848","\u7531 \u7ae5\u540d \u521b\u5efa \u2022 315 \u4e2a\u7b54\u6848"],"CREATOR_URL":["http:\/\/www.zhihu.com\/people\/yu_yuan","http:\/\/www.zhihu.com\/people\/yang-yang-32-76","http:\/\/www.zhihu.com\/people\/Tse_","http:\/\/www.zhihu.com\/people\/ding-yan-jie-92","http:\/\/www.zhihu.com\/people\/VinceJ","http:\/\/www.zhihu.com\/people\/6hikki","http:\/\/www.zhihu.com\/people\/nano.wang","http:\/\/www.zhihu.com\/people\/hmybz"],"ICON":"http:\/\/pic3.zhimg.com\/4b1255a7e_l.jpg"}
点击后就可以用WebView来访问手机网页版知乎了,这点确实比较偷懒,不过也是因为没有自己的服务器,所有都写爬虫的话我的SAE豆子估计很快就没了。
<?php
/**
* curl的模拟登陆封装
* @param [string] $url [所请求的URL地址]
* @param [string] $post [请求类型]
* @param [array] $header [表头数组]
* @param [string] $cookie [请求的cookie字符串]
* @param [array] $data [请求的表单数据]
* @param [boolean] $retHeader [是否返回header(比如需要获取header中的cookie)]
* @return [type] [description]
*/
function myCurl($url, $post, $header, $cookie, $data, $retHeader = false){
$ch = curl_init();
if($url != null)
curl_setopt($ch, CURLOPT_URL, $url);
if($header != null)
curl_setopt($ch, CURLOPT_HTTPHEADER,$header);
if($cookie != null)
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, $retHeader);
if($post == "post" || $post == "POST")
curl_setopt($ch, CURLOPT_POST, true);
if($data != null){
foreach ($data as $key => $value) {
$dstr[] = $key.'='.$value;
}
$datafileds = implode('=', $dstr);
curl_setopt($ch, CURLOPT_POSTFIELDS,$datafileds);
}
$ret = curl_exec($ch);
curl_close($ch);
return $ret;
}
/**
* 获取网页中的header
* @param [string] $content [网页内容(包含header)]
* @return [string] [header的串]
*/
function getHeader($content){
if($content)
list($header, $body) = explode("\r\n\r\n", $content);
else
$header = null;
return $header;
}
/**
* 获取网页中的cookie
* @param [string] $content [网页内容(包含header)]
* @return [array] [cookie数组]
*/
function getCookie($content){
$cookie = null;
$header = getHeader($content);
preg_match_all("/set\-cookie:([^\n\r]*)/i", $header, $matches,PREG_SET_ORDER);
foreach ($matches as $key) {
$cookie[] = $key[1];
}
return $cookie;
}
$url = "http://www.zhihu.com/collections";
$cookie = 'cookie字符串啦';
$header = array(
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36 LBBROWSER',
'Accept-Language: zh-CN,zh;q=0.8'
);
$ret = myCurl($url, "get", $header, $cookie, null, false);
#echo $ret;die;
preg_match_all("/<h2\sclass=\"zm-item-title\">\n\n<a.+href=\"(.*)\"\s>(.*)<\/a>\n*(<span\sclass=\"zg-num\">(.*)<\/span>)*\n*<\/h2>\n.*\n.*<a.*href=\"(.*)\"\starget=\"_blank\".*>(.*)<\/a>.*\n.*>(.*)<\/span>/i", $ret, $array);
preg_match_all("/<a\shref=\"(\/people.*)\"/i", $ret, $mainPage);
$mainPage = $mainPage[1][1];
#var_dump($array);die;
$length = count($array[1]);
for ($i=0; $i < $length; $i++) {
$data["URL"][$i] = "http://www.zhihu.com".$array[1][$i];
$data["TITLE"][$i] = $array[2][$i];
$data["NUM"][$i] = (int)$array[4][$i];
$data["SMALLTITLE"][$i] = "由 ".$array[6][$i]." 创建 • ".$array[7][$i];
$data["CREATOR_URL"][$i] = $array[5][$i];
}
$url = 'http://www.zhihu.com'.$mainPage;
#echo $url;die;
$ret = myCurl($url, 'get', $header, $cookie, null, false);
preg_match_all("/src=\"(.*)\"\sclass=\"zm-profile-header-img/", $ret, $icon);
$data["ICON"] = $icon[1][0];
$data = json_encode($data);
print_r($data);
?>
这个就是PHP爬虫代码咯,很小的一个爬虫。虽然小,但是做出来的东西有意思啊。
cookie是登录后获取到的cookie,因为做着玩玩所以就没有做模拟登陆。
接下来准备写个Service服务定时刷新爬虫,收藏夹有动作了立即以通知的方式提醒我。