两个月前学习php curl时做的练习,今天周末整理了一下。
程序封装了四个类,主要使用了curl来抓取微博用户的个人信息页面以及关注的用户页面,然后通过分析页面结构使用正则表达式以及php的字符串函数截取所需的信息。
Curl类:用于进行数据库操作;
Personal类:用于获取关注的人;
Info_url类:用于获取用户信息的url;
User_info类:用于获取用户信息。
需要注意的是cookie和referer有一个有效期,过了一段时间需要更换。设置cookie和referer的地方需要自己去获取替换掉。
下面上码:
Curl类:用于进行数据库操作
<?php
//$curd = new Curd();
//$data = $curd->get_id();
//var_dump($data);
//此类用于进行数据库操作
class Curd{
//连接数据库
function con_weibo(){
$con = mysql_connect('127.0.0.1:3306','root','');
if(!$con){
die('counld not connect: ' .mysql_error());
}
mysql_query('use weibo',$con);
return $con;
}
//获取用户抓取状态
function is_get($id){
$con = $this->con_weibo();
$query = 'select is_get from user where id='.$id;
$result = mysql_query($query,$con);
if(!$result){
die(mysql_error());
}
$result = mysql_fetch_array($result);
return $result;
}
//将用户标记为已抓取状态
function set_get($id){
$con = $this->con_weibo();
$query = 'update user set is_get=1 where user_id='.$id;
$result = mysql_query($query,$con);
if(!$result){
die(mysql_error());
}
return $result;
}
//从数据库中获取那些未被抓取的用户
function get_ids(){
$con =$this->con_weibo();
$query = 'select user_id from user where is_get=0 and name!="";';
$result = mysql_query($query,$con);
$arr = array();
while($row=mysql_fetch_array($result)){
$arr[] = $row['user_id'];
}
if(!$result){
die(mysql_error());
}
return $arr;
}
//通过user_id查找用户
function find_id($id){
$con =$this->con_weibo();
$query = 'select user_id from user where user_id='.$id;
$result = mysql_query($query,$con);
$row=mysql_fetch_array($result);
if(!$result){
die(mysql_error());
}
return $row;
}
//保存用户信息
function save_info($name,$addr,$sexual,$url,$user_id){
$con = mysql_connect('127.0.0.1:3306','root','');
$date = date("Y-m-d H:i:sa");
if(!$con){
die('counld not connect: ' .mysql_error());
}
//echo 'connect success';
mysql_query('use weibo',$con);
$is_get = 0;
$insert = mysql_query('insert into user values("","'.$name.'","'.$addr.'","'.$sexual.'","'.$url.'","'.$date.'","'.$user_id.'","'.$is_get.'")',$con);
if(!$insert){
die(mysql_error());
}
mysql_close($con);
}
//创建表
function create_table(){
$create_db = mysql_query('create database weibo',$con);
if($create_db){
echo'create success';
}else{
die('counld not query: '.mysql_error());
}
}
}
Personal类:用于获取关注的人
<?php
//用户获取关注的人
class Personal{
/**
* @param $id 用户id
* @param $personal personal对象
* @param $user_info 抓取用户信息对象
* @param $curd 数据库操作对象
*/
function run($id,$personal,$user_info,$curd){
//因为微博只允许非本人看到6页关注的人,所以这里只进行了六次循环
for($i=1;$i<6;$i++){
$curl_url = 'http://weibo.com/p/100505'.$id.'/follow?page='.$i.'#Pl_Official_HisRelation__61';//用户的关注的人的页面链接
$personal->page($curl_url,$user_info,$curd);
}
$curd->set_get($id);//将用户标记为已抓取状态
}
/**
* @param $curl_url curl地址
* @param $user_info 抓取用户信息对象
* @param $curd 数据库操作对象
*/
function page($curl_url,$user_info,$curd){
$ch = curl_init();
$options = array(
//referer,防外链,登录微博随便访问一个好友的页面F12即可获取
CURLOPT_REFERER => '在浏览器登录微博获取',
CURLOPT_URL => $curl_url,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_TIMEOUT => 100,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_HEADER => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
//页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
CURLOPT_COOKIE => '在浏览器登录微博获取',
);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_TIMEOUT,120);
curl_setopt_array($ch,$options);
$res = curl_exec($ch);
curl_close($ch);
//echo $res;
//截取带用户信息的源码,缩小范围
$res1 = strpos($res,'userListBox');
$res2 = strpos($res,'pageList');
$res = substr($res,$res1,$res2-$res1);
//$res = strip_tags($res);
//匹配所有符合正则表达式'/uid=([0-9]{10})/'的字符串
$reg = preg_match_all('/id=([0-9]{10})/',$res,$match);
echo 'reg:'.$reg.'</br>';
$ids = array_unique($match[0]);//去掉重复url下标不会改
//echo 'ids:'.count($ids).'</br>';
foreach ($ids as $user_id){
$user_id = substr($user_id, 3, 10);
//echo 'user_id:'.$user_id.'</br>';
$info_url = 'http://weibo.com/p/100505'.$user_id.'/info?mod=pedit_more';
//echo 'info_url:'.$info_url.'<br>';
//到数据库匹配是否已存在当前用户
if(!$curd->find_id($user_id)){
$user_info->user($info_url,$user_id,$curd);//通过user_id获取用户信息
}
}
}
}
Info_url类:用于获取用户信息的url
<?php
ini_set('max_execution_time', '0');//设置执行时间限制为零(无限制)
//此类用于获取用户信息url
class Info_url{
function get_home($url,$referer){
$ch = curl_init();
$options = array(
CURLOPT_REFERER => $referer,
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER =>1,
CURLOPT_HEADER => FALSE,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
//页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
CURLOPT_COOKIE =>'在浏览器登录微博获取',
CURLOPT_SSL_VERIFYPEER =>FALSE,
CURLOPT_ENCODING => 'gzip,deflate,sdch',
CURLOPT_TIMEOUT => 120,
CURLOPT_FOLLOWLOCATION =>FALSE,
);
curl_setopt_array($ch,$options);
$res = curl_exec($ch);
return $res;
//$res1 = strpos($res,'WB_cardmore S_txt1 S_line1 clearfix');
//echo $res1;
//echo '<br>';
//$res2 = strpos($res,'info?mod=pedit_more');
//echo $res2;
//$res = substr($res,$res1,$res2-$res1);
}
function get_url($url){
$referer = '在浏览器登录微博获取';
$res = $this->get_home($url,$referer);
$pre = preg_match('/100505[0-9]{10}/',$res,$matchs);//匹配链接id
if($pre==0){
$pre = preg_match('/[0-9]{16}/',$res,$matchs);
if($pre==0){
$referer = '在浏览器登录微博获取';
$res = $this->get_home($url,$referer);
//echo 'res:'.$res;
$pre = preg_match('/[0-9]{10}/',$res,$matchs);
if($pre==0){
$referer = '在浏览器登录微博获取';
$res = $this->get_home($url,$referer);
//echo 'res:'.$res;
$pre = preg_match('/[0-9]{10}/',$res,$matchs);
}}
}
//echo $pre;
return $matchs[0];
//echo $res;
}
}
User_info类:用于获取用户信息
<?php
//此类用于获取用户信息
class User_info{
//有些微博认证用户的个人信息页面结构不一样,获取不到用户信息
function user($url,$user_id,$curd){
$ch = curl_init();
$options = array(
//referer,防外链,登录微博随便访问一个好友的页面F12即可获取
CURLOPT_REFERER => '在浏览器登录微博获取',
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_TIMEOUT => 100,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_HEADER => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
//页面cookie,登录微博随便访问一个好友的个人信息页面F12即可获取,过期需要更新
CURLOPT_COOKIE => '在浏览器登录微博获取',
);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_TIMEOUT,120);
curl_setopt_array($ch,$options);
$res = curl_exec($ch);
curl_close($ch);
// echo $res;
// var_dump($res);
//根据分析页面结构,截取需要的用户信息
$sexual1 = strpos($res,'性别');
$sexual2 = strpos($res,'生日');
$sexual = substr($res,$sexual1,$sexual2-$sexual1);
$sexual = substr($sexual,strpos($sexual,'>'),strpos($sexual,'<\/span><\/li>')-strpos($sexual,'>'));
$sexual = substr($sexual,1,strlen($sexual)-1);
$sexual = strip_tags($sexual);
echo '<td>性别:'.$sexual.'</td>';
$sexual = strlen($sexual)>10?0:$sexual;
$addr1 = strpos($res,'所在地');
$addr2 = strpos($res,'性别');
$addr = substr($res,$addr1,$addr2-$addr1);
$addr = substr($addr,strpos($addr,'>'),strpos($addr,'<\/span><\/li>')-strpos($addr,'>'));
$addr = substr($addr,1,strlen($addr)-1);
$addr = strip_tags($addr);
echo '<td>地址:'.$addr.'</td>';
$name1 = strpos($res,'昵称');
$name2 = strpos($res,'所在地');
$name = substr($res,$name1,$name2-$name1);
$name = substr($name,strpos($name,'>'),strpos($name,'<\/span><\/li>')-strpos($name,'>'));
$name = substr($name,1,strlen($name)-1);
$name = strip_tags($name);
echo '<td>昵称:'.$name.'</td>';
$name = strlen($name)>50?"to long":$name;
echo '<br>';
// die();
//保存用户信息
$curd->save_info($name,$addr,$sexual,$url,$user_id);
}
}
执行程序
<?php
/**
* Created by PhpStorm.
* User: ROOT
* Date: 2016/11/27
* Time: 22:38
*/
ini_set('max_execution_time', '0');
//ini_set('date.timezone','Asia/Shanghai');
include('./Curd.php');
$curd = new Curd();
//include('./Info_url.php');
//$info = new Info_url();
include('./User_info.php');
$user_info = new User_info();
include('./Personal.php');
$personal = new Personal();
//从数据库中获取那些未被抓取的用户
$data = $curd->get_ids();
for($i=0; $i<count($data); $i++){
//获取数组中最后一个id
$id_se = $data[count($data)-$i-1];
echo 'id:'.$id_se.'<br>';
$personal->run($id_se,$personal,$user_info,$curd);
}
第一次执行程序需要在数据库插入一条数据,才可以让程序初始化。
程序运行输出: