<?php
define('IN_PHPBB', true);
set_time_limit(0); //设置程序的运行上限时间为不限制
ignore_user_abort(true);
//die('ok');
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/functions_user.' . $phpEx);
include($phpbb_root_path . 'includes/functions_module.' . $phpEx);
include($phpbb_root_path . 'includes/functions_display.' . $phpEx);
include($phpbb_root_path . 'includes/functions_privmsgs.' . $phpEx);
//设置采集参数
/**
* title_url 采集地址
* title_url 采集网站域名
* file_name 帖子后缀名
* pattern_title 采集标题列表正则
* pattern_content 采集内容页正则
* charset 采集网站编码
* forum_id 加入的版块的ID
* post_approved 是否已经审核,0没有,1已经审核
**/
$data = array(
'kds' =>array(
'title_url' =>'http://club.pchome.net/forum_1_15.html',
'web_url' =>'http://club.pchome.net',
'file_name' =>'',
'pattern_title' =>'/\<li class=\"i3\"\>[\s]*\<a[\s\S]*?href\=\"(.*?)\"[\s\S]*?\>(.*?)\<\/a\>[\s\S]*?\<\/li\>/',
'pattern_content' =>'/\<div id=\"__Message_\d*\"\>(.*?)\<\/div>/',
'charset' =>'gbk',
'forum_id' =>26,
'post_approved' =>1
),
'cttj' =>array(
'title_url' =>'http://www.dianping.com/group/meishi123/',
'web_url' =>'http://www.dianping.com',
'file_name' =>'',
'pattern_title' =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/',
'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/',
'charset' =>'UTF-8',
'forum_id' =>22,
'post_approved' =>1
),
'crwq' =>array(
'title_url' =>'http://www.dianping.com/group/kitchen',
'web_url' =>'http://www.dianping.com',
'file_name' =>'',
'pattern_title' =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/',
'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/',
'charset' =>'UTF-8',
'forum_id' =>160,
'post_approved' =>1
),
'pcwh' =>array(
'title_url' =>'http://www.dianping.com/group/sh5757',
'web_url' =>'http://www.dianping.com',
'file_name' =>'',
'pattern_title' =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/',
'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/',
'charset' =>'UTF-8',
'forum_id' =>21,
'post_approved' =>1
)
);
//循环采集数据
foreach($data as $key=>$value){
//print_r($value);
caiji($value['title_url'],$value['web_url'],$value['file_name'],$value['pattern_title'],$value['pattern_content'],$value['charset'],$value['forum_id'],$value['post_approved']);
}
unset($data);
echo'<br/>meishi';
function caiji($title_url,$web_url,$file_name,$pattern_title,$pattern_content,$charset,$forum_id,$post_approved=0)
{
global $db;
$topic_poster = 2;
$topic_first_poster_name = 'abc';
$poster_id = 53;
// 获取页面代码
$r = file_get_contents($title_url);
$r = str_replace("\n","",$r);
$r = str_replace("\n\r","",$r);
// 进行正则搜索
preg_match_all($pattern_title, $r, $title);
// 计算标题数量
$count = count($title[1]);
//echo $count;die('===');
//print_r($title[1]);die('====');
//加载一个多进程CURL实例
$mh = curl_multi_init();
$handles = array();
// 通过标题数量进行内容采集
for($i=5;$i<$count;$i++) {
// 设置内容页地址
$content_url = $web_url.$title[1][$i].$file_name;
//die($content_url.'==');
// 创建一个单线程CURL实例
$ch = curl_init();
// 设置CURL相关参数
curl_setopt($ch, CURLOPT_URL, $content_url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
// 将该进程加载到实例中
curl_multi_add_handle($mh,$ch);
// 加入循环数组中
$handles[] = $ch;
}
// 执行CURL多线程实例
$running=null;
do {
curl_multi_exec($mh,$running);
// 间隔0.25S
usleep (250000);
} while ($running > 0);
// 获取采集内容
for($i=0;$i<count($handles);$i++)
{
// 获取内容页代码
$c = '';
$c = curl_multi_getcontent($handles[$i]);
$c = str_replace("\n","",$c);
$c = str_replace("\n\r","",$c);
$c = str_replace("\t","",$c);
//print_r($c);die();
//echo $i.'--';continue;
// 设置内容页匹配正则
//$p = '/\<div id=\"__Message_\d*\"\>(.*?)\<\/div>/';
// 进行正则匹配搜索
preg_match($pattern_content,$c,$content);
$content[0] = strip_tags($content[0],'<a><img><br><p>');//去除HTML标识
$content[0] = addslashes($content[0]);
//$m = count($content);
//echo $m.'===';
//echo $content[1].'==';die();
//var_dump($content);die('==');
$title[2][$i] = addslashes($title[2][$i]);
$time = time();
$sql_check = "select `topic_title` from " . TOPICS_TABLE . " where `topic_title`='".iconv($charset, 'UTF-8',$title[2][$i])."'";
//echo ($sql_check);
$result = $db->sql_query($sql_check);
$r = $db->sql_fetchrow($result);
//var_dump($r);die();
$db->sql_freeresult($result);
if($r['topic_title'] != '') {
//die('yes');
continue;
}
//echo $title[2][$i];continue;
$sql_topic = "INSERT INTO " . TOPICS_TABLE . " (`topic_poster`, `topic_time`, `topic_last_view_time`, `forum_id`, `icon_id`, `topic_approved`, `topic_title`, `topic_first_poster_name`, `topic_first_poster_colour`, `topic_type`, `topic_time_limit`, `topic_attachment`) VALUES ('$topic_poster', '$time', '$time', '$forum_id', 0, 1, '".iconv($charset, 'UTF-8',$title[2][$i])."', '$topic_first_poster_name', 'AA0000', 0, 0, 0)";
//die($sql_topic);
$result = $db->sql_query($sql_topic);
$topic_id = $db->sql_nextid();
$sql_post = "INSERT INTO " . POSTS_TABLE . " (`forum_id`, `poster_id`, `icon_id`, `poster_ip`, `post_time`, `post_approved`, `enable_bbcode`, `enable_smilies`, `enable_magic_url`, `enable_sig`, `post_username`, `post_subject`, `post_text`, `post_checksum`, `post_attachment`, `bbcode_bitfield`, `bbcode_uid`, `post_postcount`, `post_edit_locked`, `topic_id`) VALUES ('$forum_id', '$poster_id', 0, '192.168.10.22', '$time', '$post_approved', 1, 1, 1, 1, '', '".iconv($charset, 'UTF-8',$title[2][$i])."', '".iconv($charset, 'UTF-8',$content[0])."', '921d0e37730d722e4475373dcc96bb0d', 0, '', '2vnw1nj7', 1, 0, '$topic_id')";
//die($sql_post);
$result = $db->sql_query($sql_post);
//echo $i.'--';
//sleep(1); //延时1秒继续循环
//$output.= curl_multi_getcontent($handles[$i]);
curl_multi_remove_handle($mh,$handles[$i]);
echo $i.'===';
}
// 输出采集结果
//echo $output;
//关闭实例
curl_multi_close($mh);
}
echo "结束";
// Close our DB connection.
if (!empty($db))
{
$db->sql_close();
}
?>