php蜘蛛抓取器

最新推荐文章于 2024-08-29 14:28:28 发布

雨相也

最新推荐文章于 2024-08-29 14:28:28 发布

阅读量803

点赞数

文章标签： php include class div list file

http://www.oschina.net/code/snippet_199414_6892
 
 利用curl，正则表达式做的一个php蜘蛛抓取器 
   qiulingfeng1987 发布于 2011年11月01日 21时, 
   3评/242阅 ( 
   3人收藏, 
     收藏 ) 
  
 顶 
 0 
 踩 

   凤网fcms内容管理系统 
  
 get.php 抓取框架，对网页内容的分析处理并进行相关替换 
  
 std.php 通用正则 
  
 news_67_com.php 对http://news.67.com 的抓取分析器 
  
 先抓列表，再抓内容页。 
  
 还欠缺监控，统计，错误处理功能。个人觉得还是比较好玩。 
 
  标签： <无> 
 
代码片段(4)[文件] news_67_com.php ~ 4KB    下载(5) 
    001<?php
 
    002include_once  dirname(__FILE__) .  '/std.php';
 
    003 
 
    004$site =  array(
 
    005    'aname'    => '中国娱乐网',
 
    006    'domain'   => 'news.67.com',
 
    007    'dirname'  => '目录名称，用于匹配基于目录不同的正文',
 
    008    'gettype'  => 'default',
 
    009    //获取主文件
 
    010    'creg'     => '/(?si)<!--文章 begin-->(.*?)\<\!--文章 end-->/',
 
    011    'code'     => 'utf-8',
 
    012    'sub'      => '获取子目录正则',
 
    013    'content'  => 'tag1',
 
    014    'img_upload'=>array('tag1'=> ''),
 
    015    //下一页
 
    016    'reg_next' => '/(?is)<a target=\'_self\' href=\'([^\']*?)\'>下一页\&gt;\&gt;<\/a>/',
 
    017    'key0'     => '/(?is)<meta name="keywords" content="([^"]*?)".*?\/>/',
 
    018    'key0_ap'  => array(array(',','|'), ' '),
 
    019    'tag0'     => '/(?is)<h1>([^<^>]*?)<\/h1>/',
 
    020    'tag0_arp' => array(
 
    021        array(
 
    022            '/(?is)\(组图\)/',
 
    023            '/(?is)\(图\)/',
 
    024            '/(?is)\(图\.\./',
 
    025            '/(?is)\(组图\.\./',
 
    026            '/(?is)\./',
 
    027            '/(?is)(《|》)/',
 
    028        ),
 
    029        array(
 
    030            '','', '','', '','',
 
    031        )
 
    032    ),
 
    033    'tag1'     => '/(?is)<div class="article" id="divContent">(.*?)<img class="[^"]*?" style="[^"]*?" src="[^"]*?" alt="[^"]*?" border="\d*" \/>/',
 
    034    'tag1_brp' => array(
 
    035        array(
 
    036            '/(?is)（.*?）/',
 
    037            '/(?is)\(.*?\)/',
 
    038            '/(?is)\s*<p align="center">.*?<img.*?src="([^"]*?)".*?>(.*?)<\/p>\s*/',
 
    039            '/(?is)\s*<p>\s*/',
 
    040            '/(?is)\s*<p align="center">\s*/',
 
    041            '/(?is)　/',
 
    042            '/(?is)<br \/>/',
 
    043            '/(?is)\s*<p align="left">\s*/',
 
    044            '/(?is)\s*<p class="f_center" align="center">\s*/',
 
    045            '/(?is)\s*<center>\s*/',
 
    046            '/(?is)\s*<\/center>\s*/',
 
    047            '/(?is)\s*<p class="f_center">\s*/',
 
    048        ),
 
    049        array(
 
    050            '','', '<p style="text-align: center;"><img src="$1" /></p>','<p style="text-indent: 24px;">','<p style="text-align: center;">','', '','<p style="text-indent: 24px;">','<p style="text-align: center;">','<p style="text-align: center;">','</p>', '<p style="text-indent: 24px;">'
 
    051        ),
 
    052    ),
 
    053    'tag1_arp' => array(
 
    054        array(
 
    055            '/(?is)<p style="text-align: center;">&nbsp;<\/p>/',
 
    056            '/(?is)<strong><\/strong>/'
 
    057        ),
 
    058        array(
 
    059            '',''
 
    060        ),
 
    061    ),
 
    062    'strip'    => array('tag1'=> ''),
 
    063    'tag2'     => '/(?is)<div class="daodu">导读：\s*(.*?)\s*<\/div>/',
 
    064    'tag2_arp' => array(
 
    065        array(
 
    066            '/(?is)　/'
 
    067        ),
 
    068        array(
 
    069            ''
 
    070        ),
 
    071    ),
 
    072    'tag3'     => '/(?is)(中国娱乐网)/',
 
    073    'tag4'     => '/(?is)<div class="artInfo"><span>日期：(\d+-\d+-\d+ \d+:\d+:\d+).*?<\/div>/',
 
    074);
 
    075 
 
    076$map =  array(
 
    077    'tag'      => 'key0',
 
    078    'title'    => 'tag0',
 
    079    'content'  => 'tag1',
 
    080    'summary'  => 'tag2',
 
    081    'source'   => 'tag3',
 
    082    'pub_date' => 'tag4',
 
    083);
 
    084 
 
    085$site_list  = array(
 
    086    'aname'    => '中国娱乐网',
 
    087    'domain'   => 'www.67.com',
 
    088    'gettype'  => 'default',
 
    089    'creg'     => '/(?si)<div class="gallery_list">(.*?)<div class="nt_cl">/',
 
    090    'code'     => 'gbk',
 
    091    'reg_next' => '/(?si)<li class="next"><a href="([^"]+?)" target="_self">下一页<\/a><\/li>/',
 
    092    //链接
 
    093    'tag0'     => '/(?is)<div style="height: 30px;">.*?<a target="_blank"\s*href=\'(\w+:\/\/news\.67\.com\/\w+\/\d+\/\d+\/\d+\/\d+\.\w+)\s*\' style="font-size: 14px;">[^<^>]*?<\/a>.*?<\/div>/',
 
    094    //标题
 
    095    'tag1'     => '/(?is)<div style="height: 30px;">.*?<a target="_blank"\s*href=\'\w+:\/\/news\.67\.com\/\w+\/\d+\/\d+\/\d+\/\d+\.\w+\s*\' style="font-size: 14px;">([^<^>]*?)<\/a>.*?<\/div>/',
 
    096    'tag1_arp' => array(
 
    097        array(
 
    098            '/(?is)\(组图\)/',
 
    099            '/(?is)\(图\)/',
 
    100            '/(?is)\(图\.\./',
 
    101            '/(?is)\(组图\.\./',
 
    102            '/(?is)\./',
 
    103            '/(?is)(《|》)/',
 
    104        ),
 
    105        array(
 
    106            '','', '','', '','',
 
    107        )
 
    108    ),
 
    109);
 
    110 
 
    111$list_map =array(
 
    112    'url'      => 'tag0',
 
    113    'title'    => 'tag1',
 
    114);
 
    115 
 
    116$site_list_sub  = array();
 
 [文件] get.php ~ 22KB    下载(4) 
[文件] std.php ~ 172B    下载(4) 
    1<?php
 
    2global $std;
 
    3$std =  array(
 
    4    'url'=> '[0-9a-zA-Z\.\:\-\/%_#;&]+',
 
    5    'img'=> '/(?is)<img.*?src=(?:[\'"]{0,1})([0-9a-zA-Z\.\:\-\/%_#;&]+)(?:[\'"]{0,1}).*?>/',
 
    6);
 
[文件] test.php ~ 2KB    下载(5) 
 01<?php
 
 02/**      
 
 03 * test.php
 
 04 *
 
 05 * @author     xzfred <xzfred@gmail.com>
 
 06 * @copyright  2009 fengone.com
 
 07 * @created    2010-12-07 .
 
 08 * @version    $Id: php.php 3 2008-10-10 07:49:21Z fred $
 
 09 * SVNPath     $HeadURL: http://192.168.0.16/svn/vim/skeletons/php.php $        
 
 10 */
 
 11/*
 
 12include_once "std.php";
 
 13include_once "lady_163_com.php";
 
 14 */
 
 15include_once  $GLOBALS['g_dir_core'] ."get.php";
 
 16 
 
 17//================================================================================
 
 18include_once  DIR_HOST_TAG . '/tuku_ent_china_com.php';
 
 19$obj =  new FcHtmlParse($site);
 
 20$c =  $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/2011-08-23/181703.xml"));
 
 21echo "\n\n\n ===================\n";
 
 22echo $c['field']['tag1'][0];
 
 23echo "\n\n\n ===================\n";
 
 24var_dump($c);
 
 25 
 
 26exit();
 
 27//列表测试
 
 28$obj =  new FcHtmlParse($site_list);
 
 29$c =  $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/3569_1.html"));
 
 30var_dump($c);
 
 31exit();
 
 32/*
 
 33 
 
 34$obj = new FcHtmlGet($site);
 
 35$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
 
 36var_dump($c);
 
 37 
 
 38$obj = new FcHtmlGet($site);
 
 39$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
 
 40var_dump($c);
 
 41 
 
 42$obj = new FcHtmlParse($site);
 
 43$img_obj = new FcHtmlImgUpload($site);
 
 44 
 
 45$data = file_get_contents("e:/b.html");
 
 46$c = $obj->parse($data);
 
 47$ic = $img_obj->upload($c['tag']['tag1'][0]);
 
 48var_dump($ic);
 
 49 
 
 50$data = file_get_contents("e:/a.html");
 
 51$c = $obj->parse($data);
 
 52$ic = $img_obj->upload($c['tag']['tag1'][0]);
 
 53var_dump($ic);
 
 54 */
 
 55 
 
56//var_dump($c['tag']['tag1']);

雨相也

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
php蜘蛛抓取器

http://www.oschina.net/code/snippet_199414_6892利用curl，正则表达式做的一个php蜘蛛抓取器qiulingfeng1987 发布于 2011年11月01日 21时,3评/242阅 (3人收藏, 收藏 ) 顶0踩凤网fcms内容管理系统get.php 抓取框架，对网页内容的分析处理并进行相关替换
复制链接

扫一扫