php抓取网页上的指定内容

最新推荐文章于 2023-05-30 16:44:11 发布

hai7425

最新推荐文章于 2023-05-30 16:44:11 发布

阅读量5.3k

点赞数 1

分类专栏： php学习研究

本文链接：https://blog.csdn.net/hai7425/article/details/51462386

版权

php学习研究专栏收录该内容

107 篇文章 2 订阅

订阅专栏

<?php

//ignore_user_abort(true);
//set_time_limit(0);

//获取所有的政府机关和事业单位
$url = "http://www.tongda2000.com/company/news.php";
$contents = file_get_contents($url);
//如果出现中文乱码使用下面代码
$getcontent = iconv("gb2312", "utf-8",$contents);
echo $contents;
//exit;

  $mode = "/ <ul class=\"newslist\">(.*)<div id=\"right\">/is";
preg_match_all($mode,$contents,$matches);
print_r($matches);
//exit;
$xinwen_str=$matches[0][0];
//echo $xinwen_str;
$xinwen_arr1=explode("<li>",$xinwen_str);

foreach($xinwen_arr1 as $key=>$value){
  $xinwen_arr2=explode("</li>",$xinwen_arr1[$key]);
  $xinwen_arr1[$key]=$xinwen_arr2[0];
  $xinwen_arr1[$key]=str_replace("\n","",$xinwen_arr1[$key]);
  $xinwen_arr1[$key]=str_replace("\r","",$xinwen_arr1[$key]);
}
//print_r($xinwen_arr1);

for($j=1;$j<count($xinwen_arr1);$j++){
  $mode1 = "/(.*)<a/is";
preg_match_all($mode1,$xinwen_arr1[$j],$matches1);
$riqi_str=$matches1[0][0];
$riqi_arr=explode(" ",$riqi_str);
$riqi=$riqi_arr[0];//获取到日期
echo $riqi."<br>";
$mode2= "/>(.*)<\/a>/isU";
preg_match_all($mode2,$xinwen_arr1[$j],$matches2);

$title_str=$matches2[0][0];
$title_arr=explode(">",$title_str);
$title=str_replace("</a","",$title_arr[1]);//获取到名称
echo $title."<br>";
$lianjie_arr=explode("\"",$xinwen_arr1[$j]);
$lianjie=$lianjie_arr[1];
   $a=strstr($lianjie,"http");
   if($a=="") $lianjie="http://www.tongda2000.com".$lianjie;
   echo $lianjie."<br>";
  }