php 采集常用函数_PHP 采集常用函数整理

最新推荐文章于 2021-03-28 20:31:17 发布

薄荷味糖豆

最新推荐文章于 2021-03-28 20:31:17 发布

阅读量120

点赞数

文章标签： php 采集常用函数

本文链接：https://blog.csdn.net/weixin_32101377/article/details/115100790

版权

$content = file_get_contents($url);

$content = iconv("GBK","UTF-8",$content);

print_r($match);

$url = "http://www.phpchina.com/";

$body = fopen_url($url);

preg_match_all('|

(.*?)

|is',$body,$match);

preg_match_all('|]*href=/"([^/"]+)/"/s*title="([^/"]+)"[^<>]*>([^<>]+)/a>|i', $match[1][0], $matches);

print_r($matches); //或match_links($match[1][0])

/**

*获取远程文件内容

*@param $url 文件http地址

function fopen_url($url)

{

if (function_exists('file_get_contents')) {

$file_content = @file_get_contents($url);

} elseif (ini_get('allow_url_fopen') && ($file = @fopen($url, 'rb'))){

$i = 0;

while (!feof($file) && $i++ < 1000) {

$file_content .= strtolower(fread($file, 4096));

}

fclose($file);

} elseif (function_exists('curl_init')) {

$curl_handle = curl_init();

curl_setopt($curl_handle, CURLOPT_URL, $url);

curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT,2);

curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER,1);

curl_setopt($curl_handle, CURLOPT_FAILONERROR,1);

curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Trackback Spam Check'); //引用垃圾邮件检查

$file_content = curl_exec($curl_handle);

curl_close($curl_handle);

} else {

$file_content = '';

}

return $file_content;

}

/**

* 提取链接

*Array

(

[link] => Array

(

[0] => http://www.phpchina.com/?action-viewnews-itemid-37454

[1] => http://www.phpchina.com/?action-viewnews-itemid-37465

)

[content] => Array

(

[0] => 微软最顶级平台技术会议PDC10隆重

[1] => 解读微软PDC10要点看云到端的战略...

)

[all] => Array

(

[0] => 微软最顶级平台技术会议PDC10隆重

[1] => ]+))[^>]*>?(.*?)'isx",$document,$links);

while(list($key,$val) = each($links[2])) {

if(!empty($val))

$match['link'][] = $val;

}

while(list($key,$val) = each($links[3])) {

if(!empty($val))

$match['link'][] = $val;

}

while(list($key,$val) = each($links[4])) {

if(!empty($val))

$match['content'][] = $val;

}

while(list($key,$val) = each($links[0])) {

if(!empty($val))

$match['all'][] = $val;

}

return $match;

}

// ####################### 获取文件流并转换成字符串 #######################

function openfile($url)

{

if(file($url)){

$str = file($url);

$count = count($str);

for ($i=0;$i

$file .= $str[$i];

}

return $file;

} else { die("文件打开失败!"); }

}

// ####################### 切分字符串 #######################

function cut($start,$end,$file){

$content=explode($start,$file);

$content=explode($end,$content[1]);

return $content[0];

}

// ####################### 清除垃圾代码 #######################

function del($start,$end,$content){

$del=cut($start,$end,$content);

$content=str_replace($del,"",$content);

$content=str_replace($start.$end,"",$content);

return $content;

}

// ####################### 分析域名 #######################

function getname($url)

{

$referer = preg_replace("/https?:([^//]+).*/i", "//1", $url);

$referer = str_replace("www.", "", $referer);

return $referer;

}

// ####################### 清除HTML代码table #######################

function clstable($content)

{

$clscontent= preg_replace("/

return $clscontent;

}

// ####################### 清除HTML代码script #######################

function clsscript($content)

{

$clscontent= preg_replace("/

]*?>.*?/script>/si", "", $content);

return $clscontent;

}

// ####################### 清除HTML代码div #######################

function clsdiv($content)

{

$clscontent= preg_replace("/

]*?>.*?/div>/si", "", $content);

return $clscontent;

}

// ####################### 清除HTML代码iframe #######################

function clsifr($content)

{

$clscontent= preg_replace("/]*?>.*?/IFRAME>/si", "", $content);

return $clscontent;

}

// ####################### 清除HTML代码tr,td #######################

function clstrtd($content)

{

$clscontent= preg_replace("/

]*?>.*?/td>/si", "", $content);

$clscontent= preg_replace("/

]*?>.*?/tr>/si", "", $clscontent);

$clscontent= preg_replace("/

]*?>/si","",$clscontent);

$clscontent= preg_replace("/

]*?>/si","",$clscontent);

$clscontent= preg_replace("//tr>/si","",$clscontent);

$clscontent= preg_replace("//td>/si","",$clscontent);

return $clscontent;

}

// ####################### 清除HTML代码超链接 #######################

function clsa($content)

{

$clscontent= preg_replace("/]*?>.*?/a>/si", "", $content);

return $clscontent;

}

// ####################### 彻底清除所有HTML代码#######################

function clearhtml($content)

{

$search = array ("'

// -->'si", // 去掉 javascript

"']*?>'si", // 去掉 HTML 标记

"'([/r/n])[/s]+'", // 去掉空白字符

"'&(quot|#34);'i", // 替换 HTML 实体

"'&(amp|#38);'i",

"'&(lt|#60);'i",

"'&(gt|#62);'i",

"'&(nbsp|#160);'i",

"'&(iexcl|#161);'i",

"'&(cent|#162);'i",

"'&(pound|#163);'i",

"'&(copy|#169);'i",

"'(/d+);'e"); // 作为 PHP 代码运行

$replace = array ("",

"",

"//1",

"/"",

"&",

">",

" ",

chr(161),

chr(162),

chr(163),

chr(169),

"chr(//1)");

$text = preg_replace ($search, $replace, $content);

return $text;

}

// ####################### 写入缓存文件 #######################

function writetocache($cachedir,$cachename, $cachedata = '') {

$cachedir = './'.$cachedir.'/';

$cachefile = $cachedir.$cachename.'.php';

if(!is_dir($cachedir)) {

@mkdir($cachedir, 0777);

}

if(!is_dir($cachedir)) {

@mkdir($cachedir, 0777);

}

if(@$fp = fopen($cachefile, 'wb')) {

@fwrite($fp, $cachedata);

@fclose($fp);

@chmod($cachefile, 0777);

} else {

echo 'Can not write to cache files, please check directory ./cache/ .';

exit;

}

// ####################### 获取文件里的html链接 #######################

function geturl($re,$ufile,$rep1,$rep2){

preg_match_all ($re,$ufile,$out, PREG_PATTERN_ORDER);

$result=count($out[1]);

$i=0;

while($i

{

$outs[$i]=str_replace($rep1,$rep2,$out[1][$i]);

$i++;

}

//合并相同的链接并重新索引...

$reout=array();

$reout=resetar($outs);

return $reout;

}

// ####################### 切分文件流 #######################

function cut($start,$end,$file){

$content=explode($start,$file);

$content=explode($end,$content[1]);

return $content[0];

}

// ####################### 清除垃圾代码 #######################

function del($start,$end,$content){

$del=cut($start,$end,$content);

$content=str_replace($del,"",$content);

$content=str_replace($start.$end,"",$content);

return $content;

}

// ####################### 清除数组里的重复值并重新索引数组 #######################

function resetar($outs){

$reout=array();

$reouts=array();

$reout=array_unique($outs);

foreach($reout as $key=>$value){

array_push($reouts,$value);

}

return $reouts;

}

薄荷味糖豆

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
php 采集常用函数_PHP 采集常用函数整理

/*$content = file_get_contents($url);$content = iconv("GBK","UTF-8",$content);print_r($match);*/$url = "http://www.phpchina.com/";$body = fopen_url($url);preg_match_all('|(.*?)|is',$body,$match);preg_...
复制链接

扫一扫