php 模拟抓去页面一些函数整理

curl 模拟提交post数据

/*
$url @ 提交地址
$postData 例
$postData = array(
	'key'=>'value',
);
*/
function curlPost($url,$postData=array()) {
	if(empty($url)) return false;
	$o="";
	foreach ($postData as $k=>$v){
		$o.= "$k=".urlencode($v)."&";
	}
	$postData=substr($o,0,-1);
	$ch = curl_init();

	$timeout = 5; 
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
	
	curl_setopt($ch, CURLOPT_POST, 1);
	curl_setopt($ch, CURLOPT_HEADER, 0);
	curl_setopt($ch, CURLOPT_URL, $url); 
	curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
	$contents = curl_exec($ch); 
	return $contents;
}

 

CURL 抓取页面

/*
 * $url @ 抓去页面的地址
 */
function getcontents($url) {
	$ch = curl_init(); 
	$timeout = 5; 
	curl_setopt($ch, CURLOPT_URL, $url); 
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
	$contents = curl_exec($ch); 
	return $contents;
}

 

正则抓去Tabel 内容

 

/*
 *$table  @ 页面抓取过来的内容
 */
function get_td_array($table) {
	// 去掉 HTML 标记属性
	$table = preg_replace("'<table[^>]*?>'si", "", $table);
	$table = preg_replace("'<tr[^>]*?>'si", "", $table);
	$table = preg_replace("'<td[^>]*?>'si", "", $table);
	$table = str_replace("</tr>", "{tr}", $table);
	$table = str_replace("</td>", "{td}", $table);
	// 去掉 HTML 标记
	
	$table = preg_replace("'<[\/\!]*?[^<>]*?>'si", "", $table);
	
	// 去掉空白字符
	$table = preg_replace("'([\r\n])[\s]+'", "", $table);
	$table = str_replace(" ", "", $table);
	$table = str_replace(" ", "", $table);
	
	$table = explode('{tr}', $table);
	array_pop($table);
	foreach ($table as $key => $tr) {
		$td = explode('{td}', $tr);
		array_pop($td);
		$td_array[] = $td;
	} 
	return $td_array;
}

 

转载于:https://www.cnblogs.com/threemore/p/3979693.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值