Step 1:发送请求
$url = '';
$header = array(
'User-Agent:a9694ebf4d02ef427830292349e3172c/5.0(Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer: ',
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
$content = curl_exec($ch);
curl_close($ch);
Step 2:解析页面内容
preg_match_all('/<tr[^>]*>(.*?)<\/tr>/si', $content, $matches);
$data = array();
foreach ($matches[1] as $k => $v) {
if ($k == 0) {
continue;
}
preg_match_all('/<td[^>]*>(.*?)<\/td>/si', $v, $_match);
if (!empty($_match[1][0])) {
$data[] = array(
'ip' => $_match[1][0],
'port' => $_match[1][1],
'type' => $_match[1][5],
);
}
}
Step 3:验证代理ip可用性
function check_proxy($ip, $port)
{
$url = '';
$header = array('User-Agent:a9694ebf4d02ef427830292349e3172c/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'Referer: ',);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_PROXYPORT, $port);
curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP');
curl_setopt($ch, CURLOPT_PROXY, $ip);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
$content = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode == 200) {
return true;
} else {
return false;
}
}
Step 4:保存结果到本地文件
$file = 'proxy.txt';
foreach ($data as $v) {
if (check_proxy($v['ip'], $v['port'])) {
file_put_contents($file, "{$v['ip']}:{$v['port']}\r\n", FILE_APPEND);
}
}
参考
使用PHP实现随机获取代理IP的爬虫