说明文档地址:http://open.iciba.com/?c=wiki&t=cc
先用php封一个curl爬取页面的方法:
/*
* @param string $url get请求地址
* @param int $httpCode 返回状态码
* @return mixed
*/
function curl_get($url, $httpCode = 0) {
// 初始化
$ch = curl_init();
// 爬取url地址
curl_setopt($ch, CURLOPT_URL, $url);
// 不将爬取内容直接输出而保存到变量中
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//部署在Linux环境下改为true
// 模拟一个浏览器访问https网站
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
// 设定连接时间
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
//执行获取内容
$file_contents = curl_exec($ch);
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
curl_close($ch);
return $file_contents;
}
在封一个可以通过音频连接将音频读取并保存到本地的方法
function save_music($type,$file_url)
{
$file_name = $type."/".uniqid().".mp3";
// $pic_url = "http://res.iciba.com/resource/amp3/1/0/34/d1/34d1f91fb2e514b8576fab1a75a89a6b.mp3";
$file_read_fd = fopen($file_url, "r");
$file_write_fd = fopen($file_name, "w");
$file_content = "";
while(!feof($file_read_fd)){
$file_content .= fread($file_read_fd, 1048576);
}
fwrite($file_write_fd, $file_content);
fclose($file_read_fd);
fclose($file_write_fd);
return $file_name;
}
然后先调用接口获取信息,通过正则拿出详细信息,存入数据库和保存本地服务器:
//收集音标
set_time_limit(0);
$mysql = new db();
$result = $mysql->sql_select('SELECT * FROM dictionary WHERE id > 7703');
foreach ($result as $a => $b){
$url = sprintf("http://dict-co.iciba.com/api/dictionary.php?w=%s&key=your_key",$b['word']);
//爬取结果
$result = curl_get($url);
//从结果提取音标和音频的连接保存数组
$rr = preg_match('/[a-zA-Z]+\s.*\s.*\s.*\s\<ps\>(.*)\s\<pron\>(.*)\s\<ps\>(.*)\s\<pron\>(.*)\s/',$result,$results);
//预处理去掉标签和空格
$en_symbol=preg_replace("/[\r\n\s]/","",strip_tags($results[1]));
$en_symbol=preg_replace("/[']/","ˈ",$en_symbol);
if ($en_symbol == "" || !$rr) continue;
$en_symbol_mp3=preg_replace("/[\r\n\s]/","",strip_tags($results[2]));
//保存音频
$url1 = save_music($b['type'],$en_symbol_mp3);
$us_symbol=preg_replace("/[\r\n\s]/","",strip_tags($results[3]));
$us_symbol=preg_replace("/[']/","ˈ",$us_symbol);
$us_symbol_mp3=preg_replace("/[\r\n\s]/","",strip_tags($results[4]));
$url2 = save_music($b['type'],$us_symbol_mp3);
//调用自己封装的方法更新数据库
$mysql->update('dictionary',[
'phonetic_symbol_en' => $en_symbol,
'phonetic_symbol_us' => $us_symbol,
'pronunciation_en' => $url1,
'pronunciation_us' => $url2,
],['id' => $b['id']]);
}
以上就是全流程