php全词查询,PHP制作百度词典查词采集器

9ca2d6b6a00bce61b7b521ce09cc8822.png

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

/**

* dict.class.php 采集百度词典翻译内容

*

* @copyright (C) 2014 widuu

* @license http://www.widuu.com

* @lastmodify 2014-2-15

*/

header("content-type:text/html;charset=utf8");

class Dict{

private $word;

//显示的条数

private static $num = 10;

public function __construct(){}

/**

* 公用返回百度采集数据的方法

* @param string 英文单词

* retun array(

*symbol" => 音标

*"pro" => 发音

*"example"=> 例句

*"explain"=> 简明释义

*"synonym"=> 同反义词

*"phrase" => 短语数组

*)

*

*/

public function content($word){

$this -> word = $word;

$symbol = $this -> Pronounced();

$pro = $this->getSay();

$example = $this -> getExample();

$explain = $this -> getExplain();

$synonym = $this -> getSynonym();

$phrase = $this -> getPhrase();

$result = array(

"symbol" => $symbol,//音标

"pro" => $pro,//发音

"example"=> $example,//例句

"explain"=> $explain,//简明释义

"synonym"=> $synonym,//同反义词

"phrase" => $phrase //短语数组

);

return $result;

}

/**

* 远程获取百度翻译内容

* get function curl

* retun string

*

*/

private function getContent(){

$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";

$ch = curl_init();

$url = "http://dict.baidu.com/s?wd=".$this->word;

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_USERAGENT,$useragent);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

curl_setopt($ch, CURLOPT_HTTPGET, 1);

curl_setopt($ch, CURLOPT_AUTOREFERER,1);

curl_setopt($ch, CURLOPT_HEADER, 0);

curl_setopt($ch, CURLOPT_TIMEOUT, 30);

$result = curl_exec($ch);

if (curl_errno($curl)) {

echo 'Errno'.curl_error($curl);

}

curl_close($ch);

return $result;

}

/**

* 获取百度翻译发音

* retun array(英,美)

*

*/

private function Pronounced(){

$data = $this -> getContent();

preg_match_all("/\"EN\-US\"\>(.*)\/Ui",$data,$pronounced);

return array(

'en' => $pronounced[1][0],

'us' => $pronounced[1][1]

);

}

/**

* 获取百度翻译发音

* return array(英,美)

*

*/

private function getSay(){

$data = $this -> getContent();

preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);

return array(

'en' => $pronounced[1][0],

'us' => $pronounced[1][1]

);

}

/**

* 获取百度翻译例句

* return array() 多维数组 例句

*

*/

private function getExample(){

$str = "";

$data = $this -> getContent();

preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);

$data1 = "[[[".ltrim($example[1][0],"[");

$data2 = explode("[[[",$data1);

$num = count(array_filter($data2));

foreach($data2 as $key => $value){

$data3 = explode("[[","[[".$value);

foreach ($data3 as $k => $v) {

preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);

if(!empty($match[1])){

$str .= implode($match[1]," ")."@";

}

}

}

$data4 = trim($str,"@");

$data5 = explode("@", $data4);

$result = array_chunk($data5, 2);

return $result;

}

/**

* 获取简明释义

* return array (x => "词性",b => "附属")

*

**/

private function getExplain(){

$data = $this -> getContent();

preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\

/Us",$data,$explain);

$r_data = $explain[1][0];

preg_match_all("/\

\(?P.*)\\(?P.*)\\/Us", $r_data, $a_data);

preg_match_all("/\(?P[^\>]+)\:\(?P.*)\\/Us", $r_data, $b_data);

$result = array();

foreach ($a_data["adj"] as $key => $value) {

$result[$value] = $a_data["name"][$key];

}

$word_b = array();

foreach ($b_data["tag"] as $key => $value) {

$word_b[$value] = strip_tags($b_data["word"][$key]);

}

$result_data = array("x" => $result,"b" => $word_b);

return $result_data;

}

/**

* 获取同义词

* return array(0 => "同义词", 1 => "反义词") 一般为多维数组

*

*/

private function getSynonym(){

$data = $this -> getContent();

preg_match_all("/id=\"en\-syn\-ant\"\>(.*)

/Us",$data,$synonym);

$content = $synonym[1][0];

$data1 = explode("", $content);

$result = array();

$data2 = array();

foreach ($data1 as $key => $value) {

preg_match_all("/\(?P.*)\&nbsp\;\\\

\ (?.*)\/Us", $value, $r_data);

$data2[$key]["adj"] = $r_data["adj"];

$data2[$key]["content"] = $r_data["content"];

}

foreach ($data2 as $key => $value) {

foreach ($value["content"] as $k => $v) {

if(!empty($v)){

preg_match_all("/\

\

(?P

.*)\(?P.*)\/Us", $v, $v_data);

foreach ($v_data['title'] as $m => $d) {

$data = strip_tags(preg_replace("<>"," ", $v_data["value"][$m]));

$result[$key][$value["adj"][$k]][$d] = $data;

}

}

}

}

return $result;

}

/**

* 获取短语词组

* return array (key => value) 一维或者多维数组

*

*/

private function getPhrase(){

$num = self::$num;

$data = $this -> getContent();

preg_match_all("/id=\"en\-phrase\"\>(.*)\

/Us",$data,$phrase);

$data = explode("",$phrase[1][0]);

$data1 = array_slice($data,0,$num);

$result = array();

foreach ($data1 as $key => $value) {

$data2 = explode("

", $value);

$n = count($data2);

if($n<=3){

$result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);

}else{

$data3 = array_slice($data2,0,$n-1);

$data4 = array_slice($data2,0,2);

$res = array_diff($data3,$data4);

$data5 = array_chunk($res,2);

$key_value = trim(str_replace(" ","",strip_tags($data4[0])));

$result[$key_value] = strip_tags($data4[1]);

foreach ($data5 as $key => $value) {

foreach ($value as $k => $v) {

$value[$k] = strip_tags($v);

}

$array = array($result[$key_value],$value);

if (array_key_exists($key_value, $result)){

$result[$key_value] = $array;

}

}

}

}

return $result;

}

/**

* 将数组转换为字符串

*

* @param array $data 数组

* @param bool $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1

* @return string 返回字符串,如果,data为空,则返回空

*/

private function array2string($data, $isformdata = 1) {

if($data == '') return '';

if($isformdata) $data = $this->new_stripslashes($data);

return addslashes(var_export($data, TRUE));

}

/**

* 返回经stripslashes处理过的字符串或数组

* @param $string 需要处理的字符串或数组

* @return mixed

*/

private function new_stripslashes($string) {

if(!is_array($string)) return stripslashes($string);

foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);

return $string;

}

}

// $word = new dict("express");

// $word ->content();

以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值