php 多音字,PHP多音字拼音处理方案

在很多时候我们遇到了各种各样的问题,就是在类似于suggest当中需要输入汉字转为拼音或者跟据拼音来产生热门的关键词。

热门关键词在这里我们暂时不做讨论。我们来说一下拼音处理的手法。在拼音处理的过程中有一个方案必须要做到的就是有一个拼音库。在这儿我会把拼音库给大家,让大家进行下载。

此拼音处理类存在的问题是效率过低,我们在处理的时候建议写成PHP扩展的模式来进行处理。在下一期中我们将使用PHP扩展的模式来进行处理和讲解。

拼音库的下载地址:http://www.mdbg.net/chindict/chindict.php?page=cedict

下面是使用演示:

echo Pinyin::trans(‘带着希望去旅行,比到达终点更美好’), “\n”;

//output: “dài zhe xī wàng qù lǔ xíng bǐ dào dá zhōng diǎn gèng měi hǎo”*

下面是实现的代码:

class Pinyin

{

/**

* dictionary path

*

* @var string

*/

protected $dictionary;

/**

* settings

*

* @var array

*/

protected static $setting = array(

‘delimiter’ => ‘ ‘,

‘accent’ => true,

);

/**

* instance

*

* @var Pinyin

*/

protected static $instance;

/**

* constructor

*

* set dictionary path.

*/

public function __construct()

{

ini_set(‘memory_limit’, ‘160M’);

$this->dictionary = __DIR__ . ‘/cedict/cedict_ts.u8′;

}

/**

* set the dictionary.

*

* @param array $setting settings.

*/

public static function set(array $setting = array())

{

self::$setting = array_merge(self::$setting, $setting);

}

/**

* get Pinyin instance

*

* @return Pinyin

*/

public static function getInstance()

{

if (is_null(self::$instance)) {

self::$instance = new self;

}

return self::$instance;

}

/**

* chinese to pinyin

*

* @param string $string source string.

* @param array $setting settings.

*

* @return string

*/

public static function trans($string, array $setting = array())

{

$instance = self::getInstance();

// merge setting

empty($setting) || self::set($setting);

$dictionary = $instance->loadDictionary();

// do replace

foreach ($dictionary as $line) {

$string = str_replace($line['simplified'], “{$line['pinyin_marks']} “, $string);

if (!$instance->containsChinese($string)) {

break;

}

}

// add accents

if(self::$setting['accent']) {

$string = $instance->pinyin_addaccents(strtolower($string));

} else {

$string = $instance->removeTone(strtolower($string));

}

// clean the string

$string = $instance->removeUnwantedCharacters($string);

// add delimiter

$string = $instance->addDelimiter($string);

return $instance->escape($string);

}

/**

* load dictionary content

*

* @return array

*/

protected function loadDictionary()

{

$cacheFilename = $this->getCacheFilename($this->dictionary);

// load from cache

if (file_exists($cacheFilename)) {

return $this->loadFromCache($cacheFilename);

}

// parse and cache

$parsedDictionary = $this->parseDictionary($this->dictionary);

$this->cache($cacheFilename, $parsedDictionary);

return $parsedDictionary;

}

/**

* get the filename of cache file.

*

* @param string $dictionary dictionary path.

*

* @return string

*/

protected function getCacheFilename($dictionary)

{

is_dir(__DIR__ .’/cache/’) || mkdir(__DIR__ .’/cache/’, 0755, true);

return __DIR__ .’/cache/’ . md5($dictionary);

}

/**

* parse the dict to php array

*

* @param string $dictionary path of dictionary file.

*

* @return array

*/

protected function parseDictionary($dictionary)

{

//ini_set(‘memory_limit’, ‘180M’);

$dictionary = file($dictionary);

$regex = “#(.*?) (.*?) \[(.*?)\] \/(.*)\/#”;

$content = array();

foreach ($dictionary as $entry) {

if (0 === stripos($entry, ‘#’)) {

continue;

}

preg_match($regex, $entry, $matches);

$content[] = array(

//’traditional’ => $matches[1],

‘simplified’ => $matches[2],

//’pinyin_numbers’ => $matches[3],

‘pinyin_marks’ => $matches[3],

//’translation’ => $this->escape($matches[4]),

);

}

// sort by simplified string length.

usort($content, function($a, $b){

if (mb_strlen($a['simplified']) == mb_strlen($b['simplified'])) {

return 0;

}

return mb_strlen($a['simplified']) < mb_strlen($b['simplified']) ? 1 : -1;

});

return $content;

}

/**

* load dictionary from cached file

*

* @param string $dictionary cached file name

*

* @return array

*/

protected function loadFromCache($dictionary)

{

return include $dictionary;

}

/**

* write array to file

*

* @param string $filename filename.

* @param array $array parsed dictionary.

*

* @return void

*/

protected function cache($filename, $array)

{

file_put_contents($filename, “ ‘u’,

‘/\d/’ => ”,

);

return preg_replace(array_keys($replacement), $replacement, $string);

}

/**

* Credits for these 2 functions go to Bouke Versteegh, who shared these

* at http://stackoverflow.com/questions/1598856/convert-numbered-to-accentuated-pinyin

*

* @param string $string The pinyin string with tone numbers, i.e. “ni3 hao3″

*

* @return string The formatted string with tone marks, i.e.

*/

protected function pinyin_addaccents($string)

{

# Find words with a number behind them, and replace with callback fn.

return str_replace(‘u:’, ‘ü’, preg_replace_callback(

‘~([a-zA-ZüÜ]+\:?)(\d)~’,

array($this, ‘pinyin_addaccents_cb’),

$string));

}

# Helper callback

protected function pinyin_addaccents_cb($match)

{

static $accentmap = null;

if ($accentmap === null) {

# Where to place the accent marks

$stars =

‘a* e* i* o* u* ü* ‘ .

‘A* E* I* O* U* Ü* ‘ .

‘a*i a*o e*i ia* ia*o ie* io* iu* ‘ .

‘A*I A*O E*I IA* IA*O IE* IO* IU* ‘ .

‘o*u ua* ua*i ue* ui* uo* üe* ‘ .

‘O*U UA* UA*I UE* UI* UO* ÜE*';

$nostars =

‘a e i o u ü ‘ .

‘A E I O U Ü ‘ .

‘ai ao ei ia iao ie io iu ‘ .

‘AI AO EI IA IAO IE IO IU ‘ .

‘ou ua uai ue ui uo üe ‘ .

‘OU UA UAI UE UI UO ÜE';

# Build an array like array(‘a’ => ‘a*’) and store statically

$accentmap = array_combine(explode(‘ ‘, $nostars), explode(‘ ‘, $stars));

}

static $vowels = array(‘a*’, ‘e*’, ‘i*’, ‘o*’, ‘u*’, ‘ü*’, ‘A*’, ‘E*’, ‘I*’, ‘O*’, ‘U*’, ‘Ü*’);

static $pinyin = array(

1 => array(‘ā’, ‘ē’, ‘ī’, ‘ō’, ‘ū’, ‘ǖ’, ‘Ā’, ‘Ē’, ‘Ī’, ‘Ō’, ‘Ū’, ‘Ǖ’),

2 => array(‘á’, ‘é’, ‘í’, ‘ó’, ‘ú’, ‘ǘ’, ‘Á’, ‘É’, ‘Í’, ‘Ó’, ‘Ú’, ‘Ǘ’),

3 => array(‘ǎ’, ‘ě’, ‘ǐ’, ‘ǒ’, ‘ǔ’, ‘ǚ’, ‘Ǎ’, ‘Ě’, ‘Ǐ’, ‘Ǒ’, ‘Ǔ’, ‘Ǚ’),

4 => array(‘à’, ‘è’, ‘ì’, ‘ò’, ‘ù’, ‘ǜ’, ‘À’, ‘È’, ‘Ì’, ‘Ò’, ‘Ù’, ‘Ǜ’),

5 => array(‘a’, ‘e’, ‘i’, ‘o’, ‘u’, ‘ü’, ‘A’, ‘E’, ‘I’, ‘O’, ‘U’, ‘Ü’)

);

list(, $word, $tone) = $match;

# Add star to vowelcluster

$word = strtr($word, $accentmap);

# Replace starred letter with accented

$word = str_replace($vowels, $pinyin[$tone], $word);

return $word;

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值