php 分词技术,php实现的中文分词类完整实例

本文实例讲述了php实现的中文分词类。分享给大家供大家参考,具体如下:

class Segmentation {

var $options = array('lowercase' => TRUE,

'segment_english' => FALSE);

var $dict_name = 'Unknown';

var $dict_words = array();

function setLowercase($value) {

if ($value) {

$this->options['lowercase'] = TRUE;

} else {

$this->options['lowercase'] = FALSE;

}

return TRUE;

}

function setSegmentEnglish($value) {

if ($value) {

$this->options['segment_english'] = TRUE;

} else {

$this->options['segment_english'] = FALSE;

}

return TRUE;

}

function load($dict_file) {

if (!file_exists($dict_file)) {

return FALSE;

}

$fp = fopen($dict_file, 'r');

$temp = fgets($fp, 1024);

if ($temp === FALSE) {

return FALSE;

} else {

if (strpos($temp, "\t") !== FALSE) {

list ($dict_type, $dict_name) = explode("\t", trim($temp));

} else {

$dict_type = trim($temp);

$dict_name = 'Unknown';

}

$this->dict_name = $dict_name;

if ($dict_type !== 'DICT_WORD_W') {

return FALSE;

}

}

while (!feof($fp)) {

$this->dict_words[rtrim(fgets($fp, 32))] = 1;

}

fclose($fp);

return TRUE;

}

function getDictName() {

return $this->dict_name;

}

function segmentString($str) {

if (count($this->dict_words) === 0) {

return FALSE;

}

$lines = explode("\n", $str);

return $this->_segmentLines($lines);

}

function segmentFile($filename) {

if (count($this->dict_words) === 0) {

return FALSE;

}

$lines = file($filename);

return $this->_segmentLines($lines);

}

function _segmentLines($lines) {

$contents_segmented = '';

foreach ($lines as $line) {

$contents_segmented .= $this->_segmentLine(rtrim($line)) . " \n";

}

do {

$contents_segmented = str_replace(' ', ' ', $contents_segmented);

}

while (strpos($contents_segmented, ' ') !== FALSE);

return $contents_segmented;

}

function _segmentLine($str) {

$str_final = '';

$str_array = array();

$str_length = strlen($str);

if ($str_length > 0) {

if (ord($str{$str_length-1}) >= 129) {

$str .= ' ';

}

}

for ($i=0; $i

if (ord($str{$i}) >= 129) {

$str_array[] = $str{$i} . $str{$i+1};

$i++;

} else {

$str_tmp = $str{$i};

for ($j=$i+1; $j

if (ord($str{$j}) < 129) {

$str_tmp .= $str{$j};

} else {

break;

}

}

$str_array[] = array($str_tmp);

$i = $j - 1;

}

}

$pos = count($str_array);

while ($pos > 0) {

$char = $str_array[$pos-1];

if (is_array($char)) {

$str_final_tmp = $char[0];

if ($this->options['segment_english']) {

$str_final_tmp = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/", " $1 ", $str_final_tmp);

$str_final_tmp = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/", " $1 $2 ", $str_final_tmp);

}

if ($this->options['lowercase']) {

$str_final_tmp = strtolower($str_final_tmp);

}

$str_final = " $str_final_tmp$str_final";

$pos--;

} else {

$word_found = 0;

$word_array = array(0 => '');

if ($pos < 4) {

$word_temp = $pos + 1;

} else {

$word_temp = 5;

}

for ($i=1; $i

$word_array[$i] = $str_array[$pos-$i] . $word_array[$i-1];

}

for ($i=($word_temp-1); $i>1; $i--) {

if (array_key_exists($word_array[$i], $this->dict_words)) {

$word_found = $i;

break;

}

}

if ($word_found) {

$str_final = " $word_array[$word_found]$str_final";

$pos = $pos - $word_found;

} else {

$str_final = " $char$str_final";

$pos--;

}

}

}

return $str_final;

}

}

?>

希望本文所述对大家PHP程序设计有所帮助。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值