<?php
class Seg
{
//字典
private $dict = [];
//加载字典
function set_dict($vDict){
//词典大写,方便比对
foreach ($vDict as $i=>$v){
$vDict[$i]= strtoupper($v);
}
$this->dict = $vDict;
}
//分词测试
//基于字典的中英文数字混合分词算法RMM 实现
//https://blog.csdn.net/xqhadoop/article/details/60757242
function rmmseg($vStr = '')
{
if('' === $vStr){
return [];
}
if(empty($this->dict)){
exit('词典为空');
}
//大写,方便比对
$str = strtoupper($vStr);
//分词初步结果
$result = [];
while ('' !== $str) {
$pos = 0;
$t_str = $str;
while (1) {
// echo '$t_str=' . $t_str . ',$pos=' . $pos . '<br>';
if (in_array($t_str, $this->dict)) {
$result[] = $t_str;
// echo ' 在字典中:' . $t_str . '<br>';
$t_str = '';
} else {
$pos++;
$t_str = mb_substr($str, $pos);
// echo ' 不在字典,剩余 ' . $t_str . '<br>';
if (1 == mb_strlen($t_str)) {
$result[] = $t_str;
$t_str = '';
}
}
if ('' === $t_str) {
break;
}
}
if (0 == $pos) {
break;
}
$str = mb_substr($str, 0, $pos);
if (1 == $pos && '' !== trim($str)) {
$result[] = $str;
break;
}
// echo '循环str=' . $str . print_r($result, 1) . '<br>';
}
$result = array_reverse($result);
// echo '<pre>' . print_r($result, 1);
$result_merge_num = [];
//连续单个数字或字母同类型合并
$last_num = '';
//连续字符类型
$last_c_type = '';
foreach ($result as $word) {
// echo print_r($result_merge_num, 1) . '<br><br>$word=' . $word . ' ';
if (1 == mb_strlen($word)) {
//单个字
$c_type = 3; //汉字
if ($word >= 'A' && $word <= 'Z') {
$c_type = 1; //字母
} elseif (is_numeric($word)) {
$c_type = 2; //数字
} elseif (in_array($word, ['+', '-', '*', '/', '.', '%'])) {
$c_type = 2; //数字或字母,随着前面一个而变
} else {
$c_type = 0; //其他,例如空格等
}
//以下是数字或字母
if ($c_type == $last_c_type) {
//与之前的是同类的
$last_num .= $word;
} else {
//不同类的
if ('' !== $last_num) {
$result_merge_num[] = $last_num;
}
$last_num = $word;
$last_c_type = $c_type;
}
// echo 'c_type=' . $c_type . ',last=' . $last_num . '<br>';
continue;
}
//以下是多个字的词
if ('' !== $last_num) {
//如果之前有连续的数值,则合并为一个,加入数组
$result_merge_num[] = $last_num;
$last_num = '';
}
$result_merge_num[] = $word;
// echo 'c_type=' . $c_type . print_r($result_merge_num, 1) . '<br>';
}
// echo '<pre> 1 $result_merge_num=' . print_r($result_merge_num, 1);
if ('' !== $last_num) {
$result_merge_num[] = $last_num;
}
//滤除空格,注意不能用array_filter,否则把0值会去掉
foreach ($result_merge_num as $i => $word) {
if (' ' === $word) {
unset($result_merge_num[$i]);
}
}
// echo '<pre> $result_merge_num=' . print_r($result_merge_num, 1);
return $result_merge_num;
}
}
//------------------------------------------------------------------
//测试
$seg = new Seg();
//词典
$dict = ['中华', '广大', '人民', '共和国', '电阻', '电阻值', '贴片', '电压',
'精度', 'RC', '功率', 'RES', 'OHM', '0603', '贴片电阻'];
$str = "贴片电阻Res0603889电阻值24.89kohm,电压 25V 功率1/8w放";
$str .= "RC0603FR-0722kL,4.22k精度0.5%,99 88方式";
$str .= "中华人民共和国广大";
$seg->set_dict($dict);
$res = $seg->rmmseg($str);
echo '原字符串=' . $str . '<br>';
echo '<br/>分词结果=';
echo "<style>.C_HIGHLIGHT{
background:#ff0; border:1px solid orange;
padding:1px 3px; margin-left:1px ;margin-top:2px;
display:inline-block}
</style>";
foreach ($res as $word) {
echo "<span class='C_HIGHLIGHT'>$word</span> ";
}
参考:https://blog.csdn.net/xqhadoop/article/details/60757242