功能: 对英文段落进行分句.
原理: 根据特殊标点符合(.?!等)进行切分, 考虑特殊情况,比如网址中包含(.)等.
代码:
<?php // 初始化变量, 英文分句用 $special = array(); /** * 英文分句 * @param string */ function englishCut($s) { global $special; $special[0] = array(); $special[1] = array(); //替换特殊的 $s = special_replace("/www\.[\w]+\.(com|cn|org)/i",$s); $s = special_replace("/\.(com|cn|org)/i",$s); $s = special_replace("/[0-9]\.[0-9]/",$s); //分句 $temp =preg_split("/[\?\.\!]\s?/",trim($s)); // array_pop($temp); //还原每句 foreach($temp as $k => $v) $temp[$k] = special_revert($v); return $temp; } function special_replace($pattern, $str){ global $special; preg_match_all($pattern, $str, $temp); if(is_array($temp)) foreach($temp[0] as $k => $v){ $special[0][] = $v; $special[1][] = $temp2 = "|".md5($v)."|"; $str = str_replace($v, $temp2, $str); } return $str; } function special_revert($str){ global $special; return str_replace($special[1],$special[0],$str); } // $a = englishCut($str);
参考资料:
http://topic.csdn.net/u/20081102/23/6ffe7f9e-c322-4f08-ace5-317496acff54.html