准备:
1、prototype.js,http://prototype.conio.net/,用于ajax加载汉字字库
2、汉字字库http://cn.minidx.com/index.php?option=com_docman&task=doc_download&gid=47
----------------------------------------------------------------------------------------------------------------------------------------------
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>中英文混合分词</title>
<script language="javascript" src="prototype.js"></script>
<script language="javascript">
var dict = "";
var lastword = "";
//ajax 装载字库
new Ajax.Request('js/dict.txt',{onComplete:function(response){dict=response.responseText+"";}});
var rs = [];
function divide(text){
if(text.length==0) return true;
var word = text.substring(0,1)+"";
var regExp = //w/;
//英文
if(regExp.test(word)){
var tmp = text.replace(/^/s*(/w+)/s*.*$/,"$1");
text = text.replace(/^/s*/w+/s*/,"");
rs.push(tmp);
divide(text);
return;
}
var words = [];
var end = 0;
var start = -1;
while((start = dict.indexOf('/r/n'+word,end))!=-1){
end = dict.indexOf('/r/n',start+1);
if(start==-1||end==-1) return false;
if(start>end) return false;
words.push(dict.substr(start,end-start).replace(/(/r/n|/s)/g,""));
}
var tmp = "";
for(j=0;j<words.length;j++){
//找到最长的词,当然也可以将所有词保留
if(text.indexOf(words[j])!=-1&&words[j].length>tmp.length){
tmp=words[j];
}
}
//词库不存在的词
if(tmp == ""){
tmp = word;
}
text=text.replace(tmp,"");
if(tmp.replace(//s/g,'')!="")
rs.push(tmp);
divide(text);
}
function dodivde(){
var text = $('word').value;
rs = [];
divide(text);
$('dividresult').innerHTML=rs;
}
</script></head>
<body>
<input type="text" name="word" id="word" value="我Welcome欢迎to Mozilla Firefox Help" οnblur="dodivde();" /><input name="do" type="button" value="DO IT" οnclick="dodivde();" />
<span id="dividresult"></span>
</body>
</html>
-------------------------------------------------------------------------------------------------------------------
关键是词库。