通过读取txt格式的词库,写入memcache缓存。比较简单,基本不存在分词的问题:
前段如下:
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<script type="text/javascript">
function openWindow(newWindow)
{
window.open(newWindow, "՚Пөѯ","height=460px, width=740px, top=100px; left=350px, toolbal=no, menubar=no, scrollbars=no, resizeable=no, location=no, status=no");
}
window.οnlοad=openWindow("view.php");
</script>
</head>
<body>
</body>
</html>
弹出的窗口如下:
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<script type="text/javascript">
function getXMLHttpRequest()
{
var xmlhttp=null;
if(window.ActiveXObject)
{
xmlhttp = new ActiveXObject("Microsoft.XMLHttp");
}
else
{
xmlhttp=new XMLHttpRequest();
}
return xmlhttp;
}
function query()
{
var url="/ciba/process.php";
var data="?enword="+$('enWord').value+"&rand="+Math.random();
var en=$('enWord').value;
$('enWord').value="";
xmlhttp=getXMLHttpRequest();
if (xmlhttp)
{
xmlhttp.open("get", url+data,true);
xmlhttp.onreadystatechange=function()
{
//window.alert(xmlhttp.readyState);
if (xmlhttp.readyState==4 && xmlhttp.status==200)
{
var res=xmlhttp.responseText;
//res=eval("("+res+")");
//window.alert(res);
$("chWord").innerHTML= en+": "+res;
}
}
xmlhttp.send(null);
}
}
function sendQuery(event)
{
if(event.keyCode==13)
{
query();
return false;
}
}
function clearContent()
{
$("enWord").value="";
return false;
}
function $(id)
{
return document.getElementById(id);
}
</script>
<style rel="stylesheet" type="text/css">
#enWord{border:1px solid blue;
margin-top:20px;
}
#sendButton{position:relative;
bottom:12px;}
#chWord{position:relative;
left:10px;
top:50px;
font-size:20px;
color:red;}
</style>
<head>
<body>
<img src="logo.jpg" /><br />
<textarea id="enWord" cols="90" rows="2" οnkeydοwn="sendQuery(event);" οnclick="clearContent();">
请输入词条
</textarea>
<input type="button" id="sendButton" value="查询" οnclick="query();" /><br />
<div id="chWord"><div>
</body>
</html>
【控制器部分】
<?php
header("content-type: plain/text; charset=utf-8");
require_once "storeWord.php";
if(!empty($_GET['enword']))
{
$en=$_GET['enword'];
$mem=new MemStore();
$ch=$mem->getWord($en);
$en=$mem->filterWord($en);
$res="<res><en>$en</en><ch>$ch</ch></res>";
file_put_contents('aword.txt', $res."\r\n",FILE_APPEND);
//ob_clean();
echo $ch;
}
else
{
file_put_contents('aword.txt', "receive NON data \r\n",FILE_APPEND);
}
【后台】
<?php
require_once('parseWord.php');
class MemStore
{
private $mem=null;
private $pat='#^[a-zA-Z]+\b#i';
public function __construct()
{
$this->mem=new Memcache();
$this->mem->connect("127.0.0.1", 11211) or die("connect memcached failed!!!<br />");
}
public function __destruct()
{
$this->mem->close();
}
public function addWord($dic)
{
$word=new Word($dic);
$word->readWord();
$result=$word->getWord();
//echo count($result)."字符<br />";
//exit();
foreach($result as $en => $ch)
{
$this->mem->add($en, $ch, MEMCACHE_COMPRESSED, time()+10*24*3600) or die("添加词条失败". __LINE__ ."<br />");
}
}
public function setWord($en,$ch)
{
//控制器判断输入是否合法
$en=$this->filterWord($en);
$en=$this->mem->get($en) or die("找不到词条 $en");
$this->mem->set($en, $ch, MEMCACHE_COMPRESSED, time()+31*24*3600) or die("添加词条$en失败");
}
public function getWord($en)
{
//控制器判断输入是否合法
$en=$this->filterWord($en);
$ch=$this->mem->get($en) or die("找不到词条 $en");
return $ch;
}
public function replaceWord($en,$ch)
{
//控制器判断输入是否合法
$en=$this->filterWord($en);
$en=$this->mem->get($en) or die("找不到词条 $en");
$this->mem->replace($en, $ch, MEMCACHE_COMPRESSED, time()+31*24*3600) or die("替换词条$en失败");
}
public function deleteWord($en)
{
//控制器判断输入是否合法
$en=$this->filterWord($en);
$this->mem->delete($en,0) or die("删除词条$en失败");
}
//过滤掉中文,包括空格的词组,长度大于20的词条
public function filterWord($en)
{
$en=trim($en);
if(preg_match('#[\x{4e00}-\x{9fa5},\)\.\(]+#u', $en))
{
//过滤掉中文,同时提取其中的英文字符
if(preg_match('#\b[a-z]+\b#i', $en, $res))
{
if(strlen($res[0])>20)
{
//echo "字符过长<br />";
return strtolower(substr($res[0], 0,20));
}
return strtolower($res[0]);
}
else
{
return " ";
}
}
else if(preg_match('#\s+#', $en))
{
//$en=explode(' ', $en);
//echo "含有空格<br />";
$res=null;
if(preg_match('#[a-z]+#i', $en, $res))
{
if(strlen($res[0])>20)
{
//echo "字符过长<br />";
return strtolower(substr($res[0], 0,20));
}
return strtolower($res[0]);
}
}
else if(preg_match('#[—_\+\?\*\^\$\#\%\&\/\\,\.!@=\`\'\"\"""]#',$en, $res))
{
//
//echo '含有非法字符<br />';
if(preg_match('#[a-z]+#i', $en, $res))
{
if(strlen($res[0])>20)
{
//echo "字符过长<br />";
return strtolower(substr($res[0], 0,20));
}
return strtolower($res[0]);
}
}
else if(strlen($en)>20)
{
//echo "字符过长<br />";
return strtolower(substr($en, 0,20));
}
else
{
return $en;
}
}
public function flushAll()
{
$this->mem->flush();
}
public function getTime()
{
if (function_exists("micro_time"))
{
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
else
{
return time();
}
}
}
//$mem=new MemStore();
//$mem->addWord('ciba.txt');
//$mem->flushAll();
//$mem->replaceWord('abandon', 100000000);
//$mem->deleteWord('abandon');
//echo $mem->getWord('_*&^%abandon^%$#');
//echo "ok";
?>
【解析库】
<?php
class Word
{
private $query_en='#\w+\b#i';
private $query_ch='#[\x{4e00}-\x{9fa5}][\x{4e00}-\x{9fa5},\)\.\( \w]*#u';
private $arr_word=array();
private $recycle_num=100;
private $fp=null;
public function __construct($fileName)
{
$this->fp=fopen($fileName,'r') or die('打开ciba失败');
}
public function readWord()
{
while(!feof($this->fp))
{
$word=fgets($this->fp);
$word=trim($word);
if($word=='') continue;
$en=$this->parseEn($word);
$ch=$this->parseCh($word);
$this->arr_word["$en"]=$ch;
/* $this->recycle_num--;
if($this->recycle_num==0) return; */
}
}
public function parseEn(&$word)
{
if(preg_match($this->query_en, $word, $en))
{
return $en[0];
}
else
{
echo "match english word failed<br />";
}
}
public function parseCh(&$word)
{
if(preg_match($this->query_ch, $word, $ch))
{
return $ch[0];
}
else
{
echo "match chinese failed<br />";
}
}
public function getWord()
{
return $this->arr_word;
}
public function __destruct()
{
fclose($this->fp);
}
}
//$word=new Word('ciba.txt');
//$word->readWord();
//echo "<pre>";
//print_r($word->getWord());
//echo "</pre>"; */
?>