《-----------------------------前段代码---------------------------------》
进入默认页面,会弹出一个适当大小的窗口:
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<script type="text/javascript">
function openWindow(newWindow)
{
window.open(newWindow, newWindow,"height=460px, width=760px, top=100px; left=350px, toolbal=no, menubar=no, scrollbars=no, resizeable=no, location=no, status=no");
}
window.οnlοad=openWindow("view.html");
</script>
</head>
<body></body>
</html>
弹出的窗口如下:
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<script type="text/javascript">
//创建XHR
function getXMLHttpRequest()
{
var xmlhttp=null;
if(window.ActiveXObject)
{
xmlhttp = new ActiveXObject("Microsoft.XMLHttp");
}
else
{
xmlhttp=new XMLHttpRequest();
}
return xmlhttp;
}
//发送AJAX查询
function query()
{
var url="/ciba/niujin/process.php";
var data="?enword="+$('enWord').value+"&rand="+Math.random();
var en=$('enWord').value;
$('enWord').value="";
xmlhttp=getXMLHttpRequest();
if (xmlhttp)
{
xmlhttp.open("get", url+data,true);
xmlhttp.onreadystatechange=function()
{
//window.alert(xmlhttp.readyState);
if (xmlhttp.readyState==4 && xmlhttp.status==200)
{
var res=xmlhttp.responseText;
//res=eval("("+res+")");
//window.alert(res);
$("chWord").innerHTML= en+": "+res;
}
}
xmlhttp.send(null);
}
}
//捕捉回车键,如果是,进行查询
function sendQuery(event)
{
if(event.keyCode==13)
{
query();
return false;
}
}
//清空输入框
function clearContent()
{
$("enWord").value="";
return false;
}
function $(id)
{
return document.getElementById(id);
}
</script>
<style rel="stylesheet" type="text/css">
#enWord{
width:650px;
height:40px;
border:2px solid gray;
margin-top:10px;
}
#sendButton{
width:65px;
height:35px;
}
#chWord{position:relative;
left:10px;
top:50px;
width:600px;
font-size:20px;
color:red;}
</style>
<head>
<body>
<img src="logo.jpg" /><br />
<input type="text" value="请输入词条" id="enWord" οnkeydοwn="sendQuery(event);" οnclick="clearContent();" />
<input type="button" id="sendButton" value="查询" οnclick="query();" /><br />
<div id="chWord"><div>
</body>
</html>
<-------------------------------处理过程如下------------------------------------------------->
<?php
require_once "storeWord.class.php";
require_once "filterWord.tool.php";
header("content-type: plain/text; charset=utf-8");
if(!empty($_GET['enword']))
{
$en=$_GET['enword'];
$en=formatWord($en);//格式化输入
$redis=new StoreWord();
$ch=$redis->getWord($en);
$res="<res><en>$en</en><ch>$ch</ch></res>";
file_put_contents('aword.txt', $res."\r\n",FILE_APPEND);
//ob_clean();
echo $ch;
}
else
{
file_put_contents('aword.txt', "receive NON data \r\n",FILE_APPEND);
}
------------------------------格式化输入的工具函数如下:
<?php
function formatWord($en)
{
$en=trim($en);
if(preg_match('#[\x{4e00}-\x{9fa5},\)\.\(]+#u', $en))
{
//过滤掉中文,同时提取其中的英文字符
if(preg_match('#\b[a-z]+\b#i', $en, $res))
{
if(strlen($res[0])>20)
{
//echo "字符过长<br />";
return strtolower(substr($res[0], 0,20));
}
return strtolower($res[0]);
}
else
{
return "";
}
}
else if(preg_match('#\s+#', $en))
{
//$en=explode(' ', $en);
//echo "含有空格<br />";
$res=null;
if(preg_match('#[a-z]+#i', $en, $res))
{
if(strlen($res[0])>20)
{
//echo "字符过长<br />";
return strtolower(substr($res[0], 0,20));
}
return strtolower($res[0]);
}
}
else if(preg_match('#[—_\+\?\*\^\$\#\%\&\/\\,\.!@=\`\'\"\"""]#',$en, $res))
{
//
//echo '含有非法字符<br />';
if(preg_match('#[a-z]+#i', $en, $res))
{
if(strlen($res[0])>20)
{
//echo "字符过长<br />";
return strtolower(substr($res[0], 0,20));
}
return strtolower($res[0]);
}
}
else if(strlen($en)>20)
{
//echo "字符过长<br />";
return strtolower(substr($en, 0,20));
}
else
{
return $en;
}
}
<------------------------------------后端实现如下---------------------------------------------->
1. 依次读取给定目录下的词条
2. 对词条进行分词和格式化
3. 按照从A-Z的分类,依次写入redis缓存服务器的【a-z】:OFX哈希表,所以总共有26个哈希表
4. 对A-Z目录下的词条进行写入
【读取目录】
<?php
class Dir
{
private $fileList=array();
public function __construct($path)
{
$this->readFileList($path);
}
function readFileList($path)
{
$path=$this->transPathSep($path);
$encode=mb_detect_encoding($path, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
$path=mb_convert_encoding($path, 'GB2312', $encode);
//用于路径读取时用UTF编码会失败,所以先转成GB2312
if ($fd=opendir($path))
{
while($fileName=readdir($fd))
{
//如果不是当前目录和上级目录
if($fileName !="." && $fileName !="..")
{
//如果是一个文件
if(is_file($path.'/'.$fileName))
{
$extName=pathinfo($path."/".$fileName)["extension"];
if(strtolower($extName)=='txt')
{
//上面把路径转成了GB2312,这里再转换会UTF-8编码
$temp=mb_convert_encoding($path.'/'.$fileName, 'UTF-8', $encode);
$groupName=$this->groupFile($temp);
$this->fileList[$groupName][]=$temp;
}
}
//如果是一个目录,则继续递归读取
else if(is_dir($path.'/'.$fileName))
{
$this->readFileList($path.'/'.$fileName);
}
}
}
}
@closedir($fd);
}
public function getFileList()
{
return $this->fileList;
}
//提取单词分类,比如从A-Z
private function groupFile($filename)
{
$pos=strripos($filename, '/');
$word=strtolower(substr($filename, $pos+1, 1));
return $word;
}
//转换window环境下路径的默认分隔符\为PHP识别更好的/
private function transPathSep($path)
{
$system=$_SERVER["SERVER_SOFTWARE"];
$pat="#\((.*?)\)#";
$sysVer=null;
if(preg_match($pat,$system,$match))
{
$sysVer=$match[1];
}
else
{
die("匹配系统类型失败<br />");
}
if(strtolower($sysVer)=="win32")
{
$realPath=str_replace("\\","/",$path);
return $realPath;
}
}
}
/* $dir=new Dir('E:\CodeEdit\php\ciba\TXT格式的牛津电子词典\牛津电子词典');
$list=$dir->getFileList();
echo "<pre>";
print_r($list);
echo "</pre>"; */
?>
【分词】
<?php
require_once "formatWord.class.php";
require_once "formatTrans.class.php";
class Oxford
{
private $OfileName=null;
private $ODicString='';
private $ODicUnit=array();
private $ODicWord=array();
private $ODicTrans=array();
private $ODicEncoding=null;
private $word=null;
private $trans=null;
public function __construct($fname)
{
$encode=mb_detect_encoding($fname, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
$this->OfileName=mb_convert_encoding(trim($fname), 'GB2312', $encode);
$this->word=new FormatWord();
$this->trans=new FormatTrans();
$temp=file_get_contents($this->OfileName, false, null, 0, 64);
$this->ODicEncoding=mb_detect_encoding($temp, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
}
//读取文件并保存到$OfileName中
protected function readDicFromFile()
{
if(!file_exists($this->OfileName))
{
die('文件不存在'.__LINE__);
}
if(!is_readable($this->OfileName))
{
die('文件不可读'.__LINE__);
}
$fp=fopen($this->OfileName, 'r') or die('打开文件失败'.__LINE__);
while(!feof($fp))
{
$this->ODicString .= fread($fp, 1024);
}
fclose($fp);
if(strtoupper($this->ODicEncoding)!='UTF-8')
{
$this->ODicString=mb_convert_encoding($this->ODicString, 'UTF-8', $this->ODicEncoding);
}
}
//根据音标进行分割并保存到ODicUnit单元中,保存为数组
protected function splitWithVoice($pattern='#(?<=\r\n)/[^/]+?/(?=\s)#ui')
{
$this->ODicUnit=preg_split($pattern, $this->ODicString);
}
//从第二个到倒数第二个单元里面,保存的都是上一个单词的翻译,和下一个单词
//根据音标分割成单元后,第一个单元里保存的是第一个单词,最后一个单元里则保存的是最后一个翻译
//并且单词是结尾,单词和上一个单词的翻译之间肯定是存在回车换行符的,根据这个特征,提取出单词
//所以依次从第二个单元到倒数第二个单元里,根据特征提取出单词。
protected function grepWord()
{
//提取首单词
$this->ODicWord[]=trim($this->ODicUnit[0]);
$pat='#\r\n(.*)(?:\r\n)$#i'; //提取单词
$len=count($this->ODicUnit);
for($i=1; $i<$len-1; $i++)
{
if(preg_match($pat, $this->ODicUnit[$i], $match))
{
$this->ODicWord[]=$match[1];
}
else
{
die('匹配单词失败'.__LINE__."<br />");
}
}
}
//从第二个到倒数第二个单元里面,保存的都是上一个单词的翻译,和下一个单词
//根据音标分割成单元后,第一个单元里保存的是第一个单词,最后一个单元里则保存的是最后一个翻译
//在每个单元中反向搜索单词第一次出现的位置,根据这个位置可以提取出上一个单词的翻译
protected function grepTrans()
{
$len=count($this->ODicUnit);
for($i=1; $i<$len-1; $i++)
{
if($pos=strripos($this->ODicUnit[$i], $this->ODicWord[$i]))
{
array_push($this->ODicTrans,$this->trans->getTrans(substr($this->ODicUnit[$i], 0, $pos)));
}
else
{
die("此方法不可行".__LINE__."<br />");
}
}
array_push($this->ODicTrans,$this->trans->getTrans($this->ODicUnit[$len-1]));
}
public function oxf()
{
$this->readDicFromFile();
$this->splitWithVoice();
$this->grepWord();
$this->grepTrans();
$len=count($this->ODicWord);
$oxfWord=array();
for($i=0; $i<$len; $i++)
{
$oxfWord[$this->ODicWord[$i]] = $this->ODicTrans[$i];
//echo $this->ODicWord[$i]."<br />";
}
return $oxfWord;
//return $this->ODicUnit;
}
}//class Oxford
/* $oxf=new Oxford(' E:/CodeEdit/php/ciba/TXT格式的牛津电子词典/牛津电子词典/G/G-b,c,d,e.txt ');
$res=$oxf->oxf();
//$result=$oxf->returnWord();
echo "<pre>";
print_r($res);
echo "</pre>"; */
?>
【格式化词条】
<?php
class FormatWord
{
//过滤逗号
private function filterComma($str)
{
if($pos=strpos($str, ','))
{
$leftWord=trim(substr($str, 0, $pos));
$rightWord=trim(substr($str, $pos+2, strlen($str)));
if(strlen($leftWord)==1 && strlen($leftWord)<strlen($rightWord))
{
return $rightWord;
}
else
{
return $leftWord;
}
}
else
{
return $str;
}
}
//当单词的左边有左括号时,这是二种情况
private function filterBrackets($str)
{
if(($pos01=strpos($str,'('))&& ($pos02=strpos($str, ')')))
{
$leftWord=substr($str, 0, $pos01);
$rightWord=substr($str, $pos01+1, $pos02-$pos01-1);
if(strlen($leftWord)==1 && strlen($rightWord)>strlen($leftWord))
{
return $rightWord;
}
else
{
return $leftWord;
}
}
else
{
return $str;
}
}
//当单词的左边是空格时,,这是第三种情况
private function filterEmpty($str)
{
if($pos=strpos($str, ' '))
{
$leftWord=trim(substr($str, 0, $pos));
$rightWord=trim(substr($str, $pos+1, strlen($str)));
if(strlen($leftWord)==1 && strlen($leftWord)<strlen($rightWord))
{
return $rightWord;
}
else
{
return $leftWord;
}
}
else
{
return $str;
}
}
private function filterQuote($str)
{
if($pos=strpos($str, '\''))
{
$leftWord=trim(substr($str, 0, $pos));
$rightWord=trim(substr($str, $pos+1, strlen($str)));
if(strlen($leftWord)==1 && strlen($leftWord)<strlen($rightWord))
{
return $rightWord;
}
else
{
return $leftWord;
}
}
else
{
return $str;
}
}
//提取单词
public function filterWord($str)
{
$pat='#[a-z]+#ui';
if(preg_match($pat, $str, $match))
{
return trim($match[0]);
}
else
{
die("无法提取单词");
}
}
//提取单词右边一位的字符
function detectDelimit($str)
{
$word=$this->filterWord($str);
$len=strlen($word);
$delimit=substr($str, $len,1);
return $delimit;
}
public function getCommonWord($str)
{
$sign=array(',', '(', ' ','\'');
$delimit=$this->detectDelimit($str);
//echo $delimit;exit();
if(in_array($delimit, $sign))
{
if($delimit==',')
{
$word=$this->filterComma($str);
return $word;
}
else if($delimit=='(')
{
$word=$this->filterBrackets($str);
return $word;
}
else if($delimit==' ')
{
$word=$this->filterEmpty($str);
return $word;
}
else if($delimit=='\'')
{
$word=$this->filterQuote($str);
return $word;
}
else
{
$word=$this->filterWord($str);
return $word;
}
}
else
{
return $this->filterWord($str);
}
}
public function getWord($str)
{
$word=$this->getCommonWord($str);
$pat='#[a-z]+#ui';
if(preg_match($pat, $word, $match))
{
return $match[0];
}
else
{
die("获取最终的单词失败".__LINE__."<br />");
}
}
public function grepWordFromContent($str)
{
$pos=strpos($str, '(');
return trim(substr($str, 0, $pos));
}
public function getFirstWord($str)
{
if(preg_match('#(\r\n)#mi', $str, $arr))
{
if(preg_match('#\r\n(.*)$#i',$str, $res))
{
return $this->getWord($res[1]);
}
}
else
{
return $this->getWord($str);
}
}
}
/* $str=<<<str
agr(o)-\r\ncomb form 构词成分 of soil 泥土的; 土壤的: agriculture * agronomy.\r\nanthrop(o)-\r\ncomb form 构词成分 of human beings 人; 人类: anthropomorphic\r\nc/o
str;
$word=new FormatWord();
echo $word->getFirstWord($str); */
?>
【格式化翻译】
<?php
class FormatTrans
{
public function getTrans($str)
{
$pat='#\s(\d{1,2})\s#';
if(preg_match_all($pat, $str))
{
$res=preg_split($pat, $str);
$len=count($res);
$trans='';
for($i=1; $i<=$len; $i++)
{
$trans .="<b>".$i."</b>".'. '.$res[$i-1]."<br />";
}
return $trans;
}
else
{
return $str;
}
}
}
【redis服务器的相关设置】
<?php
class StoreWord
{
private $redis=null;
public function __construct()
{
$this->redis=new Redis();
$this->redis->connect('127.0.0.1', 6379);
$this->redis->auth('caifangjie');
}
public function setWord($wordZone,$word)
{
$hName=$wordZone.':OFX';
foreach ($word as $key => $value)
{
$this->redis->hSetNx($hName, $key, $value);
//echo $key.'------'.$value; exit();
}
}
public function getWord($key)
{
$wordZone=null;
if (preg_match('#^[a-z]#i', $key, $word))
{
$wordZone=strtolower($word[0]);
}
else
{
die('匹配单词分组失败,并返回');
}
$hName=$wordZone.':OFX';
if($this->redis->hExists($hName, $key))
{
return $this->redis->hGet($hName, $key);
}
else
{
die("找不到你需要的单词<br >");
}
}
public function getAllWord($wordZone)
{
$hName=$wordZone.':OFX';
return $this->redis->hKeys($hName);
}
public function getNumWord($wordZone)
{
$hName=$wordZone.':OFX';
return $this->redis->hLen($hName);
}
}
/*
$redis=new StoreWord();
$redis->setWord('a', array('all'=>'全部,所有', 'about'=>'关于','above'=>'上面,上部'));
echo $redis->getWord('a', 'about'); */
?>
【依次写入redis】
<?php
require_once "readDic.class.php";
require_once "readDir.class.php";
require_once "storeWord.class.php";
class Server
{
private $redis=null;
public function __construct($dir)
{
$this->redis=new StoreWord();
$this->parseDic($dir);
}
public function getDir($path)
{
$dir=new Dir($path);
return $dir->getFileList();
}
public function parseDic($dir)
{
$path=$this->getDir($dir);
foreach ($path as $wordZone => $dir)
{
foreach ($dir as $dicPath)
{
$oxf=new Oxford($dicPath);
$res=$oxf->oxf();
$this->redis->setWord($wordZone, $res);
ob_start();
echo '单词库入库: '. $dicPath. "<br />";
ob_flush();
flush();
sleep(2);
}
echo '<font color="red">'.$wordZone.'分类存储完毕</font><br />';
}
die('<font color="blue">所有分类存储完毕</font><br />');
}
}
set_time_limit(1000);
$ser=new Server('E:\CodeEdit\php\ciba\TXT格式的牛津电子词典\牛津电子词典');
?>
牛津词典的TXT词库链接如下: 点击打开链接
源代码如下:点击打开链接