PHP实现牛津英汉词典

最新推荐文章于 2024-04-18 08:57:17 发布

错_对

最新推荐文章于 2024-04-18 08:57:17 发布

阅读量3k

点赞数

分类专栏： php 文章标签： php redis 牛津词典

本文链接：https://blog.csdn.net/Free_Program_1314/article/details/41018875

版权

php 专栏收录该内容

24 篇文章 0 订阅

订阅专栏

《-----------------------------前段代码---------------------------------》

进入默认页面，会弹出一个适当大小的窗口：

<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<script type="text/javascript">
function openWindow(newWindow)
{
	window.open(newWindow, newWindow,"height=460px, width=760px, top=100px; left=350px, toolbal=no, menubar=no, scrollbars=no, resizeable=no, location=no, status=no");
}

window.οnlοad=openWindow("view.html");
</script>
</head>
<body></body>
</html>

弹出的窗口如下：

<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />

<script type="text/javascript">

//创建XHR
function getXMLHttpRequest()
{
	var xmlhttp=null;
	if(window.ActiveXObject)
	{
		xmlhttp = new ActiveXObject("Microsoft.XMLHttp");
	}
	else
	{
		xmlhttp=new XMLHttpRequest();
	}
	return xmlhttp;
}
//发送AJAX查询
function query()
{
	var url="/ciba/niujin/process.php";
	var data="?enword="+$('enWord').value+"&rand="+Math.random();	
	var en=$('enWord').value;
	$('enWord').value="";
	xmlhttp=getXMLHttpRequest();
	 if (xmlhttp)
	 {
		
		xmlhttp.open("get", url+data,true);
		
		
		 xmlhttp.onreadystatechange=function()
		{
			//window.alert(xmlhttp.readyState);
			if (xmlhttp.readyState==4 && xmlhttp.status==200)
			{
				var res=xmlhttp.responseText;
				//res=eval("("+res+")");
				//window.alert(res);
				$("chWord").innerHTML= en+": "+res;   
			}
		} 
		xmlhttp.send(null); 
	 }
}
//捕捉回车键，如果是，进行查询
function sendQuery(event)
{
	if(event.keyCode==13)
	{
		query();
		return false;
	}
}
//清空输入框
function clearContent()
{
	$("enWord").value="";
	return false;
}
function $(id)
{
	return document.getElementById(id);
}
</script>

<style rel="stylesheet" type="text/css">
#enWord{
		width:650px;
		height:40px;
		border:2px solid gray;
		margin-top:10px;
		}
#sendButton{
			width:65px;
			height:35px;
			}
#chWord{position:relative;
		left:10px;
		top:50px;
		width:600px;
		font-size:20px;
		color:red;}
</style>
<head>

<body>

<img src="logo.jpg" /><br />
<input type="text"  value="请输入词条" id="enWord"  οnkeydοwn="sendQuery(event);" οnclick="clearContent();" />
<input type="button" id="sendButton" value="查询" οnclick="query();" /><br />

<div id="chWord"><div>
</body>
</html>

<-------------------------------处理过程如下------------------------------------------------->

<?php
require_once "storeWord.class.php";
require_once "filterWord.tool.php";
header("content-type: plain/text; charset=utf-8");

if(!empty($_GET['enword']))
{
$en=$_GET['enword'];
$en=formatWord($en);//格式化输入
$redis=new StoreWord();
$ch=$redis->getWord($en);
$res="<res><en>$en</en><ch>$ch</ch></res>";
file_put_contents('aword.txt', $res."\r\n",FILE_APPEND);
//ob_clean();
echo $ch;
}
else
{
file_put_contents('aword.txt', "receive NON data \r\n",FILE_APPEND);
}

------------------------------格式化输入的工具函数如下：

<?php
function formatWord($en)
{
		$en=trim($en);

			if(preg_match('#[\x{4e00}-\x{9fa5}，\)\.\(]+#u', $en))
			{
				//过滤掉中文，同时提取其中的英文字符
				 if(preg_match('#\b[a-z]+\b#i', $en, $res))
				{
					if(strlen($res[0])>20)
					{
						//echo "字符过长<br />";
						return strtolower(substr($res[0], 0,20));
					}
					return strtolower($res[0]);
				}
				else
				{
					return "";
				}
			}
			else if(preg_match('#\s+#', $en))
			{
					//$en=explode(' ', $en);
				//echo "含有空格<br />";
				$res=null;
				if(preg_match('#[a-z]+#i', $en, $res))
				{
					if(strlen($res[0])>20)
					{
						//echo "字符过长<br />";
						return strtolower(substr($res[0], 0,20));
					}
					return strtolower($res[0]);
				}
			}
			else if(preg_match('#[—_\+\?\*\^\$\#\%\&\/\\,\.!@=\`\'\"\"""]#',$en, $res))
			{
				//
				//echo '含有非法字符<br />';
				if(preg_match('#[a-z]+#i', $en, $res))
				{
					if(strlen($res[0])>20)
					{
						//echo "字符过长<br />";
						return strtolower(substr($res[0], 0,20));
					}
					return strtolower($res[0]);
				}
			}
	
			else if(strlen($en)>20)
			{
				//echo "字符过长<br />";
				return strtolower(substr($en, 0,20));
			}
			else 
			{
				return $en;
			}
		
}

<------------------------------------后端实现如下---------------------------------------------->

1. 依次读取给定目录下的词条

2. 对词条进行分词和格式化

3. 按照从A-Z的分类，依次写入redis缓存服务器的【a-z】:OFX哈希表，所以总共有26个哈希表

4. 对A-Z目录下的词条进行写入

【读取目录】

<?php
class Dir
{
	private $fileList=array();
	
	public function __construct($path)
	{
		$this->readFileList($path);
	}

	function readFileList($path)
	{
		$path=$this->transPathSep($path);
		$encode=mb_detect_encoding($path, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
		$path=mb_convert_encoding($path, 'GB2312', $encode);
		//用于路径读取时用UTF编码会失败，所以先转成GB2312
		if ($fd=opendir($path))
		{
			while($fileName=readdir($fd))
			{
				//如果不是当前目录和上级目录
				if($fileName !="." && $fileName !="..")
				{
					//如果是一个文件
					if(is_file($path.'/'.$fileName))
					{	
						$extName=pathinfo($path."/".$fileName)["extension"];
						if(strtolower($extName)=='txt')
						{
							//上面把路径转成了GB2312，这里再转换会UTF-8编码
							$temp=mb_convert_encoding($path.'/'.$fileName, 'UTF-8', $encode);
							$groupName=$this->groupFile($temp);
							$this->fileList[$groupName][]=$temp;
						}
					}
					//如果是一个目录，则继续递归读取
					else if(is_dir($path.'/'.$fileName))
					{
						$this->readFileList($path.'/'.$fileName);
					}
				}	
			
			}

		}
		@closedir($fd);
		
	}
	
	public function getFileList()
	{
		return $this->fileList;
	}
	//提取单词分类，比如从A-Z
	private function groupFile($filename)
	{
		$pos=strripos($filename, '/');
		$word=strtolower(substr($filename, $pos+1, 1));
		return $word;
	}
	//转换window环境下路径的默认分隔符\为PHP识别更好的/
	private function transPathSep($path)
	{
		$system=$_SERVER["SERVER_SOFTWARE"];
		$pat="#\((.*?)\)#";
		$sysVer=null;
		if(preg_match($pat,$system,$match))
		{
			$sysVer=$match[1];
		}
		else
		{
			die("匹配系统类型失败<br />");
		}
		if(strtolower($sysVer)=="win32")
		{
			$realPath=str_replace("\\","/",$path);
			return $realPath;
		}
	}
	
}

/* $dir=new Dir('E:\CodeEdit\php\ciba\TXT格式的牛津电子词典\牛津电子词典');
$list=$dir->getFileList();
echo "<pre>";
print_r($list); 
echo "</pre>";  */

?>

【分词】

<?php
require_once "formatWord.class.php";
require_once "formatTrans.class.php";
class Oxford
{
	private $OfileName=null;
	private $ODicString='';
	private $ODicUnit=array();
	private $ODicWord=array();
	private $ODicTrans=array();
	private $ODicEncoding=null;
	
	private $word=null;
	private $trans=null;
	
	
	public function __construct($fname)
	{
		$encode=mb_detect_encoding($fname, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
		$this->OfileName=mb_convert_encoding(trim($fname), 'GB2312', $encode);
		
		$this->word=new FormatWord();
		$this->trans=new FormatTrans();

		$temp=file_get_contents($this->OfileName, false, null, 0, 64);
		$this->ODicEncoding=mb_detect_encoding($temp, array('GB2312','GBK','UTF-8','BIG5','LATIN1'));
		
	}
	
	//读取文件并保存到$OfileName中
	protected function readDicFromFile()
	{
		if(!file_exists($this->OfileName))
		{
			die('文件不存在'.__LINE__);
		}
		if(!is_readable($this->OfileName))
		{
			die('文件不可读'.__LINE__);
		}
		$fp=fopen($this->OfileName, 'r') or die('打开文件失败'.__LINE__);
		
		while(!feof($fp))
		{
			$this->ODicString .= fread($fp, 1024);
		}
		fclose($fp);
		if(strtoupper($this->ODicEncoding)!='UTF-8')
		{
			$this->ODicString=mb_convert_encoding($this->ODicString, 'UTF-8', $this->ODicEncoding);
		}
		
	}
	//根据音标进行分割并保存到ODicUnit单元中，保存为数组
	protected  function splitWithVoice($pattern='#(?<=\r\n)/[^/]+?/(?=\s)#ui')
	{
		$this->ODicUnit=preg_split($pattern, $this->ODicString);
	}
	//从第二个到倒数第二个单元里面，保存的都是上一个单词的翻译，和下一个单词
	//根据音标分割成单元后，第一个单元里保存的是第一个单词，最后一个单元里则保存的是最后一个翻译
	//并且单词是结尾，单词和上一个单词的翻译之间肯定是存在回车换行符的，根据这个特征，提取出单词
	//所以依次从第二个单元到倒数第二个单元里，根据特征提取出单词。
	protected  function grepWord()
	{
		//提取首单词
		$this->ODicWord[]=trim($this->ODicUnit[0]);
		
		$pat='#\r\n(.*)(?:\r\n)$#i'; //提取单词
		$len=count($this->ODicUnit);
		
		for($i=1; $i<$len-1; $i++)
		{
			if(preg_match($pat, $this->ODicUnit[$i], $match))
			{
				$this->ODicWord[]=$match[1];
			}
			else
			{
				die('匹配单词失败'.__LINE__."<br />");
			}
		}
	}
	
	//从第二个到倒数第二个单元里面，保存的都是上一个单词的翻译，和下一个单词
	//根据音标分割成单元后，第一个单元里保存的是第一个单词，最后一个单元里则保存的是最后一个翻译
	//在每个单元中反向搜索单词第一次出现的位置，根据这个位置可以提取出上一个单词的翻译
	protected  function grepTrans()
	{
		$len=count($this->ODicUnit);
		for($i=1; $i<$len-1; $i++)
		{
			if($pos=strripos($this->ODicUnit[$i], $this->ODicWord[$i]))
			{
				array_push($this->ODicTrans,$this->trans->getTrans(substr($this->ODicUnit[$i], 0, $pos)));
			}
			else
			{
				die("此方法不可行".__LINE__."<br />");
			}
			
		}
		array_push($this->ODicTrans,$this->trans->getTrans($this->ODicUnit[$len-1]));
	}

	public function oxf()
	{
		$this->readDicFromFile();
		$this->splitWithVoice();
		$this->grepWord();
		$this->grepTrans();
		
		
		$len=count($this->ODicWord);
		$oxfWord=array();
		for($i=0; $i<$len; $i++)
		{
			$oxfWord[$this->ODicWord[$i]] = $this->ODicTrans[$i];
			//echo $this->ODicWord[$i]."<br />";
		}
		return $oxfWord;
		//return $this->ODicUnit;
		
	}

}//class Oxford

/* $oxf=new Oxford(' E:/CodeEdit/php/ciba/TXT格式的牛津电子词典/牛津电子词典/G/G-b,c,d,e.txt  ');
$res=$oxf->oxf();
//$result=$oxf->returnWord();
 echo "<pre>";
print_r($res);
echo "</pre>";  */
?>

【格式化词条】

<?php  
class FormatWord
{
	//过滤逗号
	private function filterComma($str)
	{
		if($pos=strpos($str, ','))
		{
			$leftWord=trim(substr($str, 0, $pos));
			$rightWord=trim(substr($str, $pos+2, strlen($str)));
			if(strlen($leftWord)==1 && strlen($leftWord)<strlen($rightWord))
			{
				return $rightWord;
			}
			else
			{
				return $leftWord;
			}
		}
		else
		{
			return $str;
		}
	}

	//当单词的左边有左括号时,这是二种情况
	private function filterBrackets($str)
	{
		if(($pos01=strpos($str,'('))&& ($pos02=strpos($str, ')')))
		{
			$leftWord=substr($str, 0, $pos01);
			$rightWord=substr($str, $pos01+1, $pos02-$pos01-1);
			if(strlen($leftWord)==1 && strlen($rightWord)>strlen($leftWord))
			{
				return $rightWord;
			}
			else
			{
				return $leftWord;
			}
		}
		else
		{
			return $str;
		}			
	}
	//当单词的左边是空格时，，这是第三种情况
	private function filterEmpty($str)
	{
		if($pos=strpos($str, ' '))
		{
				$leftWord=trim(substr($str, 0, $pos));
				$rightWord=trim(substr($str, $pos+1, strlen($str)));
				if(strlen($leftWord)==1 && strlen($leftWord)<strlen($rightWord))
				{
					return $rightWord;
				}
				else
				{
					return $leftWord;
				}
				
		}
		else
		{
			return $str;
		}
	}
	private function filterQuote($str)
	{
		if($pos=strpos($str, '\''))
		{
				$leftWord=trim(substr($str, 0, $pos));
				$rightWord=trim(substr($str, $pos+1, strlen($str)));
				if(strlen($leftWord)==1 && strlen($leftWord)<strlen($rightWord))
				{
					return $rightWord;
				}
				else
				{
					return $leftWord;
				}
				
		}
		else
		{
			return $str;
		}
	}
//提取单词
	public function filterWord($str)
	{
		$pat='#[a-z]+#ui';
		if(preg_match($pat, $str, $match))
		{
			return trim($match[0]);
		}
		else
		{
			die("无法提取单词");
		}
	}
	//提取单词右边一位的字符
	function detectDelimit($str)
	{
		$word=$this->filterWord($str);
		$len=strlen($word);
		$delimit=substr($str, $len,1);
		return $delimit;
	}
	public function getCommonWord($str)
	{
		$sign=array(',', '(', ' ','\'');
		$delimit=$this->detectDelimit($str);
		//echo $delimit;exit();
		if(in_array($delimit, $sign))
		{
			if($delimit==',')
			{
				$word=$this->filterComma($str);
				return $word;
			}
			else if($delimit=='(') 
			{
				$word=$this->filterBrackets($str);
				return $word;
			}
			else if($delimit==' ')
			{
				$word=$this->filterEmpty($str); 
				return $word;
			}
			else if($delimit=='\'')
			{
				$word=$this->filterQuote($str);
				return $word;
			}
			else 
			{
				$word=$this->filterWord($str);
				return $word;
			}
		}
		else
		{
			return $this->filterWord($str);
		}
		
		
	}
	public function getWord($str)
	{
		$word=$this->getCommonWord($str);
		$pat='#[a-z]+#ui';
		if(preg_match($pat, $word, $match))
		{
			return $match[0];
		}
		else
		{
			die("获取最终的单词失败".__LINE__."<br />");
		}
	}
	public function grepWordFromContent($str)
	{
		$pos=strpos($str, '(');
		return trim(substr($str, 0, $pos));
	}
	
	public function getFirstWord($str)
	{
		if(preg_match('#(\r\n)#mi', $str, $arr))
		{
			if(preg_match('#\r\n(.*)$#i',$str, $res))
			{
				return $this->getWord($res[1]);
			}
		}
		else
		{
			return $this->getWord($str);
		}
	}
}
/* $str=<<<str
agr(o)-\r\ncomb form 构词成分 of soil 泥土的; 土壤的: agriculture * agronomy.\r\nanthrop(o)-\r\ncomb form 构词成分 of human beings 人; 人类: anthropomorphic\r\nc/o
str;

$word=new FormatWord();
echo $word->getFirstWord($str); */

?>

【格式化翻译】

<?php
class FormatTrans
{
	public function getTrans($str)
	{
		$pat='#\s(\d{1,2})\s#';
		if(preg_match_all($pat, $str))
		{
			$res=preg_split($pat, $str);
			$len=count($res);
			$trans='';
			for($i=1; $i<=$len; $i++)
			{
				 $trans .="<b>".$i."</b>".'.   '.$res[$i-1]."<br />";
			}
			return $trans;
		}
		else
		{
			return $str;
		}
	}
}

【redis服务器的相关设置】

<?php
class StoreWord
{
	private $redis=null;
	
	public function __construct()
	{
		$this->redis=new Redis();
		$this->redis->connect('127.0.0.1', 6379);
		$this->redis->auth('caifangjie');
	}
	
	public function setWord($wordZone,$word)
	{
		$hName=$wordZone.':OFX';
		foreach ($word as $key => $value)
		{
			$this->redis->hSetNx($hName, $key, $value);
			//echo $key.'------'.$value; exit();
		}
	}
	public function getWord($key)
	{
		$wordZone=null;
		if (preg_match('#^[a-z]#i', $key, $word))
		{
			$wordZone=strtolower($word[0]);
		}
		else
		{
			die('匹配单词分组失败，并返回');
		}
		
		$hName=$wordZone.':OFX';
		if($this->redis->hExists($hName, $key))
		{
			return $this->redis->hGet($hName, $key);
		}
		else
		{
			die("找不到你需要的单词<br >");
		}
	}
	
	public function getAllWord($wordZone)
	{
		$hName=$wordZone.':OFX';
		return $this->redis->hKeys($hName);
	}
	public function getNumWord($wordZone)
	{
		$hName=$wordZone.':OFX';
		return $this->redis->hLen($hName);
	}
}
/* 
$redis=new StoreWord();
$redis->setWord('a', array('all'=>'全部，所有', 'about'=>'关于','above'=>'上面，上部'));
echo $redis->getWord('a', 'about'); */
?>

【依次写入redis】

<?php
require_once "readDic.class.php";
require_once "readDir.class.php";
require_once "storeWord.class.php";

class Server
{
	private $redis=null;
	
	public function __construct($dir)
	{
		$this->redis=new StoreWord();
		$this->parseDic($dir);
	}
	
	public function getDir($path)
	{
		 $dir=new Dir($path);
		 return $dir->getFileList();
	}
	
	public function parseDic($dir)
	{
		$path=$this->getDir($dir);
		foreach ($path as $wordZone => $dir)
		{
				foreach ($dir as $dicPath)
				{
					$oxf=new Oxford($dicPath);
					$res=$oxf->oxf();
					$this->redis->setWord($wordZone, $res);
					ob_start();
					echo '单词库入库:  '. $dicPath. "<br />";
					ob_flush();
					flush();
					sleep(2);
				}
				echo '<font color="red">'.$wordZone.'分类存储完毕</font><br />';
			
		}
		die('<font color="blue">所有分类存储完毕</font><br />');
	}

}
set_time_limit(1000);
$ser=new Server('E:\CodeEdit\php\ciba\TXT格式的牛津电子词典\牛津电子词典');

?>

牛津词典的TXT词库链接如下：点击打开链接

源代码如下：点击打开链接