不得不感叹用DOM直接解析HTML DOM树的灵活和强大,因为基本的HTML元素就是那么几种常见的,再加上ID属性或者CLASS属性之类的。。
在解析html文件时,完全可以用正则中脱离出来,毕竟HTML文件中存在大量相似的模式,而且代码看上去功能比较显而易见,当然正则是非常强大的,应用的领域也更广。。
代码如下:
<?php
//关闭载入包含js时的警告提示
error_reporting(E_ERROR | E_PARSE);
class DomTree
{
//DOM句柄
private $doc=null;
//保存基本解释
private $basic_meaning=array();
//保存英汉双解
private $en_or_ch=array();
//保存英英释义
private $en_to_en=array();
//保存例句
private $example=array();
//保存常用句型
private $sentences=array();
//保存词汇表
private $glossary=array();
//保存经典名人名言
private $auth=array();
//保存常见错误用法
private $use_in_wrong = array();
//保存近义词
private $approximate_words = array();
//保存百科解释
private $baike_trans = array();
public function __construct($source)
{
$this->doc = new DomDocument();
//判断$source类型
if(is_file($source))
{
file_exists($source)?$this->doc->loadHTMLFile($source):die("文件不存在");
}
else if(is_string($source))
{
empty($source)?die("传入的字符串不能为空"):$this->doc->loadHTML($source);
}
else
{
preg_match('#^(http|ftp)://#i', $source)?$this->doc->loadHTML(file_get_contents($source)):die("不支持的资源类型");
}
//获取div元素列表
$div_list = $this->doc->getElementsByTagName("div");
$div_list_len = $div_list->length;
for($i=0; $i<$div_list_len; $i++)
{
if($div_list->item($i)->hasAttribute("class"))
{
switch(trim($div_list->item($i)->getAttribute ("class")))
{
case "basic clearfix":
$this->getBasicMeans($div_list->item($i));
break;
case "layout dual":
$this->getEnOrCh($div_list->item($i));
break;
case "layout en":
$this->getEnToEn($div_list->item($i));
break;
case "layout sort":
$this->getExample($div_list->item($i));
break;
case "layout patt":
$this->normalSentence($div_list->item($i));
break;
case "layout coll":
$this->getGlossary($div_list->item($i));
break;
case "layout auth":
$this->getAuth($div_list->item($i));
break;
case "layout comn":
$this->useInWrong($div_list->item($i));
break;
case "layout nfw":
$this->getApproximateWords($div_list->item($i));
break;
case "layout baike";
$this->getBaike($div_list->item($i));
break;
}
}
}
}
//获取基本解释
private function getBasicMeans($basic_div)
{
$li_list = $basic_div->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($i=0; $i<$li_list_len; $i++)
{
$item = $li_list->item($i);
if($item->hasAttribute("style"))
{
continue;
}
else
{
$strong_list = $item->getElementsByTagName("strong");
$strong_list_len = $strong_list->length;
for($j=0; $j<$strong_list_len; $j++)
{
$this->basic_meaning[]=$strong_list->item($j)->nodeValue;
}
}
}
}
//获取英汉双解释义
private function getEnOrCh($div_elem)
{
$li_list = $div_elem->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($i=0; $i<$li_list_len; $i++)
{
$this->en_or_ch[]=$li_list->item($i)->nodeValue;
}
}
//获取英英释义
private function getEnToEn($div_elem)
{
$li_list = $div_elem->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($i=0; $i<$li_list_len; $i++)
{
$this->en_to_en[]= $this->strip_Empty($li_list->item($i)->nodeValue);
}
}
//格式化操作
private function strip_Empty($string)
{
if(is_string($string))
{
return preg_replace('#\s{2,}#', ' ', $string);
}
}
//获取例句
private function getExample($div_elem)
{
if($div_elem->hasChildNodes())
{
$ol_list = $div_elem->getElementsByTagName("ol");
$ol_list_len = $ol_list->length;
for($i=0; $i<$ol_list_len; $i++)
{
$li_list = $ol_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$this->example[] = $this->strip_Empty($li_list->item($j)->nodeValue);
}
}
}
}
//常见句型
private function normalSentence($div_elem)
{
$ol_list = $div_elem->getElementsByTagName("ol");
$ol_list_len = $ol_list->length;
for($i=0; $i<$ol_list_len; $i++)
{
//获取英语句型
$li_list = $ol_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$this->sentences[]=$this->strip_Empty($li_list->item($j)->nodeValue);
}
}
}
//常见词汇
private function getGlossary($div_elem)
{
$ul_list = $div_elem->getElementsByTagName("ul");
$ul_list_len = $ul_list->length;
for($i=0; $i<$ul_list_len; $i++)
{
//获取常见词汇
$li_list = $ul_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$this->glossary[]=$this->strip_Empty($li_list->item($j)->nodeValue);
}
}
}
//获取名人名言
private function getAuth($div_elem)
{
$ul_list = $div_elem->getElementsByTagName("ul");
$ul_list_len = $ul_list->length;
for($i=0; $i<$ul_list_len; $i++)
{
//获取列表
$li_list = $ul_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$this->auth[]=$this->strip_Empty($li_list->item($j)->nodeValue);
}
}
}
//获取常见错误用法
private function useInWrong($div_elem)
{
$ol_list = $div_elem->getElementsByTagName("ol");
$ol_list_len = $ol_list->length;
for($i=0; $i<$ol_list_len; $i++)
{
//获取错误用法列表
$li_list = $ol_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$this->use_in_wrong[]=$this->strip_Empty($li_list->item($j)->nodeValue);
}
}
}
//获取近义词
private function getApproximateWords($div_elem)
{
$ul_list = $div_elem->getElementsByTagName("ul");
$ul_list_len = $ul_list->length;
for($i=0; $i<$ul_list_len; $i++)
{
$li_list = $ul_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$a_list = $li_list->item($j)->getElementsByTagName("a");
$a_list_len = $a_list->length;
for($k=0; $k<$a_list_len; $k++)
{
$this->approximate_words[]=$a_list->item($k)->nodeValue;
}
}
}
}
//获取百科解释
private function getBaike($div_elem)
{
$ul_list = $div_elem->getElementsByTagName("ul");
$ul_list_len = $ul_list->length;
for($i=0; $i<$ul_list_len; $i++)
{
//获取列表
$li_list = $ul_list->item($i)->getElementsByTagName("li");
$li_list_len = $li_list->length;
for($j=0; $j<$li_list_len; $j++)
{
$this->baike_trans[]=$li_list->item($j)->nodeValue;
}
}
}
//接口: 返回基本释义
public function getBasicMeaning()
{
if(!empty($this->basic_meaning))
{
return $this->basic_meaning;
}
}
//接口: 返回英汉双解
public function getEnOrChMeaning()
{
if(!empty($this->en_or_ch))
{
return $this->en_or_ch;
}
}
//接口: 返回英英释义
public function getEnToEnMeaning()
{
if(!empty($this->en_to_en))
{
return $this->en_to_en;
}
}
//接口: 返回例句
public function getExampleMeaning()
{
if(!empty($this->example))
{
return $this->example;
}
}
//接口: 返回常用句型
public function getNormalSentenceMeaning()
{
if(!empty($this->sentences))
{
return $this->sentences;
}
}
//接口: 返回词汇表
public function getGlossaryMeaning()
{
if(!empty($this->glossary))
{
return $this->glossary;
}
}
//接口: 返回名人名言
public function getAuthMeaning()
{
if(!empty($this->auth))
{
return $this->auth;
}
}
//接口: 返回常见错误用法
public function getUseInWrongMeaning()
{
if(!empty($this->use_in_wrong))
{
return $this->use_in_wrong;
}
}
//接口: 获取近义词
public function getApproximateWordsMeaning()
{
if(!empty($this->approximate_words))
{
return $this->approximate_words;
}
}
//接口: 获取百度百科的解释
public function getBaikeMeaning()
{
if(!empty($this->baike_trans))
{
return $this->baike_trans;
}
}
//返回所有的翻译
public function getAllMeaning()
{
$all_meaning = array();
$all_meaning['basic_meaning'] = $this->getBasicMeaning();
$all_meaning['en_or_ch'] = $this->getEnOrChMeaning();
$all_meaning['en_to_en'] = $this->getEnToEnMeaning();
$all_meaning['example']=$this->getExampleMeaning();
$all_meaning['normal_sentence'] = $this->getNormalSentenceMeaning();
$all_meaning['glossary_sentence'] = $this->getGlossaryMeaning();
$all_meaning['auth_sentence'] = $this->getAuthMeaning();
$all_meaning['wrong_use'] = $this->getUseInWrongMeaning();
$all_meaning['approximate_words'] = $this->getApproximateWordsMeaning();
$all_meaning['baike_meaning'] = $this->getBaikeMeaning();
return $all_meaning;
}
}
$dom = new DomTree("./com.html");
$trans = $dom->getAllMeaning();
echo "<pre>";
print_r($trans);
?>
结果如下:
Array
(
[basic_meaning] => Array
(
[0] => 单词;消息;话语;诺言
[1] => 用词语表达
)
[en_or_ch] => Array
(
[0] => [C] 字,词 the smallest unit of spoken language which has meaning and can stand alone
[1] => [C] (说的)话,话语,言语 anything said; remark or statement
[2] => [S] 消息,信息; 谣言 piece of news; message; rumour
[3] => [S] 口令,号令; 命令 spoken command or signal
[4] => [S] 诺言,保证 a promise
[5] => vt. 用词语表达; 选用 express (sth) in particular words; phrase sth
)
[en_to_en] => Array
(
[0] => a unit of language that native speakers can identify; "words are the blocks from which sentences are made" "he hardly said ten words all morning"
[1] => a brief statement; "he didn't say a word about it"
[2] => information about recent and important events; "they awaited news of the outcome"
[3] => a verbal command for action; "when I give the word, charge!"
[4] => an exchange of views on some topic; "we had a good discussion" "we had a word or two about it"
[5] => a promise; "he gave his word"
[6] => a word is a string of bits stored in computer memory; "large computers use words up to 64 bits long"
[7] => the divine word of God; the second person in the Trinity (incarnate in Jesus)
[8] => a secret word or phrase known only to a restricted group; "he forgot the password"
[9] => the sacred writings of the Christian religions; "he went to carry the Word to the heathen"
[10] => put into words or an expression; "He formulated his concerns to the board of trustees"
)
[example] => Array
(
[0] => Could we have a word before you go to the meeting? 你去开会之前,咱们能私下说句话吗?
[1] => My friend sent word that he was well. 我朋友捎来口信说他很好。
)
[normal_sentence] => Array
(
[0] => What does this word mean? 这个词是什么意思?
[1] => I couldn't look up the spelling of the word, as I hadn't a dictionary at hand. 我没法查这个词的拼写,因为我手边没有词典。
[2] => Many English words are derived from Latin. 许多英文单词源于拉丁文。
[3] => All the words beside the central idea should be crossed out. 凡偏离中心思想的词语都应通通删掉。
[4] => The editor eliminated slang words from the essay. 编辑将俚语从这篇文章中剔除。
[5] => These words can't be staled by repetition. 这些词语不会因为经常使用而变成陈词滥调。
[6] => He gave me his visiting card, with a few words in pencil. 他把他的名片给我,上面有几个铅笔字。
[7] => I don't believe a word of his story. 他说的这件事我一句话都不相信。
[8] => At the press conference, the reporters copied down every word spoken by the prime minister. 在新闻发布会上,记者们逐字记下了首相的讲话。
[9] => Tell me what happened in your words. 用你自己的话把发生的事告诉我。
[10] => Deeds are better than words when people are in need of help. 当别人需要帮助时,行动胜于语言。
[11] => I would like a word with you. 我想和你谈谈。
[12] => After a word with the colonel he went away . 他和上校简单谈过之后就走了。
[13] => There's been no word from her for weeks. 已经有好几个星期没有她的音信了。
[14] => Word came that I was needed at home. 有信儿来说家里需要我。
[15] => Word has come that meeting will be held on Tuesday. 通知已到,星期二开会。
[16] => Word is that the election will be held in June. 有消息说选举将在六月份举行。
[17] => Word is that he's left the country. 据说他已经离开这个国家了。
[18] => Word got round that he had resigned. 谣传他已辞职。
[19] => Stay hidden until I give the word. 我不下令就藏着别动。
[20] => Their word is law. 他们的命令必须服从。
[21] => He gave the word and they let him in. 他说出了口令,他们让他进去了。
[22] => The word now is “freedom”. 现在的口号是“自由”。
[23] => I give you my word I'll go. 我向你保证,我会去的。
[24] => Stand by your word. 要守信用。
[25] => Hear The Word of God . 听宣讲《圣经》。
[26] => Be careful how you word your answer. 回答时要斟酌字句。
[27] => She worded the explanation well. 她的解释措辞得体。
[28] => The advice wasn't very tactfully worded. 这份通知措辞不太得体。
[29] => The suggestion might be worded more politely. 那项建议的措辞可以更婉转些。
[30] => This is a carefully worded contract. 这是一份措辞严谨的合同。
)
[glossary_sentence] => Array
(
[0] => address a few words 讲几句话
[1] => await word from sb 等待某人的消息
[2] => break one's words 食言
[3] => breathe a word 走漏消息
[4] => bring word 带来消息
[5] => choose a word 选择词
[6] => coin a word 杜撰一个词
[7] => cook up words 造新词
[8] => cross out a word 划掉一个词
[9] => cut out many words 删掉许多词
[10] => digest a word 消化一个词
[11] => doubt sb's words 怀疑某人的话
[12] => drink in all the words 吸收所有的词语
[13] => eat one's words 收回前言,认错,道歉
[14] => exchange angry words 发生口角
[15] => find words 找出言语(来表达)
[16] => gain the good word of 博得…的赞扬
[17] => get word 得到消息
[18] => get a word 插嘴
[19] => give one's word 保证,允许
[20] => give the word 发出命令
[21] => have words together 争吵
[22] => have words with sb 与某人吵嘴
[23] => have a word with sb 同某人谈一谈
[24] => hunt up a word 查一个词
[25] => keep one's word 信守诺言
[26] => leave word 留言
[27] => leave out a word 省略一个词,丢掉一个词
[28] => look up a word (在词典里)查一个词
[29] => memorize words 记单词
[30] => play on words 玩弄字眼
[31] => pronounce a word 读一个词
[32] => put in words for 为…说几句话
[33] => put the words into sb's mouth 教某人怎么讲
[34] => quote a word 引用一个词
[35] => receive word of 收到…消息
[36] => regret one's words 为说过的话而后悔
[37] => respect one's word 遵守自己许下的诺言
[38] => say a word 说句话,进一步,走漏消息
[39] => say a few words 说几句话
[40] => say a good word for sb 为某人说好话
[41] => send sb a word 给某人捎个信儿
[42] => spell a word 拼写一个词
[43] => stress the word 重读那个词
[44] => take back one's word 收回自己的话
[45] => take sb's word for it 相信了某人的话
[46] => understand a word 理解某个词的意思
[47] => use words 用词
[48] => waste one's words 白费口舌
[49] => weigh words 斟酌词句
[50] => write a word 写一个词
[51] => advance word 事先传出的消息
[52] => angry words 气话
[53] => beautiful words 优美的言辞
[54] => big words 大话
[55] => borrowed word 外来词
[56] => broken words 断断续续的话
[57] => burning words 热情洋溢的话
[58] => choice words 精选的词句
[59] => colorful words 丰富的言辞
[60] => cross words 气话
[61] => empty words 空洞的话,无意义的话
[62] => everyday word 日常用语
[63] => farewell words 送别词
[64] => fighting words 容易引起争论的话,挑战性的话
[65] => foreign word 外来词
[66] => hard words 愤怒的话,激烈的话
[67] => heated word 激烈的言词,争吵时使用的话
[68] => high words 愤怒的话,激烈的话
[69] => hollow words 虚假的言语
[70] => honeyed words 甜言蜜语
[71] => hot words 激烈的言词,争吵时使用的话
[72] => household word 家喻户晓的词
[73] => irresponsible words 不负责任的话
[74] => key words 关键的字眼
[75] => last words 临终遗言
[76] => living words 现代语
[77] => meaningful words 意味深长的言语
[78] => meaningless words 无意义的话
[79] => misspelled word 拼错的词
[80] => native word 本国词,本地词
[81] => pleasant words 动听的语言
[82] => regional word 方言
[83] => scientific word 科学用语
[84] => semi-technical words 半科技词
[85] => sharp words 愤怒的话,激烈的话
[86] => simple word 简单的词
[87] => sincere words 真诚的话
[88] => small word 小词
[89] => spoken words 口头语
[90] => suggestive words 含蓄的话
[91] => sweet words 甜言蜜语
[92] => tearful parting words 伤感的离别之言
[93] => the latest word 最新消息,最后消息
[94] => uncleanly words 下流话
[95] => unfamiliar word 生词
[96] => unusual word 冷僻词
[97] => warm words 忿怒的话,激烈的话
[98] => written words 书面语
[99] => wrong words 错词
[100] => dictionary word 词典里出现的词
[101] => English words 英语单词
[102] => law word 法律用语
[103] => newspaper word 新闻用语
[104] => slang word 俚语
[105] => at a word 立即,立刻
[106] => in a word 简言之,总之
[107] => in one's own words 用自己的话说
[108] => in other words 换言之
[109] => upon my word 的确,真的
[110] => without a word 一声没吭
[111] => word in heavy type 黑体字
[112] => words in season 时宜的话
[113] => words of comfort 安慰的话
[114] => words of command 命令
[115] => words of complaint 怨言
[116] => the W- of God 圣经
[117] => words of praise 表扬的话
[118] => word of six letters 六个字母的词
[119] => words of thanks 感谢的话
[120] => word the explanation 解释
[121] => word accurately 准确地用言语表达
[122] => word crudely 简单地用词语〔语言〕表达
[123] => word felicitously 恰当地用言语表达
[124] => word intelligibly 清楚地用语言表达
[125] => word positively 明确地用词语表达
[126] => word vaguely 含糊地表达
[127] => word well 措辞得体
)
[auth_sentence] => Array
(
[0] => Rome shall perishswrite that word In the blood that she has spilt. 出自:W. Cowper
[1] => We have striven..to draw some word from her; but she..answers nothing. 出自:G. P. R. James
[2] => To use his own words, he was in a cleft stick. 出自:H. Conway
[3] => Actions speak louder than words. 出自:Proverb
[4] => He words me, girls, he words me, that I should not Be noble to myself. 出自:Anthony Cleopatra,Shakespeare
)
[wrong_use] => Array
(
[0] => 我要跟他说句话。 误 I should like to have word with him. 正 I should like to have a word with him.
[1] => 他们听到消息说足球比赛将在今晚电视实况转播。 误 They had a word that the football match would be televised live this evening. 正 They had word that the football match would be televised live this evening. 析 have word是“听到消息〔新闻〕”的意思,“说句话”是have a word。
[2] => 对逐词背课文,我感到厌倦。 误 I was tired of reciting the texts word after word. 正 I was tired of reciting the texts word for word. 析 “一字不变地,逐字(背诵或翻译)”是word for word,不是word after word。
[3] => 我说了什么错话吗? 误 Have I said any wrong words? 正 Have I said anything wrong? 析 误句语法上没有错,但不符合英语习惯。
[4] => 他不遵守诺言。 误 He broke his words. 正 He broke his word. 析 break one's word意为“不遵守诺言”, word在此短语中不用复数形式。
[5] => 我刚得知他到达的消息。 误 I have just received the word of his arrival. 正 I have just received word of his arrival.
[6] => 有消息传来说我们的篮球队赢了这场比赛。 误 The word came that our basketball team had won the match. 正 Word came that our basketball team had won the match. 析 作“消息”“信息”解时, word前不加冠词。
[7] => 他大约是30年前开始当教师的,换句话说,他当教师已经有30年了。 误 He began to work as a teacher some thirty years ago, in another word, he has been a teacher for thirty years. 正 He began to work as a teacher some thirty years ago, in other words, he has been a teacher for thirty years. 析 in other words是固定短语,意为“换句话说”。
[8] => 他带信给我说怀特先生不久将动身去美国。 误 He carried me words that Mr.White would soon leave for America. 正 He carried me word that Mr. White would soon leave for America. 析 word作“消息”“信”解时,是不可数名词,其后不可加s。
[9] => 今晨我们争吵了。 误 We had a word this morning. 正 We had words this morning.
[10] => 他们曾为鸡毛蒜皮的小事同邻居吵过嘴。 误 They had word with their neighbour over some trifles. 正 They had words with their neighbours over some trifles. 析 表示“同某人发生口角”时,用have words with sb, words用复数形式。
[11] => 他说的大话使我们都感到惊讶。 误 His big word surprised us all. 正 His big words surprised us all.
[12] => 我们绝不收回前言。 误 We should on no account eat our word. 正 We should on no account eat our words. 析 习语big words, eat one's words中, words词尾的s不可省。
)
[approximate_words] => Array
(
[0] => account
[1] => advice
[2] => chat
[3] => communication
[4] => declaration
[5] => edict
[6] => expression
[7] => message
[8] => notice
[9] => order
[10] => password
[11] => promise
[12] => remark
[13] => term
[14] => couch
[15] => explain
[16] => express
[17] => phrase
[18] => put
[19] => say
[20] => write
)
[baike_meaning] => Array
(
[0] => word:Microsoft Word,属于办公软件,人们日常生活都有可能接触到他,对他并不陌生。 简介 wordMicrosoft Word是微软公司的一个文字处理器应用程序。它最初是由Richard Bro…
)
)