<?php
/// 如何利用栈将html解析成节点树
/// 首先html是由一个个节点组成,最大的节点为<html></html>节点 她有两个子节点<head></head> 和<body></body>
/// 首先我们将<html>压入栈中 再将<head>压入栈中 遇到</head>出栈 <body>压入栈中 遇到</body> 出栈 最终<html>
/// 出栈 可以看出 出栈的一定是栈顶元素的子节点
/// 本程序假定html完全规范
/// todo 解决标签未闭合问题
/// todo 根据标签名访问
/// 根据id访问
/// 根据class访问
/// innerHtml 和 innerText
$tree_file = fopen("tree_file","w");
fclose($tree_file);
$siteurl="http://www.cnblogs.com/poissonnotes/archive/2010/05/28/1745996.html";
$str=get_content_by_url($siteurl);
$pattern="/<.*?>/";
$deep=0;
$global_false=true ;
$result=array();
$queue_id=0;
if(preg_match_all($pattern,$str,$matches))
{
$count=count($matches[0]);
$matches=$matches[0];
$queue=array();
$error = fopen("string_tagss","a");
for($i=0;$i<$count;$i++)
{
$temptag = $matches[$i];
$node["Id"]=$i;
$node["ChildId"]=array();
$node["html_tag"]=$temptag;
/// 如果是单标签
if(is_single_tag($temptag))
{
//echo "this is a single tag: ";
//echo $temptag."\n";
$str_deep=make_string($deep);
fwrite($error,$str_deep."\t".$temptag."\n");
if(!empty($queue))
{
$queue[count($queue)-1]["ChildId"][]=$queue_id;
}
$queue_id++;
$result[]=$node;
continue ;
}
/// 如果是结束标签
if(is_end_tag($temptag))
{
$temp_node = array_pop($queue);
$queue[count($queue)-1]["ChildId"][]=$queue_id ; // 出队的节点一定是栈顶元素的子孩子
$queue_id++;
$result[]=$temp_node;
}
else
{
/// 如果是开始标签
$str_deep=make_string($deep);
fwrite($error,$str_deep."\t".$temptag."\n");
$queue[] = $node;
$deep++;
}
}
if($global_false)
{
echo "ok\n" ;
}
else
{
echo "error\n" ;
}
//制作关系图谱 不包含结束标签
//<html>
// <head>
// ............
// ............
// <body>
// ............
// ............
$deep = 0;
make_tree($result,$result[count($result)-1],$deep);
}
function is_single_tag($test_tag)
{
$single_tags=array("meta","img","link","input","!DOCTYPE","area","base","basefont","embed","hr","br");
$single_tag_string="(";
foreach($single_tags as $single_tag)
{
$single_tag_string=$single_tag_string.$single_tag."|" ;
}
$single_tag_string=$single_tag_string.")";
$single_tag_string=str_replace("|)",")",$single_tag_string);
$pattern="/\<".$single_tag_string.".*?\>/ i" ;
if(preg_match($pattern,$test_tag))
{
return true ;
}
return false ;
}
function is_end_tag($test_tag)
{
$pattern="/\<\/.*?\>/" ;
if(preg_match($pattern,$test_tag))
{
return true ;
}
return false ;
}
function is_exist_start_tag($queue,$temptag)
{
$end_string=str_replace("/","",$temptag);
$end_string=preg_replace("/\s*/","",$end_string);
$end_string=strtolower($end_string);
$count=count($queue);
echo $count."\n";
var_dump($queue);
for($i=$count-1;$i>=0;$i--)
{
echo $i."\n";
$string = $queue[$i]["html_tag"];
$string = preg_replace("/(<)(.*?)\s.*?(>)/",'$1$2$3',$string);
if(strtolower($string) == strtolower($end_string))
{
return true;
}
}
return false ;
}
function is_pear_tag($start_tag,$temptag)
{
$start_string=preg_replace("/(<)(.*?)\s.*?(>)/",'$1$2$3',$start_tag);
$end_string=str_replace("/","",$temptag);
$end_string=preg_replace("/\s*/","",$end_string);
if(strtolower($start_string) == strtolower($end_string))
{
return true ;
}
else
{
return false ;
}
}
function make_string($deep)
{
$temstr="";
for($i=0;$i<$deep;$i++)
$temstr=$temstr."-";
return $temstr;
}
function judge_tag($last_tag,$temptag)
{
return false;
// 这里做一个判断 判断上一个标签是否能嵌套这个标签 比如
// <td> 不嵌套<td>
// <td> 不嵌套<tr>
// <tr> 不嵌套<tr>
// <tr> 不嵌套<table>
// <table> 嵌套 <tr> 但不能嵌套其他 标签
// <li> 不嵌套<li>
// <ul> 不嵌套<ul>
// <ol> 不嵌套<ol>
// <head> 不能嵌套 <body>
}
function get_content_by_url($site_url)
{
$ch=curl_init($site_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($ch, CURLOPT_ENCODING, "gzip");
$content=curl_exec($ch);
$content = str_replace("\r\n","",$content);
$content = str_replace("\n\r","",$content);
$content = str_replace("\n","",$content);
$content = str_replace("\r","",$content);
$content = preg_replace("/<!--\[if.*?<!\[endif\]-->/is","",$content);
$content = preg_replace("/<!--.*?-->/is","",$content);
$content = preg_replace("/<script(.*?)<\/script>/is","",$content);
$content = preg_replace("/<style(.*?)<\/style>/is","",$content);
$errores = fopen("baidu","w");
fwrite($errores,$content) ;
fclose($errores);
return $content ;
}
function make_tree($result,$node,$deep)
{
// 制作前缀 使得显示的层次分明
$str="" ;
for($i=0;$i<$deep;$i++)
{
$str=$str."--";
}
echo $str;
echo $node["html_tag"]."\n";
$write_str = $str.$node["html_tag"]."\n";
$tree_file = fopen("tree_file","a");
fwrite($tree_file,$write_str);
fclose($tree_file);
$deep++;
if(!empty($node["ChildId"]))
{
foreach($node["ChildId"] as $Child)
{
make_tree($result,$result[$Child],$deep) ; ///递归
}
}
else
{
return ;
}
}
?>
转载于:https://my.oschina.net/qidis/blog/535004