一、前言
在使用php curl抓取网页内容时,分析出来网页内的meta信息,一般情况下,只是会用到meta中的content-type或者charset、keywords、description。
二、实现
写了一个函数,用来提出meta中的键值对,代码如下:
function getMetaOfWeb($html=NULL)
{
if(is_null($html))
{
return NULL;
}
if(strlen($html)!=0)
{
$ret = array();
$metapattern = '/<meta[^>]*?>/is';
$kvpattern = '/([\w\-]+)=[\"\']?([^\s]+)["\']?/is';
preg_match_all($metapattern, $html, $matches);
if(!empty($matches))
{
$meta = $matches[0];
foreach ($meta as $key => $value) {
$value = preg_replace('/<meta\s+/is', "", $value);
$value = rtrim($value,'/>');
$value =trim($value);
//替换;+空格
$repattern = '/;\s+/is';
$value = preg_replace($repattern, ';', $value);
preg_match_all($kvpattern, $value, $kvmatches);
if(!empty($kvmatches))
{
$type = (int)count($kvmatches[0]);
switch ($type) {
case 1://meta里面只有一条语句 <meta charset=UTF-8>
$metakey = rtrim($kvmatches[1][0],"\"'");
$metakey = ltrim($metakey,"\"'");
$metavalue = rtrim($kvmatches[2][0],"\"'");
$metavalue = ltrim($metavalue,"\"'");
$ret[$metakey] = $metavalue;
break;
case 2://meta里面是参数名称/参数值的方式
$metakey = rtrim($kvmatches[2][0],"\"'");
$metakey = ltrim($metakey,"\"'");
$metavalue = rtrim($kvmatches[2][1],"\"'");
$metavalue = ltrim($metavalue,"\"'");
$ret[$metakey] = $metavalue;
break;
case 3://meta里面;+空格<meta http-equiv=mobile-agent content="format=wml;url=http://m.qidian.com
$metakey = rtrim($kvmatches[2][0],"\"'");
$metakey = ltrim($metakey,"\"'");
$tmp = rtrim($kvmatches[2][1],"\"'");
$tmp = ltrim($tmp,"\"'");
$tmp2 = rtrim($kvmatches[2][2],"\"'");
$tmp2 = ltrim($tmp2,"\"'");
$metavalue = $tmp.$tmp2;
$ret[$metakey] = $metavalue;
break;
}
}
}
return $ret;
}
return NULL;
}
}