由于工作的原因,最近需要生成网站的sitemap.xml,谷歌百度了很多地方,没有发现并合适可用的代码,三思之后还是决定自己写吧!虽然可能写的有所缺陷,但是毕竟是认认真真写的,希望对一些后来者有所帮助......
1、为什么要自己写脚本生成sitemap.xml?
很多人会说,在网上有现成的工具,扫一下就可以了,没有必要自己写。是的,的确是这样的。但是假设我们的网站进行经常更新,那么是不是每次我都要手动更新sitemap呢。我很懒,那么,有没有更好的方案呢?肯定是有的,我是否可以起一个定时任务,每天晚上更新一次呢,此时脚本就有用武之地了
2、文档目录:配置文件 - config/config.ini.phpsitemap主文件 - SiteMap.class.php
3、主文件代码<?php /** * the script's main function is to help us to generate the target web's sitemap.xml file * * @category sitemap * @author zero * @version 1.0 */namespace Maweibinguo\SiteMap;class SiteMap{const SCHEMA = 'http://www.sitemaps.org/schemas/sitemap/0.9';/** * @var webUrlList * @access public */public $webUrlList = array();/** * @var siteMapList * @access public */public $siteMapList = array();/** * @var isUseCookie * @access public */public $isUseCookie = false;/** * @var cookieFilePath * @access public */public $cookieFilePath = '';/** * @var xmlWriter * @access private */private $_xmlWriter = '';/** * init basic config * * @access public */public function __construct(){$this->_xmlWriter = new \XMLWriter();$result = $this->_enviromentTest();}/** * test the enviroment for the script * * @access pirvate */private function _enviromentTest(){$sapiType = \php_sapi_name ();if( strtolower($sapiType) != 'cli' ) {echo ' The Script Must Run In Command Lines ', "\r\n";exit();}}/** * load the configValue for genrating sitemap by configname * * @param string $configName * @return string $configValue * @access public */public function loadConfig($configName){/* init return value */$configValue = '';/* load config value */$configPath = __DIR__ . '/config/config.ini.php';if(file_exists( $configPath )) {require $configPath;} else {echo "Can not find config file", "\r\n";exit();}$configValue = $$configName;/* return config value */return $configValue;}/** * generate sitemap.xml for the web * * @param siteMapList * @access public */public function generateSiteMapXml($siteMapList){/* init return result */$result = false;if( !is_array($siteMapList) || count($siteMapList) <= 0 ) {echo 'The SiteMap Cotent Is Empty',"\r\n";exit();}/* check the parameter */$siteMapPath = $this->loadConfig('SITEMAPPATH');if(!file_exists($siteMapPath)) {$commandStr = "touch ${siteMapPath}";exec($commandStr);}if( !is_writable($siteMapPath) ) {echo 'Is Not Writeable',"\r\n";exit();}$this->_xmlWriter->openURI($siteMapPath);$this->_xmlWriter->startDocument('1.0', 'UTF-8');$this->_xmlWriter->setIndent(true);$this->_xmlWriter->startElement('urlset');$this->_xmlWriter->writeAttribute('xmlns', self::SCHEMA);foreach($siteMapList as $siteMapKey => $siteMapItem) {$this->_xmlWriter->startElement('url');$this->_xmlWriter->writeElement('loc',$siteMapItem['Url']);$this->_xmlWriter->writeElement('title',$siteMapItem['Title']);$changefreq = !empty($siteMapItem['ChangeFreq']) ? $siteMapItem['ChangeFreq'] : 'Daily';$this->_xmlWriter->writeElement('changefreq',$changefreq);$priority = !empty($siteMapItem['Priority']) ? $siteMapItem['Priority'] : 0.5;$this->_xmlWriter->writeElement('priority',$priority);$this->_xmlWriter->endElement();}$this->_xmlWriter->endElement();/* return return */return $result;}/** * start to send request to the target url, and get the reponse * * @param string $targetUrl * @return mixed $returnData * @access public */public function sendRequest($url){/* init return value */$responseData = false;/* check the parameter */if( !filter_var($url, FILTER_VALIDATE_URL) ) {return $responseData;}$connectTimeOut = $this->loadConfig('CURLOPT_CONNECTTIMEOUT');if( $connectTimeOut === false ) {return $responseData;}$timeOut = $this->loadConfig('CURLOPT_TIMEOUT');if( $timeOut === false ) {return $responseData;}$handle = curl_init();curl_setopt($handle, CURLOPT_URL, $url);curl_setopt($handle, CURLOPT_HEADER, false);curl_setopt($handle, CURLOPT_AUTOREFERER, true);curl_setopt($handle, CURLOPT_RETURNTRANSFER , true);curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, $connectTimeOut);curl_setopt($handle, CURLOPT_TIMEOUT, $timeOut);curl_setopt($handle, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)" );$headersItem = array( 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Connection: Keep-Alive' );curl_setopt($handle, CURLOPT_HTTPHEADER, $headersItem);curl_setopt($handle, CURLOPT_FOLLOWLOCATION, 1);$cookieList = $this->loadConfig('COOKIELIST');$isUseCookie = $cookieList['IsUseCookie'];$cookieFilePath = $cookieList['CookiePath'];if($isUseCookie) {if(!file_exists($cookieFilePath)) {$touchCommand = " touch {$cookieFilePath} ";exec($touchCommand);}curl_setopt($handle, CURLOPT_COOKIEFILE, $cookieFilePath);curl_setopt($handle, CURLOPT_COOKIEJAR, $cookieFilePath);}$responseData = curl_exec($handle);$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);if($httpCode != 200) {$responseData = false;}curl_close($handle);/* return response data */return $responseData;}/** * get the sitemap content of the url, it contains url, title, priority, changefreq * * @param string $url * @access public */public function generateSiteMapList($url){$content = $this->sendRequest($url);if($content !== false) {$tagsList = $this->_parseContent($content, $url);$urlItem = $tagsList['UrlItem'];$title = $tagsList['Title'];$siteMapItem = array( 'Url' => trim($url),'Title' => trim($title) );$priority = $this->_calculatePriority($siteMapItem['Url']);$siteMapItem['Priority'] = $priority;$changefreq = $this->_calculateChangefreq($siteMapItem['Url']);$siteMapItem['ChangeFreq'] = $changefreq;$this->siteMapList[] = $siteMapItem;foreach($urlItem as $nextUrl) {if( !in_array($nextUrl, $this->webUrlList) ) {$skipUrlList = $this->loadConfig('SKIP_URLLIST');foreach($skipUrlList as $keyWords) {if( stripos($nextUrl, $keyWords) !== false ) {continue 2;}}$this->webUrlList[] = $nextUrl;echo $nextUrl,"\r\n";$this->generateSiteMapList($nextUrl);}}}}/** *teChangefreq get sitemaplist of the web * * @access public * @return array $siteMapList */public function getSiteMapList(){return $this->siteMapList;}/** * calate the priority of the targeturl * * @param string $targetUrl * @return float $priority * @access private */private function _calculatePriority($targetUrl){/* init priority */$priority = 0.5;/* calculate the priority */if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) {$priorityList = $this->loadConfig('PRIORITYLIST');foreach($priorityList as $priorityKey => $priorityValue) {if(stripos($targetUrl, $priorityKey) !== false) {$priority = $priorityValue;break;}}}/* return priority */return $priority;}/** * calate the changefreq of the targeturl * * @param string $targetUrl * @return float $changefreq * @access private */private function _calculateChangefreq($targetUrl){/* init changefreq*/$changefreq = 'Daily';/* calculate the priority */if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) {$changefreqList = $this->loadConfig('CHANGEFREQLIST');foreach($changefreqList as $changefreqKey => $changefreqValue) {if(stripos($targetUrl, $changefreqKey) !== false) {$changefreq = $changefreqValue;break;}}}/* return priority */return $changefreq;}/** * format url * * @param $url * @param $orginUrl * @access private * @return $formatUrl */private function _formatUrl($url, $originUrl){/* init url */$formatUrl = '';/* format url */if( !empty($url) && !empty($originUrl) ) {$badUrlItem = array( '\\','/' ,'javascript','javascript:;','' );$formatUrl = trim($url);$formatUrl = trim($formatUrl, '#');$formatUrl = trim($formatUrl, '\'');$formatUrl = trim($formatUrl, '"');if(stripos($formatUrl, 'http') === false && !in_array($formatUrl, $badUrlItem)) {if(strpos($formatUrl, '/') === 0) {$domainName = $this->loadConfig('DOMAIN_NAME');$formatUrl = $domainName . trim($formatUrl, '/');} else {$formatUrl = substr( $originUrl, 0, strrpos($originUrl, '/') ) .'/'. $formatUrl;}} elseif( stripos($formatUrl, 'http') === false && in_array($formatUrl, $badUrlItem) ) {$formatUrl = '';}}/* return url */return $formatUrl;}/** * check domain is right * * @param $url * @return $url * @access private */private function _checkDomain($url){/* init url */$result = false;/* check domain */if($url) {$domainName = $this->loadConfig('DOMAIN_NAME');if( stripos($url, $domainName) === false ) {return $result;}$result = true;}/* return url */return $result;}/** * parse the response content, so that we can get the urls * * @param string $content * @param string $originUrl * @return array $urlItem * @access public */public function _parseContent($content, $originUrl){/* init return data */$tagsList = array();/* start parse */if( !empty($content) && !empty($originUrl) ) {$domainName = $this->loadConfig('DOMAIN_NAME');/* get the attribute of href for tags */$regStrForTagA = '# $url) {$formatUrl = $this->_formatUrl($url, $originUrl);if( empty($formatUrl) ) {unset($urlItem[$urlKey]);continue;}$result = $this->_checkDomain($formatUrl);if($result === false) {unset($urlItem[$urlKey]);continue;}$urlItem[$urlKey] = $formatUrl;}}$tagsList['UrlItem'] = $urlItem;/* get the title tags content */$regStrForTitle = '#(.*?)#um';if( preg_match($regStrForTitle, $content, $matches) ) {$title = $matches[1];}$tagsList['Title'] = $title;}/* return tagsList */return $tagsList;}}/* here is a example */$startTime = microtime(true);echo "/***********************************************************************/","\r\n";echo "/* start to run {$startTime} */","\r\n";echo "/***********************************************************************/","\r\n\r\n";$siteMap = new SiteMap();$domain = $siteMap->loadConfig('DOMAIN_NAME');$siteMap->generateSiteMapList($domain);$siteMapList = $siteMap->getSiteMapList();$siteMap->generateSiteMapXml($siteMapList);$endTime = microtime(true);$takeTime = $endTime - $startTime;echo "/***********************************************************************/","\r\n";echo "/* Had Done, \t it total take {$takeTime} */","\r\n";echo "/***********************************************************************/","\r\n";?>
4、配置文件代码<?php //curl连接时间$CURLOPT_CONNECTTIMEOUT = 5;//curl请求超时时间$CURLOPT_TIMEOUT = 10;//域名$DOMAIN_NAME = '';//设置跳过的地址关键字$SKIP_URLLIST = array( 'addtocart' );//设置cookie$COOKIELIST = array( 'IsUseCookie' => true,'CookiePath' => '/tmp/sitemapcookie' );//sitemap文件的保存地址$SITEMAPPATH = './sitemap.xml';//根据连接关键字设置priority$PRIORITYLIST = array( 'product' => '0.8','device' => '0.6','intelligent' => '0.4','course' => '0.2' );//根据连接关键字设置CHANGEFREQ$CHANGEFREQLIST = array( 'product' => 'Always','device' => 'Hourly','intelligent' => 'Daily','course' => 'Weekly','login' => 'Monthly','about' => 'Yearly' );?>