读RSS的PHP

原创 2006年05月17日 08:12:00

PHP解析类

<?php
/*Khalid XML files parser :: class kxparse, Started in March 2002 by Khalid Al-kary*/
class kxparse{
var $xml;
var $cursor;
var $cursor2;
var $encoding;

//the constructor $xmlfile is the file you want to load into the parser
function kxparse($xmlfile)
{
 //just read the file
 $file=fopen($xmlfile,"r");
 
 //put the text inside the file in the XML object variable
 while (!feof($file))
  {
   $this->xml.=fread($file,4096);  
  }
  
// $this->xml = preg_replace("/[/t/r/n]+/","",$this->xml);//去掉xml文件中的回车换行制表等符号
 $this->xml = str_replace(array("<![CDATA[","]]>"), "", $this->xml);//去掉xml文件中的CDATA标签
 //close the opened file
 fclose($file);

 //set the cursor to 0 (start of document), the cursor is later used by another functions
 $this->cursor=0;

 //set the second curosr to the end of document
 $this->cursor2=strlen($this->xml);
}

function getencoding()
{
 $rx = "/<?xml.*encoding=['/"](.*?)['/"].*?>/m";
 if (preg_match($rx, $this->xml, $m))
 {
  $this->encoding = strtoupper($m[1]);
 }
 else
 {
  $this->encoding = "UTF-8";
 }
 return $this->encoding;
}
/*this function first gets a copy of the XML file starting from cursor and ending with cursor2
and then counts the number of occurences of the given tag name inside that area
returns an array (occurrence index -> occurence position in the XML file)
this function is half of the engine that moves Kxparse */
function track_tag_cursors($tname)
 {
  //getting the copy as intended
  $currxml=substr($this->xml,$this->cursor,$this->cursor2);
  
  //counting the number of occurences in the cut area
  $occurs=substr_count($currxml,"<".$tname);
  
  //the aray that will be returned
  $tag_poses=array();
  
  //setting its 0 to 0 because indeces in Kxparse start from 1
  $tag_poses[0]=0;
  
  //for each of the occurences
  for ($i=1;$i<=$occurs;$i++)
   {
    
    if ($i!=1)
     {
      //if it's not the first occurence
      //start checking for the next occurence but first cut the previous occurences off from the string
      $tag_poses[$i]=strpos($currxml,"<".$tname,$tag_poses[$i-1]+1-$this->cursor)+$this->cursor;
     }
    else
     {
      //if its the first occurence just assign its value + the cursor (because the position is in the XML file wholly
      $tag_poses[$i]=strpos($currxml,"<".$tname)+$this->cursor;
     }
     
   }
  
  //return that array 
  return $tag_poses;
 }
//this function strips and decodes the tag text...
function get_tag_text_internal($tname)
 {
  //strip the tags from the returned text and the decode it
  return $this->htmldecode(strip_tags($tname));
 }

//function that returns a particular attribute value ...
//tag is the tag itself(with its start and end)
function get_attribute_internal($tag,$attr)
 {
  //identifying the character directly after the tag name to cut it then
  if (strpos($tag," ")<strpos($tag,">"))
   {
    $separ=" ";
   }
  else
   {
    $separ=">";
   }

  //cutting of the tag name according to separ
  $tname=substr($tag,1,strpos($tag,$separ)-1);

  //cut the tag starting from the white space after the tag name, ending with(not containing) the > of the tag start
  $work=substr($tag,strlen($tname)+1,strpos($tag,">")-strlen($tname)-1);

  //get the index of the tag occurence inside $work
  $index_of_attr=strpos($work," ".$attr."=/"")+1;

  //check if the attribute was found in the tag
  if ($index_of_attr)
   {
    //now get the attributename+"=""+attrbutevalue+""" and extract the value from between them
    //calculate from where we will cut
    $index_of_value=$index_of_attr+strlen($attr)+2;

    //cut note the last argument for calculating the end
    $work=substr($work,$index_of_value,strpos($work,"/"",$index_of_value)-$index_of_value);

    //now return the attribute value
    return $work;
   }

   //if the attribute wasn't found, return false'
  else
   {
    return FALSE;
   }
 }


//this function HTML-decodes the var $text...
function htmldecode($text)
 {
  $text=str_replace("&lt;","<",$text);
  $text=str_replace("&gt;",">",$text);
  $text=str_replace("&amp;","&",$text);
  $text=str_replace("&ltt;","&lt;",$text);
  $text=str_replace("&gtt;","&gt;",$text);
  return $text;
 }

//the function that saves a file to a particular location
function save($file)
 {
  //open the file and overwrite of already avilable
  $my_file=fopen($file,"wb");

  //$my_status holds wether the operation is okay
  $my_status=fwrite($my_file,$this->xml);

  //close the file handle
  fclose($my_file);

  if ($my_status!=-1)
   {
    return TRUE;
   }
  else
   {
    return FALSE;
   }

 }

//function that gets a tag in the XML tree (with its starting and ending)
function get_tag_in_tree($tname,$tindex)
 {
  $this->get_work_space($tname,$tindex);
  return substr($this->xml,$this->cursor,$this->cursor2-$this->cursor);
 }
//function that gets the text of a tag
function get_tag_text($tname,$tindex)
{
 $mytag=$this->get_tag_in_tree($tname,$tindex);
 return $this->get_tag_text_internal($mytag);

//funtion that counts the number of occurences of a tag in the XML tree 
function count_tag($tname,$tindex)
 {
  return $this->get_work_space($tname,$tindex);
 }
 
//functoin that gets the attribute value in a tag 
function get_attribute($tname,$tindex,$attrname) 
 {
  $mytag=$this->get_tag_in_tree($tname,$tindex);
  return $this->get_attribute_internal($mytag,$attrname);
 }

//Very important function, half of the engine
//sets the $this->cursor and $this->cursor2 to the place where it's intended to work 
function get_work_space($tname,$tindex) 
 {
  //counts the number of ":"  in the given colonedtagindex
  $num_of_search=substr_count($tindex,":");
  
  //counts the number of ":" in the given colonedtagname
  $num_of_search_text=substr_count($tname,":");
  
  //checks if they are not equal this regarded an error
  if ($num_of_search!=$num_of_search_text)
   {
    return false;
   }
  else
   {
    //now get the numbers in an array
    $search_array=explode(":",$tindex);
    
    //and also get the corresponding tag names
    $search_text_array=explode(":",$tname);
    
    //set the cursor to 0 in order to erase former work
    $this->cursor=0;
    
    //set the cursor2 to the end of the file for the same reason
    $this->cursor2=strlen($this->xml);
    
    //get the first tag name to intiate the loop
    $currtname=$search_text_array[0];
    
    //get the first tag index to intiate the loop
    $currtindex=$search_array[0];
    
    //the loop according to number of ":"
    for ($i=0;$i<count($search_array);$i++)
     {
      //if it's not the first tag name and index
      if ($i!=0)
       {
        //so append the latest colonedtagname to the current tag name
        $currtname=$currtname.":".$search_text_array[$i];
        
        //and append the latset colonedtagindex to the current tag index
        $currtindex=$currtindex.":".$search_array[$i];
       }
      //$arr holds the number of occurences of the current tag name between the cursor and cursor2 
      $arr=$this->track_tag_cursors($search_text_array[$i]);
      
      //the index which you want to get the position of
      $tem=$search_array[$i];
      
      //to support count_tag_in_tree
      //when given a ? it returns the number of occurences of the current tag name
      if ($tem=="?")
       {
        return count($arr)-1;
       }
      else { 
      
      //to support the auto-last method
      //if the current tag index equals "-1" so replace it by the last occurence index
      if ($tem==-1) 
       {
        $tem=count($arr)-1;
       }
      
      //now just set cursor one to the occurence position in the XML file accrding to $tem 
      $this->cursor=$arr[(int)$tem];
      
      //and set cursor2 at the end of that tag
      $this->cursor2=strpos($this->xml,"</".$search_text_array[$i].">",$this->cursor)+strlen("</".$search_text_array[$i].">");
       }
     }
   } 
}
//the function that appends a tag to the XML tree
function create_tag($tname,$tindex,$ntname) 
 {
  //first get the intended father tag
  $this->get_work_space($tname,$tindex);
  
  //explode the given colonedtagname into an array
  $search_text_array=explode(":",$tname);
  
  //after setting the cursors using get_work_space
  //get a cope of the returned tag
  $workarea=substr($this->xml,$this->cursor,$this->cursor2-$this->cursor);
  
  //calculate the place where you will put the tag start and end
  $inde=$this->cursor+strpos($workarea,"</".$search_text_array[count($search_text_array)-1].">");
  
  //here, replace means insert because the length argument is set to 0
  $this->xml=substr_replace($this->xml,"<".$ntname."></".$ntname.">",$inde,0);
 }
//the function that sets the value of an attribute 
function set_attribute($tname,$tindex,$attr,$value)
 {
  //first set the cursors using get_work_space
  $this->get_work_space($tname,$tindex);
  
  //now get a copy of the XML tag between cursor and cursor2
  $currxml=substr($this->xml,$this->cursor,$this->cursor2-$this->cursor);
  
  //cut the area of the tag on which you want to work
  //starting from the tag "<" and ending with the opening tag ">"
  $work=substr($currxml,0,strpos($currxml,">")+1);
  
  //if the attribute is already available
  if (strpos($work," ".$attr."=/""))
  {
   //calculate the current value's length
   $currval_length=strlen($this->get_attribute_internal($currxml,$attr));
   
   //get the position of the attribute inside the tag
   $my_attribute_pos=strpos($work," ".$attr."=/"")+1;
   
   //get the length of the attribute
   $my_attribute_length=strlen($attr);
   
   //now replace the old value
   $this->xml=substr_replace($this->xml,$value,$this->cursor+$my_attribute_pos+$my_attribute_length+2,$currval_length);
   return TRUE;
  }
  
  //if the attribute wasn't already available'
  else
  {
   //check if there are other attributes in the tag
   if (strpos($work," "))
    {
     $separ=" ";
    }
   else
    {
     $separ=">";
    }
   
   //prepare the attribute
   $newattr=" ".$attr."=/"".$value."/"";
   
   //insert the new attribute
   $this->xml=substr_replace($this->xml,$newattr,$this->cursor+strpos($work,$separ),0);
   return TRUE;
  } 
}
//the function that changes or adds the text of a tag
function set_tag_text($tname,$tindex,$text)
 {
  //firs get set the cursors using get_work_space
  $this->get_work_space($tname,$tindex);
  
  //explode the given colonedtagname in an array
  $search_text_array=explode(":",$tname);
  
  //get the latest name
  $currtname=$search_text_array[count($search_text_array)-1];
  
  //calculate the start of replacement
  $replace_start_index=strpos($this->xml,">",$this->cursor)+1;
  
  //calculate the end of replacement
  $replace_end_index=strpos($this->xml,"</".$currtname.">",$this->cursor)-1;
  
  //calculate the length between them
  $tem=$replace_end_index-$replace_start_index+1;
  
  //and now replace
  $this->xml=substr_replace($this->xml,$text,$replace_start_index,$tem);
 }
//functio that removes a tag 
function remove_tag($tname,$tindex) 
 {
  //set the cursors using get_work_space
  $this->get_work_space($tname,$tindex);
  
  //now replace with ""
  $this->xml=substr_replace($this->xml,"",$this->cursor,$this->cursor2-$this->cursor);
 }

}
?>

存入数据库

<?PHP
include_once "kxparse.php";

//load the file into the parser: constructor(string file)
$xmlread = new kxparse("http://rss.xinhuanet.com/rss/native.xml");//需要解析的xml地址

mysql_connect("localhost","root","");
mysql_select_db("Rss");

$channeltitle = $xmlread->get_tag_text("rss:channel:title","1:1:1");
$channellink = $xmlread->get_tag_text("rss:channel:link","1:1:1");
$channeldescription = $xmlread->get_tag_text("rss:channel:description","1:1:1");
$channelpubdate = $xmlread->get_tag_text("rss:channel:pubDate","1:1:1");

for($i=1; $i <= ($xmlread->count_tag("rss:item","1:?")); $i++)
{
 $itemtitle = $xmlread->get_tag_text("rss:item:title","1:$i:1");
 $itemlink = $xmlread->get_tag_text("rss:item:link","1:$i:1");
 $itemauthor = $xmlread->get_tag_text("rss:item:author","1:$i:1");
 $itemguid = $xmlread->get_tag_text("rss:item:guid","1:$i:1");
 $itemcategory = $xmlread->get_tag_text("rss:item:category","1:$i:1");
 $itempubdate = $xmlread->get_tag_text("rss:item:pubDate","1:$i:1");
 $itemcomments = $xmlread->get_tag_text("rss:item:comments","1:$i:1");
 $itemdescription = $xmlread->get_tag_text("rss:item:description","1:$i:1");
 
 $sql = "INSERT INTO RssNews (channeltitle,
         channellink,
         channeldescription,
         channelpubdate,
         itemtitle,
         itemlink,
         itemauthor,
         itemguid,
         itemcategory,
         itempubdate,
         itemcomments,
         itemdescription)
   VALUES('".$channeltitle."',
       '".$channellink."',
       '".$channeldescription."',
       '".$channelpubdate."',
       '".$itemtitle."',
       '".$itemlink."',
       '".$itemauthor."',
       '".$itemguid."',
       '".$itemcategory."',
       '".$itempubdate."',
       '".$itemcomments."',
       '".$itemdescription."')";

 if($xmlread->getencoding() == "UFT-8")
 {
  mysql_query($sql);
 }
 else
 {
  mysql_query(iconv($xmlread->getencoding(),"UTF-8",$sql));
  
 }

}

?>

从数据库里读出内容

<?PHP
mysql_connect("localhost","root","");
mysql_select_db("Rss");

$sql = "SELECT * from RssNews";

$result = mysql_query($sql);

while($row = mysql_fetch_array($result))
{
 echo iconv("UTF-8","GBK",$row['channeltitle'])."<br>";
 echo iconv("UTF-8","GBK",$row['channellink'])."<br>";
 echo iconv("UTF-8","GBK",$row['channeldescription'])."<br>";
 echo iconv("UTF-8","GBK",$row['channelpubdate'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemtitle'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemlink'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemauthor'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemguid'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemcategory'])."<br>";
 echo iconv("UTF-8","GBK",$row['itempubdate'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemcomments'])."<br>";
 echo iconv("UTF-8","GBK",$row['itemdescription'])."<br><br><br>";
}
?>

数据库结构

--
-- 数据库: `rss`
--

-- --------------------------------------------------------

--
-- 表的结构 `rssnews`
--

CREATE TABLE `rssnews` (
  `channeltitle` varchar(500) collate utf8_bin default NULL,
  `channellink` varchar(500) collate utf8_bin default NULL,
  `channeldescription` varchar(500) collate utf8_bin default NULL,
  `channelpubdate` varchar(500) collate utf8_bin default NULL,
  `itemtitle` varchar(500) collate utf8_bin default NULL,
  `itemlink` varchar(500) collate utf8_bin default NULL,
  `itemauthor` varchar(500) collate utf8_bin default NULL,
  `itemguid` varchar(500) collate utf8_bin default NULL,
  `itemcategory` varchar(500) collate utf8_bin default NULL,
  `itempubdate` varchar(500) collate utf8_bin default NULL,
  `itemcomments` varchar(500) collate utf8_bin default NULL,
  `itemdescription` varchar(500) collate utf8_bin default NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

PHP进行RSS订阅

PHP进行RSS订阅 现在有很多的rss订阅,我们直接可以订阅到邮箱。既然学了PHP,那么有没有一种方法可以直接将rss的新闻信息显示在自己的网页上呢?有的,必须的,下面就是这个rss脚本: ...
  • amberom
  • amberom
  • 2015年01月12日 10:36
  • 831

php编写RSS源

记编写rss源 点点细雨     2013年11月26日星期二   为了提高搜索引擎的收录速度,今天开始编写rss源来增加对搜索引擎的友好。 废话就不多打了,毕竟我打字速度也不快(O(∩_∩)...
  • diandianxiyu
  • diandianxiyu
  • 2013年11月26日 14:36
  • 2774

网络爬虫,使用NodeJs抓取RSS新闻

提供RSS服务的站点超级多,百度、网易、新浪、虎嗅网 等等站点,基于java c++ php的rss抓取网上很多,今天说说NodeJs抓取RSS信息, 使用NodeJs做网络爬虫,抓取RSS新闻。各...
  • a442180673
  • a442180673
  • 2014年02月20日 17:06
  • 10740

使用wordpress快速搭建podcast资源发布平台

播客是iPod+broadcasting,是数字广播技术的一种,基本上是每个IOS设备都会默认安装。博客是2005年前后的产品,在当时推广的并不算太成功,最近又被抄火了。笔者查阅了很多资料,发现以前曾...
  • lq8841149
  • lq8841149
  • 2015年11月14日 17:38
  • 651

为什么需要RSS及如何使用

http://www.douban.com/note/203836734/ 如果你对此真的没兴趣,我真心建议你只看倒数第二段。 电梯:如果你只是需要看如何使用RSS可从【第四个标题】后开始看。...
  • hshl1214
  • hshl1214
  • 2016年01月26日 15:38
  • 2195

RSS是什么,RSS怎么玩,RSS原理是什么

http://www.cjjjs.com/paper/gzsh/201622721397372.aspx **********************************************...
  • zhao1949
  • zhao1949
  • 2016年10月13日 13:31
  • 587

网卡多队列技术与RSS功能介绍

多队列网卡是一种技术,最初是用来解决网络IO QoS (quality of service)问题的,后来随着网络IO的带宽的不断提升,单核CPU不能完全处满足网卡的需求,通过多队列网卡驱动的支持,将...
  • baidu_24553027
  • baidu_24553027
  • 2017年02月08日 15:44
  • 4429

利用UT的RSS第一时间自动下载TTG种子(转载)

转自:http://leo.eool.net/blog/archives/50 现有些人总是能在第一时间下到新发布的种子,哪怕是凌晨三点发布的。 你好奇他是怎么做到的吗? 其实很简单,...
  • u010794523
  • u010794523
  • 2013年11月29日 17:21
  • 3313

各大网站RSS订阅源地址

十大最值得订阅的中文RSS源 1、FT中文网  http://feeds.feedburner.com/ftchina 2、果壳网 http://www.guokr.com/rss...
  • Techzero
  • Techzero
  • 2013年06月03日 18:27
  • 15033

Java生成RSS-XML文件

详细看代码,部分涉及到隐私的就给删了,但是不影响功能,里面的日期等格式化,不知道的可以看我前面的博客   /** * author:humf */ import java.text.Si...
  • qq_22260641
  • qq_22260641
  • 2017年03月28日 17:26
  • 680
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:读RSS的PHP
举报原因:
原因补充:

(最多只允许输入30个字)