php DOM 解析

DOM    Document Object Model

    预定义常量

    下列常量由此扩展定义,且仅在此扩展编译入 PHP 或在运行时动态载入时可用。
    XML constants Constant     Value     Description
    XML_ELEMENT_NODE (integer)     1     Node is a DOMElement
    XML_ATTRIBUTE_NODE (integer)     2     Node is a DOMAttr
    XML_TEXT_NODE (integer)     3     Node is a DOMText
    XML_CDATA_SECTION_NODE (integer)     4     Node is a DOMCharacterData
    XML_ENTITY_REF_NODE (integer)     5     Node is a DOMEntityReference
    XML_ENTITY_NODE (integer)     6     Node is a DOMEntity
    XML_PI_NODE (integer)     7     Node is a DOMProcessingInstruction
    XML_COMMENT_NODE (integer)     8     Node is a DOMComment
    XML_DOCUMENT_NODE (integer)     9     Node is a DOMDocument
    XML_DOCUMENT_TYPE_NODE (integer)     10     Node is a DOMDocumentType
    XML_DOCUMENT_FRAG_NODE (integer)     11     Node is a DOMDocumentFragment
    XML_NOTATION_NODE (integer)     12     Node is a DOMNotation
    XML_HTML_DOCUMENT_NODE (integer)     13     
    XML_DTD_NODE (integer)     14     
    XML_ELEMENT_DECL_NODE (integer)     15     
    XML_ATTRIBUTE_DECL_NODE (integer)     16     
    XML_ENTITY_DECL_NODE (integer)     17     
    XML_NAMESPACE_DECL_NODE (integer)     18     
    XML_ATTRIBUTE_CDATA (integer)     1     
    XML_ATTRIBUTE_ID (integer)     2     
    XML_ATTRIBUTE_IDREF (integer)     3     
    XML_ATTRIBUTE_IDREFS (integer)     4     
    XML_ATTRIBUTE_ENTITY (integer)     5     
    XML_ATTRIBUTE_NMTOKEN (integer)     7     
    XML_ATTRIBUTE_NMTOKENS (integer)     8     
    XML_ATTRIBUTE_ENUMERATION (integer)     9     
    XML_ATTRIBUTE_NOTATION (integer)     10     
    DOMException constants Constant     Value     Description
    DOM_PHP_ERR (integer)     0     Error code not part of the DOM specification. Meant for PHP errors.
    DOM_INDEX_SIZE_ERR (integer)     1     If index or size is negative, or greater than the allowed value.
    DOMSTRING_SIZE_ERR (integer)     2     If the specified range of text does not fit into a DOMString.
    DOM_HIERARCHY_REQUEST_ERR (integer)     3     If any node is inserted somewhere it doesn't belong
    DOM_WRONG_DOCUMENT_ERR (integer)     4     If a node is used in a different document than the one that created it.
    DOM_INVALID_CHARACTER_ERR (integer)     5     If an invalid or illegal character is specified, such as in a name.
    DOM_NO_DATA_ALLOWED_ERR (integer)     6     If data is specified for a node which does not support data.
    DOM_NO_MODIFICATION_ALLOWED_ERR (integer)     7     If an attempt is made to modify an object where modifications are not allowed.
    DOM_NOT_FOUND_ERR (integer)     8     If an attempt is made to reference a node in a context where it does not exist.
    DOM_NOT_SUPPORTED_ERR (integer)     9     If the implementation does not support the requested type of object or operation.
    DOM_INUSE_ATTRIBUTE_ERR (integer)     10     If an attempt is made to add an attribute that is already in use elsewhere.
    DOM_INVALID_STATE_ERR (integer)     11     If an attempt is made to use an object that is not, or is no longer, usable.
    DOM_SYNTAX_ERR (integer)     12     If an invalid or illegal string is specified.
    DOM_INVALID_MODIFICATION_ERR (integer)     13     If an attempt is made to modify the type of the underlying object.
    DOM_NAMESPACE_ERR (integer)     14     If an attempt is made to create or change an object in a way which is incorrect with regard to namespaces.
    DOM_INVALID_ACCESS_ERR (integer)     15     If a parameter or an operation is not supported by the underlying object.
    DOM_VALIDATION_ERR (integer)     16     If a call to a method such as insertBefore or removeChild would make the Node invalid with respect to "partial validity", this exception would be raised and the operation would not be done.
    
    
DOMNode {
    /* 属性 */
    public readonly string $nodeName ;              //Returns the most accurate name for the current node type
    public string $nodeValue ;                      //The value of this node, depending on its type
    public readonly int $nodeType ;                 //Gets the type of the node. One of the predefined XML_xxx_NODE constants
    public readonly DOMNode $parentNode ;           //The parent of this node
    public readonly DOMNodeList $childNodes ;       //A DOMNodeList that contains all children of this node. If there are no children,
                                                            //this is an   empty DOMNodeList.
    public readonly DOMNode $firstChild ;           //The first child of this node. If there is no such node, this returns NULL.
    public readonly DOMNode $lastChild ;            //The last child of this node. If there is no such node, this returns NULL.
    public readonly DOMNode $previousSibling ;      //The node immediately preceding this node. If there is no such node, this returns NULL.
    public readonly DOMNode $nextSibling ;          //The node immediately following this node. If there is no such node, this returns NULL.
    public readonly DOMNamedNodeMap $attributes ;   //A DOMNamedNodeMap containing the attributes of this node (if it is a DOMElement)
                                                            //or NULL otherwise.
    public readonly DOMDocument $ownerDocument ;    //The DOMDocument object associated with this node.
    public readonly string $namespaceURI ;          //The namespace URI of this node, or NULL if it is unspecified.
    public string $prefix ;                         //The namespace prefix of this node, or NULL if it is unspecified.
    public readonly string $localName ;             //Returns the local part of the qualified name of this node.
    public readonly string $baseURI ;               //The absolute base URI of this node or NULL if the implementation wasn't able to
                                                             //obtain an absolute URI.
    public readonly string $textContent ;           //This attribute returns the text content of this node and its descendants.
    
    /* 方法 */
    public DOMNode appendChild ( DOMNode $newnode )      Adds new child at the end of the children
    public string C14N ([ bool $exclusive [, bool $with_comments [, array $xpath [, array $ns_prefixes ]]]] )   Canonicalize nodes to a string
    public int C14NFile ( string $uri [, bool $exclusive [, bool $with_comments [, array $xpath [, array $ns_prefixes ]]]] )    Canonicalize nodes to a file
    public DOMNode cloneNode ([ bool $deep=False ] )       Clones a node
    public int getLineNo ( void )                Get line number for a node
    public string getNodePath ( void )          Get an XPath for a node
    public bool hasAttributes ( void )          Checks if node has attributes
    public bool hasChildNodes ( void )           Checks if node has children
    public DOMNode insertBefore ( DOMNode $newnode [, DOMNode $refnode ] )       Adds a new child before a reference node
    public bool isDefaultNamespace ( string $namespaceURI )              Checks if the specified namespaceURI is the default namespace or not
    public bool isSameNode ( DOMNode $node )                         Indicates if two nodes are the same node
    public bool isSupported ( string $feature , string $version )        Checks if feature is supported for specified version
    public string lookupNamespaceURI ( string $prefix )                 Gets the namespace URI of the node based on the prefix
    public string lookupPrefix ( string $namespaceURI )                  Gets the namespace prefix of the node based on the namespace URI
    public void normalize ( void )                                      Normalizes the node
    public DOMNode removeChild ( DOMNode $oldnode )                      Removes child from list of children
    public DOMNode replaceChild ( DOMNode $newnode , DOMNode $oldnode )     Replaces a child
}


DOMNamedNodeMap implements Traversable {
    /* 属性 */
    readonly public int $length ;       The number of nodes in the map. The range of valid child node indices is 0 to length - 1 inclusive.
    /* 方法 */
    DOMNode getNamedItem ( string $name )       Retrieves a node specified by name
    DOMNode getNamedItemNS ( string $namespaceURI , string $localName )     Retrieves a node specified by local name and namespace URI
    DOMNode item ( int $index )     Retrieves a node specified by index
}


DOMNotation extends DOMNode {
    /* 属性 */
    readonly public string $publicId ;
    readonly public string $systemId ;
}


DOMProcessingInstruction extends DOMNode {
        /* 属性 */
        readonly public string $target ;
        public string $data ;
        /* 方法 */
        public __construct ( string $name [, string $value ] )
 }

DOMAttr 表示DOMElement对象属性.
DOMAttr extends DOMNode {
    /* 属性 */
    public readonly string $name ;      //属性名
    public readonly DOMElement $ownerElement ;      //该元素包含的属性
    public readonly bool $schemaTypeInfo ;          //目前未来实现为NULL
    public readonly bool $specified ;               //目前未来实现为NULL
    public string $value ;                          //属性值
    /* 方法 */
    public __construct ( string $name [, string $value ] )
    public bool isId ( void )
    
}

DOMNodeList implements Traversable {
    /* 属性 */
    readonly public int $length ;       The number of nodes in the list. The range of valid child node indices is 0 to length - 1 inclusive.
    /* 方法 */
    DOMNode DOMNodelist::item ( int $index )        Retrieves a node specified by index
}


DOMXPath {
    /* 属性 */
    public DOMDocument $document ;
    /* 方法 */
    public __construct ( DOMDocument $doc )
    public mixed evaluate ( string $expression [, DOMNode $contextnode [, bool $registerNodeNS = true ]] )      Evaluates the given XPath expression and returns a typed result if possible
    public DOMNodeList query ( string $expression [, DOMNode $contextnode [, bool $registerNodeNS = true ]] )       Evaluates the given XPath expression
    public bool registerNamespace ( string $prefix , string $namespaceURI )     Registers the namespace with the DOMXPath object
    public void registerPhpFunctions ([ mixed $restrict ] )     Register PHP functions as XPath functions
}
 
DOMElement extends DOMNode {
    /* 属性 */
    readonly public bool $schemaTypeInfo ;      Not implemented yet, always return NULL
    readonly public string $tagName ;           The element name
    /* 方法 */    
    public __construct ( string $name [, string $value [, string $namespaceURI ]] )  Creates a new DOMElement object   
    public string getAttribute ( string $name )             
    public DOMAttr getAttributeNode ( string $name )        
    public DOMAttr getAttributeNodeNS ( string $namespaceURI , string $localName )
    public string getAttributeNS ( string $namespaceURI , string $localName )
    public DOMNodeList getElementsByTagName ( string $name )
    public DOMNodeList getElementsByTagNameNS ( string $namespaceURI , string $localName )
    public bool hasAttribute ( string $name )
    public bool hasAttributeNS ( string $namespaceURI , string $localName )
    public bool removeAttribute ( string $name )
    public bool removeAttributeNode ( DOMAttr $oldnode )
    public bool removeAttributeNS ( string $namespaceURI , string $localName )
    public DOMAttr setAttribute ( string $name , string $value )
    public DOMAttr setAttributeNode ( DOMAttr $attr )
    public DOMAttr setAttributeNodeNS ( DOMAttr $attr )
    public void setAttributeNS ( string $namespaceURI , string $qualifiedName , string $value )
    public void setIdAttribute ( string $name , bool $isId )
    public void setIdAttributeNode ( DOMAttr $attr , bool $isId )
    public void setIdAttributeNS ( string $namespaceURI , string $localName , bool $isId )
}

DOMCharacterData        Represents nodes with character data. No nodes directly correspond to this class, but other nodes do inherit from it.
DOMCharacterData extends DOMNode {      
        
    /* 属性 */
    public string $data ;       The contents of the node.
    readonly public int $length ;       The length of the contents.
    /* 方法 */
    void appendData ( string $data )        Append the string to the end of the character data of the node
    void deleteData ( int $offset , int $count )        Remove a range of characters from the node
    void insertData ( int $offset , string $data )      Insert a string at the specified 16-bit unit offset
    void replaceData ( int $offset , int $count , string $data )        Replace a substring within the DOMCharacterData node
    string substringData ( int $offset , int $count )       Extracts a range of data from the node
}


DOMText         The DOMText class inherits from DOMCharacterData and represents the textual content of a DOMElement or DOMAttr.
DOMText extends DOMCharacterData {
    /* 属性 */
    readonly public string $wholeText ;     Holds all the text of logically-adjacent (not separated by Element, Comment or Processing Instruction) Text nodes.
    
    /* 方法 */
    public __construct ([ string $value ] )     Creates a new DOMText object
    public bool isWhitespaceInElementContent ( void )        Indicates whether this text node contains whitespace
    public DOMText splitText ( int $offset )        Breaks this node into two nodes at the specified offset
}


DOMCdataSection     The DOMCdataSection inherits from DOMText for textural representation of CData constructs.
DOMCdataSection extends DOMText {
    /* 方法 */
    public __construct ( string $value )
}

DOMComment  Represents comment nodes, characters delimited by <!-- and -->.
DOMComment extends DOMCharacterData {
    /* 方法 */
    public __construct ([ string $value ] )
}




DOMDocumentType     Each DOMDocument has a doctype attribute whose value is either NULL or a DOMDocumentType object.
DOMDocumentType extends DOMNode {
    /* 属性 */
    readonly public string $publicId ;      The public identifier of the external subset.
    readonly public string $systemId ;      The system identifier of the external subset. This may be an absolute URI or not.
    readonly public string $name ;          The name of DTD; i.e., the name immediately following the DOCTYPE keyword.
    readonly public DOMNamedNodeMap $entities ;     A DOMNamedNodeMap containing the general entities, both external and internal, declared in the DTD.
    readonly public DOMNamedNodeMap $notations ;        A DOMNamedNodeMap containing the notations declared in the DTD.
    readonly public string $internalSubset ;        The internal subset as a string, or null if there is none. This does not contain the delimiting square brackets.
}


DOMDocumentFragment extends DOMNode {
    public bool appendXML ( string $data )      Append raw XML data
}


DOMEntity  This interface represents a known entity, either parsed or unparsed, in an XML document.
DOMEntity extends DOMNode {
    /* 属性 */
    readonly public string $publicId ;      The public identifier associated with the entity if specified, and NULL otherwise.
    readonly public string $systemId ;      The system identifier associated with the entity if specified, and NULL otherwise. This may be an absolute URI or not.
    readonly public string $notationName ;      For unparsed entities, the name of the notation for the entity. For parsed entities, this is NULL.
    public string $actualEncoding ;         An attribute specifying the encoding used for this entity at the time of parsing, when it is an external parsed entity. This is NULL if it an entity from the internal subset or if it is not known.
    readonly public string $encoding ;      An attribute specifying, as part of the text declaration, the encoding of this entity, when it is an external parsed entity. This is NULL otherwise.
    readonly public string $version ;       An attribute specifying, as part of the text declaration, the version number of this entity, when it is an external parsed entity. This is NULL otherwise.
}

DOMEntityReference extends DOMNode {
    public __construct ( string $name )
}

Exception {
    /* 属性 */
    protected string $message ;     异常消息内容
    protected int $code ;           异常代码
    protected string $file ;        抛出异常的文件名
    protected int $line ;           抛出异常在该文件中的行号
    /* 方法 */
    public __construct ([ string $message = "" [, int $code = 0 [, Exception $previous = NULL ]]] )
    final public string getMessage ( void )         获取异常消息内容
    final public Exception getPrevious ( void )     返回异常链中的前一个异常
    final public int getCode ( void )               获取异常代码
    final public string getFile ( void )            获取发生异常的程序文件名称
    final public int getLine ( void )               获取发生异常的代码在文件中的行号
    final public array getTrace ( void )            获取异常追踪信息
    final public string getTraceAsString ( void )        获取字符串类型的异常追踪信息
    public string __toString ( void )               将异常对象转换为字符串
    final private void __clone ( void )             异常克隆
}
DOMException    DOM operations raise exceptions under particular circumstances, i.e., when an operation is impossible to perform for logical reasons.
DOMException extends Exception {
    /* 属性 */
    readonly public int $code ;     An integer indicating the type of error generated
}



DOMImplementation   The DOMImplementation interface provides a number of methods for performing operations that are independent of any particular instance of the document object model.
DOMImplementation {
    /* 方法 */
    __construct ( void )
    public DOMDocument createDocument ([ string $namespaceURI = NULL [, string $qualifiedName = NULL [, DOMDocumentType $doctype = NULL ]]] )       Creates a DOMDocument object of the specified type with its document element
    public DOMDocumentType createDocumentType ([ string $qualifiedName = NULL [, string $publicId = NULL [, string $systemId = NULL ]]] )           Creates an empty DOMDocumentType object
    public bool hasFeature ( string $feature , string $version )        Test if the DOM implementation implements a specific feature
}



DOMDocument extends DOMNode {
    /* 属性 */
    readonly public string $actualEncoding ;        Deprecated. Actual encoding of the document, is a readonly equivalent to encoding.
    readonly public DOMConfiguration $config ;      Deprecated. Configuration used when DOMDocument::normalizeDocument() is invoked.
    readonly public DOMDocumentType $doctype ;      The Document Type Declaration associated with this document.
    readonly public DOMElement $documentElement ;       This is a convenience attribute that allows direct access to the child node that is the document element of the document.
    public string $documentURI ;                The location of the document or NULL if undefined.
    public string $encoding ;       Encoding of the document, as specified by the XML declaration. This attribute is not present in the final DOM Level 3 specification, but is the only way of manipulating XML document encoding in this implementation.
    public bool $formatOutput ;     Nicely formats output with indentation and extra space.
    readonly public DOMImplementation $implementation ;         The DOMImplementation object that handles this document.
    public bool $preserveWhiteSpace = true ;        Do not remove redundant white space. Default to TRUE.
    public bool $recover ;                  Proprietary. Enables recovery mode, i.e. trying to parse non-well formed documents. This attribute is not part of the DOM specification and is specific to libxml.
    public bool $resolveExternals ;     Set it to TRUE to load external entities from a doctype declaration. This is useful for including character entities in your XML document.
    public bool $standalone ;           Deprecated. Whether or not the document is standalone, as specified by the XML declaration, corresponds to xmlStandalone.
    public bool $strictErrorChecking = true ;       Throws DOMException on errors. Default to TRUE.
    public bool $substituteEntities ;           Proprietary. Whether or not to substitute entities. This attribute is not part of the DOM specification and is specific to libxml.
    public bool $validateOnParse = false ;      Loads and validates against the DTD. Default to FALSE.
    public string $version ;                Deprecated. Version of XML, corresponds to xmlVersion.
    readonly public string $xmlEncoding ;       An attribute specifying, as part of the XML declaration, the encoding of this document. This is NULL when unspecified or when it is not known, such as when the Document was created in memory.
    public bool $xmlStandalone ;        An attribute specifying, as part of the XML declaration, whether this document is standalone. This is FALSE when unspecified.
    public string $xmlVersion ;     An attribute specifying, as part of the XML declaration, the version number of this document. If there is no declaration and if this document supports the "XML" feature, the value is "1.0".
    
    /* 方法 */
    public __construct ([ string $version [, string $encoding ]] )
    public DOMAttr createAttribute ( string $name )
    public DOMAttr createAttributeNS ( string $namespaceURI , string $qualifiedName )       Create new attribute node with an associated namespace
    public DOMCDATASection createCDATASection ( string $data )
    public DOMComment createComment ( string $data )
    public DOMDocumentFragment createDocumentFragment ( void )
    public DOMElement createElement ( string $name [, string $value ] )
    public DOMElement createElementNS ( string $namespaceURI , string $qualifiedName [, string $value ] )
    public DOMEntityReference createEntityReference ( string $name )
    public DOMProcessingInstruction createProcessingInstruction ( string $target [, string $data ] )
    public DOMText createTextNode ( string $content )
    public DOMElement getElementById ( string $elementId )      Searches for an element with a certain id
    public DOMNodeList getElementsByTagName ( string $name )        Searches for all elements with given local tag name
    public DOMNodeList getElementsByTagNameNS ( string $namespaceURI , string $localName )      Searches for all elements with given tag name in specified namespace
    public DOMNode importNode ( DOMNode $importedNode [, bool $deep ] )     Import node into current document
    public mixed load ( string $filename [, int $options = 0 ] )             Load XML from a file
    public bool loadHTML ( string $source [, int $options = 0 ] )       Load HTML from a string
    public bool loadHTMLFile ( string $filename [, int $options = 0 ] )     Load HTML from a file
    public mixed loadXML ( string $source [, int $options = 0 ] )            Load XML from a string

    public void normalizeDocument ( void )                          Normalizes the document
    public bool registerNodeClass ( string $baseclass , string $extendedclass )     Register extended class used to create base node type
    public bool relaxNGValidate ( string $filename )                    Performs relaxNG validation on the document
    public bool relaxNGValidateSource ( string $source )                Performs relaxNG validation on the document
    public int save ( string $filename [, int $options ] )       Dumps the internal XML tree back into a file
    public string saveHTML ([ DOMNode $node = NULL ] )      Dumps the internal document into a string using HTML formatting
    public int saveHTMLFile ( string $filename )            Dumps the internal document into a file using HTML formatting
    public string saveXML ([ DOMNode $node [, int $options ]] )     Dumps the internal XML tree back into a string

    public bool schemaValidate ( string $filename [, int $flags ] )     Validates a document based on a schema
    public bool schemaValidateSource ( string $source [, int $flags ] )     Validates a document based on a schema
    public bool validate ( void )                   Validates the document based on its DTD
    public int xinclude ([ int $options ] )         Substitutes XIncludes in a DOMDocument Object

}



<?php
$url = 'http://movie.douban.com/category/' ;
function getPage($url)
{ 
  $curl = curl_init(); 
  $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,"; 
  $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; 
  $header[] = "Cache-Control: max-age=0"; 
  $header[] = "Connection: keep-alive"; 
  $header[] = "Keep-Alive: 300"; 
  $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
  $header[] = "Accept-Language: en-us,en;q=0.5"; 
  $header[] = "Pragma: "; // browsers keep this blank. 

  curl_setopt($curl, CURLOPT_URL, $url); 
  curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)'); 
  curl_setopt($curl, CURLOPT_HTTPHEADER, $header); 
  curl_setopt($curl, CURLOPT_REFERER, 'http://www.baidu.com'); 
  curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate'); 
  curl_setopt($curl, CURLOPT_AUTOREFERER, true); 
  curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); 
  curl_setopt($curl, CURLOPT_TIMEOUT, 10); 

  $html = curl_exec($curl);  
  curl_close($curl); 
  return $html;
} 

$text = getPage($url); 
$xml = new DOMDocument();
$xml->loadHTMLFile("test.html");
$xml->encoding='UTF-8';
$xpath=new DOMXpath($xml);

$elements = $xpath->query("/html/body//div[@id='type']/ul");
var_dump($elements);
foreach ( $elements as $e){
    foreach ($e->childNodes as $es){
        var_dump($es->nodeValue);
    }
}
?>


没有更多推荐了,返回首页