利用PHP获取一个页面上的链接信息-CSDN博客

开发中我们可能会获取某个页面或是一段内容中的链接信息，下面我分享一个我写的函数给大家，希望能帮到大家。

函数功能：

1、获取一段内容中链接信息；

2、获取一个URL中链接信息；

3、剔除锚链等无效的链接

4、获取当前域下的链接信息

5、获取他域下的链接信息

6、保留链接的文本信息

代码：

 
    
         /** 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         * 功能：获取一个网页或一段内容里面的链接信息 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         * @param string $html    要获取链接的内容或网址 
        
 
           
         * @param string $isExclude  是否过滤无效的链接，如"","#","javascript:;","javascript:void(0);" 。默认过滤 
        
 
           
         * @param string $isKeepLinkText 是否保留链接的文字。默认保留，保留与不保留链接数可能不同 
        
 
           
         * @param string $linkType    取得链接的类型，all所有的链接，inner 本域下的链接， out 外域的链接信息。默认 是取得所有链接 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         * @return array 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         */ 
        
 
         function  
         getLinks( 
         $html 
         , 
         $isExclude 
         =true, 
         $isKeepLinkText 
         =true, 
         $linkType 
         = 
         'all' 
         ){ 
        
 
              
         if 
         ( 
         empty 
         ( 
         $html 
         ))  
         return  
         false; 
        
 
              
         set_time_limit(0); 
         //防止超时 
        
 
              
         $removes 
         = 
         array 
         ( 
         '' 
         , 
         '#' 
         , 
         'javascript:;' 
         , 
         'javascript:void(0);' 
         , 
         'javascript:void(0)' 
         ); 
         //排除锚链之类的 
        
 
              
         $html  
         =  
         substr 
         ( 
         strtolower 
         ( 
         $html 
         ),0,4)== 
         "http" 
         ? 
         file_get_contents 
         ( 
         $html 
         ): 
         $html 
         ; 
         //要处理的内容 
        
 
              
         //提取链接信息 
        
 
              
         $pattern  
         =  
         '/<a(?:.*?)href="(((?:http(?:s?):\/\/)?([^\"\/]+))?(?:[^\"]*))"(?:[^>]*?)>([^<]*?)<\/a>/i' 
         ; 
        
 
              
         preg_match_all( 
         $pattern 
         ,  
         $html 
         ,  
         $_links 
         ); 
        
 
              
         if 
         ( 
         $isKeepLinkText 
         ){ 
        
 
                  
         foreach  
         ( 
         $_links 
         [1]  
         as  
         $key  
         =>  
         $href 
         ) { 
        
 
                      
         $links 
         [ 
         $_links 
         [4][ 
         $key 
         ]]= 
         $href 
         ; 
        
 
                  
         } 
        
 
              
         } 
         else 
         { 
        
 
                  
         $links 
         = 
         $_links 
         [4]; 
        
 
              
         } 
        
 
              
         unset( 
         $_links 
         ); 
        
 
                               
        
 
              
         foreach  
         ( 
         $links  
         as  
         $text  
         =>  
         $href 
         ) { 
        
 
                  
         //移除无效的链接 
        
 
                  
         if 
         ( 
         $isExclude 
         &&in_array( 
         $href 
         ,  
         $removes 
         )) {           
        
 
                      
         unset( 
         $links 
         [ 
         $text 
         ]); 
        
 
                  
         } 
        
 
                  
         if 
         ( 
         $linkType 
         != 
         'all' 
         ){ 
        
 
                      
         $host 
         = 
         parse_url 
         ( 
         $href 
         ); 
        
 
                      
         $host 
         =isset( 
         $host 
         [ 
         'host' 
         ])? 
         $host 
         [ 
         'host' 
         ]: 
         '' 
         ; 
        
 
                      
         if 
         ( 
         $linkType 
         == 
         'inner' 
         ){ 
         //本域链接 
        
 
                          
         if 
         ( 
         substr 
         ( 
         $href 
         ,0,1)!= 
         "/" 
         && 
         strtolower 
         ( 
         $host 
         )!= 
         strtolower 
         ( 
         $_SERVER 
         [ 
         'SERVER_NAME' 
         ])) { 
        
 
                              
         unset( 
         $links 
         [ 
         $text 
         ]); 
        
 
                          
         } 
        
 
                      
         } 
         elseif 
         ( 
         $linkType 
         == 
         'out' 
         ){ 
         //他域链接 
        
 
                          
         if 
         ( 
         substr 
         ( 
         $href 
         ,0,1)== 
         "/" 
         || 
         strtolower 
         ( 
         $host 
         )== 
         strtolower 
         ( 
         $_SERVER 
         [ 
         'SERVER_NAME' 
         ])) { 
        
 
                              
         unset( 
         $links 
         [ 
         $text 
         ]); 
        
 
                          
         } 
        
 
                      
         } 
        
 
                  
         } 
        
 
              
         } 
        
 
              
         return  
         $links 
         ; 
        
 
         } 
        
 
  

使用方法：

 
         $links 
         =getLinks( 
         "http://www.sina.com.cn" 
         ); 
        
         //或 
        
         $links 
         =getLinks( 
         "http://www.sina.com.cn" 
         ,1,0, 
         "out" 
         ); 
        
         //或 
        
         $links 
         =getLinks( 
         "这里是一段要提取链接信息的内容" 
         );

特别说明：

1、上面的函数用到了file_get_contents ，获取内容可能会失败，你可以自行改成curl；

2、提取链接用了正则，效率可能低。

当然你也看一看使用下面的函数，当要提取的内容网址的时候不使用正则来提取链接信息：

代码：

 
    
         /** 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         * 功能：获取一个网页或一段内容里面的链接信息 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         * @param string $html    要获取链接的内容或网址 
        
 
           
         * @param string $isExclude  是否过滤无效的链接，如"","#","javascript:;","javascript:void(0);" 。默认过滤 
        
 
           
         * @param string $isKeepLinkText 是否保留链接的文字。默认保留，保留与不保留链接数可能不同 
        
 
           
         * @param string $linkType    取得链接的类型，all所有的链接，inner 本域下的链接， out 外域的链接信息。默认 是取得所有链接 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         * @return array 
        
 
           
         * +---------------------------------------------------------- 
        
 
           
         */ 
        
 
         function  
         getLinks( 
         $html 
         , 
         $isExclude 
         =true, 
         $isKeepLinkText 
         =true, 
         $linkType 
         = 
         'all' 
         ){ 
        
 
              
         if 
         ( 
         empty 
         ( 
         $html 
         ))  
         return  
         false; 
        
 
              
         set_time_limit(0); 
        
 
              
         $removes 
         = 
         array 
         ( 
         '' 
         , 
         '#' 
         , 
         'javascript:;' 
         , 
         'javascript:void(0);' 
         , 
         'javascript:void(0)' 
         ); 
         //排除锚链之类的 
        
 
              
         $isLink 
         = 
         substr 
         ( 
         strtolower 
         ( 
         $html 
         ),0,4)== 
         "http" 
         ?1:0; 
         //是否是链接 
        
 
              
         $html  
         =  
         $isLink 
         ? 
         file_get_contents 
         ( 
         $html 
         ): 
         $html 
         ; 
        
 
              
         if 
         ( 
         $isLink 
         ){ 
        
 
                  
         $dom  
         =  
         new  
         DOMDocument(); 
        
 
                  
         @ 
         $dom 
         ->loadHTML( 
         $html 
         ); 
        
 
                  
         $xpath  
         =  
         new  
         DOMXPath( 
         $dom 
         ); 
        
 
                  
         unset( 
         $dom 
         ); 
        
 
                  
         $hrefs  
         =  
         $xpath 
         ->evaluate( 
         "/html/body//a" 
         );//获取a节点 
        
 
                  
         $length 
         = 
         $hrefs 
         ->length; 
         //获取链接数 
        
 
                  
         $links 
         = 
         array 
         (); 
         //网页上的链接 
        
 
                  
         for  
         ( 
         $i  
         = 0;  
         $i  
         <  
         $length 
         ;  
         $i 
         ++) { 
        
 
                      
         $href  
         = trim( 
         $hrefs 
         ->item( 
         $i 
         )->getAttribute( 
         'href' 
         )); 
        
 
                      
         $text 
         =trim( 
         $hrefs 
         ->item( 
         $i 
         )->textContent); 
        
 
                      
         $links 
         [ 
         $text 
         ]= 
         $href 
         ; 
        
 
                  
         } 
        
 
              
         } 
         else 
         { 
        
 
                  
         $pattern  
         =  
         '/<a(?:.*?)href="(((?:http(?:s?):\/\/)?([^\"\/]+))?(?:[^\"]*))"(?:[^>]*?)>([^<]*?)<\/a>/i' 
         ; 
        
 
                  
         preg_match_all( 
         $pattern 
         ,  
         $html 
         ,  
         $_links 
         ); 
        
 
                  
         if 
         ( 
         $isKeepLinkText 
         ){ 
        
 
                      
         foreach  
         ( 
         $_links 
         [2]  
         as  
         $key  
         =>  
         $href 
         ) { 
        
 
                          
         $links 
         [ 
         $_links 
         [4][ 
         $key 
         ]]= 
         $href 
         ; 
        
 
                      
         } 
        
 
                  
         } 
         else 
         { 
        
 
                      
         $links 
         = 
         $_links 
         [4]; 
        
 
                  
         } 
        
 
                  
         unset( 
         $_links 
         ); 
        
 
              
         } 
        
 
              
         foreach  
         ( 
         $links  
         as  
         $text  
         =>  
         $href 
         ) { 
        
 
                  
         //移除无效的链接 
        
 
                  
         if 
         ( 
         $isExclude 
         &&in_array( 
         $href 
         ,  
         $removes 
         )) { 
        
 
                      
         unset( 
         $links 
         [ 
         $text 
         ]); 
        
 
                  
         } 
        
 
                  
         if 
         ( 
         $linkType 
         != 
         'all' 
         ){ 
        
 
                      
         $host 
         = 
         parse_url 
         ( 
         $href 
         ); 
        
 
                      
         $host 
         =isset( 
         $host 
         [ 
         'host' 
         ])? 
         $host 
         [ 
         'host' 
         ]: 
         '' 
         ; 
        
 
                      
         if 
         ( 
         $linkType 
         == 
         'inner' 
         ){ 
         //本域链接 
        
 
                          
         if 
         ( 
         substr 
         ( 
         $href 
         ,0,1)!= 
         "/" 
         && 
         strtolower 
         ( 
         $host 
         )!= 
         strtolower 
         ( 
         $_SERVER 
         [ 
         'SERVER_NAME' 
         ])) { 
        
 
                              
         unset( 
         $links 
         [ 
         $text 
         ]); 
        
 
                          
         } 
        
 
                      
         } 
         elseif 
         ( 
         $linkType 
         == 
         'out' 
         ){ 
         //他域链接 
        
 
                          
         if 
         ( 
         substr 
         ( 
         $href 
         ,0,1)== 
         "/" 
         || 
         strtolower 
         ( 
         $host 
         )== 
         strtolower 
         ( 
         $_SERVER 
         [ 
         'SERVER_NAME' 
         ])) { 
        
 
                              
         unset( 
         $links 
         [ 
         $text 
         ]); 
        
 
                          
         } 
        
 
                      
         } 
        
 
                  
         } 
        
 
              
         } 
        
 
              
         return  
         $links 
         ; 
        
 
         }