A DEMO script on how to use CPAN module HTML::LinkExtor

最新推荐文章于 2021-05-13 22:41:25 发布

ThinkHY

最新推荐文章于 2021-05-13 22:41:25 发布

阅读量856

点赞数

分类专栏：代码之美 Perl WEB技术大型机文章标签： html module callback url parsing thread

本文链接：https://blog.csdn.net/ThinkHY/article/details/7685902

版权

Perl 同时被 3 个专栏收录

44 篇文章 0 订阅

订阅专栏

代码之美

43 篇文章 0 订阅

订阅专栏

WEB技术

20 篇文章 0 订阅

订阅专栏

Last week I got a large book "Perl Cookbook". It mentions an useful module HTML::LinkExtor in the book, seems handy to use. Right now, I just wanted to crawl some docs from MVS-OE archive webpage, so I wrote a small script that can demo how to use the module.

 
 
  
  use 
  
  LWP::
  
  Simple
  
  ;
 
 
 
 
  
  use 
  
  HTML::
  
  LinkExtor
  
  ;
 
 
 
 
  
  use 
  
  URI::
  
  URL
  
  ;
 
 
 
 
  
  

 
 
 
 
  
  binmode 
  
  STDOUT
  
  , 
  
  ':utf8'
  
  ;
 
 
 
 
  
  my 
  
  $url 
  
  = 
  
  "http://www2.marist.edu/htbin/wlvindex?mvs-oe"
  
  ;
 
 
 
 
  
  my 
  
  $base 
  
  = 
  
  "http://www2.marist.edu/htbin"
  
  ;
 
 
 
 
  
  my 
  
  $ref_links 
  
  = 
  
  extract_link
  
  (
  
  $url
  
  , 
  
  ""
  
  , 
  
  "a"
  
  , 
  
  "href"
  
  );
 
 
 
 
  
  foreach
  
  (
  
  @$ref_links
  
  )
 
 
 
 
  
  {
 
 
 
 
  
      
  
  my 
  
  $sub_url 
  
  = 
  
  $_
  
  ;
 
 
 
 
  
      
  
  print 
  
  "Parsing sub url: "
  
  .
  
  $sub_url
  
  .
  
  "\n"
  
  ;
 
 
 
 
  
      
  
  my 
  
  $thread_links 
  
  = 
  
  extract_link
  
  (
  
  $sub_url
  
  , 
  
  $base
  
  , 
  
  "a"
  
  , 
  
  "href"
  
  );
 
 
 
 
  
      
  
  foreach
  
  (
  
  @$thread_links
  
  ) 
 
 
 
 
  
      
  
  { 
 
 
 
 
  
          
  
  print 
  
  "GET\n"
  
  ;
 
 
 
 
  
          
  
  print 
  
  $_
  
  .
  
  "\n"
  
  ;
 
 
 
 
  
          
  
  get
  
  (
  
  $_
  
  ); 
 
 
 
 
  
      
  
  }
 
 
 
 
  
  }
 
 
 
 
  
  

 
 
 
 
  
  

 
 
 
 
  
  sub 
  
  extract_link
  
  ()
 
 
 
 
  
  {
 
 
 
 
  
    
  
  my 
  
  $url 
  
  = 
  
  shift
  
  ;   
 
 
 
 
  
    
  
  my 
  
  $base
  
  = 
  
  shift
  
  ; 
  
  #  base URL
 
 
 
 
  
    
  
  my 
  
  $mytag 
  
  = 
  
  shift
  
  ; 
  
  # specified html tag name, such as a, form ...
 
 
 
 
  
    
  
  my 
  
  $attr_name 
  
  = 
  
  shift
  
  ; 
  
  # link pattern
 
 
 
 
  
  

 
 
 
 
  
  

 
 
 
 
  
    
  
  $base 
  
  =~ 
  
  s/\/$//g
  
  ; 
 
 
 
 
  
  

 
 
 
 
  
    
  
  $ua 
  
  = 
  
  LWP::
  
  UserAgent
  
  ->
  
  new 
  
  or 
  
  dir 
  
  $!
  
  ;
 
 
 
 
  
  

 
 
 
 
  
    
  
  # Set up a callback that collect image links
 
 
 
 
  
    
  
  my 
  
  @links 
  
  = 
  
  ();
 
 
 
 
  
  

 
 
 
 
  
    
  
  sub 
  
  callback 
  
  {
 
 
 
 
  
       
  
  my
  
  (
  
  $tag
  
  , 
  
  %attr
  
  ) 
  
  = 
  
  @_
  
  ;
 
 
 
 
  
       
  
  return 
  
  if 
  
  $tag 
  
  ne 
  
  $mytag
  
  ;  
  
  # we only look closer at <img ...>
 
 
 
 
  
       
  
  push
  
  (
  
  @links
  
  , 
  
  $attr
  
  {
  
  $attr_name
  
  });
 
 
 
 
  
    
  
  }
 
 
 
 
  
  

 
 
 
 
  
    
  
  # Make the parser.  Unfortunately, we don't know the base yet
 
 
 
 
  
    
  
  # (it might be different from $url)
 
 
 
 
  
    
  
  my 
  
  $p 
  
  = 
  
  HTML::
  
  LinkExtor
  
  ->
  
  new
  
  (
  
  \&
  
  callback
  
  );
 
 
 
 
  
  

 
 
 
 
  
    
  
  # Request document and parse it as it arrives
 
 
 
 
  
    
  
  my 
  
  $res 
  
  = 
  
  $ua
  
  ->
  
  request
  
  (
  
  HTTP::
  
  Request
  
  ->
  
  new
  
  (
  
  GET 
  
  => 
  
  $url
  
  ),
 
 
 
 
  
                           
  
  sub 
  
  {
  
  $p
  
  ->
  
  parse
  
  (
  
  $_
  
  [
  
  0
  
  ])}) 
  
  or 
  
  die 
  
  $!
  
  ;
 
 
 
 
  
  

 
 
 
 
  
  

 
 
 
 
  
    
  
  # Expand all image URLs to absolute ones
 
 
 
 
  
    
  
  $base 
  
  or 
  
  $base 
  
  = 
  
  $res
  
  ->
  
  base
  
  ;
 
 
 
 
  
    
  
  @links 
  
  = 
  
  map 
  
  { 
  
  $_
  
  = 
  
  url
  
  (
  
  $_
  
  , 
  
  $base
  
  )
  
  ->
  
  abs
  
  ; 
  
  } 
  
  @links
  
  ;
 
 
 
 
  
  

 
 
 
 
  
    
  
  return 
  
  \
  
  @links
  
  ;
 
 
 
 
  
  }

delete this gist