一个perl写的web采集程序

#!/usr/bin/perl

use warnings;
use strict;

#################### Modules ######################

use LWP::UserAgent;
use HTTP::Cookies;

use Win32::OLE qw(in with);
use Win32::OLE::Const 'Microsoft Excel';

################### variables #####################

my $base_url = 'http://www.xxxxxx.com';

my (%primary_catalog_link,%slave_catalog_link,%product_link);

# match target:
my $primary_re = qr/<strong><a href=/"(//product//cn//p//c(?:[0-9]{1,2}))///" class=blue_sell2_1 title=/".+/">(.+)<//a>( <//strong>)?<br>/s*$/;

# match target:
my $slave_re = qr/<td width=/"30/%/" bgcolor=/"/#FFFFFF/"><a href=/"(.+)/"/s{1,2}class=/"blue/">(.+)<//a><//td>/s*$/;

# match target:
my $product_re = qr/<li><a href=/'(//product//cn//pc//p--.*/.html)/'/s+class=blue/starget=_blank><h>(.+)<//h><//a><//li>/s+<//td>/s*$/;

# match target:
my $nextpage_re = qr/<a href=/"(index[2-9]/.html)/">下一页<//a>/;

# match target:
my $chinese_name_re = qr/<td width="28%" align="center" class="td1"><strong>中文名称<//strong>:<//td>/r/n/s*<td width="72%" class=tt1><strong>(.+)<//strong><//td>/;

# match target:
my $english_name_re = qr/<td align="center" bgcolor="/#FFFFFF" class="td1"><strong>英文名称<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;

# match target:
my $chinese_alias_re = qr/<td align="center" class="td1"><strong>中文别名<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;

# match target: 
my $CAS_RN_re = qr/<td align="center" class="td1"><strong>CAS RN.<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;

# match target: 
my $moscular_re = qr/<td align="center" class="td1"><strong>分 子 式<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;

# match target:
my $phyical_property_re = qr/<td colspan="2" style="padding:6"><strong><font color="#545454">物化性质<//font><//strong>: &nbsp;<br>/r/n/s*<div style="margin-left:13">(.+(/r/n.*)?)<BR><//div><//td>/;

# match target:
my $usage_re = qr/<td colspan="2" style="padding:6"><strong><font color="#545454">用  途<//font><//strong>:<div style="margin-left:13">(.+(/r/n.*)?)<//div><//td>/;

# match target:
my $up_element_re = qr/<td height="22" bgcolor="DEEEF4" class="td6"><div align="center">.*上游.*<//div><//td>/r/n/s*<//tr>/r/n/s*<tr>/r/n/s*<td bgcolor="#FFFFFF" class=grey_border style="padding:6">/s+([^</n]+(/r/n.*)?)/;

# match target:
my $down_element_re = qr/<td height="22" bgcolor="DEEEF4" class="td6"><div align="center">.*下游.*<//div><//td>/r/n/s*<//tr>/r/n/s*<tr>/r/n/s*<td bgcolor="#FFFFFF" class=grey_border style="padding:6">/s([^/s][^</n]+(/r/n.*)?)/;

################## initialize #####################

# create user agent
my $ua = new LWP::UserAgent;

# specifies the name
$ua->agent("Knuth/1.0");

# create temporary cookie
my $cookie_jar = HTTP::Cookies->new(
             file => "./lwp_cookies.dat",
             autosave => 1,
          );
$ua->cookie_jar($cookie_jar);

open(LOGFILE, ">LOG.txt") or die "Can't open file LOG.txt : $!/n";

open(MAINCATALOG, ">maincatalog.txt") or die "Can't open file maincatalog.txt : $!/n";

open(SUBCATALOG, ">subcatalog.txt") or die "Can't open file subcatalog.txt : $!/n";

open(PRODUCT, ">product.txt") or die "Can't open file product.txt : $!/n";

# initialize excel controller
$Win32::OLE::Warn = 3;                                # die on errors...

# get already active Excel application or open new
my $Excel = Win32::OLE->GetActiveObject('Excel.Application')
    || Win32::OLE->new('Excel.Application', 'Quit'); 

# open Excel file
#my $Book = $Excel->Workbooks->Open("chem_info.xls");

################## sub routines ###################

sub ECHO_AND_LOG{
 my $loginfo = shift;
 print LOGFILE "$loginfo/n";
 print "$loginfo/n";
};

sub trim {
 my @out = @_;
 for (@out) {
  s/^/s+//;
  s//s+$//;
 }
 return wantarray ? @out : $out[0];
};

sub doGet{
 
 # Create a request
 my $req = HTTP::Request->new(GET => shift);
  $req->header('Accept' => 'text/html');

 return doRetrive($req); 
};

sub doPost{
 
 my ($url,$content)= @_;
  
 # Create a request
  my $req = HTTP::Request->new(POST => $url);
  $req->content_type('application/x-www-form-urlencoded');
  $req->header('Accept' => 'text/html');
  $req->content($content);

  return doRetrive($req);
};

sub doRetrive{
 
 my $interval = 1;
 my $req = shift;
 
 while (1){
  my $res = $ua->request($req);
 
  #check the outcome
  if ($res->is_success){
   return $res->as_string;
  }else{
   SWITCH: for ($res->status_line) {
    
    /^404/ && do {
     ECHO_AND_LOG "[Error] : ".$req->as_string.$res->status_line."/n";
     return undef;
    };
    
    /^500/ && do {
     $interval *= 4;
     if ($interval < 64) {
      ECHO_AND_LOG "[Warning] : ".$req->as_string.$res->status_line."/n";
      sleep $interval;
      last;
     }else{
      ECHO_AND_LOG "[Error] : ".$req->as_string.$res->status_line."/n";
      return undef;
     }
    };
    
    ECHO_AND_LOG "[Error] : ".$req->as_string.$res->status_line."/n"; #default
   }
  }
 } 
};

sub deal_page{
 
 my ($reffunc,$value,$page) = @_;
 
 if (my $third_page = doGet($value."/".$page)) {
  foreach (split //n/, $third_page){
   $reffunc->($_);
   deal_page($reffunc,$value,$1) if (/$nextpage_re/);
  }
 }
}

##################### Main ########################

# get primary catalog information
my $main_page = doGet($base_url."/product/");
die "Can't open $base_url web page!/n" unless defined $main_page;

ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG "              PRIMARY CATALOG BEGIN                ";
ECHO_AND_LOG "***************************************************";

foreach (split //n/, $main_page){ 
 if (/$primary_re/){
  $primary_catalog_link{$2} = $base_url.$1;
  ECHO_AND_LOG "$2/t = $base_url.$1";
 }
}

ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG "                SLAVE CATALOG BEGIN                ";
ECHO_AND_LOG "***************************************************";

# get slave catalog information
while (my ($key, $value) = each %primary_catalog_link){
 if (my $second_page = doGet($value)) {
  foreach (split //n/, $second_page){
   if (/$slave_re/){
    $slave_catalog_link{$2} = $value."/".$1;
    ECHO_AND_LOG "$2/t = $value/$1";
    print MAINCATALOG "$2/t$key/n";
   }
  }
 }
}

close(MAINCATALOG) or die "Can't close file maincatalog.txt : $!/n";

ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG "                PRODUCT LINK BEGIN                 ";
ECHO_AND_LOG "***************************************************";

# get product link information
while (my ($key, $value) = each %slave_catalog_link){
 
 my $reffunc = sub{
  if (/$product_re/){
   $product_link{$2} = $base_url.$1;
   ECHO_AND_LOG "$2/t = $base_url$1";
   print SUBCATALOG "$2/t$key/n";
  }
 };
 
 my $first_page = "";
 deal_page($reffunc, $value, $first_page);
}

close(SUBCATALOG) or die "Can't close file subcatalog.txt : $!/n";

ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG "                PRODUCT INFO BEGIN                 ";
ECHO_AND_LOG "***************************************************";

# get product detail information
while (my ($key, $value) = each %product_link){
 if (my $product_info = doGet($value)) {
  
  my ($chinese_name,$english_name,$chinese_alias,$CAS_RN,$moscular,$phyical_property,$usage,$up_element,$down_element);

  # get chinese name
  foreach (split /<table>/, $product_info){
   SWITCH : {
    /$chinese_name_re/  && do{$chinese_name = trim($1); };
    /$english_name_re/  && do{$english_name = trim($1); };
    /$chinese_alias_re/ && do{$chinese_alias = trim($1);};
    /$CAS_RN_re/        && do{$CAS_RN = trim($1);       };
    /$moscular_re/      && do{$moscular = trim($1);     };
    
    /$usage_re/         && do{
     $usage = trim($1);       
     $usage =~ s//r/n/ /g;
    };
    
    /$up_element_re/    && do{
     $up_element = trim($1);  
     $up_element =~ s//r/n/ /g;
    };
    
    /$down_element_re/  && do{
     $down_element = trim($1);
     $down_element =~ s//r/n/ /g;
    };
    
    /$phyical_property_re/ && do{
     $phyical_property = trim($1);
     $phyical_property =~ s/<BR>/ /g;
    };
   }
  }
  
  ECHO_AND_LOG "$key >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>";
  ECHO_AND_LOG "URL/t/t: $value";
  $chinese_name     && ECHO_AND_LOG "CHINESE_NAME/t: $chinese_name";
  $english_name     && ECHO_AND_LOG "ENGLISH_NANEM/t: $english_name";
  $chinese_alias    && ECHO_AND_LOG "CHINESE_ALIAS/t: $chinese_alias";
  $CAS_RN           && ECHO_AND_LOG "CAS_RN/t: $CAS_RN";
  $moscular         && ECHO_AND_LOG "MOSCULAR/t: $moscular";
  $usage            && ECHO_AND_LOG "USAGE/t: $usage";
  $phyical_property && ECHO_AND_LOG "物化性质/t: $phyical_property";
  $up_element       && ECHO_AND_LOG "上游产品/t: $up_element";
  $down_element     && ECHO_AND_LOG "下游产品/t: $down_element";
  ECHO_AND_LOG "$key <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<";
  
  print PRODUCT ($chinese_name ? $chinese_name : "")."/t";
  print PRODUCT ($english_name ? $english_name : "")."/t";
  print PRODUCT ($chinese_alias ? $chinese_alias : "")."/t";
  print PRODUCT ($CAS_RN ? $CAS_RN : "")."/t";
  print PRODUCT ($moscular ? $moscular : "")."/t";
  print PRODUCT ($usage ? $usage : "")."/t";
  print PRODUCT ($phyical_property ? $phyical_property : "")."/t";
  print PRODUCT ($up_element ? $up_element : "")."/t";
  print PRODUCT ($down_element ? $down_element : "")."/n";
  
 }
}

close(LOGFILE) or die "Can't close file LOG.txt : $!/n";

close(PRODUCT) or die "Can't close file product.txt : $!/n";

#$Book->Close or die "Can't close excel file : $!/n";

1;

__DATA__

__END__

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值