#!/usr/bin/perl
use warnings;
use strict;
#################### Modules ######################
use LWP::UserAgent;
use HTTP::Cookies;
use Win32::OLE qw(in with);
use Win32::OLE::Const 'Microsoft Excel';
################### variables #####################
my $base_url = 'http://www.xxxxxx.com';
my (%primary_catalog_link,%slave_catalog_link,%product_link);
# match target:
my $primary_re = qr/<strong><a href=/"(//product//cn//p//c(?:[0-9]{1,2}))///" class=blue_sell2_1 title=/".+/">(.+)<//a>( <//strong>)?<br>/s*$/;
# match target:
my $slave_re = qr/<td width=/"30/%/" bgcolor=/"/#FFFFFF/"><a href=/"(.+)/"/s{1,2}class=/"blue/">(.+)<//a><//td>/s*$/;
# match target:
my $product_re = qr/<li><a href=/'(//product//cn//pc//p--.*/.html)/'/s+class=blue/starget=_blank><h>(.+)<//h><//a><//li>/s+<//td>/s*$/;
# match target:
my $nextpage_re = qr/<a href=/"(index[2-9]/.html)/">下一页<//a>/;
# match target:
my $chinese_name_re = qr/<td width="28%" align="center" class="td1"><strong>中文名称<//strong>:<//td>/r/n/s*<td width="72%" class=tt1><strong>(.+)<//strong><//td>/;
# match target:
my $english_name_re = qr/<td align="center" bgcolor="/#FFFFFF" class="td1"><strong>英文名称<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;
# match target:
my $chinese_alias_re = qr/<td align="center" class="td1"><strong>中文别名<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;
# match target:
my $CAS_RN_re = qr/<td align="center" class="td1"><strong>CAS RN.<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;
# match target:
my $moscular_re = qr/<td align="center" class="td1"><strong>分 子 式<//strong>:<//td>/r/n/s*<td class=tt1>(.+)<//td>/;
# match target:
my $phyical_property_re = qr/<td colspan="2" style="padding:6"><strong><font color="#545454">物化性质<//font><//strong>: <br>/r/n/s*<div style="margin-left:13">(.+(/r/n.*)?)<BR><//div><//td>/;
# match target:
my $usage_re = qr/<td colspan="2" style="padding:6"><strong><font color="#545454">用 途<//font><//strong>:<div style="margin-left:13">(.+(/r/n.*)?)<//div><//td>/;
# match target:
my $up_element_re = qr/<td height="22" bgcolor="DEEEF4" class="td6"><div align="center">.*上游.*<//div><//td>/r/n/s*<//tr>/r/n/s*<tr>/r/n/s*<td bgcolor="#FFFFFF" class=grey_border style="padding:6">/s+([^</n]+(/r/n.*)?)/;
# match target:
my $down_element_re = qr/<td height="22" bgcolor="DEEEF4" class="td6"><div align="center">.*下游.*<//div><//td>/r/n/s*<//tr>/r/n/s*<tr>/r/n/s*<td bgcolor="#FFFFFF" class=grey_border style="padding:6">/s([^/s][^</n]+(/r/n.*)?)/;
################## initialize #####################
# create user agent
my $ua = new LWP::UserAgent;
# specifies the name
$ua->agent("Knuth/1.0");
# create temporary cookie
my $cookie_jar = HTTP::Cookies->new(
file => "./lwp_cookies.dat",
autosave => 1,
);
$ua->cookie_jar($cookie_jar);
open(LOGFILE, ">LOG.txt") or die "Can't open file LOG.txt : $!/n";
open(MAINCATALOG, ">maincatalog.txt") or die "Can't open file maincatalog.txt : $!/n";
open(SUBCATALOG, ">subcatalog.txt") or die "Can't open file subcatalog.txt : $!/n";
open(PRODUCT, ">product.txt") or die "Can't open file product.txt : $!/n";
# initialize excel controller
$Win32::OLE::Warn = 3; # die on errors...
# get already active Excel application or open new
my $Excel = Win32::OLE->GetActiveObject('Excel.Application')
|| Win32::OLE->new('Excel.Application', 'Quit');
# open Excel file
#my $Book = $Excel->Workbooks->Open("chem_info.xls");
################## sub routines ###################
sub ECHO_AND_LOG{
my $loginfo = shift;
print LOGFILE "$loginfo/n";
print "$loginfo/n";
};
sub trim {
my @out = @_;
for (@out) {
s/^/s+//;
s//s+$//;
}
return wantarray ? @out : $out[0];
};
sub doGet{
# Create a request
my $req = HTTP::Request->new(GET => shift);
$req->header('Accept' => 'text/html');
return doRetrive($req);
};
sub doPost{
my ($url,$content)= @_;
# Create a request
my $req = HTTP::Request->new(POST => $url);
$req->content_type('application/x-www-form-urlencoded');
$req->header('Accept' => 'text/html');
$req->content($content);
return doRetrive($req);
};
sub doRetrive{
my $interval = 1;
my $req = shift;
while (1){
my $res = $ua->request($req);
#check the outcome
if ($res->is_success){
return $res->as_string;
}else{
SWITCH: for ($res->status_line) {
/^404/ && do {
ECHO_AND_LOG "[Error] : ".$req->as_string.$res->status_line."/n";
return undef;
};
/^500/ && do {
$interval *= 4;
if ($interval < 64) {
ECHO_AND_LOG "[Warning] : ".$req->as_string.$res->status_line."/n";
sleep $interval;
last;
}else{
ECHO_AND_LOG "[Error] : ".$req->as_string.$res->status_line."/n";
return undef;
}
};
ECHO_AND_LOG "[Error] : ".$req->as_string.$res->status_line."/n"; #default
}
}
}
};
sub deal_page{
my ($reffunc,$value,$page) = @_;
if (my $third_page = doGet($value."/".$page)) {
foreach (split //n/, $third_page){
$reffunc->($_);
deal_page($reffunc,$value,$1) if (/$nextpage_re/);
}
}
}
##################### Main ########################
# get primary catalog information
my $main_page = doGet($base_url."/product/");
die "Can't open $base_url web page!/n" unless defined $main_page;
ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG " PRIMARY CATALOG BEGIN ";
ECHO_AND_LOG "***************************************************";
foreach (split //n/, $main_page){
if (/$primary_re/){
$primary_catalog_link{$2} = $base_url.$1;
ECHO_AND_LOG "$2/t = $base_url.$1";
}
}
ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG " SLAVE CATALOG BEGIN ";
ECHO_AND_LOG "***************************************************";
# get slave catalog information
while (my ($key, $value) = each %primary_catalog_link){
if (my $second_page = doGet($value)) {
foreach (split //n/, $second_page){
if (/$slave_re/){
$slave_catalog_link{$2} = $value."/".$1;
ECHO_AND_LOG "$2/t = $value/$1";
print MAINCATALOG "$2/t$key/n";
}
}
}
}
close(MAINCATALOG) or die "Can't close file maincatalog.txt : $!/n";
ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG " PRODUCT LINK BEGIN ";
ECHO_AND_LOG "***************************************************";
# get product link information
while (my ($key, $value) = each %slave_catalog_link){
my $reffunc = sub{
if (/$product_re/){
$product_link{$2} = $base_url.$1;
ECHO_AND_LOG "$2/t = $base_url$1";
print SUBCATALOG "$2/t$key/n";
}
};
my $first_page = "";
deal_page($reffunc, $value, $first_page);
}
close(SUBCATALOG) or die "Can't close file subcatalog.txt : $!/n";
ECHO_AND_LOG "***************************************************";
ECHO_AND_LOG " PRODUCT INFO BEGIN ";
ECHO_AND_LOG "***************************************************";
# get product detail information
while (my ($key, $value) = each %product_link){
if (my $product_info = doGet($value)) {
my ($chinese_name,$english_name,$chinese_alias,$CAS_RN,$moscular,$phyical_property,$usage,$up_element,$down_element);
# get chinese name
foreach (split /<table>/, $product_info){
SWITCH : {
/$chinese_name_re/ && do{$chinese_name = trim($1); };
/$english_name_re/ && do{$english_name = trim($1); };
/$chinese_alias_re/ && do{$chinese_alias = trim($1);};
/$CAS_RN_re/ && do{$CAS_RN = trim($1); };
/$moscular_re/ && do{$moscular = trim($1); };
/$usage_re/ && do{
$usage = trim($1);
$usage =~ s//r/n/ /g;
};
/$up_element_re/ && do{
$up_element = trim($1);
$up_element =~ s//r/n/ /g;
};
/$down_element_re/ && do{
$down_element = trim($1);
$down_element =~ s//r/n/ /g;
};
/$phyical_property_re/ && do{
$phyical_property = trim($1);
$phyical_property =~ s/<BR>/ /g;
};
}
}
ECHO_AND_LOG "$key >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>";
ECHO_AND_LOG "URL/t/t: $value";
$chinese_name && ECHO_AND_LOG "CHINESE_NAME/t: $chinese_name";
$english_name && ECHO_AND_LOG "ENGLISH_NANEM/t: $english_name";
$chinese_alias && ECHO_AND_LOG "CHINESE_ALIAS/t: $chinese_alias";
$CAS_RN && ECHO_AND_LOG "CAS_RN/t: $CAS_RN";
$moscular && ECHO_AND_LOG "MOSCULAR/t: $moscular";
$usage && ECHO_AND_LOG "USAGE/t: $usage";
$phyical_property && ECHO_AND_LOG "物化性质/t: $phyical_property";
$up_element && ECHO_AND_LOG "上游产品/t: $up_element";
$down_element && ECHO_AND_LOG "下游产品/t: $down_element";
ECHO_AND_LOG "$key <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<";
print PRODUCT ($chinese_name ? $chinese_name : "")."/t";
print PRODUCT ($english_name ? $english_name : "")."/t";
print PRODUCT ($chinese_alias ? $chinese_alias : "")."/t";
print PRODUCT ($CAS_RN ? $CAS_RN : "")."/t";
print PRODUCT ($moscular ? $moscular : "")."/t";
print PRODUCT ($usage ? $usage : "")."/t";
print PRODUCT ($phyical_property ? $phyical_property : "")."/t";
print PRODUCT ($up_element ? $up_element : "")."/t";
print PRODUCT ($down_element ? $down_element : "")."/n";
}
}
close(LOGFILE) or die "Can't close file LOG.txt : $!/n";
close(PRODUCT) or die "Can't close file product.txt : $!/n";
#$Book->Close or die "Can't close excel file : $!/n";
1;
__DATA__
__END__