#!/usr/bin/perl -w
# author by luyi
use strict;
my $src_data="/export/remotedata/wget_html_data.txt"; #获取的html数据存入此文件中
my $result="/export/remotedata/get_data_173.txt"; #抽取的最终结果数据存入此文件中
my $info="/tmp/wget_data_173.info"; #日志文件
my $address="http://api.173.com:8080/public/agentList";
my $cmd=qx(which wget);
chomp($cmd);
unlink $src_data if (-f $src_data);
unlink $result if (-f $result);
qx($cmd $address -a $info -O $src_data);
##print "$cmd $address -a $info -O $src_data";
##/usr/bin/wget http://api.173.com:8080/public/agentList -a a.txt -O b.txt
if(! -s $src_data){
print "Get html file : $src_data error!\n";
print INFO "Get html file : $src_data error!\n";
exit;
}
open(INFO,">>$info") or die "can't open $info $!";
open(SD,"$src_data") or die "can't open $src_data $!";
open(RTF,">>$result") or die "can't open $result $!";
my $flag=0;
my $line_count=0; #html的<table></table>中包含的<tr>数
my $td_count=0; #html的<table></table>中包含的<td>数
my @tds; #保存<td></td>之间的内容
while(my $line=<SD>){
chomp($line);
if ($line =~ /\<table[^>]*\>/){
$flag=1;
}elsif($line =~ /\<\/table\>/){
# $flag = 0;
last;
}
if ($flag == 0){
next;
}else{
if($line =~ /\<tr[^>]*\>/){
$line_count += 1;
next;
}elsif($line =~ /\<td\>(.+)\<\/td\>/){
$td_count += 1;
push @tds,$1;
next;
}
}
}
#print "@tds\n";
#print "$line_count\t$td_count\n";
my $cols = $td_count / $line_count; #计算每个<tr>中包含的<td>数,即每行有多少列
my $cnt = 1; #划分行数的标记符号
foreach(@tds){
my $d = $cnt % $cols;
if($d == 0){ #读到一行的结尾列,换行
# print "$_\n";
print RTF "$_\n";
}else{ #否则,列与列之间用\t分割
# print "$_\t";
print RTF "$_\t";
}
$cnt += 1;
}
close(SD);
close(RTF);
close(INFO);
# author by luyi
use strict;
my $src_data="/export/remotedata/wget_html_data.txt"; #获取的html数据存入此文件中
my $result="/export/remotedata/get_data_173.txt"; #抽取的最终结果数据存入此文件中
my $info="/tmp/wget_data_173.info"; #日志文件
my $address="http://api.173.com:8080/public/agentList";
my $cmd=qx(which wget);
chomp($cmd);
unlink $src_data if (-f $src_data);
unlink $result if (-f $result);
qx($cmd $address -a $info -O $src_data);
##print "$cmd $address -a $info -O $src_data";
##/usr/bin/wget http://api.173.com:8080/public/agentList -a a.txt -O b.txt
if(! -s $src_data){
print "Get html file : $src_data error!\n";
print INFO "Get html file : $src_data error!\n";
exit;
}
open(INFO,">>$info") or die "can't open $info $!";
open(SD,"$src_data") or die "can't open $src_data $!";
open(RTF,">>$result") or die "can't open $result $!";
my $flag=0;
my $line_count=0; #html的<table></table>中包含的<tr>数
my $td_count=0; #html的<table></table>中包含的<td>数
my @tds; #保存<td></td>之间的内容
while(my $line=<SD>){
chomp($line);
if ($line =~ /\<table[^>]*\>/){
$flag=1;
}elsif($line =~ /\<\/table\>/){
# $flag = 0;
last;
}
if ($flag == 0){
next;
}else{
if($line =~ /\<tr[^>]*\>/){
$line_count += 1;
next;
}elsif($line =~ /\<td\>(.+)\<\/td\>/){
$td_count += 1;
push @tds,$1;
next;
}
}
}
#print "@tds\n";
#print "$line_count\t$td_count\n";
my $cols = $td_count / $line_count; #计算每个<tr>中包含的<td>数,即每行有多少列
my $cnt = 1; #划分行数的标记符号
foreach(@tds){
my $d = $cnt % $cols;
if($d == 0){ #读到一行的结尾列,换行
# print "$_\n";
print RTF "$_\n";
}else{ #否则,列与列之间用\t分割
# print "$_\t";
print RTF "$_\t";
}
$cnt += 1;
}
close(SD);
close(RTF);
close(INFO);