Compare array

最新推荐文章于 2024-03-11 09:58:12 发布

cuanyangpa3685

最新推荐文章于 2024-03-11 09:58:12 发布

阅读量219

点赞数

文章标签：大数据数据结构与算法

　前段时间写的upgrade，现在碰到了问题，在处理key.pl的环节上，大数据量时候效率很低，以前设计的时候没有考虑到在这个环节上能有大数据量，所以写的算法是很简单的顺序遍历，赋值也是直接赋值．结果２２００条record就要花一小时才跑完，my god.

　接到user的反映后，知道这个问题得重写．趁着五一假期研究一番，现采用二叉树算法，采用引用替代直接赋值，将循环用子函数独立出来，尽量减少遍历的次数．优化后２２００同样的数据，只要１０秒就ＯＫ了．

　　　检查出一个bug：split在拆分字符串的时候，对结尾是空的段直接忽略掉；采取方法：字符串后面先加上一个值，split后再去掉这个值．如

[@more@]

my $a = "1|2|||aa|";

$a = $a."|1";

my @a = split /|/,$a;

@a = @a[ 0..($#a - 1) ];

下面是原代码：

#!/usr/local/bin/perl
#
# System :
# Program ID :
# Description :
#
# Create By :
# Creation Date : 11-June-2008

use warnings;
use strict;
use File::Spec;
use POSIX;

my $only_before = $ARGV[0];
my $only_after = $ARGV[1];
my $diff_file = $ARGV[2];
my $key_file = $ARGV[3];
my $compare_file = $ARGV[4];
my $table_name = $ARGV[5];
my $temp_path = $ARGV[6];
my $only_be_file = $ARGV[7];

my $time=strftime("%Y%m%d%H%M",localtime);
print "Start running at $timen";

my (@key,@compare,%compare);
my $split_tag;

open(my $fh,"$key_file") || die "Can't open $key_file:$!n";
while(){
chomp;
if(/^$table_name/i){
my $id = (split /:/)[1];
$split_tag = (split /:/)[2];
my $key = (split /:/)[3];
if( (defined $split_tag) && ($split_tag ne "N") ){
@{$key[$id]} = split /,/, $key;
}
elsif( (defined $split_tag) && ($split_tag eq "N") ){
$id=999999;
@{$key[$id]} = split /,/, $key;
}
}
}
close($fh);

open($fh,"$compare_file") || die "Can't open $compare_file:$!n";
while(){
chomp;
if(/^$table_name/i){
@compare = split /,/,(split /:/)[2];

}
}
close($fh);
#array to hash
map {$compare{uc($compare[$_])} = $_} 0..$#compare;

open($fh,"$only_after") || die "Can't open $only_after:$!n";
my @only_after=;
close($fh);

open($fh,"$only_before") || die "Can't open $only_before:$!n";
my @only_before=;
close($fh);

my $split_col = $split_tag - 1;
my $before = key(@only_before,@key,%compare,$split_col);
my $after = key(@only_after,@key,%compare,$split_col);
my ($before2,$after2,$diff) = diff($before,$after);
$diff = col(@compare,$diff);

unlink $diff_file;
unlink $only_be_file;
open($fh,">>$diff_file") || die "Can't open $diff_file:$!n";
open(my $fh2,">>$only_be_file") || die "Can't open $only_be_file:$!n";

if( @{$before2} ){
my $total = $#{$before2} + 1;
print $fh "===Before upgrade===n" ;
print $fh2 "===Before upgrade===n" ;
print $fh "Total : $totaln";
print $fh2 "Total : $totaln";
for my $a (0..$#{$before2}){
for my $b ( 1..$#{$before2->[$a]} ){
print $fh "$before2->[$a][$b]|";
print $fh2 "$before2->[$a][$b]|";
}
print $fh "n";
print $fh2 "n";
}
print $fh "n";
print $fh2 "n";
}
close($fh2);

if( @{$after2} ){
my $total = $#{$after2} + 1;
print $fh "===After upgrade===n" ;
print $fh "Total : $totaln";
for my $a (0..$#{$after2}){
for my $b ( 1..$#{$after2->[$a]} ){
print $fh "$after2->[$a][$b]|";
}
print $fh "n";
}
print $fh "n";
}

if( @{$diff} ){
my $total = $#{$diff} + 1;
print $fh "===difference===n" ;
print $fh "Total : $totaln";
for my $a (0..$#{$diff}){
print $fh $a + 1,"n";
print "[D] ",$a + 1,"n";
for my $b ( 0..$#{$diff->[$a]} ){
print $fh "$diff->[$a][$b]|";
}
print $fh "n";
}
print $fh "n";
}
close($fh);

sub diff{
my $before = shift;
my $after = shift;

my @diff;
my @before = @$before;
my @after = @$after;

my $tag = 0;
for(my $a = 0; $a <= $#before;$a++){
for(my $b = 0;$b <= $#after;$b++){
next if $before[$a][0] ne $after[$b][0];
for my $element ( 1..$#{$before[$b]} ){
if( $before[$a][$element] ne $after[$b][$element] ){
$before[$a][$element] = $before[$a][$element]."|1";
$after[$b][$element] = $after[$b][$element]."|1";
my @temp1 = split /|/,$before[$a][$element];
my @temp2 = split /|/,$after[$b][$element];
for(my $c = 0;$c <= $#temp1;$c++){
$temp1[$c] = "diff{$temp1[$c],$temp2[$c]}" if $temp1[$c] ne $temp2[$c];
}
push @{$diff[$tag]},(join "|",@temp1[0..( $#temp1 - 1) ]);
next;
}
push @{$diff[$tag]},$before[$a][$element];
}
$tag++;
splice @before,$a,1;
splice @after,$b,1;
}
}
for my $cc (0..$#diff){
print "[DDE] ",$cc+1,"n";
for my $dd (0..$#{$diff[$cc]}){
print "$diff[$cc][$dd]";
}
print "n";
}
return(@before,@after,@diff);
}

sub key
{
my $file = shift;
my $key = shift;
my $compar = shift;
my $split_col = shift;

my @change;

for my $num ( 0..$#{$file} ){
my $line = $file->[$num];
next if $line =~ /^$/;
chomp $line;

$line = $line."|1";
my @temp = split /|/,$line;
@temp = @temp[0..($#temp - 1)];

my $id;
if( (defined $split_tag) && ($split_tag ne "N") ){
$id = $temp[$split_col];
}else{
$id = 999999;
}
my $value = "";
for my $key (@{$key->[$id]}){
$key = uc($key);
my $col = $compar->{$key};
my $be_col = $temp[$col];
$value = join "|",$value,$be_col;
}
($change[$num][0] = $value) =~ s/^|//;

my ($i,$j,$k);
for($i = 4,$j = 0,$k = 1;;$i = $i + 5,$k ++){
if($i > $#temp){
$i = $#temp;
$change[$num][$k] = join "|",@temp[$j..$i];
last;
}
$change[$num][$k] = join "|",@temp[$j..$i];
$j = $i + 1;
}
}
return (@change);
}

sub col
{
my $compare = shift;
my $diff = shift;
my @compa;

my ($i,$j,$k);

for($i = 4,$j = 0,$k = 0;;$i = $i + 5,$k ++){
if($i > $#{$compare}){
$i = $#{$compare};
$compa[$k] = join "|",@{$compare}[$j..$i];
last;
}
$compa[$k] = join "|",@{$compare}[$j..$i];
$j = $i + 1;
}
for my $line ( 0..$#{$diff} ){
for my $element ( 0..$#{$diff->[$line]} ){
if( $diff->[$line][$element] =~ /diff{/i ){
my $col_name = "";
my @temp1 = split /|/,$diff->[$line][$element];
my @temp2 = split /|/,$compa[$element];
for my $col ( 0..$#temp1){
$diff->[$line][$element] =~ s|diff{|$temp2[$col]{|i if $temp1[$col] =~ /diff{/i;
};
$diff->[$line][$element] =~ s/diff{/$col_name{/ig;
}
}
}

return $diff;
}

来自 “ ITPUB博客 ” ，链接：http://blog.itpub.net/640706/viewspace-1043926/，如需转载，请注明出处，否则将追究法律责任。

转载于:http://blog.itpub.net/640706/viewspace-1043926/

cuanyangpa3685

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Compare array

　前段时间写的upgrade，现在碰到了问题，在处理key.pl的环节上，大数据量时候效率很低，以前设计的时候没有考虑到在这个环节上能有大数据量，所以写的算法是很简单的顺序遍历，赋值也是直接赋值．结果２２００条record就要花...
复制链接

扫一扫