#!/usr/bin/perl -w
use strict;
use warnings;
use lib '/home/yanch/perl5/lib/perl5';
use lib '/home/yanch/perl5/lib/share/perl5';
use Getopt::Long;
use Bio::SeqIO;
use Cwd;
use File::Basename;
use subs::parallel;
my %hash;
my ($raw_reads,$raw_bases);
my ($trim_reads,$trim_bases);
my $name = shift;
my $path = shift;
my $lib = uc(shift);
my $qual = shift;
my $len = shift;
my $head = shift;
my $raw_path = getcwd;
my $trim_path = "$raw_path/trim";
my $fq_path = "$raw_path/fq";
unless (-e $trim_path && -e $fq_path){
system("mkdir -p $trim_path") ;
system("mkdir -p $fq_path") ;
}
raw_to_fq($name,$path,$lib);
sub raw_to_fq{
my ($name,$path,$lib) = @_;
system ("/mnt/lustre/users/qiangang/pipeline/utils/fq_zcat.py -i $path -o $fq_path -s $name -L $lib ") == 0 or die "Error occur in rawdata transfer to fastq $?\n";
}
open FILE, ">> $trim_path/cleanFq.list" or die $!;
if( lc($lib) eq "pe"){
my $rd1 = "$fq_path/$name\.1.fq";
my $rd2 = "$fq_path/$name\.2.fq";
pe_pipeline($name,$rd1,$rd2,$qual);
print FILE "$name\tPE\t$trim_path/$name/$name\_trim1.fq\t$trim_path/$name/$name\_trim2.fq\t$trim_path/$name/$name\_trim_unpaired.fq\n";
}elsif( lc($lib) eq "se"){
my $rd1 = "$fq_path/$name\.1.fq";
se_pipeline($name,$rd1,$qual);
print FILE "$name\tSE\t$trim_path/$name/$name\_trim_adpter.fq.trimmed.single\n";
}
sub pe_pipeline {
my ($name,$rd1,$rd2,$qual) = @_;
print "head:$head","\n";
open my $out ,">>",$trim_path."/raw_and_trimed.stat" or die "Can't open file to output\n";
if($head == 1){
print $out "Sample_ID\tRawReads_Num\tRawBase_Num\tRaw_Q20\(%\)\tGC(%)\tN(%)\tTrimReads_Num\tTrimBase_Num\tTrim_Q20(%)\tClean(%)\tAdpter(%)\n";
}
############raw reads bases count ########################
#parallelize_sub('bases_count_pe');
my ($raw_reads,$raw_bases) = bases_count_pe($rd1,$rd2);
print "raw_reads:",$raw_reads,"\t",$raw_bases,"\n";
##################### Q20 or Q30 ####################################
parallelize_sub('calcu_quality_pe');
my ($q_raw) = calcu_quality_pe($rd1,$rd2,$qual);
print "q_raw:$q_raw";
######################## Duplication ################################
parallelize_sub('duplicate');
duplicate ($rd1,$rd2);
######################GC contain ################################
#parallelize_sub('calcu_GC_pe');
my ($GC_pert,$Ns_pert) = calcu_GC_pe($rd1,$rd2);
plot_stat($rd1,$rd2);
###################trim fastq and caculte ###########################
my ($flag,$trim1,$trim2,$adpter_num) = trim_fq($name,$rd1,$rd2);
while($flag){
($trim_reads,$trim_bases)= bases_count_pe($trim1,$trim2);
my ($q_trim) = calcu_quality_pe($trim1,$trim2,$qual);
my $clean = sprintf("%.2f",$trim_reads/$raw_reads*100);
my $adpter= sprintf("%.2f",2*$adpter_num/$raw_reads)*100;
print $out "$name\t$raw_reads\t$raw_bases\t$q_raw\t$GC_pert\t$Ns_pert\t$trim_reads\t$trim_bases\t$q_trim\t$clean\t$adpter\n";
$flag = 0;
}
}
sub se_pipeline {
my ($name,$rd1,$qual) = @_;
open my $out ,">>","raw_and_trimed_se.stat" or die "Can't open file to output\n";
if($head == 1){
print $out "Sample_ID\tRawReads_Num\tRawBase_Num\tRaw_Q20(%)\tGC(%)\tN(%)\tTrimReads_Num\tTrimBase_Num\tTrim_Q20(%)\tClean(%)\tAdpter(%)\n";
}
############raw reads bases count ########################
($raw_reads,$raw_bases) = bases_count_se($rd1);
##################### Q20 or Q30 ####################################
my ($q_raw) = calcu_quality_se($rd1,$qual);
######################## Duplication ################################
#duplicate ($rd1,$rd2);
######################GC contain ################################
my ($GC_pert,$Ns_pert) = calcu_GC_se($rd1);
#plot_stat($rd1,$rd2);
###################trim fastq and caculte ###########################
my ($flag,$trim1,$adpter_num) = trim_fq($name,$rd1);
if($flag){
($trim_reads,$trim_bases)= bases_count_se($trim1);
my ($q_trim) = calcu_quality_se($trim1,$qual);
my $clean = sprintf("%.2f",$trim_reads/$raw_reads*100);
my $adpter= sprintf("%.2f",($adpter_num/$raw_reads*100));
print $out "$name\t$raw_reads\t$raw_bases\t$q_raw\t$GC_pert\t$Ns_pert\t$trim_reads\t$trim_bases\t$q_trim\t$clean\t$adpter\n";
}
}
#########################subs###########################
sub bases_count_pe{
my ($rd1,$rd2) = @_;
my $i;
my $bases;
foreach my $file ($rd1,$rd2){
my $in = Bio::SeqIO -> new( -file => $file,
-format => "Fastq",
);
while( my $seq = $in -> next_seq){
$i++;
my $s = $seq -> seq;
$bases += length $s;
}
}
return ($i,$bases);
}
sub bases_count_se{
my ($rd1) = @_;
my $i;
my $bases;
# print "rd1\t$rd1","\n";
# print "rd2\t$rd2","\n";
my $in = Bio::SeqIO -> new( -file => $rd1,
-format => "Fastq",
);
while( my $seq = $in -> next_seq()){
$i ++;
$bases += length $seq;
}
return ($i,$bases);
}
sub calcu_quality_pe{
my $result1;
my $result2;
my ($q_raw1,$q_raw2);
my $pert;
my ($rd1,$rd2,$score) = @_;
$result1 = `FastqTotalHighQualityBase.jar -i $rd1 -q $score ` ;
$result2 = `FastqTotalHighQualityBase.jar -i $rd2 -q $score ` ;
chomp $result1;
chomp $result2;
($q_raw1) = (split"\t",$result1)[3];
($q_raw2) = (split"\t",$result2)[3];
print $q_raw1,"\t",$q_raw2,"\n";
$pert = ($q_raw1 + $q_raw2)/2 ;
return $pert;
}
sub calcu_quality_se {
my $result1;
my $result2;
my ($q_raw1,$q_raw2);
my $pert;
my ($rd1,$score) = @_;
$result1 = `FastqTotalHighQualityBase.jar -i $rd1 -q $score ` ;
chomp $result1;
($q_raw1) = (split"\t",$result1)[3];
return $q_raw1;
}
sub calcu_GC_pe{
my ($rd1,$rd2) = @_;
my ($line,$g,$c,$n,$Ns,$gc,$total,$gc_pert,$Ns_pert);
my ($file1,$subpath) = fileparse($rd1);
my ($file2) = fileparse($rd2);
my $out1 = "$fq_path/$file1\.stat";
my $out2 = "$fq_path/$file2\.stat";
system "fastx_quality_stats -Q 33 -i $rd1 -o $out1";
system "fastx_quality_stats -Q 33 -i $rd2 -o $out2";
foreach my $key ($out1,$out2){
open IN ,"<", $key or die "Test Can't open $key $!";
while(<IN>){
chomp;
unless(/column/){
($line,$g,$c,$n) = (split"\t",$_)[1,13,14,16];
$total += $line;
$gc += $g+$c;
$Ns += $n;
}
}
($gc_pert) = sprintf("%.2f",$gc/$total*100);
($Ns_pert) = sprintf("%.2f",$Ns/$total*100);
return ($gc_pert,$Ns_pert);
}
}
sub calcu_GC_se{
my ($rd1) = @_;
my ($line,$g,$c,$n,$Ns,$gc,$total,$gc_pert,$Ns_pert);
my ($file1,$subpath) = fileparse($rd1);
my $out1 = "$fq_path/$file1\.stat";
system "fastx_quality_stats -Q 33 -i $rd1 -o $out1";
foreach my $key ($out1){
open IN ,"<", $key or die "Test Can't open $key $!";
while(<IN>){
chomp;
unless(/column/){
($line,$g,$c,$n) = (split"\t",$_)[1,13,14,16];
$total += $line;
$gc += $g+$c;
$Ns += $n;
}
}
($gc_pert) = sprintf("%.2f",$gc/$total*100);
($Ns_pert) = sprintf("%.2f",$Ns/$total*100);
}
return ($gc_pert,$Ns_pert);
}
sub plot_stat {
my ($rd1,$rd2) = @_;
my ($file1,$subpath) = fileparse($rd1);
my ($file2) = fileparse($rd2);
chdir "$fq_path";
open FA, ">> draw.sh" or die $!;
if($rd1 eq $rd2)
{
my $out1 = "$file1.stat";
print FA "/mnt/lustre/users/sunyong/bin/fqcheck/fqcheck_distribute.pl $out1 -size $len\n";
}else{
my $out1 = "$file1.stat";
my $out2 = "$file2.stat";
print FA "/mnt/lustre/users/sunyong/bin/fqcheck/fqcheck_distribute.pl $out1 $out2 -size $len\n";
}
chdir "$raw_path";
}
sub duplicate {
my (@file) = @_;
foreach my $file (@file){
my %hash;
my $dup = 0;
my $total;
open my $in,"<",$file or die "Can't open $file : $!\n";
open my $out,">>","$fq_path/$name\_dup_stat.txt" or die "Can't open file to output\n";
while(<$in>)
{
my $line = <$in>;
chomp($line);
$hash{$line} ++;
<$in>;
<$in>;
$total ++;
}
foreach my $k(keys %hash)
{
if($hash{$k} > 1)
{
$dup += $hash{$k} - 1;
}
}
my $pert = sprintf("%.2f%%",$dup/$total*100);
my ($id,$dir) = fileparse($file);
print $out "$id\t$pert\n";
}
}
sub trim_fq{
my $flag = 0;
my ($i,$rd1,$rd2) = @_;
chdir "$trim_path";
`mkdir $i`;
chdir "$i" ;
if($rd2){
`SeqPrep -f $rd1 -r $rd2 -1 $i\_clip_1.fq -2 $i\_clip_2.fq -3 $i\_discard_1.fq -4 $i\_discard_2.fq -q 20 -L 20 -B AGATCGGAAGAGCGTCGTGT -A AGATCGGAAGAGCACACGTC 2> $i\_trim_adpter.txt `;
`sickle pe -f $i\_clip_1.fq -r $i\_clip_2.fq -t sanger -l 25 -n -o $i\_trim1.fq -p $i\_trim2.fq -s $i\_trim_unpaired.fq >$i\_trim_sickle.log ` ;
$flag = 1;
my $adpter_num;
open ADPTER,"<","$i\_trim_adpter.txt" or die "can't open $!";
while(<ADPTER>){
chomp;
if(/Pairs With Adapters:\s+(\d+)/){
$adpter_num = $1;
print $adpter_num,"\n";
}
}
return ($flag,"$i\_trim1.fq","$i\_trim2.fq",$adpter_num);
# `echo -e "$PWD/$i\_trim1.fq\t$PWD/$i\_trim2.fq\t$PWD/$i\_trim_unpaired.fq" >> trim_fq.list;
}else{
my $adpter_num;
`fastx_clipper -a AGATCGGAAGAGCACACGTC -Q 33 -l 25 -v -i $rd1 -o $i\_trim_adpter.fq >$i\_trim_adpter.txt`;
`DynamicTrim.pl $i\_trim_adpter.fq -h 20 -bwa -sanger`;
`LengthSort.pl $i\_trim_adpter.fq.trimmed -l 25`;
# `sickle se -f $i\_clip_1.fq -t sanger -l 25 -n -s $i\_trim_unpaired.fq >$i\_trim_sickle.log `;
open ADPTER,"<","$i\_trim_adpter.txt" or die "can't open $!";
while(<ADPTER>){
chomp;
if(/(\d+)\s+adpter-only/){
$adpter_num = $1;
}
}
$flag = 1;
return ($flag,"$trim_path/$i/$i\_trim_adpter.fq.trimmed.single",$adpter_num);
}
chdir "$raw_path";
}
test
最新推荐文章于 2021-05-13 21:26:16 发布