#!/usr/bin/perl
use warnings;
use Getopt::Long;
my $usage = qq{
This script takes an input FASTQ file (formatted with four lines per sequence) and removes duplicate entries based on ID.
Usage:
perl $0 -i <path for input file> -o <output file name>
\n};
my $infile;
my $outfile;
GetOptions (
"i=s" => \$infile,
"o=s" => \$outfile,
)
or die $usage;
die $usage unless defined $infile;
die $usage unless defined $outfile;
open (my $IN, "gzip -dc $infile |") or die "Unable to open $infile \n";
open (my $OUT, "| gzip >$outfile.fq.gz") or die "Unable to open file for output. \n";
my %id_hash;
my $unique_seq_counter = 0;
my $fastq_line_count = 0;
my $print_flag = 0;
while (my $line = <$IN>) {
$fastq_line_count ++;
if ($fastq_line_count % 4 == 1) {
my $ids = join("+", $line);
if (exists $id_hash{$ids}) {
next;
}
else {
$id_hash{$ids} = 0;
print $OUT $line;
$unique_seq_counter ++;
$print_flag = 1;
}
}
elsif ($print_flag == 1) {
print $OUT $line;
if ($fastq_line_count % 4 == 0) {
$print_flag = 0;
}
}
else {
next;
}
}
close $IN;
close $OUT;
print "\nThere are $unique_seq_counter unique sequences in the output file: $outfile.
exit;
1.输入文件是.gz的fastq压缩文件,输出是去重后的fastq.gz的压缩文件。
2.使用规范:
perl fastq-rmove-duplicate.pl -i 需要去重的fastq.gz -o 去重后的fastq.gz
本博主新开公众号, 希望大家能扫码关注一下,十分感谢大家。