从NCBI基因组数据中获得cds，pep和geneID对应表

最新推荐文章于 2025-03-19 19:02:33 发布

xflee0608

最新推荐文章于 2025-03-19 19:02:33 发布

阅读量1.5w

点赞数 3

分类专栏： perl 文章标签： perl

本文链接：https://blog.csdn.net/hugolee123/article/details/38441607

版权

perl 专栏收录该内容

14 篇文章

订阅专栏

在做基因组相关分析时，我们常常需要从基因组中提取cds，并翻译成相应的pep序列。此脚本，以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件，快速获得cds序列，pep序列，RNA,Protein和gene的对应关系表等相关文件。

A perl script which deals with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.

使用方法如下：

perl $0 gff_filegenomes_fa_file file_prefix

#! /usr/bin/perl -w

=head1 #######################################
	
=head1 name
	grep_cds_pep_from_ncbi_genomes_datas.pl

=head1 description
	deal with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.

=head1 example
	perl  $0 ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole
	perl  $0 gff_file genomes_fa_file  file_prefix

=head1 author
	original from Xiangfeng Li,xflee0608@163.com
		##2014-4-19/21##

=head1 #######################################
=cut

use strict;
die `pod2text $0` unless @ARGV==3;
my ($gff,$fa,$prefix)=@ARGV;

##deal gff file##
$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||die):(open GFF,$gff||die);
my (%mrna,%cds,%pep);
while(<GFF>){
	chomp;
	next if(/^#/);
	my @p=split/\t/,$_;
	my @q=split/;/,$p[8];
	my ($rna,$pep,$nt,$gene);
	my $chr=$p[0];
	if($p[2] eq "mRNA"){
		($rna=$q[0])=~s/ID=//;
		($nt=$q[1])=~s/Name=//;
		($gene=$q[-3])=~s/gene=//;
		$mrna{$chr}{$rna}{strand}=$p[6];
		$cds{$rna}=[$gene,$nt];
	}
        if($p[2] eq "CDS"){
                ($pep=$q[1])=~s/Name=//;
                ($rna=$q[2])=~s/Parent=//;
                push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];
                $pep{$rna}=$pep;
        }
}
close GFF;

##get id list##
my %anno;
my $id_gene_cds_pep=$prefix."_id_gene_cds_pep.lst";
open ID, ">",$id_gene_cds_pep||die;
foreach my $i(sort keys %pep){
	if($cds{$i}){
		my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};
		print ID $out,"\n";
		($anno{$i}=$out)=~s/\t/\|/g;
	}
}
warn "create file:$id_gene_cds_pep\n";
close ID;

##deal fa file##
my %max;
$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||die):(open FA,$fa||die);
my $raw_cds=$prefix."_raw_cds";
open CDS1,">$raw_cds"||die;
my ($start,$end);
$/=">";<FA>;$/="\n";
while(<FA>){
        my $name=$1 if(/(\S+)/);
	my $info=(split/\|/,$name)[-1];
        $/=">";
        my $seq=<FA>;
        $/="\n";
        $seq=~s/>|\n+//g;
	my $scaf=$mrna{$info};
	foreach my $k(sort keys %$scaf){
		next if(! exists $scaf->{$k}{nt});
		my @p=@{$scaf->{$k}{nt}};
		my $strand=$$scaf{$k}{strand};
		my $get;
		@p=sort{$a->[0]<=>$b->[0]}@p;
		my $loc1=$p[0][0];
		my $loc2=$p[-1][1];
		my ($get_len,$add,$out,$gene);
		if(exists $anno{$k}){
			$add=$anno{$k}; 
			$gene=(split/\|/,$add)[1];
		}else{next;}
		foreach(@p){
			($start,$end)=@$_[0,1];
			$get.=uc(substr($seq,$start-1,$end-$start+1));
		}
		if($strand eq "+"){
			$get_len=length$get;
			$get=~s/([A-Z]{50})/$1\n/g;
			chop($get) unless($get_len%50);
			$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";
			push @{$max{$gene}},[$get_len,$out];
			print CDS1 $out;
		}
		if($strand eq "-"){
			$get=&reverse_complement($get);
			$get_len=length$get;
			$get=~s/([A-Z]{50})/$1\n/g;
			chop($get) unless($get_len%50);
			$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";
			push @{$max{$gene}},[$get_len,$out];
			print CDS1 $out;
		}
	}
}
warn "create file:$raw_cds\n";
close FA;
close CDS1;

##get max transcript##
my $filter_cds=$prefix."_filter_cds";
open CDS2,">$filter_cds"||die;
my @a;
foreach my $j(keys %max){
	my @trans=@{$max{$j}};
	@trans=sort{$a->[0] cmp $b->[0]}@trans;
	push @a,$trans[-1][1];
}
my @a_new;
foreach(@a){
	my $r=$1 if(/^>rna(\d+)/);
	push @a_new,[$r,$_];
}
my @cds_sort=map{$_->[1]}
		sort{$a->[0] <=> $b->[0]}@a_new;
print CDS2 $_ for@cds_sort;
close CDS2;
warn "create file:$filter_cds\n";

##get raw pep sequences##
my $raw_pep=$prefix."_raw_pep";
open PEP1,">",$raw_pep||die;
my @raw_pep=&cds2pep($raw_cds);
print PEP1 $_ for@raw_pep;
close PEP1;
warn "create file:$raw_pep\n";

##get filter pep sequences##
my $filter_pep=$prefix."_filter_pep";
open PEP2,">$filter_pep"||die;
my @filter_pep=&cds2pep($filter_cds);
print PEP2 $_ for@filter_pep;
close PEP2;
warn "create file:$filter_pep\n";

##add label for cds and pep of filter##
my $label=uc($prefix);
open IN1,$filter_cds||die;
my $filter_cds_label=$prefix."_filter_cds_label";
open OUT1,">",$filter_cds_label||die;
while(<IN1>){
	chomp;
	if(/^>/){
		my @a=split/\|/,$_,2;
		my $name=$a[0]."_$label";
		print OUT1"$name |$a[1]\n";
	}else{print OUT1 "$_\n";}
}
close IN1;
close OUT1;
warn "create file:$filter_cds_label\n";

open IN2,$filter_pep||die;
my $filter_pep_label=$prefix."_filter_pep_label";
open OUT2,">",$filter_pep_label||die;
while(<IN2>){
        chomp;
        if(/^>/){
                my @a=split/\|/,$_,2;
                my $name=$a[0]."_$label";
                print OUT2 "$name |$a[1]\n";
        }else{print OUT2 "$_\n";}
}
close IN2;
close OUT2;
warn "create file:$filter_pep_label\n";

##timing##
my $time=times;
my $time_out=sprintf "%.2f",$time/60;
print "##########\nElapsed Time :$time_out mins\n##########\n";

##subroutine##
sub reverse_complement{
	my ($seq)=shift;
	$seq=reverse$seq;
	$seq=~tr/AaGgCcTt/TtCcGgAa/;
	return $seq;
}

##subroutine##
sub cds2pep{
	my $file=shift;
	my %code = (
                        "standard" =>
                                {
                                'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A',                               # Alanine
                                'TGC' => 'C', 'TGT' => 'C',                                                           # Cysteine
                                'GAC' => 'D', 'GAT' => 'D',                                                           # Aspartic Aci
                                'GAA' => 'E', 'GAG' => 'E',                                                           # Glutamic Aci
                                'TTC' => 'F', 'TTT' => 'F',                                                           # Phenylalanin
                                'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G',                               # Glycine
                                'CAC' => 'H', 'CAT' => 'H',                                                           # Histidine
                                'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I',                                             # Isoleucine
                                'AAA' => 'K', 'AAG' => 'K',                                                           # Lysine
                                'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L',   # Leucine
                                'ATG' => 'M',                                                                         # Methionine
                                'AAC' => 'N', 'AAT' => 'N',                                                           # Asparagine
                                'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P',                               # Proline
                                'CAA' => 'Q', 'CAG' => 'Q',                                                           # Glutamine
                                'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R',   # Arginine
                                'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S',   # Serine
                                'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T',                               # Threonine
                                'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V',                               # Valine
                                'TGG' => 'W',                                                                         # Tryptophan
                                'TAC' => 'Y', 'TAT' => 'Y',                                                           # Tyrosine
                                'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U'                                              # Stop
                                }
                        ## more translate table could be added here in future
                        ## more translate table could be added here in future
                        ## more translate table could be added here in future
        );
	open IN,$file||die;
	$/=">";<IN>;$/="\n";
	my @results_set;
	while(<IN>){
		my $info=$_;
        	my @a=split/\s+/,$info;
        	$/=">";my $seq=<IN>;$/="\n";
        	$seq=~s/\n|>//g;
        	my $len=length$seq;
        	my $info_out=join " ",@a[0..($#a-1)];
        	my ($pep_out,$triplet);
        	for(my $i=0;$i<$len;$i+=3){
                	$triplet=substr($seq,$i,3);
                	next if(length$triplet!=3);
                	if(exists $code{standard}{$triplet}){
                        	$pep_out.=$code{standard}{$triplet};
                	}else{$pep_out.="X"}
        	}
        	$pep_out=~s/U$// if($pep_out=~/U$/);
        	my $pep_len=length$pep_out;
        	$pep_out=~s/([A-Z]{50})/$1\n/g;
        	chop($pep_out) unless($pep_len%50);
        	my $results= ">$info_out length=$pep_len\n$pep_out\n";
		push @results_set,$results;
	}
	return @results_set;
}

__END__