bulk RNA-Seq (4)合并表达矩阵

欢迎关注bioinfor 生信云!有一起想做公众号的朋友欢迎联系我!

上一步定量,我们是对每一个样本进行的,现在需要将它合并到一个矩阵。这里通过一个脚本merge_to_matrix.pl 来合并。

脚本

#!/usr/bin/env perl

use strict;
use warnings;
use FindBin;
use File::Basename;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);

my $usage = <<__EOUSAGE__;

############################################################
#
# Usage:  $0 --est_method <method>  sample1.results sample2.results ...
#
#      or  $0 --est_method <method> --quant_files file.listing_target_files.txt
#
#      Note, if only a single input file is given, it's expected to contain the paths to all the target abundance estimation files.
#
# Required:
#            
#  --est_method <string>           featureCounts|RSEM|eXpress|kallisto|salmon  (needs to know what format to expect)
#
# Options:
#
#  --cross_sample_norm <string>         TMM|UpperQuartile|none   (default: TMM)
#
#  --name_sample_by_basedir             name sample column by dirname instead of filename
#      --basedir_index <int>            default(-2)
#
#  --out_prefix <string>                default: 'matrix'
#
#  --quant_files <string>              file containing a list of all the target files.
#
############################################################


__EOUSAGE__

    ;


my $help_flag;
my $est_method;
my $val_type;
my $cross_sample_norm = "TMM";
my $name_sample_by_basedir = 0;
my $out_prefix = "matrix";
my $basedir_index = -2;
my $quant_files = "";

&GetOptions('help|h' => \$help_flag,
            'est_method=s' => \$est_method,
            
            'cross_sample_norm=s' => \$cross_sample_norm,
            'name_sample_by_basedir' => \$name_sample_by_basedir,
            'out_prefix=s' => \$out_prefix,
            'basedir_index=i' => \$basedir_index,
            
            'quant_files=s' => \$quant_files,
            );


unless ($est_method && (@ARGV || $quant_files)) {
    die $usage;
}

unless ($est_method =~ /^(featureCounts|RSEM|eXpress|kallisto|salmon)/i) {
    die "Error, dont recognize --est_method $est_method ";
}
unless ($cross_sample_norm =~ /^(TMM|UpperQuartile|none)$/i) {
    die "Error, dont recognize --cross_sample_norm $cross_sample_norm ";
}

my @files;

if ($quant_files) {
    # allow for a file listing the various files.
    @files = `cat $quant_files`;
    chomp @files;
}
elsif (@ARGV) {
    @files = @ARGV;
}
else {
    die $usage;
}



=data_formats

## featureCounts

0	gene_id
1	counts
2	fpkm
3	tpm
## RSEM:

0       transcript_id
1       gene_id
2       length
3       effective_length
4       expected_count
5       TPM
6       FPKM
7       IsoPct


## eXpress v1.5:

1       target_id
2       length
3       eff_length
4       tot_counts
5       uniq_counts
6       est_counts
7       eff_counts
8       ambig_distr_alpha
9       ambig_distr_beta
10      fpkm
11      fpkm_conf_low
12      fpkm_conf_high
13      solvable
14      tpm


## kallisto:
0       target_id
1       length
2       eff_length
3       est_counts
4       tpm


## salmon:
0       Name
1       Length
2       EffectiveLength
3       TPM
4       NumReads

=cut
    
    ;

my ($acc_field, $counts_field, $fpkm_field, $tpm_field);

if ($est_method =~ /^rsem$/i) {
    $acc_field = 0;
    $counts_field = "expected_count";
    $fpkm_field = "FPKM";
    $tpm_field = "TPM";
}
elsif ($est_method =~ /^express$/i) {  # as of v1.5
    $acc_field = "target_id";
    $counts_field = "eff_counts";
    $fpkm_field = "fpkm";
    $tpm_field = "tpm";
}
elsif ($est_method =~ /^kallisto$/i) {
    $acc_field = "target_id";
    $counts_field = "est_counts";
    $fpkm_field = "tpm";
    $tpm_field = "tpm";
}
elsif ($est_method =~ /^salmon/) {
    $acc_field = "Name";
    $counts_field = "NumReads";
    $fpkm_field = "TPM";
    $tpm_field = "TPM";
}
elsif ($est_method =~ /^featureCounts/) {
    $acc_field = "gene_id";
    $counts_field = "counts";
    $fpkm_field = "fpkm";
    $tpm_field = "tpm";
}
else {
    die "Error, dont recognize --est_method [$est_method] ";
}

main: {
    
    my %data;
    
    foreach my $file (@files) {
        print STDERR "-reading file: $file\n";
        open (my $fh, $file) or die "Error, cannot open file $file";
        my $header = <$fh>; 
        chomp $header;
        my %fields = &parse_field_positions($header);
        #use Data::Dumper; print STDERR Dumper(\%fields);
        while (<$fh>) {
            chomp;
            
            my @x = split(/\t/);
            my $acc = $x[ $fields{$acc_field} ];
            my $count = $x[ $fields{$counts_field} ];
            my $fpkm = $x[ $fields{$fpkm_field} ];
            my $tpm = $x[ $fields{$tpm_field} ];

            $data{$acc}->{$file}->{count} = $count;
            $data{$acc}->{$file}->{FPKM} = $fpkm;
            $data{$acc}->{$file}->{TPM} = $tpm;
        }
        close $fh;
    }
    
    my @filenames = @files;
    foreach my $file (@filenames) {
        if ($name_sample_by_basedir) {
            my @path = split(m|/|, $file);
            $file = $path[$basedir_index];
        }
        else {
            $file = basename($file);
        }
    }
    print STDERR "\n\n* Outputting combined matrix.\n\n";
    
    my $counts_matrix_file = "$out_prefix.counts.matrix";
    my $TPM_matrix_file = "$out_prefix.TPM.not_cross_norm";
    open (my $ofh_counts, ">$counts_matrix_file") or die "Error, cannot write file $counts_matrix_file";
    open (my $ofh_TPM, ">$TPM_matrix_file") or die "Error, cannot write file $TPM_matrix_file";

    { # check to see if they're unique
        my %filename_map = map { + $_ => 1 } @filenames;
        if (scalar keys %filename_map != scalar @filenames) {
            die "Error, the column headings: @filenames are not unique.  Should you consider using the --name_sample_by_basedir parameter?";
        }
    }

脚本用法

perl merge_to_matrix.pl \
--est_method featureCounts \#定量使用的软件
--quant_files genes.quant_files.txt \#定量的样本
--out_prefix genes #输出文件前缀

输入:每个样本的定量结果
输出:
reads count 矩阵: genes.counts.matrix #原始矩阵
表达量矩阵:genes.TMM.EXPR.matrix #标准化后的矩阵

想要获取脚本联系小编哟!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
以下是将cpm表达矩阵和分组信息分别放在2个不同的txt文件中,使用Mfuzz包分析表达趋势的R语言代码示例。 首先,我们需要加载两个txt文件,一个包含cpm表达矩阵,另一个包含分组信息。这里假设cpm矩阵文件为`cpm_matrix.txt`,分组信息文件为`group_info.txt`。 ```R # 加载Mfuzz包 library(Mfuzz) # 加载cpm表达矩阵 cpm_matrix <- read.table("cpm_matrix.txt", header = TRUE, row.names = 1, sep = "\t") # 加载分组信息 group_info <- read.table("group_info.txt", header = TRUE, row.names = 1, sep = "\t") # 将cpm表达矩阵转置为Mfuzz所需的格式 mfuzz_input <- t(cpm_matrix) # 运行Mfuzz进行聚类分析 mfuzz_obj <- mfuzz(mfuzz_input, c = 2, m = 1.5) # 生成聚类热图 pdf("mfuzz_heatmap.pdf") mfuzz.plot(mfuzz_obj, main = "Mfuzz聚类热图") dev.off() # 生成聚类指标 pdf("mfuzz_membership.pdf") plotmfuzz(mfuzz_obj, mfrow = c(2,2)) dev.off() # 进行表达趋势分析 trend_genes <- apply(mfuzz_obj$membership, 1, function(x) max(x) > 0.8 & min(x) < 0.2) trend_res <- trendInTime(mfuzz_input[trend_genes, ], group_info, plot=TRUE) ``` 在上面的代码中,我们先分别加载cpm表达矩阵和分组信息。然后,将cpm表达矩阵转置为Mfuzz所需的格式,并运行Mfuzz进行聚类分析。接着,我们生成聚类热图和聚类指标。最后,我们使用`trendInTime`函数进行表达趋势分析,`trend_genes`是一个逻辑向量,代表具有表达趋势的基因。`trend_res`是表达趋势分析的结果,可以使用`plotTrends`函数进行可视化。 需要注意的是,以上代码仅为示例,实际上cpm表达矩阵和分组信息的加载和Mfuzz分析可能涉及到更多的步骤和参数调整,具体情况需根据实际需求进行调整。 希望这些信息对您有所帮助。如果您有其他问题或需要更多帮助,请随时提出。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Bioinfor 生信云

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值