我们有一个如下文件:第一行为列名称,第1到第4列为每个样本共用的信息,从第5列到最后一列为每个样本的某个变量的数据,假如我们要将此文件拆分成每个样本的列与第1到第4列的合并而成的多个单个样本文件,该如何处理呢?
举个栗子:
chr start end allele D190531P1-180244WS-1_L3.bam D190531P1-180244WS-F-1_L3.bam D190531P1-180244WS-M-1_L3.bam D190531P1-180245WS_L3.bam D190531P1-180246WS_L3.bam DYDFA-381-1.bam
1 819917 819917 A/G 1 1 1 1 1 0
1 861630 861630 G/A 1 0.6 0.4 1 0.4 0
1 863124 863124 G/T 1 1 0.462366 1 0.358025 0
1 863138 863138 A/C 0 0 0.557895 0 0.648352 0
1 866319 866319 G/A 1 1 1 1 0.979167 1
1 866511 866511 C/CCCCT 1 0.44186 0.462963 0.980392 0.534483 0.735294
1 866519 866519 T/C 0 0 0 0 0 0.225
1 866523 866523 T/C 0 0 0 0 0 0.216216
1 871334 871334 G/T 1 0.54386 0.525 1 0.538462 1
经过拆分后可变为以下六个文件:
chr start end allele D190531P1-180244WS-1_L3.bam
1 819917 819917 A/G 1
1 861630 861630 G/A 1
1 863124 863124 G/T 1
1 863138 863138 A/C 0
1 866319 866319 G/A 1
1 866511 866511 C/CCCCT 1
1 866519 866519 T/C 0
1 866523 866523 T/C 0
1 871334 871334 G/T 1
chr start end allele D190531P1-180244WS-F-1_L3.bam
1 819917 819917 A/G 1
1 861630 861630 G/A 0.6
1 863124 863124 G/T 1
1 863138 863138 A/C 0
1 866319 866319 G/A 1
1 866511 866511 C/CCCCT 0.44186
1 866519 866519 T/C 0
1 866523 866523 T/C 0
1 871334 871334 G/T 0.54386
chr start end allele D190531P1-180244WS-M-1_L3.bam
1 819917 819917 A/G 1
1 861630 861630 G/A 0.4
1 863124 863124 G/T 0.462366
1 863138 863138 A/C 0.557895
1 866319 866319 G/A 1
1 866511 866511 C/CCCCT 0.462963
1 866519 866519 T/C 0
1 866523 866523 T/C 0
1 871334 871334 G/T 0.525
chr start end allele D190531P1-180245WS_L3.bam
1 819917 819917 A/G 1
1 861630 861630 G/A 1
1 863124 863124 G/T 1
1 863138 863138 A/C 0
1 866319 866319 G/A 1
1 866511 866511 C/CCCCT 0.980392
1 866519 866519 T/C 0
1 866523 866523 T/C 0
1 871334 871334 G/T 1
chr start end allele D190531P1-180246WS_L3.bam
1 819917 819917 A/G 1
1 861630 861630 G/A 0.4
1 863124 863124 G/T 0.358025
1 863138 863138 A/C 0.648352
1 866319 866319 G/A 0.979167
1 866511 866511 C/CCCCT 0.534483
1 866519 866519 T/C 0
1 866523 866523 T/C 0
1 871334 871334 G/T 0.538462
chr start end allele DYDFA-381-1.bam
1 819917 819917 A/G 0
1 861630 861630 G/A 0
1 863124 863124 G/T 0
1 863138 863138 A/C 0
1 866319 866319 G/A 1
1 866511 866511 C/CCCCT 0.735294
1 866519 866519 T/C 0.225
1 866523 866523 T/C 0.216216
1 871334 871334 G/T 1
具体命令如下:
head -n 1 ddd.tsv |awk -F"\t" '{for(i=5;i<=NF;i++){sub(".bam",".bed",$i);printf $i"\n";}}'|perl -ne 'chomp;$n++;print "cat ddd.tsv|awk -F\"\\t\" '\''BEGIN{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$".($n+4)."}'\'' > $_\n";'|bash
类似地,对于一个多样本vcf文件,我们要拆分成多个单个样本的vcf文件,则可这样处理:
sed -n '/#CHROM/p' All.vcf|awk -F"\t" '{for(i=10;i<=NF;i++){printf $i"\n";}}'|perl -ne 'chomp;$n++;print "(awk '\''/^##/{print}!/^##/{exit}'\'' All.vcf; sed '\''/^##/d'\'' All.vcf |awk -F\"\\t\" '\''BEGIN{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8,\$9,\$".($n+9)."}'\'') > $_.vcf\n";'|bash
更多讨论,欢迎关注公众号