从多物种vcf文件中提取genotype
#合并多个文件(默认换行符)
find ./ -name "chr??.vcf" |xargs sed 'a\' >275gene_1045parent.vcf
#提取genotyping
awk '{for(a=1;a<6;a++) printf("%s\t",$a);for (i=10;i<=NF;i++) printf("%-4s\t",substr($i,1,3)); printf"\n"}' 258gene_1045parent.vcf >258gene_1045parent_genotype.vcf
#匹配genotype进行替换
sed -e 's/0[/||]0/WT/g' -e 's/1[/||]1/MU/g' -e 's/0[/||]1/HE/g' -e 's/.[/||]./??/g' 258gene_1045parent_genotype.vcf >258gene_1045parent_symbol.vcf
#通过vim给文件加上表头(物种名)
产生随机数
for i in {1..10};do echo $i;done
统计染色体长度
脚本路径: /data8/home/dlguo/script/pyscript/cal_chrom_length.py
#!/usr/bin/python
#coding=utf-8
#传递命令行参数
import sys
#从命令行获取文件名称
f_fasta = sys.argv[1]
#打开文件
f = open(f_fasta)
#计算染色体长度
lines = f.readlines()
chr = ""
chr_len = ""
print("CHR\tSTART\tEND")
for line in lines:
#去掉行尾的换行符
line = line.strip()
if (line.startswith(">")):
if not chr=="":
print(chr + "\t0\t" + str(chr_len))
chr = line.lstrip(">")
chr_len = 0
else:
chr_len += len(line)
print(chr + "\t0\t" + str(chr_len))
#计算fasta格式的每条序列的长度
python /data8/home/dlguo/script/pyscript/cal_chrom_length.py /data6/home/zqiang/old_data/db/IRGSP-1.0_genome.fasta >/data8/home/dlguo/WAH/IRGSP-1.0_genome_chr.len
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#将基因位置列表写入字典列表
import os
def txt_read(files):
txt_list = []
fopen = open(files)
for line in fopen.readlines():
line = str(line).replace("\n","")
dict_temp = {}
dict_temp[line.split(' ',1)[0]] = line.split(' ',1)[1]
txt_list.append(dict_temp)
return(txt_list)
fopen.close()
txt_list = txt_read("202103_important_gene_position_valid")
def make_script(total_vcf):
filename = "Extract_varition_info.sh"
os.mknod(filename)
with open(filename,'w') as f:
f.write("#!/bin/bash\n")
for i in txt_list:
for k,v in i.items():
with open(filename,'a') as f:
f.write("awk '{if ($1==" + str(k) + "&& $2==" + str(v) + ")print $0' /data8/home/dlguo/WAH/F2_pop/total_filter_final_symbol.vcf >>Extract_SNP.vcf\n")