python 处理snp的vcf文件,统计snp在基因的intron、exon还是上游、下游还是不在基因及基因附近

1 篇文章 0 订阅
1 篇文章 0 订阅

1、c处理vcf文件,初步统计snp位置
位置信息有5种

down_stream是gene下游2k以内,up_stream 是gene上游2k以内,gene_exon是snp在外显子内,gene_intron是gene在内含子内,uninfluncial是gene不在上诉所有区域

SnpType.py
内容:

#这个脚本是用来看snp哪些是在基因中(分为exon和intron),哪些是在基因的上游、下游
#最终输出结果为:scaffold position 位置 geneid
#输入文件1:snpeff结果文件(/public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/snpeff/snpeffC_F/02.runC/mor_imp_strictestfilter_Snp_AncRef_selected_anno4.vcf)
#输入文件2:gff3文件(/public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/genome.gff3)

import sys,re
file1 = sys.argv[1]
file2 = sys.argv[2]
file3 = sys.argv[3]
f1 = open(file1,'r')
f2 = open(file2,'r')
f3 = open(file3,'w')
flag = 'fuck'
f1dick = {}
arr1 = []
for i in f1:
        if re.match("#",i):
                continue
        a = i.split("\t")
        if flag != a[0] and flag is not 'fuck':
                f1dick[flag] = arr1
                flag = a[0]
                arr1 = []
        #       print(a[0])
                arr1.append(a[1])
        elif flag is 'fuck':
                flag = a[0]
        else:
                arr1.append(a[1])
f1dick[flag] = arr1
#for i in f1dick:
#        print(i,f1dick[i])

sc = "fuck"
f2dick1 = {}
f2dick2 = {}
f2arr1 = []
for i in f2:
        if re.match("M",i) == None:
                f2dick1[geneid] = f2arr1
                continue
        a = i.split("\t")
        if a[2] == 'gene':
                f2arr1 = []
                f2arr1.append(a[3:5])
                if sc != a[0]:
                        f2dick2[sc] = f2dick1
                        f2dick1 = {}
                        sc = a[0]
                geneid = a[8].split(";")[0].split("=")[1]
        if a[2] == 'exon':
                f2arr1.append(a[3:5])
f2dick1[geneid] = f2arr1
f2dick2[sc] = f2dick1
del f2dick2['fuck']
f2.close()
for i in f1dick.keys():
        for j in f1dick[i]:#取到scaffold内的所有snp位点
                try:
                        taq = 0
                        for k in f2dick2[i].keys():#对gff3文件中所有的scaffold为i的gene进行遍历
                                a = f2dick2[i][k]
                                amax = max(a[0])
                                amin = min(a[0])
                                flag = 0
                                if int(amin) <= int(j) <= int(amax):#判断在gene中后判断在exon还是在intron
                                        for u in a[1:]:
                                                if  int(min(u))<int(j)<int(max(u)):
                                                        flag = "gene_exon"
                                        if flag != "gene_exon":
                                                flag = "gene_intron"
                                elif int(min(a[0]))-2000 <= int(j) <=int(min(a[0])):#判断是不是在gene下游
                                        flag = 'down_stream'
                                elif int(max(a[0]))+2000 >= int(j) >= int(max(a[0])):#判断是不是gene上游
                                        flag = "up_stream"
                                else:
                                        pass
                                if flag != 0:
                                        f3.write(i+"\t"+j+"\t"+k+"\t"+flag+"\t"+max(a[0])+"\t"+min(a[0])+"\n")
                                        taq = 1
                        if taq == 0:
                                f3.write(i+"\t"+j+"\t"+"\t"+"Uninfluncial"+"\n")

                except KeyError:
                        print("KeyError",sys.exc_info())
f3.close()
                        

输入文件为vcf和gff3
vcf内容如下:

##contig=<ID=MI_M04M24_Scaffold001_2344856_2346921_target_region_2345149_2346921,length=2066>
##contig=<ID=MI_M04M24_Scaffold018_52187_53324_target_region_52187_53324,length=1138>
##contig=<ID=MI_M04M24_Scaffold037_441019_444340_target_region_441019_444340,length=3322>
##contig=<ID=MI_M04M24_Scaffold041_338612_340340_target_region_338612_340099,length=1729>
##contig=<ID=MI_M04M24_Scaffold041_340322_342654_target_region_340322_342654,length=2333>
##contig=<ID=MI_M04M24_Scaffold114_32039_35224_target_region_32039_35224,length=3186>
##reference=file:///public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/genome.fasta
##source=SelectVariants
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  Mor_importuna
Morimp01GS001   1387    .       C       T       969.80  PASS    AC=2;AF=1.00;AN=2;BaseQRankSum=1.56;ClippingRankSum=-4.910e-01;DP=28;ExcessHet=
Morimp01GS001   1418    .       C       T       1427.78 PASS    AC=2;AF=1.00;AN=2;BaseQRankSum=-1.073e+00;ClippingRankSum=-2.400e-02;DP=43;Exce
Morimp01GS001   8939    .       A       C       282.77  PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=1.67;ClippingRankSum=-5.330e-01;DP=13;ExcessHet
Morimp01GS001   9251    .       C       T       1538.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=0.460;ClippingRankSum=0.631;DP=88;ExcessHet=3.0
Morimp01GS001   12787   .       T       G       2514.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=2.25;ClippingRankSum=0.383;DP=109;ExcessHet=3.0
Morimp01GS001   13324   .       C       G       2177.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=2.17;ClippingRankSum=1.50;DP=88;ExcessHet=3.010
Morimp01GS001   13446   .       G       C       1158.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=0.994;ClippingRankSum=-1.990e-01;DP=58;ExcessHe
Morimp01GS001   13866   .       A       C       2370.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=-1.094e+00;ClippingRankSum=-6.080e-01;DP=100;Ex
Morimp01GS001   14181   .       T       C       1455.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=0.710;ClippingRankSum=-6.500e-01;DP=71;ExcessHe
Morimp01GS001   14320   .       A       G       1473.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=1.42;ClippingRankSum=-8.810e-01;DP=64;ExcessHet
Morimp01GS001   14344   .       C       T       1471.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=1.48;ClippingRankSum=-1.880e-01;DP=69;ExcessHet
Morimp01GS001   14606   .       T       G       2329.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=-1.280e-01;ClippingRankSum=-1.583e+00;DP=94;Exc
Morimp01GS001   14869   .       A       G       2209.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=-4.340e-01;ClippingRankSum=-2.854e+00;DP=99;Exc
Morimp01GS001   14897   .       C       T       2048.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=0.603;ClippingRankSum=1.03;DP=88;ExcessHet=3.01
Morimp01GS001   15056   .       A       T       1930.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=0.870;ClippingRankSum=-3.810e-01;DP=88;ExcessHe
Morimp01GS001   15384   .       A       G       1690.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=-1.241e+00;ClippingRankSum=0.431;DP=86;ExcessHe
Morimp01GS001   15564   .       C       T       2023.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=0.198;ClippingRankSum=0.014;DP=85;ExcessHet=3.0
Morimp01GS001   15729   .       C       T       2458.77 PASS    AC=1;AF=0.500;AN=2;BaseQRankSum=

gff3如下:

Morimp01GS001   AUGUSTUS        gene    16797   18718   0.81    +       .       ID=MIM04M26Gene00001;Name=MIM04M26Gene00001
Morimp01GS001   AUGUSTUS        mRNA    16797   18718   0.81    +       .       ID=MIM04M26Gene00001.t1;Parent=MIM04M26Gene00001
Morimp01GS001   AUGUSTUS        exon    16797   16949   .       +       .       ID=MIM04M26Gene00001.t1.exon1;Parent=MIM04M26Gene00001.t1
Morimp01GS001   AUGUSTUS        CDS     16797   16949   0.82    +       0       ID=MIM04M26Gene00001.t1.CDS1;Parent=MIM04M26Gene00001.t1
Morimp01GS001   AUGUSTUS        exon    17024   17219   .       +       .       ID=MIM04M26Gene00001.t1.exon2;Parent=MIM04M26Gene00001.t1
Morimp01GS001   AUGUSTUS        CDS     17024   17219   0.99    +       0       ID=MIM04M26Gene00001.t1.CDS2;Parent=MIM04M26Gene00001.t1
Morimp01GS001   AUGUSTUS        exon    17298   18718   .       +       .       ID=MIM04M26Gene00001.t1.exon3;Parent=MIM04M26Gene00001.t1
Morimp01GS001   AUGUSTUS        CDS     17298   18718   0.98    +       2       ID=MIM04M26Gene00001.t1.CDS3;Parent=MIM04M26Gene00001.t1

Morimp01GS001   AUGUSTUS        gene    45970   47158   0.55    -       .       ID=MIM04M26Gene00002;Name=MIM04M26Gene00002
Morimp01GS001   AUGUSTUS        mRNA    45970   47158   0.55    -       .       ID=MIM04M26Gene00002.t1;Parent=MIM04M26Gene00002
Morimp01GS001   AUGUSTUS        exon    47073   47158   .       -       .       ID=MIM04M26Gene00002.t1.exon1;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        CDS     47073   47158   0.6     -       0       ID=MIM04M26Gene00002.t1.CDS1;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        exon    46887   46999   .       -       .       ID=MIM04M26Gene00002.t1.exon2;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        CDS     46887   46999   0.92    -       1       ID=MIM04M26Gene00002.t1.CDS2;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        exon    46644   46784   .       -       .       ID=MIM04M26Gene00002.t1.exon3;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        CDS     46644   46784   0.91    -       2       ID=MIM04M26Gene00002.t1.CDS3;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        exon    45970   46559   .       -       .       ID=MIM04M26Gene00002.t1.exon4;Parent=MIM04M26Gene00002.t1
Morimp01GS001   AUGUSTUS        CDS     45970   46559   0.92    -       2       ID=MIM04M26Gene00002.t1.CDS4;Parent=MIM04M26Gene00002.t1

输入结果为c.txt
对结果再进行排序

sort -k1 -k2 -V c.txt > snp_type_F.list

snp_type_F.list结果如下:

Morimp01GS001   1418            Uninfluncial
Morimp01GS001   1476            Uninfluncial
Morimp01GS001   8939            Uninfluncial
Morimp01GS001   9251            Uninfluncial
Morimp01GS001   12787           Uninfluncial
Morimp01GS001   13324           Uninfluncial
Morimp01GS001   13446           Uninfluncial
Morimp01GS001   13866           Uninfluncial
Morimp01GS001   14181           Uninfluncial
Morimp01GS001   14320           Uninfluncial
Morimp01GS001   14344           Uninfluncial
Morimp01GS001   14606           Uninfluncial
Morimp01GS001   14869   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   14897   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15056   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15384   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15564   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15729   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15747   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15943   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   15981   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16351   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16374   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16376   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16472   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16496   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16640   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   16761   MIM04M26Gene00001       down_stream     18718   16797
Morimp01GS001   17077   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   17340   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   17470   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   17478   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   17963   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   18077   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   18153   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   18448   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   18449   MIM04M26Gene00001       gene_exon       18718   16797
Morimp01GS001   18798   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   18802   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   18970   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19137   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19292   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19367   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19394   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19401   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19446   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19785   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19832   MIM04M26Gene00001       up_stream       18718   16797
Morimp01GS001   19885   MIM04M26Gene00001       up_stream       18718   16797

次文件position存在重复(eg: 既在A gene内,又在B gene的上游)
处理一下
display_SnpType.py
内容如下:

#这个是用来处理SnpType.py
import sys,re,math
file1 = sys.argv[1]
file2 = sys.argv[2]
f1 = open(file1,'r')
f2 = open(file2,'w')
la = [0,0]
a = []
fuck  = []
arr =[]
arrline = []
for i in f1:
        if re.search("Uninfluncial",i):
                f2.write(i)
        else:
                line = i.strip()
                a = line.split("\t")
                if a[1] == la[1]:
                        arr.append(la[4])
                        arr.append(la[5])
                        arrline.append(la)
                else:
                        if la[1] == 0:
                                la = a
                                continue
                        arrline.append(la)
                        arr.append(la[4])
                        arr.append(la[5])
                        for j in arr:
                                c = abs(int(j)-int(la[1]))
                                fuck.append(c)
                        t = min(fuck)
                        zjx = fuck.index(t)
                        x = zjx//2
                        fuck = []
                        for zz in arrline[x]:
                                f2.write(zz+"\t")
                        f2.write("\n")
                        arrline = []
                        arr = []
                la = a
arrline.append(la)
arr.append(la[4])
arr.append(la[5])
for j in arr:
        c = abs(int(j)-int(la[1]))
        fuck.append(c)
t = min(fuck)
zjx = fuck.index(t)
x = zjx//2
fuck = []
for zz in arrline[x]:
        f2.write(zz+"\t")
f2.write("\n")
arrline = []
arr = []
f1.close()
f2.close()
python display_SnpType.py snp_type_F.list snp_type_F_clear.list

结果和处理之前差不多,就是根据距离筛选了与SNP的position关联最强的gene,一个position对应一个gene

  • 3
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 8
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值