#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import logging
import os
import sys
import time
import vcf
import pandas as pd
import numpy as np
from collections import defaultdict
import multiprocessing
__author__ = 'Guang Hu'
__date__ = '2024/05/30'
def calculate_PIC(p): # 此处输入的的是snp位点在群体等位基因的频率集合(. 0 1 ... 所占的比率),剔除缺失等位位点。注:vcftools没有剔除,实际需要剔除
p = np.array(p)
pic = 1 - np.sum(p**2) - np.sum(2 * np.triu(np.outer(p**2, p**2), k=1)) # p**2 equal np.square(p)
return pic
def vcfStats(queue, shared_dict):
while 1:
NR, record = queue.get()
if NR == None:
return
if not record.is_snp: sys.exit('This script only can be used for allelic SNPs!')
aaf = record.aaf
aaf.append(1-sum(record.aaf))
MAF = sorted(aaf, reverse=True)[1]
PIC = calculate_PIC(aaf)
H = -sum(np.array(aaf) * np.log(aaf)) # 计算香农信息指数
shared_dict[NR] = [record.CHROM,record.POS,record.REF,str(record.ALT[0]),record.var_type.upper(),record.var_subtype,MAF,PIC,H,1 - record.call_rate,record.num_called,record.num_het,record.num_hom_alt,record.num_hom_ref,record.heterozygosity,record.nucl_diversity]
return shared_dict
def readerVcf(vcfFile, queues, processor_count):
vcfReader = vcf.Reader(filename=vcfFile)
NR = 1
while True:
for p in range(processor_count):
try:
line = next(vcfReader)
queues[p].put((NR, line))
NR += 1
except StopIteration:
for i in range(processor_count):
queues[i].put((None,None))
return
def writerStat(shared_dict, outFile):
fhout = open(outFile,'w')
fhout.write('Chr\tPos\tRef\tAlt\tVar_type\tVar_subtype\tMAF\tPIC\tH\tMissing\tNum_called\tNum_het\tNum_hom_alt\tNum_hom_ref\tHeterozygosity\tNucl_diversity\n')
# 写入最终统计文件
for k in range(len(shared_dict)):
fhout.write('\t'.join(map(str, shared_dict[k+1])) + '\n')
fhout.close()
def main():
parser = argparse.ArgumentParser(description='Perform basic statistics on allelic SNPs from the VCF file.')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument("-i", "--input", type=str, required=True, help="Input VCF file")
parser.add_argument("-p", "--jobs", type=str, required=True, help="number of process")
parser.add_argument("-o", "--output", type=str, required=True, help="Output file name prefix")
parser.add_argument("--log", type=str, default="run.log", help="output log file")
args = parser.parse_args()
# Setup logging and script timing
logging.basicConfig(filename=args.log, filemode='w', format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.DEBUG)
logging.info('----- Start')
logging.info("# {} #".format(os.path.basename(__file__)))
timeStart = time.time()
manager = multiprocessing.Manager()
shared_dict = manager.dict()
processor_count = int(args.jobs)
queues = [multiprocessing.Queue() for i in range(processor_count)] # maxsize设置100会卡死,不设置会更好
reader = multiprocessing.Process(target=readerVcf, args=(args.input, queues, processor_count))
processors = [multiprocessing.Process(target=vcfStats, args=(queue, shared_dict)) for queue in queues]
reader.start()
for processor in processors:
processor.start()
for processor in processors:
processor.join()
reader.join()
writerStat(shared_dict, args.output)
timeEnd = time.time()
# End of run
logging.info('----- Finish')
logging.info("Elapsed time: {}".format(timeEnd - timeStart))
if __name__ == '__main__':
main()
snp标记计算PIC,香农信息指数等值
于 2024-06-12 16:07:29 首次发布