cnvpytor 简介
CNV分析里比较常用到的CNVnator,该工具的高精度和高感度受到广泛的欢迎和应用。CNVpytor就是基于该工具开发的,使用Python内核,强化了CNVnator的可视化,模块化,功能和计算能力。根据RD(Read Depth)来检测CNV(copy number variations)和CNA(copy number alterations),是CNVnator的进化版。
cnvpytor github
https://github.com/abyzovlab/CNVpytor
conda安装
conda install cnvpytor -y
使用示例
sh cnvpytor_calling.sh sample001 /public/analysis/result/
Calling CNV Shell程序代码
cnvpytor_calling.sh
#!/usr/bin/bash
# 样本名称
sample=$1
# 输出结果路径
output_dir=$2
# 去除PCR重复的bam文件位于输出结果目录的下方
# 去除PCR重复可使用Samtools工具
rmdup_bam_path=${output_dir}/${sample}.rmdup.bam
# hg19参考基因组路径
hg19_reference=/reference/hg19/hg19.fa
# cnvpytor路径
# which cnvpytor获取
cnvpytor=/public/software/anaconda3/envs/WGS/bin/cnvpytor
# 检测窗口大小1000Kb
detect_bin=1000000
# 检测是否存在bam文件的index
if [ ! -f "$rmdup_bam_path.idx" ];then
samtools index $rmdup_bam_path
fi
# cnvpytor calling cnv 参考github
$cnvpytor -root ${output_dir}${sample}/${sample}.pytor -rd $rmdup_bam_path
$cnvpytor -root ${output_dir}${sample}/${sample}.pytor -his $detect_bin
$cnvpytor -root ${output_dir}/${sample}/${sample}.pytor -stat $detect_bin
$cnvpytor -root ${output_dir}${sample}/${sample}.pytor -partition $detect_bin
$cnvpytor -root ${output_dir}${sample}/${sample}.pytor -call $detect_bin > ${output_dir}${sample}/${sample}.pytor.txt
sample.pytor.txt
对CNV进行质控
cd ${output_dir}$
$cnvpytor -root ${sample}.pytor -view $detect_bin << EOF
print calls
# 质量值过滤
set Q0_range 0 0.5
set size_range $detect_bin inf
print calls
# 显著性过滤
set p_range 0 0.01
# 输出为tsv和vcf文件
set print_filename ${sample}.pytor.filter.tsv
print calls
set print_filename ${sample}.pytor.filter.vcf
print calls
EOF
sample.pytor.filter.tsv
对CNV结果进行合并
python cnvpytor_merge.py --o ${output_dir}${sample}/${sample}
合并程序python代码
cnvpytor_merge.py
import pandas as pd
import optparse
import os
# 获取所在染色体带
def get_cytoband(pos_start, pos_end, chrom):
# cytoBand.txt文件内容参考本人生信技能30
cytoband_file = "./cytoBand.txt"
try:
df = pd.read_csv(cytoband_file, header=None, sep='\t')
df.columns = ['chr', 'start', 'end', 'cyto', 'gene']
df_chr = df[df['chr'] == chrom].reset_index()
list_ctyo_start = []
list_ctyo_end = []
for idx, row in df_chr.iterrows():
cyto_start = int(row['start'])
cyto_end = int(row['end'])
if int(pos_start) >= cyto_start:
list_ctyo_start.append(row['cyto'])
if int(pos_end) <= cyto_end:
list_ctyo_end.append(row['cyto'])
if idx == (len(df_chr) - 1 ):
if len(list_ctyo_start) == 0:
list_ctyo_start.append(row['cyto'])
if len(list_ctyo_end) == 0:
list_ctyo_end.append(row['cyto'])
res_cyto_start = list_ctyo_start[len(list_ctyo_start) - 1]
res_cyto_end = list_ctyo_end[0]
if res_cyto_start == res_cyto_end:
cyto = res_cyto_start
else:
if 'p' in res_cyto_start and 'p' in res_cyto_end:
cyto = res_cyto_start + res_cyto_end
elif 'q' in res_cyto_start and 'q' in res_cyto_end:
cyto = res_cyto_start + res_cyto_end
else:
cyto = res_cyto_start + res_cyto_end
except:
cyto = ""
return cyto
def merge_results(input_file, output_file):
df = pd.read_csv(input_file, sep='\t', header=None, names=["sample", "type", "dup/del", "chrom", "start",
"end", "size", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "value"])
df_merge = pd.DataFrame()
for idx, row in df.iterrows():
df_merge.at[idx, 'sample'] = row['sample']
type = "dup" if row['dup/del'] == "duplication" else "del"
df_merge.at[idx, 'type'] = type
df_merge.at[idx, 'chrom'] = row['chrom']
pos_cyto = get_cytoband(pos_start=row['start'], pos_end=row['end'], chrom=row['chrom'])
df_merge.at[idx, 'cyto'] = pos_cyto
region = f"{int(row['start'])}-{int(row['end'])}"
df_merge.at[idx, 'region'] = region
size = f"{float(row['size'] / 1000000)}M"
df_merge.at[idx, 'size'] = size
copy_num = 1 if type == 'dup' else 2
df_merge.at[idx, 'result'] = f"{type}({row['chrom']})({pos_cyto})({size}) {row['chrom']}:g.{region}{type}"
df_merge.to_csv(output_file, sep='\t', index=False)
if __name__ == '__main__':
parse = optparse.OptionParser(usage='"%prog"', version="%prog V1.0")
parse.add_option("--o", "--output-path", dest="output_path", type=str, help="the output path of file.")
options, args = parse.parse_args()
result_merge_path = options.output_path + '.pytor.merge.tsv'
if os.path.exists(result_path):
merge_results(result_path, result_merge_path)