单倍型Haploview分析

0.关注SNP的筛选

import pandas as pd


def extract_snps_from_vcf(input_file, output_file, target_snp, interval):
    # Read the VCF file
    df = pd.read_csv(input_file, sep='\t', comment='#', header=None,
                     names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + [f'SAMPLE_{i}' for
                                                                                                       i in
                                                                                                       range(1000)])

    # Find the target SNP position and chromosome
    target_row = df[df['ID'] == target_snp]
    if target_row.empty:
        print(f"Target SNP {target_snp} not found in the file.")
        return
    target_position = target_row['POS'].values[0]
    target_chrom = target_row['CHROM'].values[0]

    # Calculate the interval range (910K * 2)
    interval *= 4
    start_position = max(0, target_position - interval)
    end_position = target_position + interval

    # Extract SNPs within the interval and matching the chromosome
    result = df[(df['CHROM'] == target_chrom) & (df['POS'] >= start_position) & (df['POS'] <= end_position)]

    # Read the original file to get the header
    with open(input_file, 'r') as f:
        header = []
        for line in f:
            if line.startswith('#'):
                header.append(line)
            else:
                break

    # Write the header and the filtered data to the new VCF file
    with open(output_file, 'w') as f:
        f.writelines(header)

    result.to_csv(output_file, sep='\t', index=False, header=False, mode='a')

    print(
        f"Extracted {len(result)} SNPs within the {interval} bp interval of {target_snp} on chromosome {target_chrom}.")


# Usage
input_file = 'clean.vcf'
output_file = 'extracted_snps.vcf'
target_snp = '1A_360259436'
interval = 910000  # 910K bp, which will be doubled in the function

extract_snps_from_vcf(input_file, output_file, target_snp, interval)

 自己筛选出vcf中所关注的连锁区块位点

1.路径下vcf文件转换——>使用plink

 plink --vcf vcf_block.vcf --recode --out output_file --allow-extra-chr

2.数据处理(1)

ped中的表型数据默认为-9,需要改为0。这个是必须的,否则就报错

# ped中的表型数据默认为-9,需要改为0。这个是必须的,否则就报错
f1 = open("output_file.ped", "r").read()
f2 = open("zc.ped", "w")
f2.write(f1.replace("-9", "0").replace("BC-", "BC_"))  # 替换掉文件中会出现异常的下划线和表型数据默认的-9
f2.close()

3.数据处理(2)

筛选关注SNP的位点并处理map文件(只取2、4)两列

# 将map的第二列和第四列提取出来,保存为a1.info文件。
import pandas as pd

f_map = pd.read_csv("output_file.map", sep="\t")
f_map.iloc[:, [1, 3]].to_csv("zc2.map", index=False, sep="\t")

4.jar包路径下启用jar包

https://www.java.com/zh-CN/download/

java -jar Haploview.jar

5. 传入处理好的文件,结果1

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值