单倍型Haploview分析

zky___

已于 2024-12-09 11:47:32 修改

阅读量470

点赞数 3

文章标签： java 服务器前端

于 2024-09-24 15:26:14 首次发布

本文链接：https://blog.csdn.net/m0_69569041/article/details/142482950

版权

0.关注SNP的筛选

import pandas as pd


def extract_snps_from_vcf(input_file, output_file, target_snp, interval):
    # Read the VCF file
    df = pd.read_csv(input_file, sep='\t', comment='#', header=None,
                     names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + [f'SAMPLE_{i}' for
                                                                                                       i in
                                                                                                       range(1000)])

    # Find the target SNP position and chromosome
    target_row = df[df['ID'] == target_snp]
    if target_row.empty:
        print(f"Target SNP {target_snp} not found in the file.")
        return
    target_position = target_row['POS'].values[0]
    target_chrom = target_row['CHROM'].values[0]

    # Calculate the interval range (910K * 2)
    interval *= 4
    start_position = max(0, target_position - interval)
    end_position = target_position + interval

    # Extract SNPs within the interval and matching the chromosome
    result = df[(df['CHROM'] == target_chrom) & (df['POS'] >= start_position) & (df['POS'] <= end_position)]

    # Read the original file to get the header
    with open(input_file, 'r') as f:
        header = []
        for line in f:
            if line.startswith('#'):
                header.append(line)
            else:
                break

    # Write the header and the filtered data to the new VCF file
    with open(output_file, 'w') as f:
        f.writelines(header)

    result.to_csv(output_file, sep='\t', index=False, header=False, mode='a')

    print(
        f"Extracted {len(result)} SNPs within the {interval} bp interval of {target_snp} on chromosome {target_chrom}.")


# Usage
input_file = 'clean.vcf'
output_file = 'extracted_snps.vcf'
target_snp = '1A_360259436'
interval = 910000  # 910K bp, which will be doubled in the function

extract_snps_from_vcf(input_file, output_file, target_snp, interval)

自己筛选出vcf中所关注的连锁区块位点

1.路径下vcf文件转换——>使用plink

 plink --vcf vcf_block.vcf --recode --out output_file --allow-extra-chr

2.数据处理（1）

ped中的表型数据默认为-9，需要改为0。这个是必须的，否则就报错

# ped中的表型数据默认为-9，需要改为0。这个是必须的，否则就报错
f1 = open("output_file.ped", "r").read()
f2 = open("zc.ped", "w")
f2.write(f1.replace("-9", "0").replace("BC-", "BC_"))  # 替换掉文件中会出现异常的下划线和表型数据默认的-9
f2.close()

3.数据处理（2）

筛选关注SNP的位点并处理map文件（只取2、4)两列

# 将map的第二列和第四列提取出来，保存为a1.info文件。
import pandas as pd

f_map = pd.read_csv("output_file.map", sep="\t")
f_map.iloc[:, [1, 3]].to_csv("zc2.map", index=False, sep="\t")