0.关注SNP的筛选
import pandas as pd
def extract_snps_from_vcf(input_file, output_file, target_snp, interval):
# Read the VCF file
df = pd.read_csv(input_file, sep='\t', comment='#', header=None,
names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + [f'SAMPLE_{i}' for
i in
range(1000)])
# Find the target SNP position and chromosome
target_row = df[df['ID'] == target_snp]
if target_row.empty:
print(f"Target SNP {target_snp} not found in the file.")
return
target_position = target_row['POS'].values[0]
target_chrom = target_row['CHROM'].values[0]
# Calculate the interval range (910K * 2)
interval *= 4
start_position = max(0, target_position - interval)
end_position = target_position + interval
# Extract SNPs within the interval and matching the chromosome
result = df[(df['CHROM'] == target_chrom) & (df['POS'] >= start_position) & (df['POS'] <= end_position)]
# Read the original file to get the header
with open(input_file, 'r') as f:
header = []
for line in f:
if line.startswith('#'):
header.append(line)
else:
break
# Write the header and the filtered data to the new VCF file
with open(output_file, 'w') as f:
f.writelines(header)
result.to_csv(output_file, sep='\t', index=False, header=False, mode='a')
print(
f"Extracted {len(result)} SNPs within the {interval} bp interval of {target_snp} on chromosome {target_chrom}.")
# Usage
input_file = 'clean.vcf'
output_file = 'extracted_snps.vcf'
target_snp = '1A_360259436'
interval = 910000 # 910K bp, which will be doubled in the function
extract_snps_from_vcf(input_file, output_file, target_snp, interval)
自己筛选出vcf中所关注的连锁区块位点
1.路径下vcf文件转换——>使用plink
plink --vcf vcf_block.vcf --recode --out output_file --allow-extra-chr
2.数据处理(1)
ped中的表型数据默认为-9,需要改为0。这个是必须的,否则就报错
# ped中的表型数据默认为-9,需要改为0。这个是必须的,否则就报错
f1 = open("output_file.ped", "r").read()
f2 = open("zc.ped", "w")
f2.write(f1.replace("-9", "0").replace("BC-", "BC_")) # 替换掉文件中会出现异常的下划线和表型数据默认的-9
f2.close()
3.数据处理(2)
筛选关注SNP的位点并处理map文件(只取2、4)两列
# 将map的第二列和第四列提取出来,保存为a1.info文件。
import pandas as pd
f_map = pd.read_csv("output_file.map", sep="\t")
f_map.iloc[:, [1, 3]].to_csv("zc2.map", index=False, sep="\t")
4.jar包路径下启用jar包
https://www.java.com/zh-CN/download/
java -jar Haploview.jar
5. 传入处理好的文件,结果1