群体结构PCA分析+LD分析

zky___

已于 2024-09-25 10:22:51 修改

阅读量98

点赞数 2

文章标签： python

于 2024-09-21 13:17:16 首次发布

本文链接：https://blog.csdn.net/m0_69569041/article/details/142415274

版权

1.PCA——使用plink（linux）

如果需要加入参考基因组的绘制

需要修改vcf文件

import os

def modify_vcf(input_vcf, output_vcf):
    with open(input_vcf, 'r') as infile, open(output_vcf, 'w') as outfile:
        for line in infile:
            if line.startswith('#'):
                # Write header lines as is, including the added ALT_SAMPLE
                if line.startswith('#CHROM'):
                    outfile.write(line.strip() + '\tREF_SAMPLE\n')
                else:
                    outfile.write(line)
            else:
                fields = line.strip().split('\t')
                chrom, pos, id, ref, alt = fields[:5]
                format_field = fields[8]

                # Create ALT genotype
                alt_genotype = '0/0'  # Homozygous for ALT allele

                # Write the modified line
                outfile.write(line.strip() + f'\t{alt_genotype}\n')

def main():
    # Set input and output paths using Windows path format
    input_vcf = r"E:\pyProjects\zc\2024年8月23日\ZC-wanglin分析\00.filter\clean.vcf"
    output_dir = r"E:\pyProjects\zc\ZC_pca\pca2024_9_6"
    modified_vcf = os.path.join(output_dir, "modified_clean.vcf")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Modify VCF file
    print("Modifying VCF file...")
    modify_vcf(input_vcf, modified_vcf)
    print("VCF modification complete. Output file:", modified_vcf)


if __name__ == "__main__":
    main()

使用plink进行pca分析

PLINK v1.90b7.4 64-bit (18 Aug 2024)
Options in effect:
  --allow-extra-chr
  --out /mnt/e/pyProjects/zc/ZC_pca/pca2024_9_6/plink_pca
  --pca 10
  --set-missing-var-ids @:#
  --vcf /mnt/e/pyProjects/zc/2024年8月23日/ZC_pca/pca2024_9_6/modified_clean.vcf
  --vcf-half-call missing

Hostname: zky
Working directory: /mnt/c/Users/Administrator
Start time: Fri Sep  6 20:59:12 2024

要使用的生成文件plink_pca.eigenval E:\pyProjects\zc\ZC_pca\pca2024_9_6\plink_pca.eigenvec

准备map文件

3.PCA

import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import numpy as np
import pandas as pd
from matplotlib.patches import Ellipse


def confidence_ellipse(x, y, ax, n_std=2.0, facecolor='none', **kwargs):
    if x.size != y.size:
        raise ValueError("x 和 y 必须具有相同的大小")
    cov = np.cov(x, y)
    pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2,
                      facecolor=facecolor, **kwargs)
    scale_x = np.sqrt(cov[0, 0]) * n_std
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_x, mean_y = np.mean(x), np.mean(y)
    transf = transforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)


# 读取数据
eigenvalues = np.loadtxt('E:/pyProjects/zc/ZC_pca/pca2024_9_6/plink_pca_output.eigenval')
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)

data = pd.read_csv('E:/pyProjects/zc/ZC_pca/pca2024_9_6/plink_pca_output.eigenvec', sep=r'\s+', header=None)
data.columns = ['FID', 'IID'] + [f'PC{i}' for i in range(1, len(data.columns) - 1)]

mapping = pd.read_csv('map.csv', header=None, names=['IID', 'Name', 'Group'])
merged_data = pd.merge(data, mapping, on='IID', how='left')

# 提取前两个主成分
X_pca = merged_data[['PC1', 'PC2']].values

# 颜色映射
color_map = {'loc1': ('red', 'Sichuan local wheat'),
             'loc2': ('blue', 'Sichuan bred varieties'),
             'loc3': ('green', 'varieties from other province')}
colors = [color_map[group][0] for group in merged_data['Group']]

# 绘图
fig, ax = plt.subplots(figsize=(12, 10))
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.6)
ax.set_xlabel(f'PC1 ({explained_variance_ratio[0]:.2%})')
ax.set_ylabel(f'PC2 ({explained_variance_ratio[1]:.2%})')
plt.title('PCA Projection of Wheat Varieties with Genetic Subgroup Ellipses')

for group, (color, label) in color_map.items():
    group_data = X_pca[merged_data['Group'] == group]
    if len(group_data) > 1:
        confidence_ellipse(group_data[:, 0], group_data[:, 1], ax, n_std=2.0,
                           edgecolor=color, linestyle='--', linewidth=2)

for color, label in color_map.values():
    ax.scatter([], [], c=color, label=label)

# 标注特定点（中国春）
chinese_spring = data[data['IID'] == 'BC-REF']
if not chinese_spring.empty:
    cs_x, cs_y = chinese_spring['PC1'].values[0], chinese_spring['PC2'].values[0]
    ax.scatter(cs_x, cs_y, c='yellow', s=100, edgecolor='black', linewidth=2, zorder=5)
    ax.annotate('ZGC2.1', (cs_x, cs_y), xytext=(5, 5), textcoords='offset points',
                fontsize=12, fontweight='bold', color='black',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.7),
                arrowprops=dict(arrowstyle="->", color='black'))

ax.legend()

plt.savefig('E:/pyProjects/zc/ZC_pca/pca2024_9_6/pca_2d_visualization_with_ellipses_and_chinese_spring.png', dpi=600,
            bbox_inches='tight')
plt.close()

print("PCA 可视化完成。生成了包含中国春标注的 PCA 可视化图。")

4.PCA（标准化）

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.transforms as transforms
from matplotlib.patches import Ellipse


def confidence_ellipse(x, y, ax, n_std=2.0, facecolor='none', **kwargs):
    """
    Create a plot of the covariance confidence ellipse of `x` and `y`
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])

    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0),
                      width=ell_radius_x * 2,
                      height=ell_radius_y * 2,
                      facecolor=facecolor,
                      **kwargs)

    scale_x = np.sqrt(cov[0, 0]) * n_std
    scale_y = np.sqrt(cov[1, 1]) * n_std

    mean_x = np.mean(x)
    mean_y = np.mean(y)

    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)


# 读取eigenval文件并计算解释方差比例
eigenvalues = np.loadtxt('plink_pca.eigenval')
total_variance = np.sum(eigenvalues)
explained_variance_ratio = eigenvalues / total_variance

# 读取PCA结果
data = pd.read_csv('plink_pca.eigenvec', sep='\s+', header=None)
data.columns = ['FID', 'IID'] + [f'PC{i}' for i in range(1, len(data.columns) - 1)]

# 读取映射文件
mapping = pd.read_csv('map.csv', header=None, names=['IID', 'Name', 'Group'])

# 合并PCA数据和映射数据
merged_data = pd.merge(data, mapping, on='IID', how='left')

# 提取所有主成分
X = merged_data[[f'PC{i}' for i in range(1, len(data.columns) - 1)]].values

# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 进行PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# 创建颜色映射
color_map = {'loc1': ('red', 'Sichuan local wheat'),
             'loc2': ('blue', 'Sichuan bred varieties'),
             'loc3': ('green', 'varieties from other province')}

colors = [color_map[group][0] for group in merged_data['Group']]

# 绘制2D投影图（PC1 vs PC2）
fig, ax = plt.subplots(figsize=(12, 10))

# 计算PC1和PC2的最小值和最大值
x_min, x_max = X_pca[:, 0].min(), X_pca[:, 0].max()
y_min, y_max = X_pca[:, 1].min(), X_pca[:, 1].max()

# 稍微扩大范围，确保所有点和椭圆都能显示
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= 0.1 * x_range
x_max += 0.1 * x_range
y_min -= 0.1 * y_range
y_max += 0.1 * y_range

scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.6)
ax.set_xlabel(f'PC1 ({explained_variance_ratio[0]:.2%})')
ax.set_ylabel(f'PC2 ({explained_variance_ratio[1]:.2%})')
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
plt.title('Standardized PCA Plot of Wheat Genetic Diversity with Equal Axial Scaling')

# 为每个组添加置信椭圆
for group, (color, label) in color_map.items():
    group_data = X_pca[merged_data['Group'] == group]
    if len(group_data) > 1:  # 确保有足够的点来计算椭圆
        confidence_ellipse(group_data[:, 0], group_data[:, 1], ax, n_std=2.0,
                           edgecolor=color, linestyle='--', linewidth=2)

# 添加图例
for (color, label) in color_map.values():
    ax.scatter([], [], c=color, label=label)
ax.legend()

plt.savefig('pca_2d_visualization_real_pca_with_ellipses.png', dpi=300, bbox_inches='tight')
plt.close()

print("PCA可视化完成。生成了带有颜色映射和真实PCA置信椭圆的2D投影图。")

5.PCA条形图

import matplotlib.pyplot as plt
import numpy as np

# 读取eigenvalues
eigenvalues = np.loadtxt('plink_pca.eigenval')

# 计算解释方差比例
total_variance = np.sum(eigenvalues)
explained_variance_ratio = eigenvalues / total_variance

# 计算累积解释方差
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# 创建图表
fig, ax = plt.subplots(figsize=(10, 6))

# 绘制条形图
ax.bar(range(1, len(eigenvalues) + 1), explained_variance_ratio, alpha=0.6, color='b',
       label='Individual explained variance')

# 绘制累积方差线
ax.plot(range(1, len(eigenvalues) + 1), explained_variance_ratio, color='r', marker='o', linestyle='-', linewidth=2,
        markersize=4, label='Explained variance')

# 设置y轴
ax.set_ylabel('Proportion of variance explained')
ax.set_xlabel('Principal components')

# 设置刻度
ax.set_xticks(range(1, len(eigenvalues) + 1))
ax.set_xticklabels(range(1, len(eigenvalues) + 1))

# 添加图例
ax.legend(loc='upper right')

# 设置标题
plt.title('Scree Plot of PCA')

# 调整布局并保存图片
plt.tight_layout()
plt.savefig('pca_scree_plot.png', dpi=300, bbox_inches='tight')

print("Scree plot has been saved as 'pca_scree_plot.png'")