记录一下最近的数据分析过程:
假如我有一个Dataframe,有两列['Drug', 'Gene'],我想构造一个矩阵,行名为Drug,列名为Gene,值为0或者1,其中0表示药物的靶点是该基因,0表示不是靶点。
(1)导入需要的包
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ks_2samp
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import epps_singleton_2samp
import matplotlib.pyplot as plt
import seaborn as sns
(2) 生成假数据:sub_df1和sub_df2,分别表示Drug-Gene,和Drug-Metabolites
# 设置随机种子保证可复现性
np.random.seed(42)
num_rows = 20 # 数据行数
drugs = ['Drug_A', 'Drug_B', 'Drug_C', 'Drug_D', 'Drug_E'] # 药物列表
genes = ['Gene_1', 'Gene_2', 'Gene_3', 'Gene_4', 'Gene_5', 'Gene_6'] # 基因列表
metabolites = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6'] # 代谢物列表
# 生成随机数据
data1 = {
'Drug': np.random.choice(drugs, size=num_rows), # 随机选择药物
'Gene': np.random.choice(genes, size=num_rows) # 随机选择基因
}
data2 = {
'Drug': np.random.choice(drugs, size=num_rows), # 随机选择药物
'Metabolites': np.random.choice(metabolites, size=num_rows), # 随机选择代谢物
'Z.Score': np.round(np.random.normal(loc=0, scale=1.5, size=num_rows), 2) # 生成Z分数(均值为0,标准差1.5)
}
# 创建DataFrame
sub_df1 = pd.DataFrame(data1)
sub_df2 = pd.DataFrame(data2)
(2)使用 pd.crosstab 得到矩阵 drug_gene_matrix,drug_meta_matrix
drug_gene_matrix = pd.crosstab(index=sub_df1['Drug'], columns=sub_df1['Gene']).clip(upper=1)
drug_meta_matrix = pd.crosstab(
index=sub_df2['Drug'],
columns=sub_df2['Metabolites'],
values=sub_df2['Z.Score'],
aggfunc='first' # 或 'mean'
).fillna(0)
(3)计算 Cosine Similarity
#### 计算药物与药物的余弦相似度
drugs = drug_gene_matrix.index.to_list()
cosine_sim1 = cosine_similarity(drug_gene_matrix.values)
drug_similarity_df1 = pd.DataFrame(cosine_sim1, index=drugs, columns=drugs)
drugs = drug_meta_matrix.index.to_list()
cosine_sim2 = cosine_similarity(drug_meta_matrix.values)
drug_similarity_df2 = pd.DataFrame(cosine_sim2, index=drugs, columns=drugs)
(4)找到 drug_similarity_df1 的下三角中大于0的位置,然后提取drug_similarity_df2中对应位置的值并统计零和非零的数量,还要统计drug_similarity_df2的下三角中剩下的值中零和非零的数量
thred = 0
drug_similarity_df2 = drug_similarity_df2.abs()
# Step1: 找到第一个矩阵中 > thred 且非对角线且是下三角的位置
lower_tri_mask = np.tril(np.ones_like(drug_similarity_df1, dtype=bool), k=-1) # 下三角(不包括对角线)
mask = lower_tri_mask & (drug_similarity_df1 > thred)
# Step2: 提取第二个矩阵中对应位置的值
mapped_values = drug_similarity_df2.where(mask).stack()
# (a) 统计零和非零的数量
zero_count = (mapped_values == 0).sum()
non_zero_count = len(mapped_values) - zero_count
non_zero_values = mapped_values[mapped_values != 0]
# (b) 提取第二个表中下三角的值
lower_tri_mask2 = np.tril(np.ones_like(drug_similarity_df2, dtype=bool), k=-1)
df2_non_diagonal_lower = drug_similarity_df2.where(lower_tri_mask2).stack()
# (c) 统计第二个表中下三角的零值和非零值
total_zeros = (df2_non_diagonal_lower == 0).sum()
total_non_zero = (df2_non_diagonal_lower != 0).sum()
total_non_zero_value = df2_non_diagonal_lower[df2_non_diagonal_lower != 0]
# (d) 计算比例
zero_ratio = (zero_count / total_zeros) * 100
nonzero_ratio = (non_zero_count / total_non_zero) * 100
small_nonzero_zero_ratio = (non_zero_count / zero_count) * 100 if zero_count > 0 else np.nan
large_nonzero_zero_ratio = (total_non_zero / total_zeros) * 100
# (e) 找出剩余的非零值(total_non_zero_value 中但不在 non_zero_values 中的部分)
remaining_mask = ~total_non_zero_value.index.isin(non_zero_values.index)
remaining_values = total_non_zero_value[remaining_mask]
remaining_count = len(remaining_values)
remaining_ratio = (remaining_count / total_non_zero) * 100 if total_non_zero > 0 else np.nan
# 打印结果
print("-----------------------------")
print(f'阈值: {thred}')
print(f"在第二个矩阵的下三角部分中:")
print(f"- 零值数量:{zero_count}")
print(f"- 非零值数量:{non_zero_count}\n")
print(f"- 全部零的个数: {total_zeros}")
print(f"- 非对角线且下三角的非零个数: {total_non_zero}")
print(f"- 零值比例 {zero_count}/{total_zeros}: {zero_ratio:.4f}%")
print(f"- 非零比例 {non_zero_count}/{total_non_zero}: {nonzero_ratio:.4f}%")
print(f"- 子集里面 非零值/零值的比例 {non_zero_count}/{zero_count}: {small_nonzero_zero_ratio:.4f}%")
print(f"- 全集里面去除对角线元素之后 非零值/零值的比例 {total_non_zero}/{total_zeros}: {large_nonzero_zero_ratio:.4f}%")
print("\n-----------------------------")
print("非零值的详细统计信息:")
print(non_zero_values.describe())
print("\n-----------------------------")
print("剩余非零值的详细统计信息(在第二个矩阵但不在第一个矩阵符合条件的部分):")
print(remaining_values.describe())
(5)对non_zero_values 和 remaining_values 进行统计检验
# Kolmogorov-Smirnov Test (KS检验)
statistic, ks_p_value = ks_2samp(non_zero_values, remaining_values)
print(f"KS统计量: {statistic:.4f}, p值: {ks_p_value:.4f}")
# Welch's T-Test (T检验,假设方差不齐)
statistic, tt_p_value = ttest_ind(non_zero_values, remaining_values, equal_var=False)
print(f"T检验统计量: {statistic:.4f}, p值: {tt_p_value:.4f}")
# Mann-Whitney U Test (非参数检验)
statistic, mw_p_value = mannwhitneyu(non_zero_values, remaining_values)
print(f"Mann-Whitney U统计量: {statistic:.4f}, p值: {mw_p_value:.4f}")
# Epps-Singleton Test (ES检验)
statistic, es_p_value = epps_singleton_2samp(non_zero_values, remaining_values)
print(f"E-S统计量: {statistic:.4f}, p值: {es_p_value:.4f}")
(6)对non_zero_values 和 remaining_values 绘制箱型图
# 将两组数据合并为一个DataFrame(两列)
df_combined = pd.DataFrame({
"Subset Non-zero": non_zero_values,
"Remainset Non-zero": remaining_values})
# 绘制箱线图(单图双箱)
plt.figure(figsize=(3, 3))
df_combined.plot.box(
patch_artist=True, # 填充箱体颜色
showfliers=False, # 不显示异常值
boxprops=dict(facecolor='lightblue'), # 箱体颜色
medianprops=dict(color='red'), # 中位数线颜色
vert=True # 竖向箱线图(默认)
)
plt.title("Comparison of Non-zero Values", fontsize=12)
plt.ylabel("Value")
plt.xticks(rotation=0) # 保持X轴标签水平
# plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
(7)对non_zero_values 和 remaining_values 绘制密度图
df_combined = pd.DataFrame({
"Subset Non-zero": non_zero_values,
"Allset Non-zero": remaining_values})
#### 绘制密度图
plt.figure(figsize=(8, 5)) # 设置画布大小
sns.kdeplot(
data=df_combined,
x="Subset Non-zero",
label="Subset Drug-Drug Pairs",
fill=True,
alpha=0.5,
color="#008792"
)
sns.kdeplot(
data=df_combined,
x="Allset Non-zero",
label="Remaining Drug-Drug Pairs",
fill=True,
alpha=0.5,
color="#f3715c"
)
# plt.title("Density Plot of Cosine Similarity Values", fontsize=15, weight='bold') # 标题
plt.xlabel("Cosine similarity", fontsize=15) # x轴标签
plt.ylabel("Density", fontsize=15) # y轴标签
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# 在右下角添加p值
plt.text(
0.95, 0.05, # x, y坐标(基于0-1的相对坐标,右下角是(0.95,0.05))
f"p_value = {ks_p_value:.4f}",
ha='right', va='bottom', # 右对齐、底部对齐
transform=plt.gca().transAxes, # 使用相对坐标
fontsize=12,
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none') # 白底半透明框
)
plt.legend() # 显示图例
plt.tight_layout()
# plt.grid(True, linestyle="--", alpha=0.5) # 添加网格线
plt.show()
(8)对non_zero_values 和 remaining_values 绘制折线图
###### 绘制折现图
# non_zero_values = [...] # 子集非零值数组
# remaining_values = [...] # 全集非零值数组
# 创建阈值范围
thresholds = np.arange(0.1, 0.9, 0.1) # [0.0, 0.1, 0.2,..., 0.9]
# 计算各阈值下的比例
def calculate_proportions(values):
total = len(values)
return [np.sum(values > t) / total for t in thresholds]
subset_props = calculate_proportions(non_zero_values)
allset_props = calculate_proportions(remaining_values)
# 创建DataFrame
df_plot = pd.DataFrame({
'Threshold': thresholds,
'Subset Proportion': subset_props,
'Allset Proportion': allset_props
})
# 绘制折线图
plt.figure(figsize=(10, 6), dpi=100)
plt.plot(df_plot['Threshold'], df_plot['Subset Proportion'],
marker='o', label='Subset (Drug-Metab)')
plt.plot(df_plot['Threshold'], df_plot['Allset Proportion'],
marker='s', label='Allset (Background)')
# 图表美化
plt.title('Subset vs Background',
fontsize=14, pad=20)
plt.xlabel('Value Threshold', fontsize=12)
plt.ylabel('Proportion of Values > Threshold', fontsize=12)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0)) # 转换为百分比格式
plt.xticks(thresholds)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
# 添加数据标签
for i, t in enumerate(thresholds):
plt.text(t, subset_props[i]+0.02, f'{subset_props[i]:.1%}',
ha='center', fontsize=9)
plt.text(t, allset_props[i]-0.03, f'{allset_props[i]:.1%}',
ha='center', fontsize=9)
plt.tight_layout()
plt.show()
(9)对non_zero_values 和 remaining_values 绘制柱状图
##### 绘制柱状图
# non_zero_values = [...] # 药物-代谢物关联值(子集)
# remaining_values = [...] # 背景关联值(全集)
# 创建阈值范围
thresholds = np.array([0.1, 0.2, 0.3]) # [0.0, 0.1, 0.2,..., 0.9]
labels = ['> 0.1', '> 0.2', '> 0.3']
# 计算各阈值下的比例
def calculate_proportions(values):
total = len(values)
return [(np.sum(values > t) / total)*100 for t in thresholds]
subset_props = calculate_proportions(non_zero_values)
allset_props = calculate_proportions(remaining_values)
# 创建DataFrame
df_plot = pd.DataFrame({
'Threshold': thresholds,
'Subset': subset_props,
'Background': allset_props
}).set_index('Threshold')
# df_plot.to_excel('percentage.xlsx', index=True)
# 绘制柱状图
plt.figure(figsize=(12, 12))
bars = df_plot.plot(kind='bar', width=0.8, color=['#1f77b4', '#ff7f0e']) # 自定义颜色
x = np.arange(len(df_plot))
# 添加数值标签(百分比格式)
for container in bars.containers:
bars.bar_label(container,
fmt='%.2f', # 显示为百分比,保留1位小数
label_type='edge',
padding=2,
fontsize=12)
# plt.title('Subset vs Background', fontsize=14)
plt.xlabel('Cosine similarity', fontsize=14)
plt.ylabel('Proportion (%)', fontsize=14)
# plt.xticks(rotation=0) # 横轴标签不旋转
plt.xticks(x, labels, fontsize=12, rotation=0)
plt.yticks(fontsize=12)
plt.legend(fontsize=12) # 图例放在右侧
# plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
(10) 保存文件
# 定义输出文件路径
output_path = "Matrix.xlsx"
# 使用 ExcelWriter 写入多 Sheet
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
drug_gene_matrix.to_excel(writer, sheet_name="Drug-Gene", index=True) # Sheet1
drug_meta_matrix.to_excel(writer, sheet_name="Drug-Metabolites", index=True) # Sheet2
drug_similarity_df1.to_excel(writer, sheet_name="Drug-Gene(Similarity)", index=True) # Sheet3
drug_similarity_df2.to_excel(writer, sheet_name="Drug-Metabolites(Similarity)", index=True) # Sheet4
print(f"数据已保存到 {output_path}")