import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 设置中文字体和图形样式
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 100
# 读取处理后的数据
df = pd.read_excel('嵌套数据处理.xlsx')
print(f"数据形状: {df.shape}")
print("\n数据基本信息:")
print(df.info())
# 使用十分位数将BMI平均分为10组(0%, 10%, 20%, ..., 100%)
bmi_quantiles = df['孕妇BMI'].quantile([i / 10 for i in range(11)])
print(f"\nBMI十分位数分组边界: {bmi_quantiles.values}")
# 创建BMI分组(使用十分位数)
bmi_bins = bmi_quantiles.values
bmi_labels = []
for i in range(len(bmi_bins) - 1):
bmi_labels.append(f'BMI组{i + 1}: {bmi_bins[i]:.1f}-{bmi_bins[i + 1]:.1f}')
df['BMI分组'] = pd.cut(df['孕妇BMI'], bins=bmi_bins, labels=bmi_labels, include_lowest=True)
# 使用十分位数将检测孕周平均分为10组
week_quantiles = df['检测孕周'].quantile([i / 10 for i in range(11)])
print(f"检测孕周十分位数分组边界: {week_quantiles.values}")
# 创建检测孕周分组(使用十分位数)
week_bins = week_quantiles.values
week_labels = []
for i in range(len(week_bins) - 1):
week_labels.append(f'孕周组{i + 1}: {week_bins[i]:.1f}-{week_bins[i + 1]:.1f}周')
df['孕周分组'] = pd.cut(df['检测孕周'], bins=week_bins, labels=week_labels, include_lowest=True)
print("\nBMI分组统计:")
bmi_counts = df['BMI分组'].value_counts().sort_index()
print(bmi_counts)
print("\n孕周分组统计:")
week_counts = df['孕周分组'].value_counts().sort_index()
print(week_counts)
# 创建第一个图形:不同BMI分组下 Y染色体浓度与检测孕周的散点图
fig1, axes1 = plt.subplots(2, 5, figsize=(20, 10)) # 减小画布(原25x15 → 20x10)
axes1 = axes1.flatten()
# 确保分组按组号排序
bmi_groups = df['BMI分组'].dropna().unique()
bmi_groups_sorted = sorted(bmi_groups, key=lambda x: int(x.split('组')[1].split(':')[0]))
bmi_groups = bmi_groups_sorted
for i, (ax, bmi_group) in enumerate(zip(axes1, bmi_groups)):
group_data = df[df['BMI分组'] == bmi_group]
if len(group_data) > 0:
colors = plt.cm.Spectral_r(np.linspace(0, 1, len(bmi_groups)))
ax.scatter(group_data['检测孕周'], group_data['Y染色体浓度'],
alpha=0.7, s=25, c=[colors[i]], edgecolors='none') # 减小点大小
if len(group_data) > 1:
valid_data = group_data[['检测孕周', 'Y染色体浓度']].dropna()
if len(valid_data) > 1:
z = np.polyfit(valid_data['检测孕周'], valid_data['Y染色体浓度'], 1)
p = np.poly1d(z)
x_range = np.linspace(valid_data['检测孕周'].min(), valid_data['检测孕周'].max(), 100)
ax.plot(x_range, p(x_range), "red", linewidth=1.5, alpha=0.8)
correlation = valid_data['检测孕周'].corr(valid_data['Y染色体浓度'])
ax.text(0.05, 0.95, f'r = {correlation:.3f}\nn = {len(group_data)}',
transform=ax.transAxes, fontsize=10,
bbox=dict(boxstyle="round,pad=0.4", facecolor="white", alpha=0.9, edgecolor='lightgray'))
ax.set_xlabel('检测孕周', fontsize=11)
ax.set_ylabel('Y染色体浓度', fontsize=11)
ax.set_title(f'{bmi_group}', fontsize=12, pad=15)
ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5)
ax.tick_params(axis='both', which='major', labelsize=9) # 缩小刻度字体
ax.set_xlim(df['检测孕周'].min() - 0.5, df['检测孕周'].max() + 0.5)
ax.set_ylim(df['Y染色体浓度'].min() - 0.005, df['Y染色体浓度'].max() + 0.005)
# 隐藏多余子图
for i in range(len(bmi_groups), len(axes1)):
axes1[i].set_visible(False)
plt.suptitle('不同BMI分组下 Y染色体浓度与检测孕周的关系', fontsize=14, fontweight='bold', y=0.96)
plt.subplots_adjust(
left=0.06,
right=0.96,
bottom=0.12, # 留出足够下边距
top=0.90, # suptitle 留空间
hspace=0.5, # 垂直间距(因紧凑而略增)
wspace=0.35 # 水平间距
)
plt.savefig('不同BMI十分位数分组_孕周与Y染色体浓度.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()
# 计算各分组的相关系数矩阵
correlation_results = []
# BMI分组下的相关系数(孕周 vs Y染色体浓度)
for bmi_group in bmi_groups:
group_data = df[df['BMI分组'] == bmi_group]
if len(group_data) > 1:
valid_data = group_data[['检测孕周', 'Y染色体浓度']].dropna()
if len(valid_data) > 1:
correlation = valid_data['检测孕周'].corr(valid_data['Y染色体浓度'])
correlation_results.append({
'分组类型': 'BMI分组',
'分组名称': bmi_group,
'分组编号': f'组{bmi_groups.index(bmi_group) + 1}',
'样本数量': len(group_data),
'相关系数': correlation,
'变量关系': '孕周 vs Y染色体浓度',
'分组下界': bmi_bins[bmi_groups.index(bmi_group)],
'分组上界': bmi_bins[bmi_groups.index(bmi_group) + 1]
})
# 孕周分组下的相关系数(BMI vs Y染色体浓度)
for week_group in week_groups:
group_data = df[df['孕周分组'] == week_group]
if len(group_data) > 1:
valid_data = group_data[['孕妇BMI', 'Y染色体浓度']].dropna()
if len(valid_data) > 1:
correlation = valid_data['孕妇BMI'].corr(valid_data['Y染色体浓度'])
correlation_results.append({
'分组类型': '孕周分组',
'分组名称': week_group,
'分组编号': f'组{week_groups.index(week_group) + 1}',
'样本数量': len(group_data),
'相关系数': correlation,
'变量关系': 'BMI vs Y染色体浓度',
'分组下界': week_bins[week_groups.index(week_group)],
'分组上界': week_bins[week_groups.index(week_group) + 1]
})
# 转换为DataFrame并保存
correlation_df = pd.DataFrame(correlation_results)
print("\n=== 各分组相关系数 ===")
print(correlation_df.round(3))
# 保存结果
correlation_df.to_excel('十分位数分组相关性分析结果.xlsx', index=False)
# 创建相关系数热力图
plt.figure(figsize=(15, 8))
# BMI分组的相关系数
bmi_corr = correlation_df[correlation_df['分组类型'] == 'BMI分组']
plt.subplot(1, 2, 1)
bars = plt.bar(range(len(bmi_corr)), bmi_corr['相关系数'],
color=plt.cm.Spectral_r(np.linspace(0, 1, len(bmi_corr))),
alpha=0.8, edgecolor='gray', linewidth=0.5)
plt.xlabel('BMI分组', fontsize=12)
plt.ylabel('相关系数', fontsize=12)
plt.title('BMI分组: 孕周 vs Y染色体浓度', fontsize=14)
plt.xticks(range(len(bmi_corr)), [f'组{i + 1}' for i in range(len(bmi_corr))], rotation=45, fontsize=11)
plt.grid(True, alpha=0.3, axis='y')
# 在柱子上添加数值标签
for bar, corr in zip(bars, bmi_corr['相关系数']):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01 * (1 if height >= 0 else -1),
f'{corr:.3f}', ha='center', va='bottom' if height >= 0 else 'top', fontsize=10, fontweight='bold')
# 孕周分组的相关系数
week_corr = correlation_df[correlation_df['分组类型'] == '孕周分组']
plt.subplot(1, 2, 2)
bars = plt.bar(range(len(week_corr)), week_corr['相关系数'],
color=plt.cm.magma(np.linspace(0, 1, len(week_corr))),
alpha=0.8, edgecolor='gray', linewidth=0.5)
plt.xlabel('孕周分组', fontsize=12)
plt.ylabel('相关系数', fontsize=12)
plt.title('孕周分组: BMI vs Y染色体浓度', fontsize=14)
plt.xticks(range(len(week_corr)), [f'组{i + 1}' for i in range(len(week_corr))], rotation=45, fontsize=11)
plt.grid(True, alpha=0.3, axis='y')
# 在柱子上添加数值标签
for bar, corr in zip(bars, week_corr['相关系数']):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01 * (1 if height >= 0 else -1),
f'{corr:.3f}', ha='center', va='bottom' if height >= 0 else 'top', fontsize=10, fontweight='bold')
plt.tight_layout()
plt.savefig('十分位数分组相关系数可视化.png', dpi=300, bbox_inches='tight')
plt.show()
print("\n分析完成!所有图形和结果已保存。")
print(f"总样本量: {len(df)}")
print(f"BMI分组数量: {len(bmi_groups)}")
print(f"孕周分组数量: {len(week_groups)}")
报错了,请修改