pandas.Series.add参数fill_value

本文介绍了Pandas中如何进行索引对齐以及如何填充缺失值的操作。通过示例展示了当两个Series存在共同索引时,如何使用`add()`函数并设置`fill_value`参数来合并数据,若对应位置都是NaN则保持不变。这种功能在处理数据集合并时非常实用。

fill_value: None或浮点数;填充NaN值;如果元素在2个series都是NaN,结果还是NaN;

pandas的一个重要功能,就是索引对齐,即在2个series里添加对应虚列,值为NaN

示例

a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'])
b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e'])
a.add(b,fill_value=0)
------
a    2.0
b    1.0
c    1.0
d    1.0
e    NaN
dtype: float64
======
# 填充过程为
a = pd.Series([1, 1, 1, np.nan,np.nan], index=['a', 'b', 'c', 'd','e'])
b = pd.Series([1, np.nan, np.nan,1, np.nan], index=['a', 'b', 'c','d', 'e'])
# b,c,d在序列a,序列b中有值, 对应位置填充0; e在2个series都为NaN,不填充; 最后填充为
a = pd.Series([1, 1, 1, 0,np.nan], index=['a', 'b', 'c', 'd','e'])
b = pd.Series([1, 0,0, 1, np.nan], index=['a', 'b', 'c','d', 'e'])
a.add(b,fill_value=0)
------
a    2.0
b    1.0
c    1.0
d    1.0
e    NaN
dtype: float64
======
KeyError Traceback (most recent call last) File D:\anacondaxiaz\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key) 3804 try: -> 3805 return self._engine.get_loc(casted_key) 3806 except KeyError as err: File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc() File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc() File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: '分组标签' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[1], line 531 529 if __name__ == "__main__": 530 optimizer = NIPTOptimizer("男胎检测数据_预处理.xlsx") --> 531 optimal_df, risk_df = optimizer.optimize() 533 print("=== 原始优化结果 ===") 534 print(optimal_df[['分组标签', 'BMI区间', '子BMI区间', '最佳检测周', '最小总风险', '平均窗口期风险', 'Y染色体达标比例']]) Cell In[1], line 486, in NIPTOptimizer.optimize(self) 481 group_data = self.data[ 482 (self.data['孕妇BMI'] >= bmi_min) & 483 (self.data['孕妇BMI'] < bmi_max) 484 ].copy() 485 if not group_data.empty: --> 486 results_df = self.assign_isolated_samples(group_data, results_df) 488 return results_df, risk_df Cell In[1], line 294, in NIPTOptimizer.assign_isolated_samples(self, group_data, results_df) 292 for _, sample in isolated_samples.iterrows(): 293 sample_bmi = sample['孕妇BMI'] --> 294 sample_week = self._get_best_week_for_sample(sample, self.get_weights(sample['分组标签'])) 295 sample_rt_risk = self._calculate_rt_risk(sample_bmi, sample_week, sample_bmi) 296 sample_total_risk = sample['最小总风险'] File D:\anacondaxiaz\Lib\site-packages\pandas\core\series.py:1121, in Series.__getitem__(self, key) 1118 return self._values[key] 1120 elif key_is_scalar: -> 1121 return self._get_value(key) 1123 # Convert generator to list before going through hashable part 1124 # (We will iterate through the generator there to check for slices) 1125 if is_iterator(key): File D:\anacondaxiaz\Lib\site-packages\pandas\core\series.py:1237, in Series._get_value(self, label, takeable) 1234 return self._values[label] 1236 # Similar to Index.get_value, but we do not fall back to positional -> 1237 loc = self.index.get_loc(label) 1239 if is_integer(loc): 1240 return self._values[loc] File D:\anacondaxiaz\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key) 3807 if isinstance(casted_key, slice) or ( 3808 isinstance(casted_key, abc.Iterable) 3809 and any(isinstance(x, slice) for x in casted_key) 3810 ): 3811 raise InvalidIndexError(key) -> 3812 raise KeyError(key) from err 3813 except TypeError: 3814 # If we have a listlike key, _check_indexing_error will raise 3815 # InvalidIndexError. Otherwise we fall through and re-raise 3816 # the TypeError. 3817 self._check_indexing_error(key) KeyError: '分组标签' 检查好原因后,输出修改完的完整代码
09-08
按import pandas as pd import os # 定义数据目录路径(使用原始字符串处理Windows路径) base_path = r"D:\Acute Myeloid Leukemia数据源\Acute Myeloid Leukemia data processing -可以改动\按照疾病分类\Acute Myeloid Leukemia(TARGET GDC,2025)" # 1. 加载基础数据 meta = pd.read_csv(os.path.join(base_path, 'patients_metainfo.csv')) # 患者元数据 outcomes = pd.read_csv(os.path.join(base_path, 'patients_outcomes.csv')) # 临床结局 blood = pd.read_csv(os.path.join(base_path, 'patients_blood_cell.csv')) # 血细胞数据 # 2. 处理基因突变数据(长格式 → 宽格式) mutation = pd.read_csv(os.path.join(base_path, 'patients_mutation.csv')) mutation_pivot = mutation.pivot_table( index='patient_id', columns='gene', values='mutation_status', fill_value='WT', # 缺失值直接填充野生型 aggfunc='first' # 取每个患者-基因对的首次记录 ).add_prefix('mut_').reset_index() # 添加前缀并将patient_id移出索引 # 3. 处理拷贝数变异(CNA) cna = pd.read_csv(os.path.join(base_path, 'patients_cna.csv')) cna_pivot = cna.pivot( index='patient_id', columns='gene', values='cna_value' ).add_prefix('cna_').reset_index() # 4. 处理mRNA表达数据(制表符分隔的矩阵) mrna = pd.read_csv( os.path.join(base_path, 'patients_mrna_zcores.txt'), sep='\t' ) # 重命名首列为patient_id(假设第一列无标题) mrna = mrna.rename(columns={mrna.columns[0]: 'patient_id'}) # 5. 多级合并(按临床重要性递增的顺序) merge_order = [meta, outcomes, blood, mutation_pivot, cna_pivot, mrna] merged_df = meta.copy() # 从元数据开始 for df in merge_order[1:]: # 跳过meta自身 merged_df = merged_df.merge( df, on='patient_id', how='left', # 保留所有患者 suffixes=('', '_drop') # 处理重复列 ) # 删除合并中产生的冗余列 merged_df = merged_df[[col for col in merged_df.columns if not col.endswith('_drop')]] # 6. 缺失值处理 # 基因数据填充0,分类数据填充特定值 cols_to_fill_zero = [col for col in merged_df.columns if col.startswith(('mut_', 'cna_')) or 'ENSG' in col] merged_df[cols_to_fill_zero] = merged_df[cols_to_fill_zero].fillna(0) merged_df['mutation_status'] = merged_df['mutation_status'].fillna('WT') # 补充突变状态 # 7. 保存合并结果 output_path = os.path.join(base_path, 'multimodal_AML_data.csv') merged_df.to_csv(output_path, index=False) print(f"数据合并完成!保存路径: {output_path}") print(f"合并后维度: {merged_df.shape} (行数×列数)")方式调整
最新发布
10-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值