df = pd.DataFrame({'id': [1, 2, 3, 4],
'age': [20, 26, 27, 28],
'sex': ['female', 'female', 'male', 'male'],
'group': ['ctrl', 'treat1', 'treat2', 'treat3'],
'UA': [350.1, 360.2, 370.3, 380.4],
'T2DM': [False, True, True, True]})
# df示例如下:
id age sex group UA T2DM
0 1 20 female ctrl 350.1 False
1 2 26 female treat1 360.2 True
2 3 27 male treat2 370.3 True
3 4 28 male treat3 380.4 True
选择前几行或后几行:
"""前几行"""
df.head(2) # 方法1
df[:2] # 方法2
df[0:2] # 方法3
# 以上三种方法的输出一样:
id age sex group UA T2DM
0 1 20 female ctrl 350.1 False
1 2 26 female treat1 360.2 True
"""后几行"""
df.tail(2)
# 输出:
id age sex group UA T2DM
2 3 27 male treat2 370.3 True
3 4 28 male treat3 380.4 True
根据单列筛选行:
1、连续变量
"""与1个数值比较"""
df[df['age']>27]
# 输出:
# id age sex group UA T2DM
# 3 4 28 male treat3 380.4 True
df[ df['age'] < df['age'].mean() ]
# 输出:
# id age sex group UA T2DM
# 0 1 20 female ctrl 350.1 False
"""与2个数值比较"""
df[df.apply(lambda x: 20<= x['age'] <27, axis=1)]
df[(df['age']>=20) & (df['age']<27)]
# 两种方法的输出一样,但前者更简洁:
# id age sex group UA T2DM
# 0 1 20 female ctrl 350.1 False
# 1 2 26 female treat1 360.2 True
2、分类分量
"""精确匹配单个元素"""
df[df['sex'] == 'male']
# 输出:
# id age sex group UA T2DM
# 2 3 27 male treat2 370.3 True
# 3 4 28 male treat3 380.4 True
"""精确匹配多个元素"""
df[df['group'].isin(['ctrl', 'treat1'])]
# 输出:
# id age sex group UA T2DM
# 0 1 20 female ctrl 350.1 False
# 1 2 26 female treat1 360.2 True
"""模糊匹配"""
df[df['group'].str.contains('treat', regex=True)]
# 输出:
# id age sex group UA T2DM
# 1 2 26 female treat1 360.2 True
# 2 3 27 male treat2 370.3 True
# 3 4 28 male treat3 380.4 True
根据多列筛选行:
df[(df['age'] >= 26) & \
(df['group'].isin(['ctrl', 'treat3']))]
# 输出:
# id age sex group UA T2DM
# 3 4 28 male treat3 380.4 True
df[df.apply(lambda x: x['UA'] > 360 and \
x['sex'] == 'female',
axis=1)]
# 输出:
# id age sex group UA T2DM
# 1 2 26 female treat1 360.2 True