import pandas as pd
import numpy as np
#创建含有缺失值的DataFrame
df=pd.DataFrame(np.random.randn(10,6))
df.iloc[:4,1]=None
df.iloc[:2,4:6]=None
df.iloc[6,3:5]=None
df.iloc[8,0:2]=Noneprint(df)0123450-0.338736 NaN -0.265401-0.335392 NaN NaN
10.735121 NaN 0.0394840.684837 NaN NaN
2-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.166790 NaN NaN -0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.0347878 NaN NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794
判断缺失值
#元素级别的判断,把对应的所有元素的位置都列出来,元素为空或者NA就显示True,否则就是False
result=df.isnull()print(result)0123450FalseTrueFalseFalseTrueTrue1FalseTrueFalseFalseTrueTrue2FalseTrueFalseFalseFalseFalse3FalseTrueFalseFalseFalseFalse4FalseFalseFalseFalseFalseFalse5FalseFalseFalseFalseFalseFalse6FalseFalseFalseTrueTrueFalse7FalseFalseFalseFalseFalseFalse8TrueTrueFalseFalseFalseFalse9FalseFalseFalseFalseFalseFalse#列级别的判断,只要该列有为空或者NA的元素,就为True,否则Flase
result=df.isnull().any()print(result)0True1True2False3True4True5True
dtype:bool#只显示存在缺失值的行列,清楚的确定缺失值的位置#用drop_duplicates()去掉重复的行
result=df[df.isnull().values==True].drop_duplicates()print(result)0123450-0.338736 NaN -0.265401-0.335392 NaN NaN
10.735121 NaN 0.0394840.684837 NaN NaN
2-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.6400516-0.3323470.18186-0.166790 NaN NaN -0.4209508 NaN NaN -1.206436-0.0418141.106454-0.036821#获得为空或者NA的列索引
result=df.columns[df.isnull().any()==True]print(result)
Int64Index([0,1,3,4,5], dtype='int64')#获得每列为空的数据的个数
num=df.isnull().sum()print(num)011520314352
dtype: int64
#获得每行为空的数据的个数
num=df.isnull().sum(axis=1)print(num)03132131405062708290
dtype: int64
滤除缺失值
#使用默认参数,删除行,只要有空值就会删除,不改变源数据print(df.dropna())print('源数据不变')print(df)01234540.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.42133670.553072-1.324093-0.361992-0.710529-0.080789-1.0347879-0.7028480.1415010.4024180.8141630.2446780.072794
源数据不变
0123450-0.338736 NaN -0.265401-0.335392 NaN NaN
10.735121 NaN 0.0394840.684837 NaN NaN
2-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.166790 NaN NaN -0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.0347878 NaN NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#使用axis删除有空值的列print(df.dropna(axis=1))20-0.26540110.0394842-0.42140430.6357844-1.0385955-0.2832586-0.1667907-0.3619928-1.20643690.402418#所有值为缺失值才删除print(df.dropna(how='all'))0123450-0.338736 NaN -0.265401-0.335392 NaN NaN
10.735121 NaN 0.0394840.684837 NaN NaN
2-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.166790 NaN NaN -0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.0347878 NaN NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#至少有四个非空值才保留print(df.dropna(thresh=4))0123452-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.166790 NaN NaN -0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.0347878 NaN NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#删除这个subset中含有缺失值的行print(df.dropna(subset=[2,4]))0123452-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.42133670.553072-1.324093-0.361992-0.710529-0.080789-1.0347878 NaN NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#删除这个subset中含有缺失值的列print(df.dropna(subset=[2,4],axis=1))023450-0.338736-0.265401-0.335392 NaN NaN
10.7351210.0394840.684837 NaN NaN
2-0.774130-0.4214041.4723101.207911-0.2377533-0.1529820.6357840.476728-1.6155270.64005140.132076-1.038595-0.517508-0.497453-0.6501245-1.756492-0.283258-1.969033-0.856556-0.4213366-0.332347-0.166790 NaN NaN -0.42095070.553072-0.361992-0.710529-0.080789-1.0347878 NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.4024180.8141630.2446780.072794
填充缺失值
#将缺失值替换为常量0print(df.fillna(0))0123450-0.3387360.000000-0.265401-0.3353920.0000000.00000010.7351210.0000000.0394840.6848370.0000000.0000002-0.7741300.000000-0.4214041.4723101.207911-0.2377533-0.1529820.0000000.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.1667900.0000000.000000-0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.03478780.0000000.000000-1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#横向用缺失值前面的值替代缺失值print(df.fillna(axis=1,method='ffill'))0123450-0.338736-0.338736-0.265401-0.335392-0.335392-0.33539210.7351210.7351210.0394840.6848370.6848370.6848372-0.774130-0.774130-0.4214041.4723101.207911-0.2377533-0.152982-0.1529820.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.166790-0.166790-0.166790-0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.0347878 NaN NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#纵向用缺失值上面的值替代缺失值print(df.fillna(axis=0,method='ffill'))0123450-0.338736 NaN -0.265401-0.335392 NaN NaN
10.735121 NaN 0.0394840.684837 NaN NaN
2-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.166790-1.969033-0.856556-0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.03478780.553072-1.324093-1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#不同的列用不同的值填充
info={0:1,1:2,2:3,3:4,4:5,5:6}print(df.fillna(value=info))0123450-0.3387362.000000-0.265401-0.3353925.0000006.00000010.7351212.0000000.0394840.6848375.0000006.0000002-0.7741302.000000-0.4214041.4723101.207911-0.2377533-0.1529822.0000000.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.1667904.0000005.000000-0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.03478781.0000002.000000-1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794#对每列出现的替换值有次数限制print(df.fillna(value=info,limit=1))0123450-0.3387362.000000-0.265401-0.3353925.0000006.00000010.735121 NaN 0.0394840.684837 NaN NaN
2-0.774130 NaN -0.4214041.4723101.207911-0.2377533-0.152982 NaN 0.6357840.476728-1.6155270.64005140.132076-0.986871-1.038595-0.517508-0.497453-0.6501245-1.7564921.740063-0.283258-1.969033-0.856556-0.4213366-0.3323470.181860-0.1667904.000000 NaN -0.42095070.553072-1.324093-0.361992-0.710529-0.080789-1.03478781.000000 NaN -1.206436-0.0418141.106454-0.0368219-0.7028480.1415010.4024180.8141630.2446780.072794
处理缺失值import pandas as pdimport numpy as np#创建含有缺失值的DataFramedf=pd.DataFrame(np.random.randn(10,6))df.iloc[:4,1]=Nonedf.iloc[:2,4:6]=Nonedf.iloc[6,3:5]=Nonedf.iloc[8,0:2]=Noneprint(df) ...