2-3 处理缺失数据

2.3 处理缺失数据

与本节相关的视频课程:处理缺失数据

检查缺失数据

基础知识

def foo(): pass
f = foo()
print(f)
None
type(f)
NoneType
None + 2
TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'
import numpy as np
np.nan + 2
nan
type(np.nan)
float
import pandas as pd
s = pd.Series([1, 2, None, np.nan])    # ①
s
0    1.0
1    2.0
2    NaN
3    NaN
dtype: float64
s.sum()
3.0
s.isna()
0    False
1    False
2     True
3     True
dtype: bool
df = pd.DataFrame({"one":[1, 2, np.nan], "two":[np.nan, 3, 4]})
df.isna()
onetwo
0FalseTrue
1FalseFalse
2TrueFalse

项目案例

hitters = pd.read_csv("/home/aistudio/data/data20507/Hitters.csv")
hitters.isna().any()
AtBat        False
Hits         False
HmRun        False
Runs         False
RBI          False
Walks        False
Years        False
CAtBat       False
CHits        False
CHmRun       False
CRuns        False
CRBI         False
CWalks       False
League       False
Division     False
PutOuts      False
Assists      False
Errors       False
Salary        True
NewLeague    False
dtype: bool
(hitters.shape[0] - hitters.count()) / hitters.shape[0]
AtBat        0.00000
Hits         0.00000
HmRun        0.00000
Runs         0.00000
RBI          0.00000
Walks        0.00000
Years        0.00000
CAtBat       0.00000
CHits        0.00000
CHmRun       0.00000
CRuns        0.00000
CRBI         0.00000
CWalks       0.00000
League       0.00000
Division     0.00000
PutOuts      0.00000
Assists      0.00000
Errors       0.00000
Salary       0.18323
NewLeague    0.00000
dtype: float64
hitters.shape
(322, 20)
hitters.count()
AtBat        322
Hits         322
HmRun        322
Runs         322
RBI          322
Walks        322
Years        322
CAtBat       322
CHits        322
CHmRun       322
CRuns        322
CRBI         322
CWalks       322
League       322
Division     322
PutOuts      322
Assists      322
Errors       322
Salary       263
NewLeague    322
dtype: int64
(df.shape[1] - df.T.count()) / df.shape[1] 
0    0.5
1    0.0
2    0.5
dtype: float64
df.dropna()
onetwo
12.03.0
df = pd.concat([df, pd.DataFrame({"one": [np.nan], "two": [np.nan], "three": [np.nan]})], 
               ignore_index=True, sort=False)    # 重新构建一个含有缺失值的DataFrame对象
df
onetwothree
01.0NaNNaN
12.03.0NaN
2NaN4.0NaN
3NaNNaNNaN
df.dropna(axis=0, how='all')    # how声明删除条件
onetwothree
01.0NaNNaN
12.03.0NaN
2NaN4.0NaN
df.dropna(thresh=2)    # 非缺失值小于2的删除
onetwothree
12.03.0NaN
new_hitters = hitters.dropna()
new_hitters.isna().any()
AtBat        False
Hits         False
HmRun        False
Runs         False
RBI          False
Walks        False
Years        False
CAtBat       False
CHits        False
CHmRun       False
CRuns        False
CRBI         False
CWalks       False
League       False
Division     False
PutOuts      False
Assists      False
Errors       False
Salary       False
NewLeague    False
dtype: bool

动手练习

eles = pd.read_csv("/home/aistudio/data/data20507/elements.csv")
eles.isna().any()
atomic number               False
symbol                      False
name                        False
atomic mass                 False
CPK                         False
electronic configuration    False
electronegativity            True
atomic radius                True
ion radius                   True
van der Waals radius         True
IE-1                         True
EA                           True
standard state               True
bonding type                 True
melting point                True
boiling point                True
density                      True
metal                       False
year discovered             False
group                       False
period                      False
dtype: bool
(eles.shape[0] - eles.count()) / eles.shape[0]
atomic number               0.000000
symbol                      0.000000
name                        0.000000
atomic mass                 0.000000
CPK                         0.000000
electronic configuration    0.000000
electronegativity           0.177966
atomic radius               0.398305
ion radius                  0.220339
van der Waals radius        0.677966
IE-1                        0.135593
EA                          0.279661
standard state              0.161017
bonding type                0.169492
melting point               0.144068
boiling point               0.203390
density                     0.186441
metal                       0.000000
year discovered             0.000000
group                       0.000000
period                      0.000000
dtype: float64
eles_nona = eles.dropna()
eles_nona.isna().any()
atomic number               False
symbol                      False
name                        False
atomic mass                 False
CPK                         False
electronic configuration    False
electronegativity           False
atomic radius               False
ion radius                  False
van der Waals radius        False
IE-1                        False
EA                          False
standard state              False
bonding type                False
melting point               False
boiling point               False
density                     False
metal                       False
year discovered             False
group                       False
period                      False
dtype: bool

2.3.2 用指定值填补缺失数据

基础知识

df = pd.DataFrame({"one":[10, 11, 12], 'two':[np.nan, 21, 22], "three":[30, np.nan, 33]})
df
onetwothree
010NaN30.0
11121.0NaN
21222.033.0
df = pd.DataFrame({'ColA':[1, np.nan, np.nan, 4, 5, 6, 7], 'ColB':[1, 1, 1, 1, 2, 2, 2]})
df['ColA'].fillna(method='ffill')
0    1.0
1    1.0
2    1.0
3    4.0
4    5.0
5    6.0
6    7.0
Name: ColA, dtype: float64
df['ColA'].fillna(method='bfill')
0    1.0
1    4.0
2    4.0
3    4.0
4    5.0
5    6.0
6    7.0
Name: ColA, dtype: float64

项目案例

persons = pd.read_csv("/home/aistudio/data/data20507/Person.csv")    # 为了适应平台要求,数据的名称与教材中的稍有差异
pdf = persons.sample(20)    # ①
pdf['Height-na'] = np.where(pdf['Height'] % 5 == 0, np.nan, pdf['Height'])    # ②
pdf
GenderHeightWeightIndexHeight-na
64Male1751355NaN
225Female1551445NaN
484Female1881154188.0
293Female165834NaN
102Male1611555161.0
282Female147945147.0
139Male1591245159.0
66Female172964172.0
365Male141805141.0
397Male1691365169.0
18Male1441455144.0
172Male1671515167.0
443Male1521465152.0
358Female180581NaN
447Female1761214176.0
251Male1401435NaN
360Female193611193.0
346Female191682191.0
5Male1891043189.0
294Female1681435168.0
pdf['Height-na'].fillna(pdf['Height-na'].mean(), inplace=True)   
pdf
GenderHeightWeightIndexHeight-na
64Male1751355167.8
225Female1551445167.8
484Female1881154188.0
293Female165834167.8
102Male1611555161.0
282Female147945147.0
139Male1591245159.0
66Female172964172.0
365Male141805141.0
397Male1691365169.0
18Male1441455144.0
172Male1671515167.0
443Male1521465152.0
358Female180581167.8
447Female1761214176.0
251Male1401435167.8
360Female193611193.0
346Female191682191.0
5Male1891043189.0
294Female1681435168.0
pdf['Height'].describe()
count     20.000000
mean     166.600000
std       16.740748
min      140.000000
25%      154.250000
50%      167.500000
75%      177.000000
max      193.000000
Name: Height, dtype: float64
pdf['Height-na'].describe()
count     20.000000
mean     167.800000
std       14.882699
min      141.000000
25%      160.500000
50%      167.800000
75%      173.000000
max      193.000000
Name: Height-na, dtype: float64

扩展研究

pdf2 = persons.sample(20)
pdf2['Height-na'] = np.where(pdf2['Height'] % 5 == 0, np.nan, pdf2['Height'])    # 制造缺失值

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')    # ③
col_values = imp_mean.fit_transform(pdf2['Height-na'].values.reshape((-1, 1)))    # ④
col_values
array([[169.        ],
       [188.        ],
       [178.33333333],
       [178.33333333],
       [166.        ],
       [193.        ],
       [178.33333333],
       [178.        ],
       [142.        ],
       [178.33333333],
       [197.        ],
       [178.33333333],
       [186.        ],
       [171.        ],
       [178.        ],
       [176.        ],
       [183.        ],
       [184.        ],
       [188.        ],
       [176.        ]])
df = pd.DataFrame({"name": ["Google", "Huawei", "Facebook", "Alibaba"], 
                   "price": [100, -1, -1, 90]
                  })
df
nameprice
0Google100
1Huawei-1
2Facebook-1
3Alibaba90
imp = SimpleImputer(missing_values=-1, strategy='constant', fill_value=110)    # ⑤
imp.fit_transform(df['price'].values.reshape((-1, 1)))
array([[100],
       [110],
       [110],
       [ 90]])

2.3.3 根据规律填补缺失值

df = pd.DataFrame({"one":np.random.randint(1, 100, 10), 
                   "two": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                  "three":[5, 9, 13, np.nan, 21, np.nan, 29, 33, 37, 41]})
df
onetwothree
03825.0
16949.0
286613.0
3798NaN
4731021.0
59012NaN
6771429.0
7311633.0
8661837.0
9222041.0
from sklearn.linear_model import LinearRegression    # ⑥

df_train = df.dropna()    #训练集
df_test = df[df['three'].isnull()]    #测试集

regr = LinearRegression()
regr.fit(df_train['two'].values.reshape(-1, 1), df_train['three'].values.reshape(-1, 1))    # ⑦
df_three_pred = regr.predict(df_test['two'].values.reshape(-1, 1))    # ⑧

# 将所得数值填补到原数据集中
df.loc[(df.three.isnull()), 'three'] = df_three_pred
df
onetwothree
03825.0
16949.0
286613.0
379817.0
4731021.0
5901225.0
6771429.0
7311633.0
8661837.0
9222041.0

项目案例

import pandas as pd

train_data = pd.read_csv("/home/aistudio/data/data20507/train.csv")
train_data.info()    # ⑨
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
train_data.isna().any()
PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool
df = train_data[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]    #可能跟年龄有关的特征
known_age = df[df['Age'].notnull()].values
unknown_age = df[df['Age'].isnull()].values

y = known_age[:, 0]
X = known_age[:, 1:]

from sklearn.ensemble import RandomForestRegressor    # ⑩
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)    # ○11
rfr.fit(X, y)    

pred_age = rfr.predict(unknown_age[:, 1:])    # ○13
pred_age.mean()
29.438010170664793
train_data.loc[(train_data.Age.isnull()), 'Age'] = pred_age
train_data.isna().any()
PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool
!mkdir /home/aistudio/external-libraries
!pip install seaborn -t /home/aistudio/external-libraries
import sys
sys.path.append('/home/aistudio/external-libraries')
%matplotlib inline
import seaborn as sns
sns.distplot(y)

png

sns.distplot(train_data['Age'])

png

df_mean = df['Age'].fillna(df['Age'].mean())
sns.distplot(df_mean)

png

扩展研究

!pip install missingpy -t /home/aistudio/external-libraries
from sklearn.datasets import load_iris    # 引入鸢尾花数据集
import numpy as np

iris = load_iris()
X = iris.data
# 制造含有缺失值的数据集
rng = np.random.RandomState(0)
X_missing = X.copy()
mask = np.abs(X[:, 2] - rng.normal(loc=5.5, scale=0.7, size=X.shape[0])) < 0.6
X_missing[mask, 3] = np.nan    # X_missing是包含了缺失值的数据集

from missingpy import KNNImputer    # 引入KNN填充缺失值的模型
imputer = KNNImputer(n_neighbors=3, weights="uniform")
X_imputed = imputer.fit_transform(X_missing)
/home/aistudio/external-libraries/missingpy/utils.py:124: RuntimeWarning: invalid value encountered in sqrt
  return distances if squared else np.sqrt(distances, out=distances)
sns.distplot(X.reshape((-1, 1)))

png

sns.distplot(X_imputed.reshape((-1, 1)))    # 填补缺失数据后的分布

png

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

绿洲213

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值