data cleansing and normalizing

Data Cleaning

Cleaning Rows with NaNs

import pandas as pd
df = pd.read_csv('NaNDataset.csv')
df.isnull().sum()

A    0
B    2
C    0
dtype: int64
print(df)
    A     B   C
0   1   2.0   3
1   4   NaN   6
2   7   NaN   9
3  10  11.0  12
4  13  14.0  15
5  16  17.0  18

Replacing NaN with the Mean of the Column

# replace all the NaNs in column B with the average of column B
df.B = df.B.fillna(df.B.mean())
print(df)

    A     B   C
0   1   2.0   3
1   4  11.0   6
2   7  11.0   9
3  10  11.0  12
4  13  14.0  15
5  16  17.0  18

Removing Rows

df = pd.read_csv('NaNDataset.csv')
df = df.dropna()                             # drop all rows with NaN
print(df)

    A     B   C
0   1   2.0   3
3  10  11.0  12
4  13  14.0  15
5  16  17.0  18
df = df.reset_index(drop=True)               # reset the index
print(df)

    A     B   C
0   1   2.0   3
1  10  11.0  12
2  13  14.0  15
3  16  17.0  18

Removing Duplicate Rows

import pandas as pd
df = pd.read_csv('DuplicateRows.csv')
print(df.duplicated(keep=False))

0    False
1     True
2     True
3    False
4    False
5     True
6     True
7    False
8    False
dtype: bool
print(df.duplicated(keep="first"))

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
dtype: bool
print(df[df.duplicated(keep=False)])

    A   B   C
1   4   5   6
2   4   5   6
5  10  11  12
6  10  11  12
df.drop_duplicates(keep='first', inplace=True)  # remove duplicates and keep the first
print(df)

    A   B   C
0   1   2   3
1   4   5   6
3   7   8   9
4   7  18   9
5  10  11  12
7  13  14  15
8  16  17  18
df.drop_duplicates(subset=['A', 'C'], keep='last',
                           inplace=True)     # remove all duplicates in
                                             # columns A and C and keep
                                             # the last
print(df)

    A   B   C
0   1   2   3
1   4   5   6
4   7  18   9
5  10  11  12
7  13  14  15
8  16  17  18

Normalizing Columns

import pandas as pd
from sklearn import preprocessing

df = pd.read_csv('NormalizeColumns.csv')
print(df)

x = df.values.astype(float)

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled, columns=df.columns)
print(df)

      A   B   C
0  1000   2   3
1   400   5   6
2   700   6   9
3   100  11  12
4  1300  14  15
5  1600  17  18
     A         B    C
0  0.6  0.000000  0.0
1  0.2  0.200000  0.2
2  0.4  0.266667  0.4
3  0.0  0.600000  0.6
4  0.8  0.800000  0.8
5  1.0  1.000000  1.0
from sklearn.preprocessing import Normalizer
df = pd.read_csv('NormalizeColumns.csv')
print(df)
      A   B   C
0  1000   2   3
1   400   5   6
2   700   6   9
3   100  11  12
4  1300  14  15
5  1600  17  18
normalizer = Normalizer()
x_scaled = normalizer.fit_transform(df.values.astype('float'))
df = pd.DataFrame(x_scaled, columns= df.columns)
print(df)
          A         B         C
0  0.999994  0.002000  0.003000
1  0.999809  0.012498  0.014997
2  0.999881  0.008570  0.012856
3  0.987008  0.108571  0.118441
4  0.999875  0.010768  0.011537
5  0.999880  0.010624  0.011249

Removing Outliers

Tukey Fences

import numpy as np

def outliers_iqr(data):
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((data > upper_bound) | (data < lower_bound))

import pandas as pd
df = pd.read_csv("http://www.mosaic-web.org/go/datasets/galton.csv")
print(df.head())

  family  father  mother sex  height  nkids
0      1    78.5    67.0   M    73.2      4
1      1    78.5    67.0   F    69.2      4
2      1    78.5    67.0   F    69.0      4
3      1    78.5    67.0   F    69.0      4
4      2    75.5    66.5   M    73.5      4
outliers_iqr(df['height'])
(array([288]),)
print("Outliers using outliers_iqr()")
print("=============================")
for i in outliers_iqr(df.height)[0]:
    print(df[i:i+1])

Outliers using outliers_iqr()
=============================
    family  father  mother sex  height  nkids
288     72    70.0    65.0   M    79.0      7

Z-Score

def outliers_z_score(data):
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    z_scores = [(y - mean) / std for y in data]
    return np.where(np.abs(z_scores) > threshold)

def outliers_z_score(data):
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    z_score = [(y-mean) / std for y in data]
    return np.where(np.abs(z_score) > threshold)
print("Outliers using outliers_z_score()")
print("=================================")
for i in outliers_z_score(df.height)[0]:
    print(df[i:i+1])
print()

Outliers using outliers_z_score()
=================================
    family  father  mother sex  height  nkids
125     35    71.0    69.0   M    78.0      5
    family  father  mother sex  height  nkids
288     72    70.0    65.0   M    79.0      7
    family  father  mother sex  height  nkids
672    155    68.0    60.0   F    56.0      7

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值