数据科学 5.4 数据清洗(代码)
5.4 数据清洗
import pandas as pd
import os
import numpy as np
# os.chdir(r"D:\Python_Training\script_Python\5Preprocessing")
camp = pd.read_csv(r'.\teleco_camp_orig.csv')
camp.head()
ID | Suc_flag | ARPU | PromCnt12 | PromCnt36 | PromCntMsg12 | PromCntMsg36 | Class | Age | Gender | HomeOwner | AvgARPU | AvgHomeValue | AvgIncome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12 | 1 | 50.0 | 6 | 10 | 2 | 3 | 4 | 57.0 | M | H | 49.894904 | 33400 | 39460 |
1 | 53 | 0 | NaN | 5 | 9 | 1 | 4 | 3 | 55.0 | M | H | 48.574742 | 37600 | 33545 |
2 | 67 | 1 | 25.0 | 6 | 11 | 2 | 4 | 1 | 57.0 | F | H | 49.272646 | 100400 | 42091 |
3 | 71 | 1 | 80.0 | 7 | 10 | 2 | 4 | 1 | 52.0 | F | H | 47.334953 | 39900 | 39313 |
4 | 142 | 1 | 15.0 | 6 | 11 | 2 | 4 | 1 | NaN | F | U | 47.827404 | 47500 | 0 |
5.4.1、脏数据或数据不正确
import matplotlib.pyplot as plt
plt.hist(camp['AvgIncome'], bins=20, density=True)#查看分布情况
camp['AvgIncome'].describe(include='all')
count 9686.000000
mean 40491.444249
std 28707.494146
min 0.000000
25% 24464.000000
50% 43100.000000
75% 56876.000000
max 200001.000000
Name: AvgIncome, dtype: float64
plt.hist(camp['AvgHomeValue'], bins=20, density=True)#查看分布情况
camp['AvgHomeValue'].describe(include='all')
count 9686.000000
mean 110986.299814
std 98670.855450
min 0.000000
25% 52300.000000
50% 76900.000000
75% 128175.000000
max 600000.000000
Name: AvgHomeValue, dtype: float64
#这里的0值应该是缺失值
camp['AvgIncome']=camp['AvgIncome'].replace({0: np.NaN})
#像这种外部获取的数据要比较小心,经常出现意义不清晰或这错误值。AvgHomeValue也有这种情况
plt.hist(camp['AvgIncome'], bins=20, density=True,range=(camp.AvgIncome.min(),camp.AvgIncome.max()))#由于数据中存在缺失值,需要指定绘图的值域
camp['AvgIncome'].describe(include='all')
count 7329.000000
mean 53513.457361
std 19805.168339
min 2499.000000
25% 40389.000000
50% 48699.000000
75% 62385.000000
max 200001.000000
Name: AvgIncome, dtype: float64
camp['AvgHomeValue']=camp['AvgHomeValue'].replace({0: np.NaN})
plt.hist(camp['AvgHomeValue'], bins=20, density=True,range=(camp.AvgHomeValue.min(),camp.AvgHomeValue.max()))#由于数据中存在缺失值,需要指定绘图的值域
camp['AvgHomeValue'].describe(include='all')
count 9583.000000
mean 112179.202755
std 98522.888583
min 7500.000000
25% 53200.000000
50% 77700.000000
75% 129350.000000
max 600000.000000
Name: AvgHomeValue, dtype: float64
5.4.2、数据不一致
- 这个问题需要详细的结合描述统计进行变量说明核对
1、数据重复
camp['dup'] = camp.duplicated() # 生成重复标识变量
camp.dup.head()
0 False
1 False
2 False
3 False
4 False
Name: dup, dtype: bool
#本数据没有重复记录,此处只是示例
camp_dup = camp[camp['dup'] == True] # 把有重复的数据保存出来,以备核查
camp_nodup = camp[camp['dup'] == False] # 注意与camp.drop_duplicates()的区别
camp_nodup.head()
ID | Suc_flag | ARPU | PromCnt12 | PromCnt36 | PromCntMsg12 | PromCntMsg36 | Class | Age | Gender | HomeOwner | AvgARPU | AvgHomeValue | AvgIncome | dup | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12 | 1 | 50.0 | 6 | 10 | 2 | 3 | 4 | 57.0 | M | H | 49.894904 | 33400.0 | 39460.0 | False |
1 | 53 | 0 | NaN | 5 | 9 | 1 | 4 | 3 | 55.0 | M | H | 48.574742 | 37600.0 | 33545.0 | False |
2 | 67 | 1 | 25.0 | 6 | 11 | 2 | 4 | 1 | 57.0 | F | H | 49.272646 | 100400.0 | 42091.0 | False |
3 | 71 | 1 | 80.0 | 7 | 10 | 2 | 4 | 1 | 52.0 | F | H | 47.334953 | 39900.0 | 39313.0 | False |
4 | 142 | 1 | 15.0 | 6 | 11 | 2 | 4 | 1 | NaN | F | U | 47.827404 | 47500.0 | NaN | False |
camp['dup1'] = camp['ID'].duplicated() # 按照主键进行重复记录标识
2、缺失值处理
-
如果count数量少于样本量,说明存在缺失
-
缺失最多的两个变量是Age和AvgIncome,缺失了大概20%。
-
其他有缺失变量请自行填补,找到一个有缺失的分类变量,使用众数进行填补
-
多重插补:sklearn.preprocessing.Imputer仅可用于填补均值、中位数、众数,多重插补可考虑使用Orange、impute、Theano等包
-
多重插补的处理有两个要点:1、被解释变量有缺失值的观测不能填补,只能删除;2、只对放入模型的解释变量进行插补。
camp.describe()
ID | Suc_flag | ARPU | PromCnt12 | PromCnt36 | PromCntMsg12 | PromCntMsg36 | Class | Age | AvgARPU | AvgHomeValue | AvgIncome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 9686.000000 | 9686.000000 | 4843.000000 | 9686.000000 | 9686.000000 | 9686.000000 | 9686.000000 | 9686.000000 | 7279.000000 | 9686.000000 | 9583.000000 | 7329.000000 |
mean | 97975.474086 | 0.500000 | 78.121722 | 3.495251 | 7.466963 | 1.034586 | 2.323044 | 2.424530 | 49.567386 | 52.905156 | 112179.202755 | 53513.457361 |
std | 56550.171120 | 0.500026 | 62.225686 | 1.270258 | 1.977909 | 0.244171 | 0.904083 | 1.049047 | 6.991306 | 4.993775 | 98522.888583 | 19805.168339 |
min | 12.000000 | 0.000000 | 5.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 16.000000 | 46.138968 | 7500.000000 | 2499.000000 |
25% | 48835.500000 | 0.000000 | 50.000000 | 3.000000 | 6.000000 | 1.000000 | 1.000000 | 2.000000 | 45.000000 | 49.760116 | 53200.000000 | 40389.000000 |
50% | 99106.000000 | 0.500000 | 65.000000 | 3.000000 | 8.000000 | 1.000000 | 3.000000 | 2.000000 | 50.000000 | 50.876672 | 77700.000000 | 48699.000000 |
75% | 148538.750000 | 1.000000 | 100.000000 | 4.000000 | 8.000000 | 1.000000 | 3.000000 | 3.000000 | 55.000000 | 54.452822 | 129350.000000 | 62385.000000 |
max | 191779.000000 | 1.000000 | 1000.000000 | 15.000000 | 20.000000 | 4.000000 | 6.000000 | 4.000000 | 60.000000 | 99.444787 | 600000.000000 | 200001.000000 |
vmean = camp['Age'].mean(axis=0, skipna=True)
camp['Age_empflag'] = camp['Age'].isnull() #空值标记
camp['Age']= camp['Age'].fillna(vmean) #用平均值填补空值
camp['Age'].describe()
count 9686.000000
mean 49.567386
std 6.060585
min 16.000000
25% 47.000000
50% 49.567386
75% 54.000000
max 60.000000
Name: Age, dtype: float64
camp
ID | Suc_flag | ARPU | PromCnt12 | PromCnt36 | PromCntMsg12 | PromCntMsg36 | Class | Age | Gender | HomeOwner | AvgARPU | AvgHomeValue | AvgIncome | dup | dup1 | Age_empflag | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12 | 1 | 50.0 | 6 | 10 | 2 | 3 | 4 | 57.000000 | M | H | 49.894904 | 33400.0 | 39460.0 | False | False | False |
1 | 53 | 0 | NaN | 5 | 9 | 1 | 4 | 3 | 55.000000 | M | H | 48.574742 | 37600.0 | 33545.0 | False | False | False |
2 | 67 | 1 | 25.0 | 6 | 11 | 2 | 4 | 1 | 57.000000 | F | H | 49.272646 | 100400.0 | 42091.0 | False | False | False |
3 | 71 | 1 | 80.0 | 7 | 10 | 2 | 4 | 1 | 52.000000 | F | H | 47.334953 | 39900.0 | 39313.0 | False | False | False |
4 | 142 | 1 | 15.0 | 6 | 11 | 2 | 4 | 1 | 49.567386 | F | U | 47.827404 | 47500.0 | NaN | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9681 | 191547 | 1 | 75.0 | 4 | 9 | 2 | 3 | 2 | 58.000000 | F | U | 58.403995 | 31800.0 | 42358.0 | False | False | False |
9682 | 191649 | 0 | NaN | 3 | 8 | 1 | 3 | 1 | 55.000000 | F | U | 49.166689 | 38200.0 | 42373.0 | False | False | False |
9683 | 191663 | 1 | 50.0 | 2 | 7 | 1 | 3 | 2 | 54.000000 | M | U | 49.434637 | 53800.0 | NaN | False | False | False |
9684 | 191672 | 0 | NaN | 4 | 8 | 1 | 3 | 1 | 59.000000 | F | U | 48.634900 | 36600.0 | 44023.0 | False | False | False |
9685 | 191779 | 1 | 750.0 | 3 | 5 | 1 | 2 | 2 | 54.000000 | M | U | 62.081260 | 143200.0 | 79635.0 | False | False | False |
9686 rows × 17 columns
vmean = camp['AvgHomeValue'].mean(axis=0, skipna=True)
camp['AvgHomeValue_empflag'] = camp['AvgHomeValue'].isnull()
camp['AvgHomeValue']= camp['AvgHomeValue'].fillna(vmean)
camp['AvgHomeValue'].describe()
count 9686.000000
mean 112179.202755
std 97997.592632
min 7500.000000
25% 53500.000000
50% 78450.000000
75% 128175.000000
max 600000.000000
Name: AvgHomeValue, dtype: float64
vmean = camp['AvgIncome'].mean(axis=0, skipna=True)
camp['AvgIncome_empflag'] = camp['AvgIncome'].isnull()
camp['AvgIncome']= camp['AvgIncome'].fillna(vmean)
camp['AvgIncome'].describe()
count 9686.000000
mean 53513.457361
std 17227.468161
min 2499.000000
25% 42775.000000
50% 53513.457361
75% 56876.000000
max 200001.000000
Name: AvgIncome, dtype: float64
5.4.3、噪声值处理
1、盖帽法
def blk(floor, root): # 'blk' will return a function
def f(x):
if x < floor:
x = floor
elif x > root:
x = root
return x
return f
q1 = camp['Age'].quantile(0.01) # 计算百分位数
q99 = camp['Age'].quantile(0.99)
blk_tot = blk(floor=q1, root=q99) # 'blk_tot' is a function
camp['Age']= camp['Age'].map(blk_tot)
camp['Age'].describe()
count 9686.000000
mean 49.624685
std 5.835803
min 31.000000
25% 47.000000
50% 49.567386
75% 54.000000
max 60.000000
Name: Age, dtype: float64
2、分箱法
- 分箱法——等宽分箱
camp['Age_group1'] = pd.qcut( camp['Age'], 4) # 这里以age_oldest_tr字段等宽分为4段
camp.Age_group1
0 (54.0, 60.0]
1 (54.0, 60.0]
2 (54.0, 60.0]
3 (49.567, 54.0]
4 (47.0, 49.567]
...
9681 (54.0, 60.0]
9682 (54.0, 60.0]
9683 (49.567, 54.0]
9684 (54.0, 60.0]
9685 (49.567, 54.0]
Name: Age_group1, Length: 9686, dtype: category
Categories (4, interval[float64]): [(30.999, 47.0] < (47.0, 49.567] < (49.567, 54.0] < (54.0, 60.0]]
- 分箱法——等深分箱
camp['Age_group2'] = pd.cut( camp['Age'], 4) # 这里以age_oldest_tr字段等宽分为4段
camp.Age_group2
0 (52.75, 60.0]
1 (52.75, 60.0]
2 (52.75, 60.0]
3 (45.5, 52.75]
4 (45.5, 52.75]
...
9681 (52.75, 60.0]
9682 (52.75, 60.0]
9683 (52.75, 60.0]
9684 (52.75, 60.0]
9685 (52.75, 60.0]
Name: Age_group2, Length: 9686, dtype: category
Categories (4, interval[float64]): [(30.971, 38.25] < (38.25, 45.5] < (45.5, 52.75] < (52.75, 60.0]]
camp.to_csv('tele_camp_ok.csv')