ch7:数据清洗和准备

数据清洗和准备

import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

处理缺失值

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool

1.过滤掉丢失的数据

from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
data[data.notnull()]
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0

2.按行删除缺失值

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
cleaned = data.dropna()
cleaned
012
01.06.53.0

3.仅删除行值全为NaN的对应行

data.dropna(how='all')
012
01.06.53.0
11.0NaNNaN
3NaN6.53.0

4.删除列全为NaN的对应的列

data[4] = NA
data
0124
01.06.53.0NaN
11.0NaNNaNNaN
2NaNNaNNaNNaN
3NaN6.53.0NaN
data.dropna(axis=1, how='all')
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0

举例

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
012
0-0.204708NaNNaN
1-0.555730NaNNaN
20.092908NaN0.769023
31.246435NaN-1.296221
40.2749920.2289131.352917
50.886429-2.001637-0.371843
61.669025-0.438570-0.539741
df.dropna()
012
40.2749920.2289131.352917
50.886429-2.001637-0.371843
61.669025-0.438570-0.539741
df.dropna(thresh=2)

填充缺失值

1.填充0

df.fillna(0)   
012
0-0.2047080.0000000.000000
1-0.5557300.0000000.000000
20.0929080.0000000.769023
31.2464350.000000-1.296221
40.2749920.2289131.352917
50.886429-2.001637-0.371843
61.669025-0.438570-0.539741

2.按列填充不同的数字

df.fillna({1: 0.5, 2: 0})   
012
0-0.2047080.5000000.000000
1-0.5557300.5000000.000000
20.0929080.5000000.769023
31.2464350.500000-1.296221
40.2749920.2289131.352917
50.886429-2.001637-0.371843
61.669025-0.438570-0.539741

数据转换

1.对数据进行去重处理(各指标都相同)

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
6two4
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
data.drop_duplicates()
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
data['v1'] = range(7)
data
k1k2v1
0one10
1two11
2one22
3two33
4one34
5two45
6two46

2.对数据进行去重处理(某个或者某几个指标都相同)

data.drop_duplicates(['k1'])
k1k2v1
0one10
1two11
data.drop_duplicates(['k1', 'k2'], keep='last')
k1k2v1
0one10
1two11
2one22
3two33
4one34
6two46

对Series进行替换值

data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
data.replace(-999, np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
data.replace([-999, -1000], np.nan)
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
data.replace([-999, -1000], [np.nan, 0])
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
data.replace({-999: np.nan, -1000: 0})
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renaming Axis Indexes

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data
onetwothreefour
Ohio0123
Colorado4567
New York891011
data.rename(index=str.title, columns=str.upper)
ONETWOTHREEFOUR
Ohio0123
Colo4567
New891011

1.将一维数据进行频率统计,分区间

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

2.将一维数据区间进行重命名,概念化

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats1=pd.cut(ages, bins, labels=group_names)
cats1
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats
[(0.678, 3.26], (-3.746, -0.64], (-0.0251, 0.678], (-3.746, -0.64], (-0.64, -0.0251], ..., (-3.746, -0.64], (-0.0251, 0.678], (-3.746, -0.64], (-0.64, -0.0251], (-3.746, -0.64]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -0.64] < (-0.64, -0.0251] < (-0.0251, 0.678] < (0.678, 3.26]]
pd.value_counts(cats)
(0.678, 3.26]       250
(-0.0251, 0.678]    250
(-0.64, -0.0251]    250
(-3.746, -0.64]     250
dtype: int64
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
[(1.338, 3.26], (-1.296, -0.0251], (-0.0251, 1.338], (-1.296, -0.0251], (-1.296, -0.0251], ..., (-1.296, -0.0251], (-0.0251, 1.338], (-3.746, -1.296], (-1.296, -0.0251], (-1.296, -0.0251]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -1.296] < (-1.296, -0.0251] < (-0.0251, 1.338] < (1.338, 3.26]]

检测和过滤异常值

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.048908-0.0626460.007473-0.008516
std0.9914551.0071851.0256121.001500
min-3.194414-3.530912-3.183867-3.481593
25%-0.752792-0.755464-0.690993-0.694234
50%-0.041676-0.0326400.0044150.025105
75%0.6176930.6175440.7136060.694813
max3.0237202.9161533.1899402.961194

1.对某一列进行检测

col = data[2]     #第三列
col
0     -2.296733
1     -0.908114
2     -1.488200
3      0.074264
4      1.242359
         ...   
995   -0.105099
996   -1.343030
997   -0.671807
998    0.307430
999    0.998089
Name: 2, Length: 1000, dtype: float64
col[np.abs(col) > 3]
17    -3.183867
64    -3.140963
457    3.082067
485    3.189940
851   -3.024110
Name: 2, dtype: float64

2.对整个数据框进行检测

data[(np.abs(data) > 3).any(1)]
0123
17-0.2741381.188742-3.1838671.050471
641.741426-2.214074-3.140963-1.509976
230-3.1944140.077839-1.7335490.235425
2603.023720-1.1053120.1051410.995257
2880.0625282.3680100.452649-3.481593
457-0.0713200.1642933.082067-0.516982
4850.617599-0.8438493.1899400.070978
650-3.044612-1.1939800.8623121.012656
6830.0690360.617561-1.148738-3.170292
840-3.105636-0.3690090.131459-2.540833
851-1.4146370.123291-3.024110-1.168413
9120.691626-3.530912-0.576175-0.750648

3.前五行数据的正负

np.sign(data).head()
0123
01.0-1.0-1.0-1.0
1-1.0-1.0-1.0-1.0
2-1.01.0-1.0-1.0
31.01.01.01.0
4-1.0-1.01.0-1.0

排列和随机抽样

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
array([1, 3, 2, 4, 0])
df
0123
00123
14567
2891011
312131415
416171819

1.改变行排序

df.take(sampler)
0123
14567
312131415
2891011
416171819
00123

2.按行抽样

df.sample(n=3)
0123
312131415
14567
2891011
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)  #replace 是否放回抽样,默认false为不放回
draws
3    6
0    5
1    7
4    4
2   -1
0    5
0    5
1    7
4    4
0    5
dtype: int64

字符串操作

字符串对象方法

val = 'a,b,  guido'
val.split(',')
['a', 'b', '  guido']
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值