Pandas——数据清洗1

数据规整:

~

df2:

    fruit   color  price
0   apple     red    8.5
1  banana  yellow    6.8
2  orange  yellow    5.6
3   apple    cyan    7.8
4  banana    cyan    6.4
#根据多列名分组
print(df1.groupby(by=['fruit', 'color'])['price'].mean())
fruit   color 
apple   cyan      7.8
        red       8.5
banana  cyan      6.4
        yellow    6.8
orange  yellow    5.6
Name: price, dtype: float64

df2:
 
        a         b         c         d         e
sum  -0.792502 -0.968316 -0.617287  0.437361 -0.053810
make -0.475013 -1.281450  0.313900 -1.278580 -0.937860
tom   0.341775  0.796679 -1.321376  0.133645 -0.227819
bob  -0.666465 -0.408208 -0.173661 -2.080733 -0.003080
#按字典-columns分组
m = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red'}
print(df2.groupby(by=m, axis=1).sum())
          blue       red
sum  -0.179926 -1.814628
make -0.964680 -2.694323
tom  -1.187731  0.910635
bob  -2.254394 -1.077752

 

#按照series,分组求组内数量
s1 = pd.Series(m)
print(df2.groupby(by=s1, axis=1).count())
      blue  red
sum      2    3
make     2    3
tom      2    3
bob      2    3

 

#按照函数 长度函数分组
print(df2.groupby(by=len).sum())
          a         b         c         d         e
3 -1.117192 -0.579845 -2.112324 -1.509727 -0.284709
4 -0.475013 -1.281450  0.313900 -1.278580 -0.937860

 

文件读取: 

#读取文件 如果csv文件本身就没有标题 pd会自动把第一行当作标题, 所以为了避免数据被当成表头 的情况 header = none
df = pd.read_csv('房屋信息.csv', header=None)
print(df)

#也可以主动添加表头 names
df = pd.read_csv('房屋信息.csv', names=['a', 'b', 'c', 'd', 'message'])
print(df)

#换索引列 将message当作行索引  col代表列
df = pd.read_csv('房屋信息.csv', names=['a', 'b', 'c', 'd', 'message'], index_col='message')
print(df)

#index_col也可以列表形式 构造多层索引
df = pd.read_csv('房屋信息.csv', names=['a', 'b', 'c', 'd', 'message'], index_col=['a', 'b'])
print(df)

数据清洗: 

~

data1:


     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
#丢弃全部都为nan的行
print(data1.dropna(how='all'))
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
#丢弃掉全部为nan的列
print(data1.dropna(how='all', axis=1))
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN

df:
          0         1         2
0  1.714094 -1.506848 -0.110878
1  0.000021 -0.505049 -0.004595
2 -2.349841  0.638547 -0.087270
3  0.560394 -0.653842 -0.636631
4  0.447427  0.480138  0.196381
5  1.103654  0.899888 -0.717674
6 -0.098616  0.458054  1.213029
#将部分值改为nan
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
print(df)
          0         1         2
0  1.714094       NaN       NaN
1  0.000021       NaN       NaN
2 -2.349841       NaN -0.087270
3  0.560394       NaN -0.636631
4  0.447427  0.480138  0.196381
5  1.103654  0.899888 -0.717674
6 -0.098616  0.458054  1.213029
#丢弃前行内有两个缺失值的行
print(df.dropna(thresh=2))
          0         1         2
2 -2.349841       NaN -0.087270
3  0.560394       NaN -0.636631
4  0.447427  0.480138  0.196381
5  1.103654  0.899888 -0.717674
6 -0.098616  0.458054  1.213029
#填充数据
print(df.fillna(0))
          0         1         2
0  1.714094  0.000000  0.000000
1  0.000021  0.000000  0.000000
2 -2.349841  0.000000 -0.087270
3  0.560394  0.000000 -0.636631
4  0.447427  0.480138  0.196381
5  1.103654  0.899888 -0.717674
6 -0.098616  0.458054  1.213029
#分类讨论 把第一列的缺失值改成1, 把第二列的缺失值改成2
print(df.fillna({1:1, 2:0}))  #添加inplace = ture 原地修改
          0         1         2
0  1.714094  1.000000  0.000000
1  0.000021  1.000000  0.000000
2 -2.349841  1.000000 -0.087270
3  0.560394  1.000000 -0.636631
4  0.447427  0.480138  0.196381
5  1.103654  0.899888 -0.717674
6 -0.098616  0.458054  1.213029

 ~

df2:
          0         1         2
0 -0.070229 -0.853853 -0.739137
1 -0.947690 -1.608765  0.059307
2  2.758675       NaN -0.603239
3 -1.165221       NaN -0.762415
4  0.083660       NaN       NaN
5 -0.244539       NaN       NaN
#向下填充缺失值 同列最近非缺失值 元素填充缺失值
print(df2.fillna(method='ffill'))               #记住这个参数 ffill
          0         1         2
0 -0.070229 -0.853853 -0.739137
1 -0.947690 -1.608765  0.059307
2  2.758675 -1.608765 -0.603239
3 -1.165221 -1.608765 -0.762415
4  0.083660 -1.608765 -0.762415
5 -0.244539 -1.608765 -0.762415
#限制向下填充的个数
print(df2.fillna(method='ffill', limit=2))
          0         1         2
0 -0.070229 -0.853853 -0.739137
1 -0.947690 -1.608765  0.059307
2  2.758675 -1.608765 -0.603239
3 -1.165221 -1.608765 -0.762415
4  0.083660       NaN -0.762415
5 -0.244539       NaN -0.762415

 ~

data:

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4
#返回布尔 判断是否重复
print(data.duplicated())
0    False
1    False
2    False
3    False
4    False
5    False
6     True
#删除重复数据 直接返回数组对象
print(data.drop_duplicates())
    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
#也可以按照某类进行删除重复数据的操作--k1列如果有重复则删除整行保留最上面的
print(data.drop_duplicates(['k1']))
    k1  k2
0  one   1
1  two   1
#当遇到重复值时 保留后面的数值 而 删除前面的数值
print(data.drop_duplicates(keep='last'))            #last参数是保留最后的
    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
6  two   4

~

datas:

      food  price
0    apple    1.0
1    bnana    2.0
2   orange    3.5
3  mangguo    2.9
4    Apple    3.5
meat = {
    'apple': 'furit',
    'bnana': 'veg',
    'mangguo': 'fruit',
    'orange': 'fru'
}
#map作用与series apply作用域datafame的每行或每 applymap是作用与所有元素
print(datas['food'].map(meat))               #map用于接收映射关系 == 字典或函数
0    furit
1      veg
2      fru
3    fruit
4      NaN
#把所有值转为小写lower
low = datas['food'].str.lower()              #.str-获取列中的字符串
print(low)
0      apple
1      bnana
2     orange
3    mangguo
4      apple
datas['class'] = datas['food'].map(meat)
print(datas)
      food  price  class
0    apple    1.0  furit
1    bnana    2.0    veg
2   orange    3.5    fru
3  mangguo    2.9  fruit
4    Apple    3.5    NaN
#由于Apple 和 apple 都是苹果 但是对应字典中 是全小写的形式 故可以用函数 先转换为小写 然后再对应
print(datas['food'].map(lambda x: meat[x.lower()]))
0    furit
1      veg
2      fru
3    fruit
4    furit

df:
      a  b
0  111万  1
1   22万  2
#将数据中的字符串 提取出数字
df['a'].map(lambda x: print(eval(x[0:-1])))
111
22

~

data:

0        1
1     -999
2        2
3    77777
dtype: int64
#replace方法

print(data.replace([-999, 77777], [np.nan, 0]))     #-999替换成nan 7777替换成0
print(data.replace({-999: np.nan, 77777: 0}))       #也可以用字典形式替换
0    1.0
1    NaN
2    2.0
3    0.0
dtype: float64
0    1.0
1    NaN
2    2.0
3    0.0
dtype: float64

 ~

datas = pd.DataFrame(np.arange(12).reshape((3, 4)),
                     index=['bj', 'tk', 'ny'],
                     columns=['one', 'two', 'three', 'four'])

print(datas)
    one  two  three  four
bj    0    1      2     3
tk    4    5      6     7
ny    8    9     10    11
#重新索引
print(datas.reindex(['tk', 'ny', 'bj']))
      one  two  three  four
tk    4    5      6     7
ny    8    9     10    11
bj    0    1      2     3
#重命名 rename
print(datas.rename(index={'tk': 'a', 'ny': 'b', 'bj': 'c'}, columns={'one': '1'}))
   1  two  three  four
c  0    1      2     3
a  4    5      6     7
b  8    9     10    11

~

#数据的离散化和再划分
#分阶段
ages = [20, 22, 25, 27, 30, 31, 35, 37, 41, 46, 51, 67]
#面元bin
bins = [18, 25, 35, 60, 100]
'''
cut 函数 两个参数 第一个是需要切割的数组, 第二个是切割区间
返回两行
第一行是每个数据所属区间 皆是左闭右开
第二行是切割的区间
'''
cats = pd.cut(ages, bins)

print(cats)
[(18, 25], (18, 25], (18, 25], (25, 35], (25, 35], ..., (35, 60], (35, 60], (35, 60], (35, 60], (60, 100]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
#打印每个元素所属区间的编号
print(cats.codes)
[0 0 0 1 1 1 1 2 2 2 2 3]
#打印该数据的区间
print(cats.categories)
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
#降序打印 区间出现数据的次数
print(pd.value_counts(cats))
(25, 35]     4
(35, 60]     4
(18, 25]     3
(60, 100]    1
dtype: int64
names = ['青少年', '青年', '中年', '老年']
print(pd.cut(ages, bins, labels=names))
['青少年', '青少年', '青少年', '青年', '青年', ..., '中年', '中年', '中年', '中年', '老年']

~

data3:
[ 0.27299673 -0.76994187 -0.13676037  1.78912866  0.0323877  -0.26431203
  0.47883635  0.31424239  1.16599465 -1.57052829  0.11403125  0.68940353
  0.31311914 -1.86112748 -0.31328714  0.77800633  0.00404619 -0.6806554
  0.57488494  1.22218326]
#均匀分割 4 份  #限定小数位数
print(pd.cut(data3, 4,  precision=2))
[(-0.036, 0.88], (-0.95, -0.036], (-0.95, -0.036], (0.88, 1.79], (-0.036, 0.88], ..., (-0.036, 0.88], (-0.036, 0.88], (-0.95, -0.036], (-0.036, 0.88], (0.88, 1.79]]
Length: 20
Categories (4, interval[float64, right]): [(-1.86, -0.95] < (-0.95, -0.036] < (-0.036, 0.88] <
                                           (0.88, 1.79]]

 ~

#qcut 均匀分割函数 均匀面元内的数 个数相等
data4 = np.random.randn(100)
print(pd.qcut(data4, 4))
print(pd.value_counts(pd.qcut(data4, 4)))
[(0.394, 1.686], (0.00123, 0.394], (0.00123, 0.394], (-2.541, -0.725], (0.00123, 0.394], ..., (-0.725, 0.00123], (0.394, 1.686], (-0.725, 0.00123], (0.394, 1.686], (-2.541, -0.725]]
Length: 100
Categories (4, interval[float64, right]): [(-2.541, -0.725] < (-0.725, 0.00123] < (0.00123, 0.394] <
                                           (0.394, 1.686]]
(-2.541, -0.725]     25
(-0.725, 0.00123]    25
(0.00123, 0.394]     25
(0.394, 1.686]       25
dtype: int64

#检测和过滤异常值
data5 = pd.DataFrame(np.random.randn(100, 4))
print(data5)

print('-'*50)
#寻找绝对值大于三的
print(np.abs(data5) > 1)
print(data5[(np.abs(data5) > 1).any(1)])        #any的意思!敲重点 只要每行有一个ture 则返回ture
print(data5.describe())
           0         1         2         3
0   0.888472  0.352992 -0.137076  1.734222
1  -0.178694 -0.995964 -0.220334 -0.240809
2  -0.956168  0.056116 -1.161986  0.621710
3  -0.132241 -0.772302  1.752085 -0.198287
4  -0.929681  1.649626 -0.640536  0.218673
..       ...       ...       ...       ...
95  2.107463  0.503272  1.088572  1.518699
96  0.391785 -0.386596 -1.014523 -0.204848
97 -1.149701 -0.007344  0.082031 -0.711486
98  1.757191 -0.072677 -0.232690 -0.061123
99 -0.478114  1.468151 -0.517214 -1.730135

[100 rows x 4 columns]
--------------------------------------------------
        0      1      2      3
0   False  False  False   True
1   False  False  False  False
2   False  False   True  False
3   False  False   True  False
4   False   True  False  False
..    ...    ...    ...    ...
95   True  False   True   True
96  False  False   True  False
97   True  False  False  False
98   True  False  False  False
99  False   True  False   True

[100 rows x 4 columns]





           0         1         2         3
0   0.888472  0.352992 -0.137076  1.734222
2  -0.956168  0.056116 -1.161986  0.621710
3  -0.132241 -0.772302  1.752085 -0.198287
4  -0.929681  1.649626 -0.640536  0.218673
6   0.683864  2.038768 -0.202977  0.137352
..       ...       ...       ...       ...
95  2.107463  0.503272  1.088572  1.518699
96  0.391785 -0.386596 -1.014523 -0.204848
97 -1.149701 -0.007344  0.082031 -0.711486
98  1.757191 -0.072677 -0.232690 -0.061123
99 -0.478114  1.468151 -0.517214 -1.730135

[79 rows x 4 columns]



                0           1           2           3
count  100.000000  100.000000  100.000000  100.000000
mean     0.148266    0.179241    0.003147    0.039621
std      0.958056    1.055200    1.049992    0.949855
min     -1.980334   -2.545607   -2.459421   -2.134215
25%     -0.481164   -0.381006   -0.655402   -0.518193
50%      0.083503    0.233742    0.046225   -0.047955
75%      0.813610    0.922826    0.716252    0.685000
max      2.573760    3.501705    2.311853    2.250750

Process finished with exit code 0

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值