【Python数据分析学习笔记Day3】(三)数据分析工具pandas,数据清洗,聚类K-Means


Pandas学习 https://blog.csdn.net/qq_41251963/article/details/103904044


Pandas数据结构

import pandas as pd
  • Series
# 通过list构建Series
ser_obj = pd.Series(range(10, 20))
print(type(ser_obj))
<class 'pandas.core.series.Series'>
# 获取数据
print(ser_obj.values)

# 获取索引
print(ser_obj.index)
[10 11 12 13 14 15 16 17 18 19]
RangeIndex(start=0, stop=10, step=1)
# 预览数据
print(ser_obj.head(3))
0    10
1    11
2    12
dtype: int32
print(ser_obj)
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
#通过索引获取数据
print(ser_obj[0])
print(ser_obj[8])
10
18
# 索引与数据的对应关系仍保持在数组运算的结果中
print(ser_obj * 2)
print(ser_obj > 15)
0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int32
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool
# 通过dict构建Series
year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5}
ser_obj2 = pd.Series(year_data)
print(ser_obj2.head())
print(ser_obj2.index)
2001    17.8
2002    20.1
2003    16.5
dtype: float64
Int64Index([2001, 2002, 2003], dtype='int64')
# name属性
ser_obj2.name = 'temp'
ser_obj2.index.name = 'year'
print(ser_obj2.head())
year
2001    17.8
2002    20.1
2003    16.5
Name: temp, dtype: float64
  • DataFrame
import numpy as np

# 通过ndarray构建DataFrame
array = np.random.randn(5,4)
print(array)

df_obj = pd.DataFrame(array)
print(df_obj.head())
[[-1.4293778   0.67288062 -0.45636593  0.21228057]
 [ 2.25005309  0.88655282  0.90643595 -0.70144272]
 [ 0.07317507  0.19102656 -0.04193859 -0.5824688 ]
 [ 0.40290136 -1.07145513 -1.15744649  1.65085608]
 [-1.1389536   0.46239919 -0.7448839   0.66148365]]
          0         1         2         3
0 -1.429378  0.672881 -0.456366  0.212281
1  2.250053  0.886553  0.906436 -0.701443
2  0.073175  0.191027 -0.041939 -0.582469
3  0.402901 -1.071455 -1.157446  1.650856
4 -1.138954  0.462399 -0.744884  0.661484
# 通过dict构建DataFrame
dict_data = {'A': 1., 
             'B': pd.Timestamp('20161217'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([3] * 4,dtype='int32'),
             'E' : pd.Categorical(["Python","Java","C++","C#"]),
             'F' : 'ChinaHadoop' }
#print dict_data
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.head())
     A          B    C  D       E            F
0  1.0 2016-12-17  1.0  3  Python  ChinaHadoop
1  1.0 2016-12-17  1.0  3    Java  ChinaHadoop
2  1.0 2016-12-17  1.0  3     C++  ChinaHadoop
3  1.0 2016-12-17  1.0  3      C#  ChinaHadoop
# 通过列索引获取列数据
print(df_obj2['A'])
print(type(df_obj2['A']))

print(df_obj2.A)
0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
<class 'pandas.core.series.Series'>
0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
# 增加列
df_obj2['G'] = df_obj2['D'] + 4
print(df_obj2.head())
     A          B    C  D       E            F  G
0  1.0 2016-12-17  1.0  3  Python  ChinaHadoop  7
1  1.0 2016-12-17  1.0  3    Java  ChinaHadoop  7
2  1.0 2016-12-17  1.0  3     C++  ChinaHadoop  7
3  1.0 2016-12-17  1.0  3      C#  ChinaHadoop  7
# 删除列
del(df_obj2['G'] )
print(df_obj2.head())
     A          B    C  D       E            F
0  1.0 2016-12-17  1.0  3  Python  ChinaHadoop
1  1.0 2016-12-17  1.0  3    Java  ChinaHadoop
2  1.0 2016-12-17  1.0  3     C++  ChinaHadoop
3  1.0 2016-12-17  1.0  3      C#  ChinaHadoop
  • 索引对象 Index
print(type(ser_obj.index))
print(type(df_obj2.index))

print(df_obj2.index)
<class 'pandas.core.indexes.range.RangeIndex'>
<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 1, 2, 3], dtype='int64')
# 索引对象不可变
df_obj2.index[0] = 2
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-22-7f40a356d7d1> in <module>()
      1 # 索引对象不可变
----> 2 df_obj2.index[0] = 2


D:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
   1668 
   1669     def __setitem__(self, key, value):
-> 1670         raise TypeError("Index does not support mutable operations")
   1671 
   1672     def __getitem__(self, key):


TypeError: Index does not support mutable operations

Pandas数据操作

import pandas as pd
  • Series索引
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())
a    0
b    1
c    2
d    3
e    4
dtype: int32
# 行索引
print(ser_obj['a'])
print(ser_obj[0])
0
0
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])
b    1
c    2
dtype: int32
b    1
c    2
d    3
dtype: int32
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])
a    0
c    2
e    4
dtype: int32
a    0
e    4
dtype: int32
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])
a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32
  • DataFrame索引
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())
          a         b         c         d
0 -0.595692  0.813699 -0.551327 -0.059703
1  0.339194 -2.335579  0.230472 -0.680213
2 -0.252306  0.212406 -0.979523  0.408522
3  0.216677  0.574524 -0.819607  2.170009
4 -1.099175 -0.665488  0.391421 -0.400642
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series类型
print(type(df_obj[[0]])) # 返回DataFrame类型

# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])
print(df_obj[[1, 3]])
列索引
0   -0.595692
1    0.339194
2   -0.252306
3    0.216677
4   -1.099175
Name: a, dtype: float64
<class 'pandas.core.frame.DataFrame'>
不连续索引
          a         c
0 -0.595692 -0.551327
1  0.339194  0.230472
2 -0.252306 -0.979523
3  0.216677 -0.819607
4 -1.099175  0.391421
          b         d
0  0.813699 -0.059703
1 -2.335579 -0.680213
2  0.212406  0.408522
3  0.574524  2.170009
4 -0.665488 -0.400642
  • 三种索引方式
# 标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])
b    1
c    2
d    3
dtype: int32
b    1
c    2
d    3
dtype: int32
0   -0.595692
1    0.339194
2   -0.252306
3    0.216677
4   -1.099175
Name: a, dtype: float64
0   -0.595692
1    0.339194
2   -0.252306
Name: a, dtype: float64
# 整型位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别
b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0   -0.595692
1    0.339194
Name: a, dtype: float64
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作,然后再按位置索引尝试操作
b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0   -0.595692
1    0.339194
2   -0.252306
Name: a, dtype: float64
  • 运算与对齐
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)
s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32
# Series 对齐运算
s1 + s2
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)
df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
# DataFrame对齐操作
df1 + df2
abc
02.02.0NaN
12.02.0NaN
2NaNNaNNaN
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2, fill_value = -1)
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
0    20
1    21
2    22
3    23
4    24
dtype: int32





0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64
df1.sub(df2, fill_value = 2.)
abc
00.00.01.0
10.00.01.0
21.01.01.0
# 填充NaN
s3 = s1 + s2
print(s3)
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64
s3_filled = s3.fillna(-1)
print(s3_filled)
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64
df3 = df1 + df2
print(df3)
     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN
df3.fillna(100, inplace = True)
print(df3)
       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0
  • 函数应用
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))
          0         1         2         3
0 -2.193022 -2.090432 -2.288651 -0.026022
1 -0.720957 -1.501025 -1.734828 -1.858286
2  0.300216 -3.391127 -0.872570 -0.686669
3 -2.552131 -1.452268 -1.188845 -0.597845
4  2.111044 -1.203676 -1.143487 -0.542755
          0         1         2         3
0  2.193022  2.090432  2.288651  0.026022
1  0.720957  1.501025  1.734828  1.858286
2  0.300216  3.391127  0.872570  0.686669
3  2.552131  1.452268  1.188845  0.597845
4  2.111044  1.203676  1.143487  0.542755
# 使用apply应用行或列数据
#f = lambda x : x.max()
print(df.apply(lambda x : x.max()))
0    2.111044
1   -1.203676
2   -0.872570
3   -0.026022
dtype: float64
# 指定轴方向
print(df.apply(lambda x : x.max(), axis=1))
0   -0.026022
1   -0.720957
2    0.300216
3   -0.597845
4    2.111044
dtype: float64
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))
       0      1      2      3
0  -2.19  -2.09  -2.29  -0.03
1  -0.72  -1.50  -1.73  -1.86
2   0.30  -3.39  -0.87  -0.69
3  -2.55  -1.45  -1.19  -0.60
4   2.11  -1.20  -1.14  -0.54
  • 排序
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)
4    10
3    11
1    12
4    13
4    14
dtype: int32
# 索引排序
s4.sort_index()
1    12
3    11
4    10
4    13
4    14
dtype: int32
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)
          3         2         2         1
2  0.244068 -1.977220  0.045238 -2.064546
2  0.218196 -0.419284 -0.698839  0.241649
2  0.296747 -0.021311  0.225724 -0.325439
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)
1223
2-2.064546-1.9772200.0452380.244068
20.241649-0.419284-0.6988390.218196
2-0.325439-0.0213110.2257240.296747
# 按值排序
df4.sort_values(by=1)
3221
20.244068-1.9772200.045238-2.064546
20.296747-0.0213110.225724-0.325439
20.218196-0.419284-0.6988390.241649
  • 处理缺失数据
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()
012
01.6194630.548047-1.027003
11.000000NaNNaN
24.000000NaNNaN
31.000000NaN2.000000
# isnull
df_data.isnull()
012
0FalseFalseFalse
1FalseTrueTrue
2FalseTrueTrue
3FalseTrueFalse
# dropna
df_data.dropna()
#df_data.dropna(axis=1)
012
01.6194630.548047-1.027003
# fillna
df_data.fillna(-100.)
012
01.6194630.548047-1.027003
11.000000-100.000000-100.000000
24.000000-100.000000-100.000000
31.000000-100.0000002.000000

Pandas统计计算和描述

import numpy as np
import pandas as pd
  • 常用的统计计算
df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
df_obj
abcd
00.7155940.123322-0.628493-0.103682
10.7832280.140333-0.211933-1.403887
2-0.713423-1.4833640.276417-0.664303
31.580704-0.0531380.562683-0.424985
42.046556-1.6002190.0219360.320219
df_obj.sum()
a    4.412658
b   -2.873065
c    0.020610
d   -2.276637
dtype: float64
df_obj.max()
a    2.046556
b    0.140333
c    0.562683
d    0.320219
dtype: float64
df_obj.min(axis=1)
0   -0.628493
1   -1.403887
2   -1.483364
3   -0.424985
4   -1.600219
dtype: float64
  • 统计描述
df_obj.describe()
abcd
count5.0000005.0000005.0000005.000000
mean0.882532-0.5746130.004122-0.455327
std1.0520450.8871150.4564360.646042
min-0.713423-1.600219-0.628493-1.403887
25%0.715594-1.483364-0.211933-0.664303
50%0.783228-0.0531380.021936-0.424985
75%1.5807040.1233220.276417-0.103682
max2.0465560.1403330.5626830.320219

Pandas层级索引

import pandas as pd
import numpy as np
ser_obj = pd.Series(np.random.randn(12),
                    index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],
                           [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
print(ser_obj)
a  0    0.078539
   1    0.643005
   2    1.254099
b  0    0.569994
   1   -1.267482
   2   -0.751972
c  0    2.579259
   1    0.566795
   2   -0.796418
d  0    1.444369
   1   -0.013740
   2   -1.541993
dtype: float64
  • MultiIndex索引对象
print(type(ser_obj.index))
print(ser_obj.index)
<class 'pandas.indexes.multi.MultiIndex'>
MultiIndex(levels=[['a', 'b', 'c', 'd'], [0, 1, 2]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
  • 选取子集
# 外层选取
print(ser_obj['c'])
0    2.579259
1    0.566795
2   -0.796418
dtype: float64
# 内层选取
print(ser_obj[:, 2])
a    1.254099
b   -0.751972
c   -0.796418
d   -1.541993
dtype: float64
  • 交换分层顺序
print(ser_obj.swaplevel())
0  a    0.078539
1  a    0.643005
2  a    1.254099
0  b    0.569994
1  b   -1.267482
2  b   -0.751972
0  c    2.579259
1  c    0.566795
2  c   -0.796418
0  d    1.444369
1  d   -0.013740
2  d   -1.541993
dtype: float64
  • 交换并排序分层
print(ser_obj.swaplevel().sortlevel())
0  a    0.078539
   b    0.569994
   c    2.579259
   d    1.444369
1  a    0.643005
   b   -1.267482
   c    0.566795
   d   -0.013740
2  a    1.254099
   b   -0.751972
   c   -0.796418
   d   -1.541993
dtype: float64

分组与聚合

  • GroupBy对象
import pandas as pd
import numpy as np
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randn(8),
            'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)
      data1     data2 key1   key2
0 -0.943078  0.820645    a    one
1 -1.429043  0.142617    b    one
2  0.832261  0.843898    a    two
3  0.906262  0.688165    b  three
4  0.541173  0.117232    a    two
5 -0.213385 -0.098734    b    two
6 -1.291468 -1.186638    a    one
7  1.186941  0.809122    a  three
# dataframe根据key1进行分组
print(type(df_obj.groupby('key1')))
<class 'pandas.core.groupby.DataFrameGroupBy'>
# data1列根据key1进行分组
print(type(df_obj['data1'].groupby(df_obj['key1'])))
<class 'pandas.core.groupby.SeriesGroupBy'>
# 分组运算
grouped1 = df_obj.groupby('key1')
print(grouped1.mean())

grouped2 = df_obj['data1'].groupby(df_obj['key1'])
print(grouped2.mean())
         data1     data2
key1                    
a     0.065166  0.280852
b    -0.245389  0.244016
key1
a    0.065166
b   -0.245389
Name: data1, dtype: float64
# size
print(grouped1.size())
print(grouped2.size())
key1
a    5
b    3
dtype: int64
key1
a    5
b    3
dtype: int64
# 按列名分组
df_obj.groupby('key1')
<pandas.core.groupby.DataFrameGroupBy object at 0x00000224B6DA5DD8>
# 按自定义key分组,列表
self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
df_obj.groupby(self_def_key).size()
1    5
2    3
dtype: int64
# 按自定义key分组,多层列表
df_obj.groupby([df_obj['key1'], df_obj['key2']]).size()
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
# 按多个列多层分组
grouped2 = df_obj.groupby(['key1', 'key2'])
print(grouped2.size())
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
# 多层分组按key的顺序进行
grouped3 = df_obj.groupby(['key2', 'key1'])
print(grouped3.mean())
print()
print(grouped3.mean().unstack())
               data1     data2
key2  key1                    
one   a    -1.117273 -0.182997
      b    -1.429043  0.142617
three a     1.186941  0.809122
      b     0.906262  0.688165
two   a     0.686717  0.480565
      b    -0.213385 -0.098734

          data1               data2          
key1          a         b         a         b
key2                                         
one   -1.117273 -1.429043 -0.182997  0.142617
three  1.186941  0.906262  0.809122  0.688165
two    0.686717 -0.213385  0.480565 -0.098734
  • GroupBy对象分组迭代
# 单层分组
for group_name, group_data in grouped1:
    print(group_name)
    print(group_data)
a
      data1     data2 key1   key2
0 -0.943078  0.820645    a    one
2  0.832261  0.843898    a    two
4  0.541173  0.117232    a    two
6 -1.291468 -1.186638    a    one
7  1.186941  0.809122    a  three
b
      data1     data2 key1   key2
1 -1.429043  0.142617    b    one
3  0.906262  0.688165    b  three
5 -0.213385 -0.098734    b    two
# 多层分组
for group_name, group_data in grouped2:
    print(group_name)
    print(group_data)
('a', 'one')
      data1     data2 key1 key2
0 -0.943078  0.820645    a  one
6 -1.291468 -1.186638    a  one
('a', 'three')
      data1     data2 key1   key2
7  1.186941  0.809122    a  three
('a', 'two')
      data1     data2 key1 key2
2  0.832261  0.843898    a  two
4  0.541173  0.117232    a  two
('b', 'one')
      data1     data2 key1 key2
1 -1.429043  0.142617    b  one
('b', 'three')
      data1     data2 key1   key2
3  0.906262  0.688165    b  three
('b', 'two')
      data1     data2 key1 key2
5 -0.213385 -0.098734    b  two
# GroupBy对象转换list
list(grouped1)
[('a',       data1     data2 key1   key2
  0 -0.943078  0.820645    a    one
  2  0.832261  0.843898    a    two
  4  0.541173  0.117232    a    two
  6 -1.291468 -1.186638    a    one
  7  1.186941  0.809122    a  three), ('b',       data1     data2 key1   key2
  1 -1.429043  0.142617    b    one
  3  0.906262  0.688165    b  three
  5 -0.213385 -0.098734    b    two)]
# GroupBy对象转换dict
dict(list(grouped1))
{'a':       data1     data2 key1   key2
 0 -0.943078  0.820645    a    one
 2  0.832261  0.843898    a    two
 4  0.541173  0.117232    a    two
 6 -1.291468 -1.186638    a    one
 7  1.186941  0.809122    a  three, 'b':       data1     data2 key1   key2
 1 -1.429043  0.142617    b    one
 3  0.906262  0.688165    b  three
 5 -0.213385 -0.098734    b    two}
# 按列分组
print(df_obj.dtypes)

# 按数据类型分组
df_obj.groupby(df_obj.dtypes, axis=1).size()
df_obj.groupby(df_obj.dtypes, axis=1).sum()
data1    float64
data2    float64
key1      object
key2      object
dtype: object
float64object
0-0.122433aone
1-1.286426bone
21.676158atwo
31.594427bthree
40.658404atwo
5-0.312119btwo
6-2.478106aone
71.996064athree
  • 其他分组方法
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
df_obj2.ix[1, 1:4] = np.NaN
df_obj2
abcde
A11.01.06.05
B2NaNNaNNaN6
C55.07.05.07
D28.05.06.02
E51.04.04.04
# 通过字典分组
mapping_dict = {'a':'python', 'b':'python', 'c':'java', 'd':'C', 'e':'java'}
df_obj2.groupby(mapping_dict, axis=1).size()
df_obj2.groupby(mapping_dict, axis=1).count() # 非NaN的个数
df_obj2.groupby(mapping_dict, axis=1).sum()
Cjavapython
A6.06.02.0
BNaN6.02.0
C5.014.010.0
D6.07.010.0
E4.08.06.0
# 通过函数分组
df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['AA', 'BBB', 'CC', 'D', 'EE'])
#df_obj3

def group_key(idx):
    """
        idx 为列索引或行索引
    """
    #return idx
    return len(idx)

df_obj3.groupby(group_key).size()

# 以上自定义函数等价于
#df_obj3.groupby(len).size()
1    1
2    3
3    1
dtype: int64
# 通过索引级别分组
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df_obj4
languagePythonJavaPythonJavaPython
indexAABCB
016472
197224
239975
316166
451736
# 根据language进行分组
df_obj4.groupby(level='language', axis=1).sum()
df_obj4.groupby(level='index', axis=1).sum()
indexABC
0767
11662
212147
3776
46133
  • 聚合
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1,10, 8),
            'data2': np.random.randint(1,10, 8)}
df_obj5 = pd.DataFrame(dict_obj)
print(df_obj5)
   data1  data2 key1   key2
0      4      2    a    one
1      7      1    b    one
2      2      8    a    two
3      9      4    b  three
4      3      2    a    two
5      8      5    b    two
6      6      8    a    one
7      9      3    a  three
# 内置的聚合函数
print(df_obj5.groupby('key1').sum())
print(df_obj5.groupby('key1').max())
print(df_obj5.groupby('key1').min())
print(df_obj5.groupby('key1').mean())
print(df_obj5.groupby('key1').size())
print(df_obj5.groupby('key1').count())
print(df_obj5.groupby('key1').describe())
      data1  data2
key1              
a        24     23
b        24     10
      data1  data2 key2
key1                   
a         9      8  two
b         9      5  two
      data1  data2 key2
key1                   
a         2      2  one
b         7      1  one
      data1     data2
key1                 
a       4.8  4.600000
b       8.0  3.333333
key1
a    5
b    3
dtype: int64
      data1  data2  key2
key1                    
a         5      5     5
b         3      3     3
               data1     data2
key1                          
a    count  5.000000  5.000000
     mean   4.800000  4.600000
     std    2.774887  3.130495
     min    2.000000  2.000000
     25%    3.000000  2.000000
     50%    4.000000  3.000000
     75%    6.000000  8.000000
     max    9.000000  8.000000
b    count  3.000000  3.000000
     mean   8.000000  3.333333
     std    1.000000  2.081666
     min    7.000000  1.000000
     25%    7.500000  2.500000
     50%    8.000000  4.000000
     75%    8.500000  4.500000
     max    9.000000  5.000000
# 自定义聚合函数
def peak_range(df):
    """
        返回数值范围
    """
    #print type(df) #参数为索引所对应的记录
    return df.max() - df.min()

print(df_obj5.groupby('key1').agg(peak_range))
print(df_obj.groupby('key1').agg(lambda df : df.max() - df.min()))
      data1  data2
key1              
a         7      6
b         2      4
         data1     data2
key1                    
a     2.478410  2.030536
b     2.335305  0.786899
# 应用多个聚合函数

# 同时应用多个聚合函数
print(df_obj.groupby('key1').agg(['mean', 'std', 'count', peak_range])) # 默认列名为函数名
         data1                                data2                           
          mean       std count peak_range      mean       std count peak_range
key1                                                                          
a     0.065166  1.110226     5   2.478410  0.280852  0.875752     5   2.030536
b    -0.245389  1.167982     3   2.335305  0.244016  0.403130     3   0.786899
print(df_obj.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)])) # 通过元组提供新的列名
         data1                               data2                          
          mean       std count     range      mean       std count     range
key1                                                                        
a     0.065166  1.110226     5  2.478410  0.280852  0.875752     5  2.030536
b    -0.245389  1.167982     3  2.335305  0.244016  0.403130     3  0.786899
# 每列作用不同的聚合函数
dict_mapping = {'data1':'mean',
                'data2':'sum'}
print(df_obj.groupby('key1').agg(dict_mapping))
         data2     data1
key1                    
a     1.404259  0.065166
b     0.732047 -0.245389
dict_mapping = {'data1':['mean','max'],
                'data2':'sum'}
print(df_obj.groupby('key1').agg(dict_mapping))
         data2     data1          
           sum      mean       max
key1                              
a     1.404259  0.065166  1.186941
b     0.732047 -0.245389  0.906262

数据分组运算

import pandas as pd
import numpy as np
# 分组运算后保持shape
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1, 10, 8),
            'data2': np.random.randint(1, 10, 8)}
df_obj = pd.DataFrame(dict_obj)
df_obj
data1data2key1key2
049aone
184bone
275atwo
385bthree
452atwo
578btwo
653aone
741athree
# 按key1分组后,计算data1,data2的统计信息并附加到原始表格中
k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
k1_sum
sum_data1sum_data2
key1
a2520
b2317
# 方法1,使用merge
pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)
data1data2key1key2sum_data1sum_data2
049aone2520
275atwo2520
452atwo2520
653aone2520
741athree2520
184bone2317
385bthree2317
578btwo2317
  • transform方法
# 方法2,使用transform
k1_sum_tf = df_obj.groupby('key1').transform(np.sum).add_prefix('sum_')
df_obj[k1_sum_tf.columns] = k1_sum_tf
df_obj
data1data2key1key2sum_data1sum_data2sum_key2
049aone49one
184bone84one
275atwo75two
385bthree85three
452atwo52two
578btwo78two
653aone53one
741athree41three
# 自定义函数传入transform
def diff_mean(s):
    """
        返回数据与均值的差值
    """
    return s - s.mean()

df_obj.groupby('key1').transform(diff_mean)
data1data2sum_data1sum_data2
0-1.0000005.000000-1.0000005.000000
10.333333-1.6666670.333333-1.666667
22.0000001.0000002.0000001.000000
30.333333-0.6666670.333333-0.666667
40.000000-2.0000000.000000-2.000000
5-0.6666672.333333-0.6666672.333333
60.000000-1.0000000.000000-1.000000
7-1.000000-3.000000-1.000000-3.000000
dataset_path = './starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek', 
                                             'TotalHours', 'APM'])
  • apply
def top_n(df, n=3, column='APM'):
    """
        返回每个分组按 column 的 top n 数据
    """
    return df.sort_values(by=column, ascending=False)[:n]

df_data.groupby('LeagueIndex').apply(top_n)
LeagueIndexAgeHoursPerWeekTotalHoursAPM
LeagueIndex
12214120.012.0730.0172.9530
2246127.08.0250.0141.6282
1753120.028.0100.0139.6362
23062220.06.0100.0179.6250
3229216.024.0110.0156.7380
1520229.06.0250.0151.6470
31557322.06.0200.0226.6554
484319.042.0450.0220.0692
2883316.08.0800.0208.9500
42688426.024.0990.0249.0210
1759416.06.075.0229.9122
2637423.024.0650.0227.2272
53277518.016.0950.0372.6426
93517.036.0720.0335.4990
202537.014.0800.0327.7218
6734616.028.0730.0389.8314
2746616.028.04000.0350.4114
1810621.014.0730.0323.2506
73127723.042.02000.0298.7952
104721.024.01000.0286.4538
1654718.098.0700.0236.0316
833938NaNNaNNaN375.8664
33738NaNNaNNaN364.8504
33728NaNNaNNaN355.3518
# apply函数接收的参数会传入自定义的函数中
df_data.groupby('LeagueIndex').apply(top_n, n=2, column='Age')
LeagueIndexAgeHoursPerWeekTotalHoursAPM
LeagueIndex
13146140.012.0150.038.5590
3040139.010.0500.029.8764
2920243.010.0730.086.0586
2437241.04.0200.054.2166
31258341.014.0800.077.6472
2972340.010.0500.060.5970
41696444.06.0500.089.5266
1729439.08.0500.086.7246
5202537.014.0800.0327.7218
2745537.018.01000.0123.4098
63069631.08.0800.0133.1790
2706631.08.0700.066.9918
72813726.036.01300.0188.5512
1992726.024.01000.0219.6690
833408NaNNaNNaN189.7404
33418NaNNaNNaN287.8128
  • 禁止分组 group_keys=False
df_data.groupby('LeagueIndex', group_keys=False).apply(top_n)
LeagueIndexAgeHoursPerWeekTotalHoursAPM
2214120.012.0730.0172.9530
2246127.08.0250.0141.6282
1753120.028.0100.0139.6362
3062220.06.0100.0179.6250
3229216.024.0110.0156.7380
1520229.06.0250.0151.6470
1557322.06.0200.0226.6554
484319.042.0450.0220.0692
2883316.08.0800.0208.9500
2688426.024.0990.0249.0210
1759416.06.075.0229.9122
2637423.024.0650.0227.2272
3277518.016.0950.0372.6426
93517.036.0720.0335.4990
202537.014.0800.0327.7218
734616.028.0730.0389.8314
2746616.028.04000.0350.4114
1810621.014.0730.0323.2506
3127723.042.02000.0298.7952
104721.024.01000.0286.4538
1654718.098.0700.0236.0316
33938NaNNaNNaN375.8664
33738NaNNaNNaN364.8504
33728NaNNaNNaN355.3518

数据连接 merge

import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2' : np.random.randint(0,10,3)})

print(df_obj1)
print(df_obj2)
   data1 key
0      5   b
1      9   b
2      1   a
3      0   c
4      3   a
5      9   a
6      0   b
   data2 key
0      9   a
1      3   b
2      8   d
# 默认将重叠列的列名作为“外键”进行连接
pd.merge(df_obj1, df_obj2)
data1keydata2
05b3
19b3
20b3
31a9
43a9
59a9
# on显示指定“外键”
pd.merge(df_obj1, df_obj2, on='key')
data1keydata2
05b3
19b3
20b3
31a9
43a9
59a9
# left_on,right_on分别指定左侧数据和右侧数据的“外键”

# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
data1key1data2key2
05b3b
19b3b
20b3b
31a9a
43a9a
59a9a
# “外连接”
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
data1key1data2key2
05.0b3.0b
19.0b3.0b
20.0b3.0b
31.0a9.0a
43.0a9.0a
59.0a9.0a
60.0cNaNNaN
7NaNNaN8.0d
# 左连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
data1key1data2key2
05b3.0b
19b3.0b
21a9.0a
30cNaNNaN
43a9.0a
59a9.0a
60b3.0b
# 右连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
data1key1data2key2
05.0b3b
19.0b3b
20.0b3b
31.0a9a
43.0a9a
59.0a9a
6NaNNaN8d
# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data' : np.random.randint(0,10,3)})

pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))
data_leftkeydata_right
04b5
12b5
25b5
39a7
46a7
56a7
# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)
data1keydata2
06b5
17b5
69b5
22a3
44a3
54a3

数据合并 concat

import numpy as np
import pandas as pd
  • NumPy的concat
arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))

print(arr1)
print(arr2)
[[8 5 2 4]
 [2 9 9 0]
 [6 2 1 3]]
[[0 4 5 3]
 [6 4 1 3]
 [1 2 9 9]]
np.concatenate([arr1, arr2])
array([[8, 5, 2, 4],
       [2, 9, 9, 0],
       [6, 2, 1, 3],
       [0, 4, 5, 3],
       [6, 4, 1, 3],
       [1, 2, 9, 9]])
np.concatenate([arr1, arr2], axis=1)
array([[8, 5, 2, 4, 0, 4, 5, 3],
       [2, 9, 9, 0, 6, 4, 1, 3],
       [6, 2, 1, 3, 1, 2, 9, 9]])
  • Series上的concat
# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))

print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
0    9
1    9
2    5
3    6
4    4
dtype: int32
5    1
6    9
7    2
8    5
dtype: int32
9     4
10    7
11    4
dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3])
0     9
1     9
2     5
3     6
4     4
5     1
6     9
7     2
8     5
9     4
10    7
11    4
dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1)
012
09.0NaNNaN
19.0NaNNaN
25.0NaNNaN
36.0NaNNaN
44.0NaNNaN
5NaN1.0NaN
6NaN9.0NaN
7NaN2.0NaN
8NaN5.0NaN
9NaNNaN4.0
10NaNNaN7.0
11NaNNaN4.0
# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))

print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
0    7
1    3
2    9
3    1
4    7
dtype: int32
0    6
1    1
2    4
3    7
dtype: int32
0    7
1    2
2    3
dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3])
0    7
1    3
2    9
3    1
4    7
0    6
1    1
2    4
3    7
0    7
1    2
2    3
dtype: int32
pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1, join='inner')
012
0767
1312
2943
  • DataFrame上的concat
df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
                       columns=['A', 'B'])
df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
                       columns=['C', 'D'])
print(df_obj1)
print(df_obj2)
   A  B
a  1  6
b  1  0
c  1  6
   C  D
a  2  1
b  7  4
pd.concat([df_obj1, df_obj2])
ABCD
a1.06.0NaNNaN
b1.00.0NaNNaN
c1.06.0NaNNaN
aNaNNaN2.01.0
bNaNNaN7.04.0
pd.concat([df_obj1, df_obj2], axis=1)
ABCD
a162.01.0
b107.04.0
c16NaNNaN

数据重构

import numpy as np
import pandas as pd
  • stack
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
df_obj
data1data2
008
195
211
385
494
stacked = df_obj.stack()
print(stacked)
0  data1    0
   data2    8
1  data1    9
   data2    5
2  data1    1
   data2    1
3  data1    8
   data2    5
4  data1    9
   data2    4
dtype: int32
print(type(stacked))
print(type(stacked.index))
<class 'pandas.core.series.Series'>
<class 'pandas.indexes.multi.MultiIndex'>
# 默认操作内层索引
stacked.unstack()
data1data2
008
195
211
385
494
# 通过level指定操作索引的级别
stacked.unstack(level=0)
01234
data109189
data285154

数据转换

import numpy as np
import pandas as pd
  • 重复数据
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
                       'data2' : np.random.randint(0, 4, 8)})
df_obj
data1data2
0a1
1a1
2a0
3a0
4b2
5b3
6b3
7b1
df_obj.duplicated()
0    False
1     True
2    False
3     True
4    False
5    False
6     True
7    False
dtype: bool
df_obj.drop_duplicates()
data1data2
0a1
2a0
4b2
5b3
7b1
df_obj.drop_duplicates('data2')
data1data2
0a1
2a0
4b2
5b3
  • map函数
ser_obj = pd.Series(np.random.randint(0,10,10))
ser_obj
0    3
1    3
2    0
3    6
4    2
5    0
6    3
7    1
8    7
9    0
dtype: int32
ser_obj.map(lambda x : x ** 2)
0     9
1     9
2     0
3    36
4     4
5     0
6     9
7     1
8    49
9     0
dtype: int64
  • 数据替换repalce
# 替换单个值
ser_obj.replace(0, -100)
0      3
1      3
2   -100
3      6
4      2
5   -100
6      3
7      1
8      7
9   -100
dtype: int32
# 替换多个值
ser_obj.replace([0, 2], -100)
0      3
1      3
2   -100
3      6
4   -100
5   -100
6      3
7      1
8      7
9   -100
dtype: int32
# 替换多个值
ser_obj.replace([0, 2], [-100, -200])
0      3
1      3
2   -100
3      6
4   -200
5   -100
6      3
7      1
8      7
9   -100
dtype: int32

聚类-K-Means

https://blog.csdn.net/qq_41251963/article/details/81603169

k-means算法的实现:
kmeans_tools.py

# -*- coding: utf-8 -*-


import math
import random


class Cluster(object):
    """
        聚类
    """

    def __init__(self, samples):
        if len(samples) == 0:
            # 如果聚类中无样本点
            raise Exception("错误:一个空的聚类!")

        # 属于该聚类的样本点
        self.samples = samples

        # 该聚类中样本点的维度
        self.n_dim = samples[0].n_dim

        # 判断该聚类中所有样本点的维度是否相同
        for sample in samples:
            if sample.n_dim != self.n_dim:
                raise Exception("错误: 聚类中样本点的维度不一致!")

        # 设置初始化的聚类中心
        self.centroid = self.cal_centroid()

    def __repr__(self):
        """
            输出对象信息
        """
        return str(self.samples)

    def update(self, samples):
        """
            计算之前的聚类中心和更新后聚类中心的距离
        """

        old_centroid = self.centroid
        self.samples = samples
        self.centroid = self.cal_centroid()
        shift = get_distance(old_centroid, self.centroid)
        return shift

    def cal_centroid(self):
        """
           对于一组样本点计算其中心点
        """
        n_samples = len(self.samples)
        # 获取所有样本点的坐标(特征)
        coords = [sample.coords for sample in self.samples]
        unzipped = zip(*coords)
        # 计算每个维度的均值
        centroid_coords = [math.fsum(d_list)/n_samples for d_list in unzipped]

        return Sample(centroid_coords)


class Sample(object):
    """
        样本点类
    """
    def __init__(self, coords):
        self.coords = coords    # 样本点包含的坐标
        self.n_dim = len(coords)    # 样本点维度

    def __repr__(self):
        """
            输出对象信息
        """
        return str(self.coords)


def get_distance(a, b):
    """
        返回样本点a, b的欧式距离
        参考:https://en.wikipedia.org/wiki/Euclidean_distance#n_dimensions
    """
    if a.n_dim != b.n_dim:
        # 如果样本点维度不同
        raise Exception("错误: 样本点维度不同,无法计算距离!")

    acc_diff = 0.0
    for i in range(a.n_dim):
        square_diff = pow((a.coords[i]-b.coords[i]), 2)
        acc_diff += square_diff
    distance = math.sqrt(acc_diff)

    return distance


def gen_random_sample(n_dim, lower, upper):
    """
        生成随机样本
    """
    sample = Sample([random.uniform(lower, upper) for _ in range(n_dim)])
    return sample

main.py

# -*- coding: utf-8 -*-

import random
from kmeans_tools import Cluster, get_distance, gen_random_sample
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors


def kmeans(samples, k, cutoff):
    """
        kmeans函数
    """

    # 随机选k个样本点作为初始聚类中心
    init_samples = random.sample(samples, k)

    # 创建k个聚类,聚类的中心分别为随机初始的样本点
    clusters = [Cluster([sample]) for sample in init_samples]

    # 迭代循环直到聚类划分稳定
    n_loop = 0
    while True:
        # 初始化一组空列表用于存储每个聚类内的样本点
        lists = [[] for _ in clusters]

        # 开始迭代
        n_loop += 1
        # 遍历样本集中的每个样本
        for sample in samples:
            # 计算样本点sample和第一个聚类中心的距离
            smallest_distance = get_distance(sample, clusters[0].centroid)
            # 初始化属于聚类 0
            cluster_index = 0

            # 计算和其他聚类中心的距离
            for i in range(k - 1):
                # 计算样本点sample和聚类中心的距离
                distance = get_distance(sample, clusters[i+1].centroid)
                # 如果存在更小的距离,更新距离
                if distance < smallest_distance:
                    smallest_distance = distance
                    cluster_index = i + 1

            # 找到最近的聚类中心,更新所属聚类
            lists[cluster_index].append(sample)

        # 初始化最大移动距离
        biggest_shift = 0.0

        # 计算本次迭代中,聚类中心移动的距离
        for i in range(k):
            shift = clusters[i].update(lists[i])
            # 记录最大移动距离
            biggest_shift = max(biggest_shift, shift)

        # 如果聚类中心移动的距离小于收敛阈值,即:聚类稳定
        if biggest_shift < cutoff:
            print("第{}次迭代后,聚类稳定。".format(n_loop))
            break
    # 返回聚类结果
    return clusters


def run_main():
    """
        主函数
    """
    # 样本个数
    n_samples = 1000

    # 特征个数 (特征维度)
    n_feat = 2

    # 特征数值范围
    lower = 0
    upper = 200

    # 聚类个数
    n_cluster = 5

    # 生成随机样本
    samples = [gen_random_sample(n_feat, lower, upper) for _ in range(n_samples)]

    # 收敛阈值
    cutoff = 0.2

    clusters = kmeans(samples, n_cluster, cutoff)

    # 输出结果
    for i, c in enumerate(clusters):
        for sample in c.samples:
            print('聚类--{},样本点--{}'.format(i, sample))

    # 可视化结果
    plt.subplot()
    color_names = list(mcolors.cnames)
    for i, c in enumerate(clusters):
        x = []
        y = []
        random.choice
        color = [color_names[i]] * len(c.samples)
        for sample in c.samples:
            x.append(sample.coords[0])
            y.append(sample.coords[1])
        plt.scatter(x, y, c=color)
    plt.show()

if __name__ == '__main__':
    run_main()

在这里插入图片描述

实战:全球食品数据分析

统计各国家食物中的食品添加剂种类

https://www.kaggle.com/openfoodfacts/world-food-facts
在这里插入图片描述

# -*- coding: utf-8 -*-

import zipfile
import os
import pandas as pd
import matplotlib.pyplot as plt


def unzip(zip_filepath, dest_path):
    """
        解压zip文件
    """
    with zipfile.ZipFile(zip_filepath) as zf:
        zf.extractall(path=dest_path)


def get_dataset_filename(zip_filepath):
    """
            获取数据库文件名
    """
    with zipfile.ZipFile(zip_filepath) as zf:
        return zf.namelist()[0]


def run_main():
    """
        主函数
    """
    # 声明变量
    dataset_path = './data'  # 数据集路径
    zip_filename = 'open-food-facts.zip'  # zip文件名
    zip_filepath = os.path.join(dataset_path, zip_filename)  # zip文件路径
    dataset_filename = get_dataset_filename(zip_filepath)  # 数据集文件名(在zip中)
    dataset_filepath = os.path.join(dataset_path, dataset_filename)  # 数据集文件路径

    print('解压zip...', end='')
    unzip(zip_filepath, dataset_path)
    print('完成.')

    # 读取数据
    data = pd.read_csv(dataset_filepath, usecols=['countries_en', 'additives_n'])

    # 分析各国家食物中的食品添加剂种类个数
    # 1. 数据清理
    # 去除缺失数据
    data = data.dropna()    # 或者data.dropna(inplace=True)

    # 将国家名称转换为小写
    # 课后练习:经过观察发现'countries_en'中的数值不是单独的国家名称,
    # 有的是多个国家名称用逗号隔开,如 Albania,Belgium,France,Germany,Italy,Netherlands,Spain
    # 正确的统计应该是将这些值拆开成多个行记录,然后进行分组统计
    data['countries_en'] = data['countries_en'].str.lower()

    # 2. 数据分组统计
    country_additives = data['additives_n'].groupby(data['countries_en']).mean()

    # 3. 按值从大到小排序
    result = country_additives.sort_values(ascending=False)

    # 4. pandas可视化top10
    result.iloc[:10].plot.bar()
    plt.show()

    # 5. 保存处理结果
    result.to_csv('./country_additives.csv')

    # 删除解压数据,清理空间
    if os.path.exists(dataset_filepath):
        os.remove(dataset_filepath)

if __name__ == '__main__':
    run_main()

在这里插入图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

乐亦亦乐

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值