Pandas从入门到入墓

参考文章:https://jizhi.im/blog/post/10min2pandas01
https://jizhi.im/blog/post/10min2pandas02

导入pandas和numpy库

import pandas as pd
import numpy as np

一、创建对象

1、创建Series对象

创建Series对象,需要传递一个List参数,

# Series,可以指定index,同样是传递List参数
s = pd.Series([1,3,5,np.nan,6,8],index=[i+1 for i in range(len([1,3,5,np.nan,6,8]))])
print(s)

#输出
'''
1    1.0
2    3.0
3    5.0
4    NaN
5    6.0
6    8.0
dtype: float64
'''

2、创建Dataframe对象

# 首先创建一个时间序列
dates = pd.date_range('20130101', periods=6,freq='d')
'''
参数:
:param start: 序列开始时间
:param end: 序列结束时间, 给定start时, 结束时间不包含end
:param periods: int, 生成的时间序列长度
:param freq: 要生成时间序列的时间间隔,可以是天(d)、多天(2d)、月(M)、多月(3M)、年(Y)、多年(2Y)
					  以及时(h)分(min)秒(s)等
:param out_format: 是否输出格式化后的字符串, 若要输出可指定输出格式. "%Y-%m-%d %H:%M:%S"
:param input_format: 若start或end是字符串且无法自动推断时间格式则需指定格式
'''
print(dates)

# 输出
'''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D'
'''


# 再创建一个6行4列的随机数组
data = np.random.randn(6,4)
print(data)

# 输出
'''
[[ 0.52615987 -0.10511352  0.93578156 -1.0171928 ]
 [ 0.14465037  0.92757347  0.31852467 -0.82320396]
 [-0.63506826  0.62381754 -0.61869088 -0.23659419]
 [ 0.53514965  0.17191194 -1.25918251 -2.80756598]
 [-0.97158469  0.51641578  0.71967892  0.48177381]
 [-0.82879759  1.21244248  0.21248849 -0.62200675]]
'''


# 最后创建DataFrame,index指定索引,columns通过传递List指定列名
df = pd.DataFrame(data,index=dates,columns=['A','B','C','D'])
print(df)

#输出
'''
                   A         B         C         D
2013-01-01  0.526160 -0.105114  0.935782 -1.017193
2013-01-02  0.144650  0.927573  0.318525 -0.823204
2013-01-03 -0.635068  0.623818 -0.618691 -0.236594
2013-01-04  0.535150  0.171912 -1.259183 -2.807566
2013-01-05 -0.971585  0.516416  0.719679  0.481774
2013-01-06 -0.828798  1.212442  0.212488 -0.622007
'''


# 也可以通过传递字典构建DataFrame
dict = { 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 
'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }
print(dict)

# 输出
{'A': 1.0, 'B': Timestamp('2013-01-02 00:00:00'), 'C': 0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32, 'D': array([3, 3, 3, 3]), 'E': [test, train, test, train]
Categories (2, object): [test, train], 'F': 'foo'}

df2 = pd.DataFrame(dict)
print(df2)
print('=' * 60)
print(df2.dtypes)

# 输出
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
============================================================
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

二、观察数据

# 默认情况下,.head()和.tail()输出首尾的前5行,也可以手动指定输出行数。
df.head()
df.tail(n=5)

# 索引,如果不添加.values,返回的数据类型是index,否则返回的数据类型是数组
print(df.index)
print(type(df.index))
# 输出
'''
Index(['A', 'B', 'C', 'D'], dtype='object')
<class 'pandas.core.indexes.base.Index'
'''

print(df.index.values)
print(type(df.index.values))
# 输出
'''
['2013-01-01T00:00:00.000000000' '2013-01-02T00:00:00.000000000'
 '2013-01-03T00:00:00.000000000' '2013-01-04T00:00:00.000000000'
 '2013-01-05T00:00:00.000000000' '2013-01-06T00:00:00.000000000']
<class 'numpy.ndarray'>
'''

# 列标签
print(df.columns)
print(df.columns.values)

# 数值,返回数组
df.values

# 描述统计
dt.describe()						# 默认只统计数值类型的特征

dt.describe(include='all')			# 全统计

dt.describe(include=[np.number])	# 只统计数值类型的特征

dt.describe(include=[np.object])	# 只统计字符类型的特征

dt.describe(include=['列名'])		# 统计指定列

dt.describe(exclude=[np.number])	# 反向指定

三、转置与排序

1、转置

print(df.T)

2、排序

print(df2.sort_index(axis=1, ascending=False))	# 按轴排序,逐列递减
print('=' * 60)
print(df2.sort_values(by='E', ascending=False))	# 按值排序,'E'列逐行递增
print('=' * 60)
print(df2[['A','C','E']].sort_values(by=['E', 'C'], ascending=False))	# by可以传递List

# 输出
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0
============================================================
     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
0  1.0 2013-01-02  1.0  3   test  foo
============================================================
     A    C      E
3  1.0  1.0  train
1  1.0  1.0  train
2  1.0  1.0   test
0  1.0  1.0   test

四、聚合分组

print(df2.groupby(['E'],as_index=False).mean())
print('=' * 60)
print(df2[['A','E']].groupby(['E'],as_index=False).mean())
print('=' * 60)
print(df2['A'].groupby(df2['E']).mean())

# 输出
       E    A    C  D
0   test  1.0  1.0  3
1  train  1.0  1.0  3
============================================================
       E    A
0   test  1.0
1  train  1.0
============================================================
E
test     1.0
train    1.0
Name: A, dtype: float64

聚合分组加排序

print(df2[['A','E']].groupby(['E'],as_index=False).sum().sort_values(by='E', ascending=False))

# 输出
       E    A
1  train  2.0
0   test  2.0

五、切片

1、通过[](List)

# 用[]选取行
print(df[0:1])		# 选取一行
print('=' * 60)
print(df[0:-1])		# 选取所有行
print('=' * 60)
print(df[0:3])		# 选取前三行
print('=' * 60)
print(df['20130102':'20130104'])		# 选取特定值行

# 输出
                  A        B         C         D
2013-01-01 -1.584409 -1.85485 -0.284502 -0.168737
============================================================
                   A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275
2013-01-03  0.200055  0.797812  0.007122 -0.587346
2013-01-04  0.463844  0.072262  0.253212 -0.839793
2013-01-05 -0.631868  1.829623  1.135196  0.693634
============================================================
                   A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275
2013-01-03  0.200055  0.797812  0.007122 -0.587346
============================================================
                   A         B         C         D
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275
2013-01-03  0.200055  0.797812  0.007122 -0.587346
2013-01-04  0.463844  0.072262  0.253212 -0.839793

# 用[]选取多列
print(df[['A','B','C']])

# 输出
                   A         B         C
2013-01-01 -1.584409 -1.854850 -0.284502
2013-01-02 -0.182270 -1.599832 -0.242016
2013-01-03  0.200055  0.797812  0.007122
2013-01-04  0.463844  0.072262  0.253212
2013-01-05 -0.631868  1.829623  1.135196
2013-01-06 -1.018113  0.523377 -1.427537

# 选取单列
print(df.A)
print('=' * 60)
print(df['A'])	# 效果同上

#输出
2013-01-01   -1.584409
2013-01-02   -0.182270
2013-01-03    0.200055
2013-01-04    0.463844
2013-01-05   -0.631868
2013-01-06   -1.018113
Freq: D, Name: A, dtype: float64
============================================================
2013-01-01   -1.584409
2013-01-02   -0.182270
2013-01-03    0.200055
2013-01-04    0.463844
2013-01-05   -0.631868
2013-01-06   -1.018113
Freq: D, Name: A, dtype: float64

2、通过标签(loc,at)选择

# 选取行
print(df.loc[dates[0]])		# 选取第一行
print('=' * 60)
print(df.loc[dates[0:2]])	# 选取前两行
print('=' * 60)
print(df.loc['2013-01-01':'2013-01-02'])	# 同上

# 输出
A   -1.584409
B   -1.854850
C   -0.284502
D   -0.168737
Name: 2013-01-01 00:00:00, dtype: float64
============================================================
                   A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275
============================================================
                   A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275

# 选取列
print(df.loc[:,['A','B']])	# 选取A列、B列
print('=' * 60)
print(df.loc[:,'A'])		# 选取A列

# 输出
                  A         B
2013-01-01 -1.584409 -1.854850
2013-01-02 -0.182270 -1.599832
2013-01-03  0.200055  0.797812
2013-01-04  0.463844  0.072262
2013-01-05 -0.631868  1.829623
2013-01-06 -1.018113  0.523377
============================================================
2013-01-01   -1.584409
2013-01-02   -0.182270
2013-01-03    0.200055
2013-01-04    0.463844
2013-01-05   -0.631868
2013-01-06   -1.018113
Freq: D, Name: A, dtype: float64

# 同时选取行列
print(df.loc[dates[1:3],['B','C']])
print('=' * 60)
print(df.loc['2013-01-02':'2013-01-05',['B','D']])

# 输出
                   B         C
2013-01-02 -1.599832 -0.242016
2013-01-03  0.797812  0.007122
============================================================
                   B         D
2013-01-02 -1.599832  0.055275
2013-01-03  0.797812 -0.587346
2013-01-04  0.072262 -0.839793
2013-01-05  1.829623  0.693634

# 选取某一个特定值
print(df.loc[dates[0],'B'])		# 等同于 df.at[dates[0],'B'],at不支持List
print('=' * 60)
print(df.loc['2013-01-02','D'])

# 输出
-1.8548503338248823
============================================================
0.0552752957849636

3、按索引取值

# 选取行
print(df.iloc[3])
print('=' * 60)
print(df.iloc[2:4,:])
print('=' * 60)
print(df.iloc[[2,4],:])

# 输出
A    0.463844
B    0.072262
C    0.253212
D   -0.839793
Name: 2013-01-04 00:00:00, dtype: float64
============================================================
                   A         B         C         D
2013-01-03  0.200055  0.797812  0.007122 -0.587346
2013-01-04  0.463844  0.072262  0.253212 -0.839793
============================================================
                   A         B         C         D
2013-01-03  0.200055  0.797812  0.007122 -0.587346
2013-01-05 -0.631868  1.829623  1.135196  0.693634

# 选取列
print(df.iloc[:3])
print('=' * 60)
print(df.iloc[:,1:3])
print('=' * 60)
print(df.iloc[:,[1,3]])

# 输出
                  A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275
2013-01-03  0.200055  0.797812  0.007122 -0.587346
============================================================
                   B         C
2013-01-01 -1.854850 -0.284502
2013-01-02 -1.599832 -0.242016
2013-01-03  0.797812  0.007122
2013-01-04  0.072262  0.253212
2013-01-05  1.829623  1.135196
2013-01-06  0.523377 -1.427537
============================================================
                   B         D
2013-01-01 -1.854850 -0.168737
2013-01-02 -1.599832  0.055275
2013-01-03  0.797812 -0.587346
2013-01-04  0.072262 -0.839793
2013-01-05  1.829623  0.693634
2013-01-06  0.523377 -0.549401

# 同时选取多行多列
print(df.iloc[2:4,1:3])
print('=' * 60)
print(df.iloc[[2,4],[1,3]])

# 输出
                   B         C
2013-01-03  0.797812  0.007122
2013-01-04  0.072262  0.253212
============================================================
                   B         D
2013-01-03  0.797812 -0.587346
2013-01-05  1.829623  0.693634

# 选取某一个特定值
print(df.iloc[1,1])
print('=' * 60)
print(df.iat[1,1])

# 输出
-1.599831597530119
============================================================
-1.599831597530119

4、按布尔值取值

# 条件判断
print(df.A >0)
boolean = df.A > 0
print(df[boolean])

# 输出
2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04     True
2013-01-05    False
2013-01-06    False
Freq: D, Name: A, dtype: bool
                   A         B         C         D
2013-01-03  0.200055  0.797812  0.007122 -0.587346
2013-01-04  0.463844  0.072262  0.253212 -0.839793

# 使用.isin()
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
print(df2[df2['E'].isin(['two','four'])])
print('=' * 60)
print(df2[~df2['E'].isin(['two','four'])])
print('=' * 60)
row_index = [x for i,x in enumerate(df2.index) if df2.iat[i,4] in ['two','four']]
print(df2.loc[row_index])
print('=' * 60)
row_index = [x for i,x in enumerate(df2.index) if df2.iat[i,4] == 'two']
print(df2.loc[row_index])

# 输出
                   A         B         C         D     E
2013-01-03  0.200055  0.797812  0.007122 -0.587346   two
2013-01-05 -0.631868  1.829623  1.135196  0.693634  four
============================================================
                   A         B         C         D      E
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737    one
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275    one
2013-01-04  0.463844  0.072262  0.253212 -0.839793  three
2013-01-06 -1.018113  0.523377 -1.427537 -0.549401  three
============================================================
                   A         B         C         D     E
2013-01-03  0.200055  0.797812  0.007122 -0.587346   two
2013-01-05 -0.631868  1.829623  1.135196  0.693634  four
============================================================
                   A         B         C         D    E
2013-01-03  0.200055  0.797812  0.007122 -0.587346  two

六、缺失值处理

# Pandas默认使用np.nan来代表缺失数据。
# 创建DataFrame对象df1,以dates[0:4]为索引,在df的基础上再加一个新的列'E'(初始均为NaN)
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
# 将'E'列的前两个行设为1
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)

# 输出
A         B         C         D    E
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737  1.0
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275  1.0
2013-01-03  0.200055  0.797812  0.007122 -0.587346  NaN
2013-01-04  0.463844  0.072262  0.253212 -0.839793  NaN

# 删除、填充缺失值
print(df1.dropna(how='any'))	# 剔除df1中含NaN的行(只要任一一列出现NaN)
print('=' * 60)
print(df1.dropna(how='all'))	# 剔除df1中行内值全是NaN的行
print('=' * 60)
print(df1.fillna(value=5))		# 用5填充df1里的缺失值
print('=' * 60)
print(pd.isnull(df1))			# 判断df1中的值是否为缺失数据,返回True/False

七、函数应用

1、函数

print(df.mean(0))	# 按列求平均数
print('=' * 60)
print(df.mean(1))	# 按行求平均数

# 输出
A   -0.458793
B   -0.038601
C   -0.093088
D   -0.232728
dtype: float64
============================================================
2013-01-01   -0.973125
2013-01-02   -0.492211
2013-01-03    0.104411
2013-01-04   -0.012619
2013-01-05    0.756646
2013-01-06   -0.617919
Freq: D, dtype: float64

2、apply函数

print(df)
print(df.apply(np.cumsum))
print('=' * 60)
print(df.apply(lambda x: x.max()-x.min()))

# 输出
                   A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -0.182270 -1.599832 -0.242016  0.055275
2013-01-03  0.200055  0.797812  0.007122 -0.587346
2013-01-04  0.463844  0.072262  0.253212 -0.839793
2013-01-05 -0.631868  1.829623  1.135196  0.693634
2013-01-06 -1.018113  0.523377 -1.427537 -0.549401
                   A         B         C         D
2013-01-01 -1.584409 -1.854850 -0.284502 -0.168737
2013-01-02 -1.766679 -3.454682 -0.526517 -0.113462
2013-01-03 -1.566624 -2.656870 -0.519395 -0.700808
2013-01-04 -1.102780 -2.584609 -0.266183 -1.540601
2013-01-05 -1.734648 -0.754986  0.869012 -0.846967
2013-01-06 -2.752761 -0.231609 -0.558525 -1.396368
============================================================
A    2.048253
B    3.684473
C    2.562733
D    1.533426
dtype: float64

八、数据集合并

1、连接

df = pd.DataFrame(np.random.randn(10,4))
pieces = [df[:3],df[3:7],df[7:]]
print(pieces)
print('=' * 60)
print(pd.concat(pieces))

# 输出
[          0         1         2         3
0  0.639723  0.444278  0.742950  0.017251
1 -0.885244  0.728299 -0.086944 -2.956531
2  1.549633 -0.388373  0.604226 -2.166363,           0         1         2         3
3 -1.178607 -0.275841 -0.747447  1.147744
4  0.821686  0.050840 -0.567725  0.994611
5  0.908378 -1.193426  0.160251  0.701478
6  0.706225 -1.959642 -1.132494  0.672372,           0         1         2         3
7 -0.583920 -1.108532  0.118670  0.133175
8  1.801885  0.053336 -0.665703  0.628828
9  0.146570 -0.694837 -1.508931  0.583960]
============================================================
          0         1         2         3
0  0.639723  0.444278  0.742950  0.017251
1 -0.885244  0.728299 -0.086944 -2.956531
2  1.549633 -0.388373  0.604226 -2.166363
3 -1.178607 -0.275841 -0.747447  1.147744
4  0.821686  0.050840 -0.567725  0.994611
5  0.908378 -1.193426  0.160251  0.701478
6  0.706225 -1.959642 -1.132494  0.672372
7 -0.583920 -1.108532  0.118670  0.133175
8  1.801885  0.053336 -0.665703  0.628828
9  0.146570 -0.694837 -1.508931  0.583960

2、追加列

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
print(df)
print("=" * 60)
s = df.iloc[3]
print(df.append(s, ignore_index=True))

# 输出
          A         B         C         D
0 -0.578743 -0.150960 -0.446618 -0.988564
1  1.225723 -0.047997 -1.271592  0.035804
2  0.138365  1.788831  1.388763 -0.205168
3  1.245087 -1.409133 -1.346689 -0.012804
4  0.783840 -3.361609  1.415269  1.091229
5 -0.614037  0.302538 -1.202600 -1.220887
6  0.361368 -0.812398 -1.773582 -0.345263
7  0.636777 -0.593689  0.219574  0.239153
============================================================
          A         B         C         D
0 -0.578743 -0.150960 -0.446618 -0.988564
1  1.225723 -0.047997 -1.271592  0.035804
2  0.138365  1.788831  1.388763 -0.205168
3  1.245087 -1.409133 -1.346689 -0.012804
4  0.783840 -3.361609  1.415269  1.091229
5 -0.614037  0.302538 -1.202600 -1.220887
6  0.361368 -0.812398 -1.773582 -0.345263
7  0.636777 -0.593689  0.219574  0.239153
8  1.245087 -1.409133 -1.346689 -0.012804

九、重塑数据框

1、层次化(stack)

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))
# 多重索引
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df)
print("=" * 60)
# .stack()方法将DataFrame的列“压缩”了一级
stacked = df.stack()
print(stacked)

2、去层次化

print(stacked)
print("=" * 60)
print(stacked.unstack())	# 默认从最里边开始unstack
print("=" * 60)
print(stacked.unstack(1))	# 指定索引位置
print("=" * 60)
print(stacked.unstack(0))

# 输出
first  second   
bar    one     A    0.195572
               B   -0.108570
       two     A   -1.660513
               B   -1.583929
baz    one     A    0.077588
               B    2.668222
       two     A    0.147339
               B   -0.063172
foo    one     A    0.141091
               B    1.345705
       two     A   -0.238587
               B    0.328344
qux    one     A   -0.071885
               B    0.052804
       two     A   -0.896821
               B    1.147172
dtype: float64
============================================================
                     A         B
first second                    
bar   one     0.195572 -0.108570
      two    -1.660513 -1.583929
baz   one     0.077588  2.668222
      two     0.147339 -0.063172
foo   one     0.141091  1.345705
      two    -0.238587  0.328344
qux   one    -0.071885  0.052804
      two    -0.896821  1.147172
============================================================
second        one       two
first                      
bar   A  0.195572 -1.660513
      B -0.108570 -1.583929
baz   A  0.077588  0.147339
      B  2.668222 -0.063172
foo   A  0.141091 -0.238587
      B  1.345705  0.328344
qux   A -0.071885 -0.896821
      B  0.052804  1.147172
============================================================
first          bar       baz       foo       qux
second                                          
one    A  0.195572  0.077588  0.141091 -0.071885
       B -0.108570  2.668222  1.345705  0.052804
two    A -1.660513  0.147339 -0.238587 -0.896821
       B -1.583929 -0.063172  0.328344  1.147172

3、数据透视表

# 常见参数有index,values,columns,aggfunc,都支持传递List。
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 
                   'E' : np.random.randn(12)})
print(df)
print("=" * 60)
print(pd.pivot_table(df, index=['A', 'B'], values='D', columns=['C']))
print("=" * 60)
print(pd.pivot_table(df, index=['A', 'B'], values='D', columns=['C'], aggfunc=[np.mean,len]))

# 输出
       A  B    C         D         E
0     one  A  foo  0.163998  0.499117
1     one  B  foo -0.968083 -0.395698
2     two  C  foo -0.696637  0.263027
3   three  A  bar -1.765157  0.281752
4     one  B  bar  0.847807  0.432454
5     one  C  bar  1.107584  0.121205
6     two  A  foo  1.513330 -0.553265
7   three  B  foo  1.837021  0.388208
8     one  C  foo  0.322434  1.388784
9     one  A  bar -0.596184 -0.194123
10    two  B  bar  0.773536 -0.881527
11  three  C  bar  1.181760 -0.391355
============================================================
C             bar       foo
A     B                    
one   A -0.596184  0.163998
      B  0.847807 -0.968083
      C  1.107584  0.322434
three A -1.765157       NaN
      B       NaN  1.837021
      C  1.181760       NaN
two   A       NaN  1.513330
      B  0.773536       NaN
      C       NaN -0.696637
============================================================
             mean            len     
C             bar       foo  bar  foo
A     B                              
one   A -0.596184  0.163998  1.0  1.0
      B  0.847807 -0.968083  1.0  1.0
      C  1.107584  0.322434  1.0  1.0
three A -1.765157       NaN  1.0  NaN
      B       NaN  1.837021  NaN  1.0
      C  1.181760       NaN  1.0  NaN
two   A       NaN  1.513330  NaN  1.0
      B  0.773536       NaN  1.0  NaN
      C       NaN -0.696637  NaN  1.0

4、列联表

print(pd.crosstab(df['A'],df['C'],margins=True,normalize=True))

# 输出
C           bar       foo   All
A                              
one    0.250000  0.250000  0.50
three  0.166667  0.083333  0.25
two    0.083333  0.166667  0.25
All    0.500000  0.500000  1.00

# 也可以循环实现列联表
val_list = ['A','B','C']
for i in val_list:
    print('='*100)
    print(pd.crosstab(df[df['B'] == i]['B'], df[df['B'] == i]['C'],margins=True,normalize=True))

# 输出
====================================================================================================
C    bar  foo  All
B                 
A    0.5  0.5  1.0
All  0.5  0.5  1.0
====================================================================================================
C    bar  foo  All
B                 
B    0.5  0.5  1.0
All  0.5  0.5  1.0
====================================================================================================
C    bar  foo  All
B                 
C    0.5  0.5  1.0
All  0.5  0.5  1.0
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值