import pandas as pd
import numpy as np
dataset_path = './starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek',
'TotalHours', 'APM'])
1.透视表 (pivot table)
透视表指根据一个或多个键值对数据进行聚合,根据行或列的分组键将数据划分到各个区域中。
调用.pivot_tabel(),其中有四个参数可以自定义设置:
- index参数:透视表中的索引值
- columns参数:分组的列
- agggunc:应用在每个区域的聚合函数,默认为np.mean
- fill_value:替换结果中的缺失值
如下案例,行索引为age,值为LeagueIndex,列索引为其余的3个列的值。
a = pd.pivot_table(df_data,
index=['Age'],
columns=['LeagueIndex'],
aggfunc=np.sum)
print a
HoursPerWeek \
LeagueIndex 1 2 3 4 5 6 7
Age
16.0 324.0 720.0 778.0 758.0 1220.0 1280.0 56.0
17.0 184.0 378.0 664.0 744.0 1460.0 1116.0 104.0
18.0 204.0 548.0 886.0 1220.0 1644.0 1194.0 164.0
19.0 126.0 458.0 950.0 1024.0 962.0 1642.0 168.0
20.0 328.0 288.0 654.0 1414.0 1290.0 1816.0 NaN
21.0 162.0 270.0 580.0 1112.0 1446.0 1858.0 62.0
22.0 146.0 372.0 680.0 954.0 1304.0 1448.0 112.0
23.0 46.0 320.0 598.0 1100.0 998.0 726.0 296.0
24.0 116.0 344.0 406.0 700.0 668.0 1048.0 36.0
25.0 124.0 166.0 268.0 560.0 682.0 256.0 52.0
26.0 96.0 148.0 272.0 330.0 418.0 354.0 60.0
27.0 40.0 152.0 226.0 324.0 340.0 164.0 NaN
28.0 28.0 90.0 220.0 286.0 186.0 44.0 NaN
29.0 54.0 56.0 80.0 178.0 180.0 106.0 NaN
30.0 14.0 54.0 76.0 146.0 90.0 28.0 NaN
31.0 12.0 36.0 68.0 180.0 12.0 16.0 NaN
32.0 40.0 56.0 68.0 94.0 10.0 NaN NaN
33.0 32.0 12.0 42.0 16.0 60.0 NaN NaN
34.0 12.0 56.0 28.0 126.0 14.0 NaN NaN
35.0 54.0 8.0 26.0 82.0 12.0 NaN NaN
36.0 NaN 16.0 40.0 10.0 NaN NaN NaN
37.0 12.0 30.0 6.0 NaN 32.0 NaN NaN
38.0 16.0 NaN 46.0 NaN NaN NaN NaN
39.0 10.0 NaN 12.0 8.0 NaN NaN NaN
40.0 12.0 14.0 26.0 NaN NaN NaN NaN
41.0 NaN 12.0 14.0 NaN NaN NaN NaN
43.0 NaN 10.0 NaN NaN NaN NaN NaN
44.0 NaN NaN NaN 6.0 NaN NaN NaN
TotalHours ... \
LeagueIndex 1 2 3 ... 5 6
Age ...
16.0 4307.0 13143.0 29211.0 ... 49233.0 51320.0
17.0 2044.0 7423.0 16602.0 ... 53375.0 45421.0
18.0 3570.0 11471.0 21037.0 ... 1056486.0 50378.0
19.0 2355.0 8467.0 31861.0 ... 44697.0 70331.0
20.0 7212.0 6325.0 20174.0 ... 64170.0 105131.0
21.0 3377.0 7673.0 19095.0 ... 68739.0 82061.0
22.0 4225.0 10861.0 23030.0 ... 84330.0 67069.0
23.0 896.0 12350.0 23081.0 ... 56097.0 43176.0
24.0 2070.0 9543.0 25421.0 ... 36147.0 43114.0
25.0 2440.0 5846.0 11270.0 ... 40681.0 14890.0
26.0 1608.0 3417.0 10548.0 ... 20100.0 17663.0
27.0 1100.0 3615.0 7525.0 ... 19770.0 11796.0
28.0 466.0 1860.0 7901.0 ... 10872.0 2500.0
29.0 2490.0 2000.0 3816.0 ... 10292.0 5950.0
30.0 210.0 2440.0 4370.0 ... 3440.0 1500.0
31.0 200.0 1300.0 3500.0 ... 1050.0 1500.0
32.0 1600.0 860.0 2300.0 ... 800.0 NaN
33.0 1200.0 220.0 2065.0 ... 2050.0 NaN
34.0 150.0 3380.0 1150.0 ... 2764.0 NaN
35.0 1350.0 500.0 1950.0 ... 800.0 NaN
36.0 NaN 500.0 1950.0 ... NaN NaN
37.0 300.0 125.0 600.0 ... 1800.0 NaN
38.0 300.0 NaN 2280.0 ... NaN NaN
39.0 500.0 NaN 450.0 ... NaN NaN
40.0 150.0 500.0 1080.0 ... NaN NaN
41.0 NaN 450.0 800.0 ... NaN NaN
43.0 NaN 730.0 NaN ... NaN NaN
44.0 NaN NaN NaN ... NaN NaN
APM \
LeagueIndex 7 1 2 3 4
Age
16.0 3000.0 1062.44754 2919.70434 4851.9222 5149.7310
17.0 12700.0 655.67280 1661.01540 4181.8920 5525.3586
18.0 3200.0 704.47680 3300.41040 4847.2152 8763.0783
19.0 4166.0 734.55600 2216.81880 5183.7888 8030.1960
20.0 NaN 1624.89660 2147.23200 4211.5686 10596.2070
21.0 3180.0 780.67950 1578.02880 3949.3062 8689.8804
22.0 6950.0 674.59860 2147.50980 4379.3424 7818.7302
23.0 14290.0 359.65980 1575.06120 4602.7416 7616.9298
24.0 2250.0 439.43040 1717.55340 2876.8572 5503.7736
25.0 3300.0 572.61420 1178.02440 2201.6388 4710.9924
26.0 2300.0 418.70874 1165.96680 1794.1890 3139.2852
27.0 NaN 359.17320 1164.15960 1426.4550 2850.1320
28.0 NaN 333.84240 479.34000 1152.5958 2205.8778
29.0 NaN 236.74020 695.88480 568.2594 1447.5906
30.0 NaN 125.53740 441.14160 733.6416 743.4468
31.0 NaN 41.58600 314.92980 659.2626 1166.7606
32.0 NaN 179.14380 315.54180 457.5174 541.8996
33.0 NaN 198.77880 153.34680 284.8218 116.7516
34.0 NaN 49.11360 276.88260 268.4100 340.7124
35.0 NaN 229.31280 54.04680 170.4930 634.7688
36.0 NaN NaN 150.13140 333.6474 73.6980
37.0 NaN 22.05960 49.89600 44.9682 NaN
38.0 NaN 71.59500 NaN 334.6878 NaN
39.0 NaN 29.87640 NaN 53.7690 86.7246
40.0 NaN 38.55900 51.83580 107.4684 NaN
41.0 NaN NaN 108.45720 77.6472 NaN
43.0 NaN NaN 86.05860 NaN NaN
44.0 NaN NaN NaN NaN 89.5266
LeagueIndex 5 6 7
Age
16.0 7787.37780 9042.1722 386.7774
17.0 10052.72100 8310.0858 573.8286
18.0 10988.66100 9134.7240 618.5790
19.0 9271.09260 11955.6030 696.7770
20.0 10871.65440 14291.8692 NaN
21.0 11954.91660 13165.7649 867.3474
22.0 10473.28380 10165.8672 493.1586
23.0 8292.86160 6131.1936 1799.6520
24.0 7292.32740 7240.4076 428.6538
25.0 6168.19260 2200.6362 361.4550
26.0 4016.67060 3301.8498 408.2202
27.0 3498.30300 2040.8454 NaN
28.0 1992.60540 521.7666 NaN
29.0 1398.78540 715.9404 NaN
30.0 578.32020 123.3774 NaN
31.0 315.53460 200.1708 NaN
32.0 66.19740 NaN NaN
33.0 245.45166 NaN NaN
34.0 174.54540 NaN NaN
35.0 115.06440 NaN NaN
36.0 NaN NaN NaN
37.0 451.13160 NaN NaN
38.0 NaN NaN NaN
39.0 NaN NaN NaN
40.0 NaN NaN NaN
41.0 NaN NaN NaN
43.0 NaN NaN NaN
44.0 NaN NaN NaN
[28 rows x 21 columns]
b = pd.pivot_table(df_data,
index=['Age'],
columns=['LeagueIndex'],
aggfunc=np.sum,
fill_value=-100.)
print b
HoursPerWeek TotalHours \
LeagueIndex 1 2 3 4 5 6 7 1 2
Age
16.0 324 720 778 758 1220 1280 56 4307 13143
17.0 184 378 664 744 1460 1116 104 2044 7423
18.0 204 548 886 1220 1644 1194 164 3570 11471
19.0 126 458 950 1024 962 1642 168 2355 8467
20.0 328 288 654 1414 1290 1816 -100 7212 6325
21.0 162 270 580 1112 1446 1858 62 3377 7673
22.0 146 372 680 954 1304 1448 112 4225 10861
23.0 46 320 598 1100 998 726 296 896 12350
24.0 116 344 406 700 668 1048 36 2070 9543
25.0 124 166 268 560 682 256 52 2440 5846
26.0 96 148 272 330 418 354 60 1608 3417
27.0 40 152 226 324 340 164 -100 1100 3615
28.0 28 90 220 286 186 44 -100 466 1860
29.0 54 56 80 178 180 106 -100 2490 2000
30.0 14 54 76 146 90 28 -100 210 2440
31.0 12 36 68 180 12 16 -100 200 1300
32.0 40 56 68 94 10 -100 -100 1600 860
33.0 32 12 42 16 60 -100 -100 1200 220
34.0 12 56 28 126 14 -100 -100 150 3380
35.0 54 8 26 82 12 -100 -100 1350 500
36.0 -100 16 40 10 -100 -100 -100 -100 500
37.0 12 30 6 -100 32 -100 -100 300 125
38.0 16 -100 46 -100 -100 -100 -100 300 -100
39.0 10 -100 12 8 -100 -100 -100 500 -100
40.0 12 14 26 -100 -100 -100 -100 150 500
41.0 -100 12 14 -100 -100 -100 -100 -100 450
43.0 -100 10 -100 -100 -100 -100 -100 -100 730
44.0 -100 -100 -100 6 -100 -100 -100 -100 -100
... APM \
LeagueIndex 3 ... 5 6 7 1 2
Age ...
16.0 29211 ... 49233 51320 3000 1062.44754 2919.70434
17.0 16602 ... 53375 45421 12700 655.67280 1661.01540
18.0 21037 ... 1056486 50378 3200 704.47680 3300.41040
19.0 31861 ... 44697 70331 4166 734.55600 2216.81880
20.0 20174 ... 64170 105131 -100 1624.89660 2147.23200
21.0 19095 ... 68739 82061 3180 780.67950 1578.02880
22.0 23030 ... 84330 67069 6950 674.59860 2147.50980
23.0 23081 ... 56097 43176 14290 359.65980 1575.06120
24.0 25421 ... 36147 43114 2250 439.43040 1717.55340
25.0 11270 ... 40681 14890 3300 572.61420 1178.02440
26.0 10548 ... 20100 17663 2300 418.70874 1165.96680
27.0 7525 ... 19770 11796 -100 359.17320 1164.15960
28.0 7901 ... 10872 2500 -100 333.84240 479.34000
29.0 3816 ... 10292 5950 -100 236.74020 695.88480
30.0 4370 ... 3440 1500 -100 125.53740 441.14160
31.0 3500 ... 1050 1500 -100 41.58600 314.92980
32.0 2300 ... 800 -100 -100 179.14380 315.54180
33.0 2065 ... 2050 -100 -100 198.77880 153.34680
34.0 1150 ... 2764 -100 -100 49.11360 276.88260
35.0 1950 ... 800 -100 -100 229.31280 54.04680
36.0 1950 ... -100 -100 -100 -100.00000 150.13140
37.0 600 ... 1800 -100 -100 22.05960 49.89600
38.0 2280 ... -100 -100 -100 71.59500 -100.00000
39.0 450 ... -100 -100 -100 29.87640 -100.00000
40.0 1080 ... -100 -100 -100 38.55900 51.83580
41.0 800 ... -100 -100 -100 -100.00000 108.45720
43.0 -100 ... -100 -100 -100 -100.00000 86.05860
44.0 -100 ... -100 -100 -100 -100.00000 -100.00000
LeagueIndex 3 4 5 6 7
Age
16.0 4851.9222 5149.7310 7787.37780 9042.1722 386.7774
17.0 4181.8920 5525.3586 10052.72100 8310.0858 573.8286
18.0 4847.2152 8763.0783 10988.66100 9134.7240 618.5790
19.0 5183.7888 8030.1960 9271.09260 11955.6030 696.7770
20.0 4211.5686 10596.2070 10871.65440 14291.8692 -100.0000
21.0 3949.3062 8689.8804 11954.91660 13165.7649 867.3474
22.0 4379.3424 7818.7302 10473.28380 10165.8672 493.1586
23.0 4602.7416 7616.9298 8292.86160 6131.1936 1799.6520
24.0 2876.8572 5503.7736 7292.32740 7240.4076 428.6538
25.0 2201.6388 4710.9924 6168.19260 2200.6362 361.4550
26.0 1794.1890 3139.2852 4016.67060 3301.8498 408.2202
27.0 1426.4550 2850.1320 3498.30300 2040.8454 -100.0000
28.0 1152.5958 2205.8778 1992.60540 521.7666 -100.0000
29.0 568.2594 1447.5906 1398.78540 715.9404 -100.0000
30.0 733.6416 743.4468 578.32020 123.3774 -100.0000
31.0 659.2626 1166.7606 315.53460 200.1708 -100.0000
32.0 457.5174 541.8996 66.19740 -100.0000 -100.0000
33.0 284.8218 116.7516 245.45166 -100.0000 -100.0000
34.0 268.4100 340.7124 174.54540 -100.0000 -100.0000
35.0 170.4930 634.7688 115.06440 -100.0000 -100.0000
36.0 333.6474 73.6980 -100.00000 -100.0000 -100.0000
37.0 44.9682 -100.0000 451.13160 -100.0000 -100.0000
38.0 334.6878 -100.0000 -100.00000 -100.0000 -100.0000
39.0 53.7690 86.7246 -100.00000 -100.0000 -100.0000
40.0 107.4684 -100.0000 -100.00000 -100.0000 -100.0000
41.0 77.6472 -100.0000 -100.00000 -100.0000 -100.0000
43.0 -100.0000 -100.0000 -100.00000 -100.0000 -100.0000
44.0 -100.0000 89.5266 -100.00000 -100.0000 -100.0000
[28 rows x 21 columns]
2. 交叉表 (crosstab)
交叉表用于计算分组频率,是一种特殊的透视表。
pd.crosstab(index, columns)其中有两个参数:
- index: 分组数据,交叉表的行索引
- columns: 交叉表的列索引
print pd.crosstab(df_data['LeagueIndex'],
df_data['Age'])
Age 16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 24.0 25.0 ... \
LeagueIndex ...
1 18 9 13 12 22 12 12 6 9 11 ...
2 38 22 38 32 25 23 28 24 24 18 ...
3 48 43 56 53 47 44 47 47 32 26 ...
4 45 49 78 71 97 81 75 72 52 45 ...
5 55 71 81 68 80 96 81 59 58 51 ...
6 50 51 56 73 86 83 68 42 48 15 ...
7 2 3 3 4 0 5 3 9 2 2 ...
Age 34.0 35.0 36.0 37.0 38.0 39.0 40.0 41.0 43.0 44.0
LeagueIndex
1 1 5 0 1 1 1 1 0 0 0
2 5 1 2 1 0 0 1 2 1 0
3 3 3 5 1 4 1 2 1 0 0
4 4 7 1 0 0 1 0 0 0 1
5 2 1 0 2 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0
[7 rows x 28 columns]
print pd.crosstab(df_data['LeagueIndex'],
[df_data['Age'], df_data['HoursPerWeek']])
Age 16.0 ... 39.0 40.0 \
HoursPerWeek 0.0 2.0 4.0 6.0 8.0 10.0 12.0 14.0 16.0 20.0 ... 12.0 10.0
LeagueIndex ...
1 0 0 0 1 3 1 1 2 3 3 ... 0 0
2 0 0 2 1 0 9 4 4 3 3 ... 0 0
3 0 0 1 6 7 6 6 7 2 1 ... 1 1
4 0 2 5 1 6 4 3 6 3 1 ... 0 0
5 0 1 2 1 3 8 3 6 3 6 ... 0 0
6 1 1 4 2 1 1 4 3 1 7 ... 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0
Age 41.0 43.0 44.0
HoursPerWeek 12.0 14.0 16.0 4.0 8.0 14.0 10.0 6.0
LeagueIndex
1 1 0 0 0 0 0 0 0
2 0 1 0 1 1 0 1 0
3 0 0 1 0 0 1 0 0
4 0 0 0 0 0 0 0 1
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0
[7 rows x 325 columns]
注:部分例子来自于小象学院Robin课程