import pandas as pd
import numpy as np
import matplotlib. pyplot as plot
% matplotlib inline
df= pd. read_csv( r'C:\Users\yxl22\Desktop\Python\pandas\insurance.csv' )
type ( df)
pandas.core.frame.DataFrame
df. head( )
age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520
df. head( n= 2 )
age sex bmi children smoker region charges 0 19 female 27.90 0 yes southwest 16884.9240 1 18 male 33.77 1 no southeast 1725.5523
df. tail( )
age sex bmi children smoker region charges 1333 50 male 30.97 3 no northwest 10600.5483 1334 18 female 31.92 0 no northeast 2205.9808 1335 18 female 36.85 0 no southeast 1629.8335 1336 21 female 25.80 0 no southwest 2007.9450 1337 61 female 29.07 0 yes northwest 29141.3603
df. shape
(1338, 7)
df. index
RangeIndex(start=0, stop=1338, step=1)
df. columns
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
df. values
array([[19, 'female', 27.9, ..., 'yes', 'southwest', 16884.924],
[18, 'male', 33.77, ..., 'no', 'southeast', 1725.5523],
[28, 'male', 33.0, ..., 'no', 'southeast', 4449.462],
...,
[18, 'female', 36.85, ..., 'no', 'southeast', 1629.8335],
[21, 'female', 25.8, ..., 'no', 'southwest', 2007.945],
[61, 'female', 29.07, ..., 'yes', 'northwest', 29141.3603]],
dtype=object)
myvalues= df. values
type ( myvalues)
numpy.ndarray
np. random. seed( 123 )
mydata= np. random. randint( 0 , 10 , 30 ) . reshape( 6 , 5 )
mydata
array([[2, 2, 6, 1, 3],
[9, 6, 1, 0, 1],
[9, 0, 0, 9, 3],
[4, 0, 0, 4, 1],
[7, 3, 2, 4, 7],
[2, 4, 8, 0, 7]])
myindex= 'R1 R2 R3 R4 R5 R6' . split( )
myindex
['R1', 'R2', 'R3', 'R4', 'R5', 'R6']
mycols= 'C1 C2 C3 C4 C5' . split( )
mycols
['C1', 'C2', 'C3', 'C4', 'C5']
df2= pd. DataFrame( mydata, index= myindex, columns= mycols)
type ( df2)
pandas.core.frame.DataFrame
df2. index
Index(['R1', 'R2', 'R3', 'R4', 'R5', 'R6'], dtype='object')
df2. columns
Index(['C1', 'C2', 'C3', 'C4', 'C5'], dtype='object')
df2. values== mydata
array([[ True, True, True, True, True],
[ True, True, True, True, True],
[ True, True, True, True, True],
[ True, True, True, True, True],
[ True, True, True, True, True],
[ True, True, True, True, True]])
df. head( )
age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520
charge= df[ 'charges' ]
type ( charge)
pandas.core.series.Series
charge. head( )
0 16884.92400
1 1725.55230
2 4449.46200
3 21984.47061
4 3866.85520
Name: charges, dtype: float64
df. dtypes
age int64
sex object
bmi float64
children int64
smoker object
region object
charges float64
dtype: object
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age 1338 non-null int64
sex 1338 non-null object
bmi 1338 non-null float64
children 1338 non-null int64
smoker 1338 non-null object
region 1338 non-null object
charges 1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB
df. describe( )
age bmi children charges count 1338.000000 1338.000000 1338.000000 1338.000000 mean 39.207025 30.663397 1.094918 13270.422265 std 14.049960 6.098187 1.205493 12110.011237 min 18.000000 15.960000 0.000000 1121.873900 25% 27.000000 26.296250 0.000000 4740.287150 50% 39.000000 30.400000 1.000000 9382.033000 75% 51.000000 34.693750 2.000000 16639.912515 max 64.000000 53.130000 5.000000 63770.428010
df= df. iloc[ 0 : 10 , : ]
df. iloc[ : , 0 : 2 ]
age sex 0 19 female 1 18 male 2 28 male 3 33 male 4 32 male 5 31 female 6 46 female 7 37 female 8 37 male 9 60 female
df
age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520 5 31 female 25.740 0 no southeast 3756.62160 6 46 female 33.440 1 no southeast 8240.58960 7 37 female 27.740 3 no northwest 7281.50560 8 37 male 29.830 2 no northeast 6406.41070 9 60 female 25.840 0 no northwest 28923.13692
mycols= [ 'sex' , 'region' , 'charges' ]
df[ mycols]
sex region charges 0 female southwest 16884.92400 1 male southeast 1725.55230 2 male southeast 4449.46200 3 male northwest 21984.47061 4 male northwest 3866.85520 5 female southeast 3756.62160 6 female southeast 8240.58960 7 female northwest 7281.50560 8 male northeast 6406.41070 9 female northwest 28923.13692
df
age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520 5 31 female 25.740 0 no southeast 3756.62160 6 46 female 33.440 1 no southeast 8240.58960 7 37 female 27.740 3 no northwest 7281.50560 8 37 male 29.830 2 no northeast 6406.41070 9 60 female 25.840 0 no northwest 28923.13692
dfsmall= df[ mycols]
dfsmall
sex region charges 0 female southwest 16884.92400 1 male southeast 1725.55230 2 male southeast 4449.46200 3 male northwest 21984.47061 4 male northwest 3866.85520 5 female southeast 3756.62160 6 female southeast 8240.58960 7 female northwest 7281.50560 8 male northeast 6406.41070 9 female northwest 28923.13692
mycols= [ 'C2' , 'C4' ]
df2[ mycols]
C2 C4 R1 2 1 R2 6 0 R3 0 9 R4 0 4 R5 3 4 R6 4 0
df2. loc[ : , [ 'C2' , 'C4' ] ]
C2 C4 R1 2 1 R2 6 0 R3 0 9 R4 0 4 R5 3 4 R6 4 0
myrows= [ 'R1' , 'R5' ]
df2. loc[ myrows, : ]
df2
C1 C2 C3 C4 C5 R1 2 2 6 1 3 R2 9 6 1 0 1 R3 9 0 0 9 3 R4 4 0 0 4 1 R5 7 3 2 4 7 R6 2 4 8 0 7
df2. loc[ : , 'C1' : 'C4' ]
C1 C2 C3 C4 R1 2 2 6 1 R2 9 6 1 0 R3 9 0 0 9 R4 4 0 0 4 R5 7 3 2 4 R6 2 4 8 0
df2. loc[ [ 'R1' , 'R5' ] , [ 'C2' , 'C4' , 'C5' ] ]
df2. loc[ 'R1' : 'R6' : 2 , 'C1' : 'C5' : 2 ]
df2
C1 C2 C3 C4 C5 R1 2 2 6 1 3 R2 9 6 1 0 1 R3 9 0 0 9 3 R4 4 0 0 4 1 R5 7 3 2 4 7 R6 2 4 8 0 7
myrows= [ 1 , 3 ]
df2. iloc[ myrows, : ]
myrows= [ 1 , 3 ]
df2. iloc[ : , myrows]
C2 C4 R1 2 1 R2 6 0 R3 0 9 R4 0 4 R5 3 4 R6 4 0
myrows= [ 1 , 3 ]
df2. iloc[ myrows]
df2. iloc[ 4 , 2 ]
2
df2. iloc[ 0 , 0 ] = 99
df2
C1 C2 C3 C4 C5 R1 99 2 6 1 3 R2 9 6 1 0 1 R3 9 0 0 9 3 R4 4 0 0 4 1 R5 7 3 2 4 7 R6 2 4 8 0 7
df2
C1 C2 C3 C4 C5 R1 99 2 6 1 3 R2 9 6 1 0 1 R3 9 0 0 9 3 R4 4 0 0 4 1 R5 7 3 2 4 7 R6 2 4 8 0 7
df2[ 'mysum' ] = df2[ 'C1' ] + df2[ 'C2' ]
df2
C1 C2 C3 C4 C5 mysum R1 99 2 6 1 3 101 R2 9 6 1 0 1 15 R3 9 0 0 9 3 9 R4 4 0 0 4 1 4 R5 7 3 2 4 7 10 R6 2 4 8 0 7 6
df2= df2. drop( 'mysum' , axis= 1 )
df2
C1 C2 C3 C4 C5 R1 99 2 6 1 3 R2 9 6 1 0 1 R3 9 0 0 9 3 R4 4 0 0 4 1 R5 7 3 2 4 7 R6 2 4 8 0 7
df= pd. read_csv( r'C:\Users\yxl22\Desktop\Python\pandas\insurance.csv' )
df. head( )
age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520
df. describe( )
age bmi children charges count 1338.000000 1338.000000 1338.000000 1338.000000 mean 39.207025 30.663397 1.094918 13270.422265 std 14.049960 6.098187 1.205493 12110.011237 min 18.000000 15.960000 0.000000 1121.873900 25% 27.000000 26.296250 0.000000 4740.287150 50% 39.000000 30.400000 1.000000 9382.033000 75% 51.000000 34.693750 2.000000 16639.912515 max 64.000000 53.130000 5.000000 63770.428010
df[ 'charges' ]
0 16884.92400
1 1725.55230
2 4449.46200
3 21984.47061
4 3866.85520
5 3756.62160
6 8240.58960
7 7281.50560
8 6406.41070
9 28923.13692
10 2721.32080
11 27808.72510
12 1826.84300
13 11090.71780
14 39611.75770
15 1837.23700
16 10797.33620
17 2395.17155
18 10602.38500
19 36837.46700
20 13228.84695
21 4149.73600
22 1137.01100
23 37701.87680
24 6203.90175
25 14001.13380
26 14451.83515
27 12268.63225
28 2775.19215
29 38711.00000
...
1308 33900.65300
1309 6875.96100
1310 6940.90985
1311 4571.41305
1312 4536.25900
1313 36397.57600
1314 18765.87545
1315 11272.33139
1316 1731.67700
1317 1163.46270
1318 19496.71917
1319 7201.70085
1320 5425.02335
1321 28101.33305
1322 12981.34570
1323 43896.37630
1324 4239.89265
1325 13143.33665
1326 7050.02130
1327 9377.90470
1328 22395.74424
1329 10325.20600
1330 12629.16560
1331 10795.93733
1332 11411.68500
1333 10600.54830
1334 2205.98080
1335 1629.83350
1336 2007.94500
1337 29141.36030
Name: charges, Length: 1338, dtype: float64
filt20= df[ 'charges' ] > 20000
filt20. head( )
0 False
1 False
2 False
3 True
4 False
Name: charges, dtype: bool
df20= df[ filt20]
df20. head( )
age sex bmi children smoker region charges 3 33 male 22.705 0 no northwest 21984.47061 9 60 female 25.840 0 no northwest 28923.13692 11 62 female 26.290 0 yes southeast 27808.72510 14 27 male 42.130 0 yes southeast 39611.75770 19 30 male 35.300 0 yes southwest 36837.46700
filt1= df[ 'charges' ] > 20000
filt2= df[ 'sex' ] == 'male'
filt= filt1 & filt2
m20= df[ filt] . head( )
m20
age sex bmi children smoker region charges 3 33 male 22.705 0 no northwest 21984.47061 14 27 male 42.130 0 yes southeast 39611.75770 19 30 male 35.300 0 yes southwest 36837.46700 29 31 male 36.300 2 yes southwest 38711.00000 30 22 male 35.600 0 yes southwest 35585.57600
m20. describe( )
age bmi children charges count 5.00000 5.000000 5.000000 5.000000 mean 28.60000 34.407000 0.400000 34546.054262 std 4.27785 7.113116 0.894427 7196.052493 min 22.00000 22.705000 0.000000 21984.470610 25% 27.00000 35.300000 0.000000 35585.576000 50% 30.00000 35.600000 0.000000 36837.467000 75% 31.00000 36.300000 0.000000 38711.000000 max 33.00000 42.130000 2.000000 39611.757700
filt= filt1 | filt2
df[ filt] . head( )
age sex bmi children smoker region charges 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520 8 37 male 29.830 2 no northeast 6406.41070
g= df. groupby( 'sex' )
g
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072C5FD390>
g. describe( )
age bmi ... children charges count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max sex female 662.0 39.503021 14.054223 18.0 27.0 40.0 51.75 64.0 662.0 30.377749 ... 2.0 5.0 662.0 12569.578844 11128.703801 1607.5101 4885.1587 9412.96250 14454.691825 63770.42801 male 676.0 38.917160 14.050141 18.0 26.0 39.0 51.00 64.0 676.0 30.943129 ... 2.0 5.0 676.0 13956.751178 12971.025915 1121.8739 4619.1340 9369.61575 18989.590250 62592.87309
2 rows × 32 columns
g. corr( )
age bmi children charges sex female age 1.000000 0.097214 0.078500 0.324575 bmi 0.097214 1.000000 0.022151 0.161419 children 0.078500 0.022151 1.000000 0.058492 charges 0.324575 0.161419 0.058492 1.000000 male age 1.000000 0.123088 0.008690 0.282369 bmi 0.123088 1.000000 0.002385 0.225847 children 0.008690 0.002385 1.000000 0.074496 charges 0.282369 0.225847 0.074496 1.000000
type ( g)
pandas.core.groupby.generic.DataFrameGroupBy
for gp, df_gp in g:
print ( gp)
print ( df_gp)
print ( '' )
female
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
5 31 female 25.740 0 no southeast 3756.62160
6 46 female 33.440 1 no southeast 8240.58960
7 37 female 27.740 3 no northwest 7281.50560
9 60 female 25.840 0 no northwest 28923.13692
11 62 female 26.290 0 yes southeast 27808.72510
13 56 female 39.820 0 no southeast 11090.71780
16 52 female 30.780 1 no northeast 10797.33620
20 60 female 36.005 0 no northeast 13228.84695
21 30 female 32.400 1 no southwest 4149.73600
23 34 female 31.920 1 yes northeast 37701.87680
25 59 female 27.720 3 no southeast 14001.13380
26 63 female 23.085 0 no northeast 14451.83515
27 55 female 32.775 2 no northwest 12268.63225
31 18 female 26.315 0 no northeast 2198.18985
32 19 female 28.600 5 no southwest 4687.79700
36 62 female 32.965 3 no northwest 15612.19335
40 24 female 26.600 0 no northeast 3046.06200
41 31 female 36.630 2 no southeast 4949.75870
43 37 female 30.800 2 no southeast 6313.75900
46 18 female 38.665 2 no northeast 3393.35635
47 28 female 34.770 0 no northwest 3556.92230
48 60 female 24.530 0 no southeast 12629.89670
50 18 female 35.625 0 no northeast 2211.13075
51 21 female 33.630 2 no northwest 3579.82870
54 40 female 28.690 3 no northwest 8059.67910
56 58 female 31.825 2 no northeast 13607.36875
58 53 female 22.880 1 yes southeast 23244.79020
59 34 female 37.335 2 no northwest 5989.52365
63 28 female 25.935 1 no northwest 4133.64165
... ... ... ... ... ... ... ...
1277 32 female 29.735 0 no northwest 4357.04365
1279 25 female 26.790 2 no northwest 4189.11310
1280 48 female 33.330 0 no southeast 8283.68070
1281 47 female 27.645 2 yes northwest 24535.69855
1282 18 female 21.660 0 yes northeast 14283.45940
1285 47 female 24.320 0 no northeast 8534.67180
1286 28 female 17.290 0 no northeast 3732.62510
1287 36 female 25.900 1 no southwest 5472.44900
1290 38 female 19.950 2 no northeast 7133.90250
1297 28 female 26.510 2 no southeast 4340.44090
1299 19 female 25.745 1 no northwest 2710.82855
1302 25 female 20.800 1 no southwest 3208.78700
1305 24 female 27.720 0 no southeast 2464.61880
1306 29 female 21.850 0 yes northeast 16115.30450
1308 25 female 30.200 0 yes southwest 33900.65300
1311 33 female 26.695 0 no northwest 4571.41305
1313 19 female 34.700 2 yes southwest 36397.57600
1314 30 female 23.655 3 yes northwest 18765.87545
1316 19 female 20.600 0 no southwest 1731.67700
1319 39 female 26.315 2 no northwest 7201.70085
1323 42 female 40.370 2 yes southeast 43896.37630
1326 42 female 32.870 0 no northeast 7050.02130
1328 23 female 24.225 2 no northeast 22395.74424
1330 57 female 25.740 2 no southeast 12629.16560
1331 23 female 33.400 0 no southwest 10795.93733
1332 52 female 44.700 3 no southwest 11411.68500
1334 18 female 31.920 0 no northeast 2205.98080
1335 18 female 36.850 0 no southeast 1629.83350
1336 21 female 25.800 0 no southwest 2007.94500
1337 61 female 29.070 0 yes northwest 29141.36030
[662 rows x 7 columns]
male
age sex bmi children smoker region charges
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
8 37 male 29.830 2 no northeast 6406.41070
10 25 male 26.220 0 no northeast 2721.32080
12 23 male 34.400 0 no southwest 1826.84300
14 27 male 42.130 0 yes southeast 39611.75770
15 19 male 24.600 1 no southwest 1837.23700
17 23 male 23.845 0 no northeast 2395.17155
18 56 male 40.300 0 no southwest 10602.38500
19 30 male 35.300 0 yes southwest 36837.46700
22 18 male 34.100 0 no southeast 1137.01100
24 37 male 28.025 2 no northwest 6203.90175
28 23 male 17.385 1 no northwest 2775.19215
29 31 male 36.300 2 yes southwest 38711.00000
30 22 male 35.600 0 yes southwest 35585.57600
33 63 male 28.310 0 no northwest 13770.09790
34 28 male 36.400 1 yes southwest 51194.55914
35 19 male 20.425 0 no northwest 1625.43375
37 26 male 20.800 0 no southwest 2302.30000
38 35 male 36.670 1 yes northeast 39774.27630
39 60 male 39.900 0 yes southwest 48173.36100
42 41 male 21.780 1 no southeast 6272.47720
44 38 male 37.050 1 no northeast 6079.67150
45 55 male 37.300 0 no southwest 20630.28351
49 36 male 35.200 1 yes southeast 38709.17600
52 48 male 28.000 1 yes southwest 23568.27200
53 36 male 34.430 0 yes southeast 37742.57570
55 58 male 36.955 2 yes northwest 47496.49445
... ... ... ... ... ... ... ...
1283 18 male 30.030 1 no southeast 1720.35370
1284 61 male 36.300 1 yes southwest 47403.88000
1288 20 male 39.400 2 yes southwest 38344.56600
1289 44 male 34.320 1 no southeast 7147.47280
1291 19 male 34.900 0 yes southwest 34828.65400
1292 21 male 23.210 0 no southeast 1515.34490
1293 46 male 25.745 3 no northwest 9301.89355
1294 58 male 25.175 0 no northeast 11931.12525
1295 20 male 22.000 1 no southwest 1964.78000
1296 18 male 26.125 0 no northeast 1708.92575
1298 33 male 27.455 2 no northwest 5261.46945
1300 45 male 30.360 0 yes southeast 62592.87309
1301 62 male 30.875 3 yes northwest 46718.16325
1303 43 male 27.800 0 yes southwest 37829.72420
1304 42 male 24.605 2 yes northeast 21259.37795
1307 32 male 28.120 4 yes northwest 21472.47880
1309 41 male 32.200 2 no southwest 6875.96100
1310 42 male 26.315 1 no northwest 6940.90985
1312 34 male 42.900 1 no southwest 4536.25900
1315 18 male 28.310 1 no northeast 11272.33139
1317 18 male 53.130 0 no southeast 1163.46270
1318 35 male 39.710 4 no northeast 19496.71917
1320 31 male 31.065 3 no northwest 5425.02335
1321 62 male 26.695 0 yes northeast 28101.33305
1322 62 male 38.830 0 no southeast 12981.34570
1324 31 male 25.935 1 no northwest 4239.89265
1325 61 male 33.535 0 no northeast 13143.33665
1327 51 male 30.030 1 no southeast 9377.90470
1329 52 male 38.600 2 no southwest 10325.20600
1333 50 male 30.970 3 no northwest 10600.54830
[676 rows x 7 columns]
g. size( )
sex
female 662
male 676
dtype: int64
g. count( )
age bmi children smoker region charges sex female 662 662 662 662 662 662 male 676 676 676 676 676 676
mymeans= g. mean( )
mymeans
age bmi children charges sex female 39.503021 30.377749 1.074018 12569.578844 male 38.917160 30.943129 1.115385 13956.751178
mymeans[ [ 'age' , 'bmi' , 'children' ] ] . plot( kind= 'barh' ) ;
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-21vYpy4W-1576119946192)(output_75_0.png)]
df_region= df. groupby( 'region' )
df_region[ 'charges' ] . mean( ) . round ( 2 )
region
northeast 13406.38
northwest 12417.58
southeast 14735.41
southwest 12346.94
Name: charges, dtype: float64
df_region. describe( )
age bmi ... children charges count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max region northeast 324.0 39.268519 14.069007 18.0 27.00 39.5 51.0 64.0 324.0 29.173503 ... 2.0 5.0 324.0 13406.384516 11255.803066 1694.7964 5194.322288 10057.652025 16687.3641 58571.07448 northwest 325.0 39.196923 14.051646 19.0 26.00 39.0 51.0 64.0 325.0 29.199785 ... 2.0 5.0 325.0 12417.575374 11072.276928 1621.3402 4719.736550 8965.795750 14711.7438 60021.39897 southeast 364.0 38.939560 14.164585 18.0 26.75 39.0 51.0 64.0 364.0 33.355989 ... 2.0 5.0 364.0 14735.411438 13971.098589 1121.8739 4440.886200 9294.131950 19526.2869 63770.42801 southwest 325.0 39.455385 13.959886 19.0 27.00 39.0 51.0 64.0 325.0 30.596615 ... 2.0 5.0 325.0 12346.937377 11557.179101 1241.5650 4751.070000 8798.593000 13462.5200 52590.82939
4 rows × 32 columns
mylist= [ 'charges' , 'bmi' ]
df_region[ mylist] . describe( ) . round ( 2 )
charges bmi count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max region northeast 324.0 13406.38 11255.80 1694.80 5194.32 10057.65 16687.36 58571.07 324.0 29.17 5.94 15.96 24.87 28.88 32.89 48.07 northwest 325.0 12417.58 11072.28 1621.34 4719.74 8965.80 14711.74 60021.40 325.0 29.20 5.14 17.39 25.74 28.88 32.78 42.94 southeast 364.0 14735.41 13971.10 1121.87 4440.89 9294.13 19526.29 63770.43 364.0 33.36 6.48 19.80 28.57 33.33 37.81 53.13 southwest 325.0 12346.94 11557.18 1241.56 4751.07 8798.59 13462.52 52590.83 325.0 30.60 5.69 17.40 26.90 30.30 34.60 47.60
Multiple Grouping Columns
-This is my first comment
-This is my second comment
mylist = [ 'region' , 'sex' ]
mygroups = df. groupby( mylist)
mygroups
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072BC8E908>
mygroups[ 'charges' ] . mean( ) . round ( 2 )
region sex
northeast female 12953.20
male 13854.01
northwest female 12479.87
male 12354.12
southeast female 13499.67
male 15879.62
southwest female 11274.41
male 13412.88
Name: charges, dtype: float64
mm= [ 'sex' , 'region' ]
mmgroups = df. groupby( mm)
mmgroups[ 'charges' ] . mean( ) . round ( 2 )
sex region
female northeast 12953.20
northwest 12479.87
southeast 13499.67
southwest 11274.41
male northeast 13854.01
northwest 12354.12
southeast 15879.62
southwest 13412.88
Name: charges, dtype: float64
mytable = df. pivot_table( index = 'sex' , columns = 'region' , values = 'charges' , aggfunc = 'mean' ) . round ( 2 )
mytable
region northeast northwest southeast southwest sex female 12953.20 12479.87 13499.67 11274.41 male 13854.01 12354.12 15879.62 13412.88
df= pd. read_csv( r'C:\Users\yxl22\Desktop\Python\pandas\mycountry.csv' )
df. head( )
country country year POP XRAT tcgdp cc cg 0 Argentina ARG 2000 37335.653 0.999500 2.950722e+05 75.716805 5.578804 1 Australia AUS 2000 19053.186 1.724830 5.418047e+05 67.759026 6.720098 2 India IND 2000 1006300.297 44.941600 1.728144e+06 64.575551 14.072206 3 Israel ISR 2000 6114.570 4.077330 1.292539e+05 64.436451 10.266688 4 Malawi MWI 2000 11801.505 59.543808 5.026222e+03 74.707624 11.658954
df. columns
Index(['country', 'country ', 'year', 'POP', 'XRAT', 'tcgdp', 'cc', 'cg'], dtype='object')
df = df[ [ 'country' , 'POP' , 'tcgdp' ] ]
df. head( )
country POP tcgdp 0 Argentina 37335.653 2.950722e+05 1 Australia 19053.186 5.418047e+05 2 India 1006300.297 1.728144e+06 3 Israel 6114.570 1.292539e+05 4 Malawi 11801.505 5.026222e+03
df= df. set_index( 'country' )
df. head( )
POP tcgdp country Argentina 37335.653 2.950722e+05 Australia 19053.186 5.418047e+05 India 1006300.297 1.728144e+06 Israel 6114.570 1.292539e+05 Malawi 11801.505 5.026222e+03
df. columns
Index(['POP', 'tcgdp'], dtype='object')
df. columns= [ 'population' , 'total_gdp' ]
df. head( )
population total_gdp country Argentina 37335.653 2.950722e+05 Australia 19053.186 5.418047e+05 India 1006300.297 1.728144e+06 Israel 6114.570 1.292539e+05 Malawi 11801.505 5.026222e+03
df[ 'GDP_percap' ] = df[ 'total_gdp' ] * 1000000 / df[ 'population' ]
df. head( )
population total_gdp GDP_percap country Argentina 37335.653 2.950722e+05 7.903229e+06 Australia 19053.186 5.418047e+05 2.843643e+07 India 1006300.297 1.728144e+06 1.717325e+06 Israel 6114.570 1.292539e+05 2.113867e+07 Malawi 11801.505 5.026222e+03 4.258967e+05
df[ 'GDP_percap' ] . plot( kind = 'bar' )
<matplotlib.axes._subplots.AxesSubplot at 0x2072bc8e748>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-265aMTfM-1576119946194)(output_101_1.png)]
df= df. sort_values( by= 'GDP_percap' , ascending = True )
df. head( )
population total_gdp GDP_percap country Malawi 11801.505 5.026222e+03 4.258967e+05 India 1006300.297 1.728144e+06 1.717325e+06 South Africa 45064.098 2.272424e+05 5.042648e+06 Uruguay 3219.793 2.525596e+04 7.843971e+06 Argentina 37335.653 2.950722e+05 7.903229e+06
df[ 'GDP_percap' ] . plot( kind = 'bar' ) ;
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u0TsPaBD-1576119946194)(output_104_0.png)]
df. GDP_percap. plot( kind = 'barh' , figsize = ( 12 , 6 ) , colormap= 'summer' , title= 'This a hot' )
<matplotlib.axes._subplots.AxesSubplot at 0x2072d8da780>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Hrb1YN1d-1576119946195)(output_105_1.png)]
df[ 'GDP_percap' ] . plot( kind = 'pie' , figsize = ( 12 , 6 ) , colormap= 'Blues' ) ;
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XzceHeJp-1576119946195)(output_106_0.png)]
def times2 ( x) :
return x* 2
df2[ 'double' ] = df2[ 'C2' ] . apply ( times2)
df2. head( )
C1 C2 C3 C4 C5 double R1 99 2 6 1 3 4 R2 9 6 1 0 1 12 R3 9 0 0 9 3 0 R4 4 0 0 4 1 0 R5 7 3 2 4 7 6
def weight ( x) :
if ( x[ 4 ] < 5 ) :
ans = x[ 1 ] + x[ 2 ]
else :
ans = 99
return ans
df2[ 'summary' ] = df2. apply ( weight, axis = 1 )
df2. head( )
C1 C2 C3 C4 C5 double summary R1 99 2 6 1 3 4 8 R2 9 6 1 0 1 12 7 R3 9 0 0 9 3 0 0 R4 4 0 0 4 1 0 0 R5 7 3 2 4 7 6 99
df2. apply ( np. max , axis = 0 )
C1 99
C2 6
C3 8
C4 9
C5 7
double 12
summary 99
dtype: int64
df2. apply ( np. max , axis = 1 )
R1 99
R2 12
R3 9
R4 4
R5 99
R6 99
dtype: int64
dfM = pd. DataFrame( np. arange( 0 , 15 ) . reshape( 3 , 5 ) )
dfM
0 1 2 3 4 0 0 1 2 3 4 1 5 6 7 8 9 2 10 11 12 13 14
dfM. loc[ 1 , 2 ] = np. nan
dfM
0 1 2 3 4 0 0 1 2.0 3 4 1 5 6 NaN 8 9 2 10 11 12.0 13 14
import Quandl
conda. . .