Pandas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
%matplotlib inline
df=pd.read_csv(r'C:\Users\yxl22\Desktop\Python\pandas\insurance.csv') #读入文件并且命名数据集
#Pandas使用一个二维的数据结构DataFrame来表示表格式的数据
type(df) #DataFrame来自frame模块,而frame模块是属于core模块的,同时pandas包又是由core模块等组成
pandas.core.frame.DataFrame
df.head() #默认取前五个,显示索引index
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
df.head(n=2) #返回前N条
agesexbmichildrensmokerregioncharges
019female27.900yessouthwest16884.9240
118male33.771nosoutheast1725.5523
df.tail() #默认取后五个
agesexbmichildrensmokerregioncharges
133350male30.973nonorthwest10600.5483
133418female31.920nonortheast2205.9808
133518female36.850nosoutheast1629.8335
133621female25.800nosouthwest2007.9450
133761female29.070yesnorthwest29141.3603
df.shape #返回tuple,行x列
(1338, 7)
#关注表中的黑体字
df.index #不包括1338,label索引名,返回Index类型的索引的集合(行索引)
RangeIndex(start=0, stop=1338, step=1)
df.columns #列名,返回Index类型的列的集合(列索引)
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
df.values #值的二维数组,以numpy.ndarray矩阵返回
array([[19, 'female', 27.9, ..., 'yes', 'southwest', 16884.924],
       [18, 'male', 33.77, ..., 'no', 'southeast', 1725.5523],
       [28, 'male', 33.0, ..., 'no', 'southeast', 4449.462],
       ...,
       [18, 'female', 36.85, ..., 'no', 'southeast', 1629.8335],
       [21, 'female', 25.8, ..., 'no', 'southwest', 2007.945],
       [61, 'female', 29.07, ..., 'yes', 'northwest', 29141.3603]],
      dtype=object)
myvalues=df.values
type(myvalues)
numpy.ndarray
np.random.seed(123)
mydata=np.random.randint(0,10,30).reshape(6,5)#0到10的10个随机数,分成6x5的矩阵
mydata
array([[2, 2, 6, 1, 3],
       [9, 6, 1, 0, 1],
       [9, 0, 0, 9, 3],
       [4, 0, 0, 4, 1],
       [7, 3, 2, 4, 7],
       [2, 4, 8, 0, 7]])
myindex='R1 R2 R3 R4 R5 R6'.split()
myindex
['R1', 'R2', 'R3', 'R4', 'R5', 'R6']
mycols='C1 C2 C3 C4 C5'.split()
mycols
['C1', 'C2', 'C3', 'C4', 'C5']
#创建一个新的dataframe,结合mydata
df2=pd.DataFrame(mydata,index=myindex,columns=mycols)#index是行名,columns是列名
type(df2)
pandas.core.frame.DataFrame
df2.index #list of row
Index(['R1', 'R2', 'R3', 'R4', 'R5', 'R6'], dtype='object')
df2.columns
Index(['C1', 'C2', 'C3', 'C4', 'C5'], dtype='object')
df2.values==mydata
array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])
df.head()
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
charge=df['charges']
type(charge) #series, just one column of DataFrame
pandas.core.series.Series
charge.head()#index+value
0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64
df.dtypes #显示各种列的种类
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object
df.info()#overall information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB
df.describe()#默认只输出数值型数据的统计信息
agebmichildrencharges
count1338.0000001338.0000001338.0000001338.000000
mean39.20702530.6633971.09491813270.422265
std14.0499606.0981871.20549312110.011237
min18.00000015.9600000.0000001121.873900
25%27.00000026.2962500.0000004740.287150
50%39.00000030.4000001.0000009382.033000
75%51.00000034.6937502.00000016639.912515
max64.00000053.1300005.00000063770.428010
df=df.iloc[0:10,:]#从第一行到第10行,所有列
df.iloc[:,0:2]#从第一列和第二列
agesex
019female
118male
228male
333male
432male
531female
646female
737female
837male
960female
df
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
531female25.7400nosoutheast3756.62160
646female33.4401nosoutheast8240.58960
737female27.7403nonorthwest7281.50560
837male29.8302nonortheast6406.41070
960female25.8400nonorthwest28923.13692
mycols=['sex','region','charges']
df[mycols]#只要其中指定的三列,但是并没有存
sexregioncharges
0femalesouthwest16884.92400
1malesoutheast1725.55230
2malesoutheast4449.46200
3malenorthwest21984.47061
4malenorthwest3866.85520
5femalesoutheast3756.62160
6femalesoutheast8240.58960
7femalenorthwest7281.50560
8malenortheast6406.41070
9femalenorthwest28923.13692
df #但是并没有存
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
531female25.7400nosoutheast3756.62160
646female33.4401nosoutheast8240.58960
737female27.7403nonorthwest7281.50560
837male29.8302nonortheast6406.41070
960female25.8400nonorthwest28923.13692
dfsmall=df[mycols] #新定义了才存上
dfsmall
sexregioncharges
0femalesouthwest16884.92400
1malesoutheast1725.55230
2malesoutheast4449.46200
3malenorthwest21984.47061
4malenorthwest3866.85520
5femalesoutheast3756.62160
6femalesoutheast8240.58960
7femalenorthwest7281.50560
8malenortheast6406.41070
9femalenorthwest28923.13692
mycols=['C2','C4']
df2[mycols]#只能这种形式
C2C4
R121
R260
R309
R404
R534
R640
df2.loc[:,['C2','C4']]
C2C4
R121
R260
R309
R404
R534
R640
myrows=['R1','R5']
df2.loc[myrows,:]
C1C2C3C4C5
R122613
R573247
df2
C1C2C3C4C5
R122613
R296101
R390093
R440041
R573247
R624807
df2.loc[:,'C1':'C4']#包括C4
#loc利用index的名称,来获取想要的行(或列)
#iloc利用index的具体位置(所以它只能是整数型参数)
C1C2C3C4
R12261
R29610
R39009
R44004
R57324
R62480
df2.loc[['R1','R5'],['C2','C4','C5']]
C2C4C5
R1213
R5347
df2.loc['R1':'R6':2,'C1':'C5':2]
C1C3C5
R1263
R3903
R5727
df2 #还是不变
C1C2C3C4C5
R122613
R296101
R390093
R440041
R573247
R624807
myrows=[1,3]
df2.iloc[myrows,:]
C1C2C3C4C5
R296101
R440041
myrows=[1,3]
df2.iloc[:,myrows]
C2C4
R121
R260
R309
R404
R534
R640
myrows=[1,3]
df2.iloc[myrows]
C1C2C3C4C5
R296101
R440041
df2.iloc[4,2]
2
df2.iloc[0,0]=99 #赋值
df2
C1C2C3C4C5
R1992613
R296101
R390093
R440041
R573247
R624807
df2
C1C2C3C4C5
R1992613
R296101
R390093
R440041
R573247
R624807
df2['mysum']=df2['C1']+df2['C2'] #create a new column
df2
C1C2C3C4C5mysum
R1992613101
R29610115
R3900939
R4400414
R57324710
R6248076
df2=df2.drop('mysum',axis=1)#再扔掉这一行,只有drop类函数才会这么反着axis
df2
C1C2C3C4C5
R1992613
R296101
R390093
R440041
R573247
R624807
df=pd.read_csv(r'C:\Users\yxl22\Desktop\Python\pandas\insurance.csv')#重新读入
df.head()
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
df.describe()
agebmichildrencharges
count1338.0000001338.0000001338.0000001338.000000
mean39.20702530.6633971.09491813270.422265
std14.0499606.0981871.20549312110.011237
min18.00000015.9600000.0000001121.873900
25%27.00000026.2962500.0000004740.287150
50%39.00000030.4000001.0000009382.033000
75%51.00000034.6937502.00000016639.912515
max64.00000053.1300005.00000063770.428010
df['charges']
0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
5        3756.62160
6        8240.58960
7        7281.50560
8        6406.41070
9       28923.13692
10       2721.32080
11      27808.72510
12       1826.84300
13      11090.71780
14      39611.75770
15       1837.23700
16      10797.33620
17       2395.17155
18      10602.38500
19      36837.46700
20      13228.84695
21       4149.73600
22       1137.01100
23      37701.87680
24       6203.90175
25      14001.13380
26      14451.83515
27      12268.63225
28       2775.19215
29      38711.00000
           ...     
1308    33900.65300
1309     6875.96100
1310     6940.90985
1311     4571.41305
1312     4536.25900
1313    36397.57600
1314    18765.87545
1315    11272.33139
1316     1731.67700
1317     1163.46270
1318    19496.71917
1319     7201.70085
1320     5425.02335
1321    28101.33305
1322    12981.34570
1323    43896.37630
1324     4239.89265
1325    13143.33665
1326     7050.02130
1327     9377.90470
1328    22395.74424
1329    10325.20600
1330    12629.16560
1331    10795.93733
1332    11411.68500
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64
filt20=df['charges']>20000 #是否大于20000
filt20.head()
0    False
1    False
2    False
3     True
4    False
Name: charges, dtype: bool
df20=df[filt20] #剔除false的
df20.head()
agesexbmichildrensmokerregioncharges
333male22.7050nonorthwest21984.47061
960female25.8400nonorthwest28923.13692
1162female26.2900yessoutheast27808.72510
1427male42.1300yessoutheast39611.75770
1930male35.3000yessouthwest36837.46700
filt1=df['charges']>20000
filt2=df['sex']=='male'
filt=filt1 & filt2  #且
m20=df[filt].head()
m20
agesexbmichildrensmokerregioncharges
333male22.7050nonorthwest21984.47061
1427male42.1300yessoutheast39611.75770
1930male35.3000yessouthwest36837.46700
2931male36.3002yessouthwest38711.00000
3022male35.6000yessouthwest35585.57600
m20.describe()
agebmichildrencharges
count5.000005.0000005.0000005.000000
mean28.6000034.4070000.40000034546.054262
std4.277857.1131160.8944277196.052493
min22.0000022.7050000.00000021984.470610
25%27.0000035.3000000.00000035585.576000
50%30.0000035.6000000.00000036837.467000
75%31.0000036.3000000.00000038711.000000
max33.0000042.1300002.00000039611.757700
filt=filt1 | filt2 #或
df[filt].head()
agesexbmichildrensmokerregioncharges
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
837male29.8302nonortheast6406.41070
g=df.groupby('sex') #分组
g #like dict
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072C5FD390>
g.describe()
agebmi...childrencharges
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
sex
female662.039.50302114.05422318.027.040.051.7564.0662.030.377749...2.05.0662.012569.57884411128.7038011607.51014885.15879412.9625014454.69182563770.42801
male676.038.91716014.05014118.026.039.051.0064.0676.030.943129...2.05.0676.013956.75117812971.0259151121.87394619.13409369.6157518989.59025062592.87309

2 rows × 32 columns

g.corr()
agebmichildrencharges
sex
femaleage1.0000000.0972140.0785000.324575
bmi0.0972141.0000000.0221510.161419
children0.0785000.0221511.0000000.058492
charges0.3245750.1614190.0584921.000000
maleage1.0000000.1230880.0086900.282369
bmi0.1230881.0000000.0023850.225847
children0.0086900.0023851.0000000.074496
charges0.2823690.2258470.0744961.000000
type(g)
pandas.core.groupby.generic.DataFrameGroupBy
for gp,df_gp in g:
    print(gp)#显示分类
    print(df_gp)#显示具体数据
    print('')
female
      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
5      31  female  25.740         0     no  southeast   3756.62160
6      46  female  33.440         1     no  southeast   8240.58960
7      37  female  27.740         3     no  northwest   7281.50560
9      60  female  25.840         0     no  northwest  28923.13692
11     62  female  26.290         0    yes  southeast  27808.72510
13     56  female  39.820         0     no  southeast  11090.71780
16     52  female  30.780         1     no  northeast  10797.33620
20     60  female  36.005         0     no  northeast  13228.84695
21     30  female  32.400         1     no  southwest   4149.73600
23     34  female  31.920         1    yes  northeast  37701.87680
25     59  female  27.720         3     no  southeast  14001.13380
26     63  female  23.085         0     no  northeast  14451.83515
27     55  female  32.775         2     no  northwest  12268.63225
31     18  female  26.315         0     no  northeast   2198.18985
32     19  female  28.600         5     no  southwest   4687.79700
36     62  female  32.965         3     no  northwest  15612.19335
40     24  female  26.600         0     no  northeast   3046.06200
41     31  female  36.630         2     no  southeast   4949.75870
43     37  female  30.800         2     no  southeast   6313.75900
46     18  female  38.665         2     no  northeast   3393.35635
47     28  female  34.770         0     no  northwest   3556.92230
48     60  female  24.530         0     no  southeast  12629.89670
50     18  female  35.625         0     no  northeast   2211.13075
51     21  female  33.630         2     no  northwest   3579.82870
54     40  female  28.690         3     no  northwest   8059.67910
56     58  female  31.825         2     no  northeast  13607.36875
58     53  female  22.880         1    yes  southeast  23244.79020
59     34  female  37.335         2     no  northwest   5989.52365
63     28  female  25.935         1     no  northwest   4133.64165
...   ...     ...     ...       ...    ...        ...          ...
1277   32  female  29.735         0     no  northwest   4357.04365
1279   25  female  26.790         2     no  northwest   4189.11310
1280   48  female  33.330         0     no  southeast   8283.68070
1281   47  female  27.645         2    yes  northwest  24535.69855
1282   18  female  21.660         0    yes  northeast  14283.45940
1285   47  female  24.320         0     no  northeast   8534.67180
1286   28  female  17.290         0     no  northeast   3732.62510
1287   36  female  25.900         1     no  southwest   5472.44900
1290   38  female  19.950         2     no  northeast   7133.90250
1297   28  female  26.510         2     no  southeast   4340.44090
1299   19  female  25.745         1     no  northwest   2710.82855
1302   25  female  20.800         1     no  southwest   3208.78700
1305   24  female  27.720         0     no  southeast   2464.61880
1306   29  female  21.850         0    yes  northeast  16115.30450
1308   25  female  30.200         0    yes  southwest  33900.65300
1311   33  female  26.695         0     no  northwest   4571.41305
1313   19  female  34.700         2    yes  southwest  36397.57600
1314   30  female  23.655         3    yes  northwest  18765.87545
1316   19  female  20.600         0     no  southwest   1731.67700
1319   39  female  26.315         2     no  northwest   7201.70085
1323   42  female  40.370         2    yes  southeast  43896.37630
1326   42  female  32.870         0     no  northeast   7050.02130
1328   23  female  24.225         2     no  northeast  22395.74424
1330   57  female  25.740         2     no  southeast  12629.16560
1331   23  female  33.400         0     no  southwest  10795.93733
1332   52  female  44.700         3     no  southwest  11411.68500
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[662 rows x 7 columns]

male
      age   sex     bmi  children smoker     region      charges
1      18  male  33.770         1     no  southeast   1725.55230
2      28  male  33.000         3     no  southeast   4449.46200
3      33  male  22.705         0     no  northwest  21984.47061
4      32  male  28.880         0     no  northwest   3866.85520
8      37  male  29.830         2     no  northeast   6406.41070
10     25  male  26.220         0     no  northeast   2721.32080
12     23  male  34.400         0     no  southwest   1826.84300
14     27  male  42.130         0    yes  southeast  39611.75770
15     19  male  24.600         1     no  southwest   1837.23700
17     23  male  23.845         0     no  northeast   2395.17155
18     56  male  40.300         0     no  southwest  10602.38500
19     30  male  35.300         0    yes  southwest  36837.46700
22     18  male  34.100         0     no  southeast   1137.01100
24     37  male  28.025         2     no  northwest   6203.90175
28     23  male  17.385         1     no  northwest   2775.19215
29     31  male  36.300         2    yes  southwest  38711.00000
30     22  male  35.600         0    yes  southwest  35585.57600
33     63  male  28.310         0     no  northwest  13770.09790
34     28  male  36.400         1    yes  southwest  51194.55914
35     19  male  20.425         0     no  northwest   1625.43375
37     26  male  20.800         0     no  southwest   2302.30000
38     35  male  36.670         1    yes  northeast  39774.27630
39     60  male  39.900         0    yes  southwest  48173.36100
42     41  male  21.780         1     no  southeast   6272.47720
44     38  male  37.050         1     no  northeast   6079.67150
45     55  male  37.300         0     no  southwest  20630.28351
49     36  male  35.200         1    yes  southeast  38709.17600
52     48  male  28.000         1    yes  southwest  23568.27200
53     36  male  34.430         0    yes  southeast  37742.57570
55     58  male  36.955         2    yes  northwest  47496.49445
...   ...   ...     ...       ...    ...        ...          ...
1283   18  male  30.030         1     no  southeast   1720.35370
1284   61  male  36.300         1    yes  southwest  47403.88000
1288   20  male  39.400         2    yes  southwest  38344.56600
1289   44  male  34.320         1     no  southeast   7147.47280
1291   19  male  34.900         0    yes  southwest  34828.65400
1292   21  male  23.210         0     no  southeast   1515.34490
1293   46  male  25.745         3     no  northwest   9301.89355
1294   58  male  25.175         0     no  northeast  11931.12525
1295   20  male  22.000         1     no  southwest   1964.78000
1296   18  male  26.125         0     no  northeast   1708.92575
1298   33  male  27.455         2     no  northwest   5261.46945
1300   45  male  30.360         0    yes  southeast  62592.87309
1301   62  male  30.875         3    yes  northwest  46718.16325
1303   43  male  27.800         0    yes  southwest  37829.72420
1304   42  male  24.605         2    yes  northeast  21259.37795
1307   32  male  28.120         4    yes  northwest  21472.47880
1309   41  male  32.200         2     no  southwest   6875.96100
1310   42  male  26.315         1     no  northwest   6940.90985
1312   34  male  42.900         1     no  southwest   4536.25900
1315   18  male  28.310         1     no  northeast  11272.33139
1317   18  male  53.130         0     no  southeast   1163.46270
1318   35  male  39.710         4     no  northeast  19496.71917
1320   31  male  31.065         3     no  northwest   5425.02335
1321   62  male  26.695         0    yes  northeast  28101.33305
1322   62  male  38.830         0     no  southeast  12981.34570
1324   31  male  25.935         1     no  northwest   4239.89265
1325   61  male  33.535         0     no  northeast  13143.33665
1327   51  male  30.030         1     no  southeast   9377.90470
1329   52  male  38.600         2     no  southwest  10325.20600
1333   50  male  30.970         3     no  northwest  10600.54830

[676 rows x 7 columns]
g.size() #不能用shape
sex
female    662
male      676
dtype: int64
g.count() #无missing value
agebmichildrensmokerregioncharges
sex
female662662662662662662
male676676676676676676
mymeans=g.mean()
mymeans#index是性别
agebmichildrencharges
sex
female39.50302130.3777491.07401812569.578844
male38.91716030.9431291.11538513956.751178
mymeans[['age','bmi','children']].plot(kind='barh');

在这里插入图片描述

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-21vYpy4W-1576119946192)(output_75_0.png)]

df_region=df.groupby('region')
df_region['charges'].mean().round(2)
region
northeast    13406.38
northwest    12417.58
southeast    14735.41
southwest    12346.94
Name: charges, dtype: float64
df_region.describe()
agebmi...childrencharges
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
region
northeast324.039.26851914.06900718.027.0039.551.064.0324.029.173503...2.05.0324.013406.38451611255.8030661694.79645194.32228810057.65202516687.364158571.07448
northwest325.039.19692314.05164619.026.0039.051.064.0325.029.199785...2.05.0325.012417.57537411072.2769281621.34024719.7365508965.79575014711.743860021.39897
southeast364.038.93956014.16458518.026.7539.051.064.0364.033.355989...2.05.0364.014735.41143813971.0985891121.87394440.8862009294.13195019526.286963770.42801
southwest325.039.45538513.95988619.027.0039.051.064.0325.030.596615...2.05.0325.012346.93737711557.1791011241.56504751.0700008798.59300013462.520052590.82939

4 rows × 32 columns

mylist=['charges','bmi']
df_region[mylist].describe().round(2)
chargesbmi
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
region
northeast324.013406.3811255.801694.805194.3210057.6516687.3658571.07324.029.175.9415.9624.8728.8832.8948.07
northwest325.012417.5811072.281621.344719.748965.8014711.7460021.40325.029.205.1417.3925.7428.8832.7842.94
southeast364.014735.4113971.101121.874440.899294.1319526.2963770.43364.033.366.4819.8028.5733.3337.8153.13
southwest325.012346.9411557.181241.564751.078798.5913462.5252590.83325.030.605.6917.4026.9030.3034.6047.60

Multiple Grouping Columns

-This is my first comment
-This is my second comment
mylist = ['region', 'sex']
mygroups = df.groupby(mylist)
mygroups
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072BC8E908>
mygroups['charges'].mean().round(2)
region     sex   
northeast  female    12953.20
           male      13854.01
northwest  female    12479.87
           male      12354.12
southeast  female    13499.67
           male      15879.62
southwest  female    11274.41
           male      13412.88
Name: charges, dtype: float64
mm=['sex', 'region']
mmgroups = df.groupby(mm)
mmgroups['charges'].mean().round(2)
sex     region   
female  northeast    12953.20
        northwest    12479.87
        southeast    13499.67
        southwest    11274.41
male    northeast    13854.01
        northwest    12354.12
        southeast    15879.62
        southwest    13412.88
Name: charges, dtype: float64
mytable =df.pivot_table(index = 'sex', columns = 'region', values = 'charges', aggfunc = 'mean').round(2) #aggfun指的是想要的数据
mytable
regionnortheastnorthwestsoutheastsouthwest
sex
female12953.2012479.8713499.6711274.41
male13854.0112354.1215879.6213412.88
df=pd.read_csv(r'C:\Users\yxl22\Desktop\Python\pandas\mycountry.csv') #读入文件并且命名数据集
df.head()
countrycountryyearPOPXRATtcgdpcccg
0ArgentinaARG200037335.6530.9995002.950722e+0575.7168055.578804
1AustraliaAUS200019053.1861.7248305.418047e+0567.7590266.720098
2IndiaIND20001006300.29744.9416001.728144e+0664.57555114.072206
3IsraelISR20006114.5704.0773301.292539e+0564.43645110.266688
4MalawiMWI200011801.50559.5438085.026222e+0374.70762411.658954
df.columns
Index(['country', 'country ', 'year', 'POP', 'XRAT', 'tcgdp', 'cc', 'cg'], dtype='object')
df = df[['country', 'POP', 'tcgdp']]
df.head()
countryPOPtcgdp
0Argentina37335.6532.950722e+05
1Australia19053.1865.418047e+05
2India1006300.2971.728144e+06
3Israel6114.5701.292539e+05
4Malawi11801.5055.026222e+03
df=df.set_index('country') #country变成了index
df.head()
POPtcgdp
country
Argentina37335.6532.950722e+05
Australia19053.1865.418047e+05
India1006300.2971.728144e+06
Israel6114.5701.292539e+05
Malawi11801.5055.026222e+03
df.columns #country就被剔除了column中
Index(['POP', 'tcgdp'], dtype='object')
df.columns=['population','total_gdp'] #重新命名
df.head()
populationtotal_gdp
country
Argentina37335.6532.950722e+05
Australia19053.1865.418047e+05
India1006300.2971.728144e+06
Israel6114.5701.292539e+05
Malawi11801.5055.026222e+03
df['GDP_percap']=df['total_gdp']*1000000/df['population']
df.head()
populationtotal_gdpGDP_percap
country
Argentina37335.6532.950722e+057.903229e+06
Australia19053.1865.418047e+052.843643e+07
India1006300.2971.728144e+061.717325e+06
Israel6114.5701.292539e+052.113867e+07
Malawi11801.5055.026222e+034.258967e+05
df['GDP_percap'].plot(kind = 'bar') #横坐标是index=country
<matplotlib.axes._subplots.AxesSubplot at 0x2072bc8e748>

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-265aMTfM-1576119946194)(output_101_1.png)]

df=df.sort_values(by='GDP_percap', ascending = True)#降序
df.head()
populationtotal_gdpGDP_percap
country
Malawi11801.5055.026222e+034.258967e+05
India1006300.2971.728144e+061.717325e+06
South Africa45064.0982.272424e+055.042648e+06
Uruguay3219.7932.525596e+047.843971e+06
Argentina37335.6532.950722e+057.903229e+06
df['GDP_percap'].plot(kind = 'bar');

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u0TsPaBD-1576119946194)(output_104_0.png)]

df.GDP_percap.plot(kind = 'barh', figsize =(12,6),colormap='summer', title='This a hot') #横图
<matplotlib.axes._subplots.AxesSubplot at 0x2072d8da780>

在这里插入图片描述

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Hrb1YN1d-1576119946195)(output_105_1.png)]

df['GDP_percap'].plot(kind = 'pie',figsize =(12,6),colormap='Blues');

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XzceHeJp-1576119946195)(output_106_0.png)]

#https://blog.csdn.net/h_hxx/article/details/90635650
def times2(x):
    return x*2
df2['double']= df2['C2'].apply(times2) #应用函数
df2.head()
C1C2C3C4C5double
R19926134
R29610112
R3900930
R4400410
R5732476
def weight(x):
    if(x[4]<5):
        ans =x[1] +x[2]
    else:
        ans = 99
    return ans
df2['summary']=df2.apply(weight, axis = 1)#行
df2.head()
C1C2C3C4C5doublesummary
R199261348
R296101127
R39009300
R44004100
R573247699
df2.apply(np.max, axis = 0)
C1         99
C2          6
C3          8
C4          9
C5          7
double     12
summary    99
dtype: int64
df2.apply(np.max, axis = 1)
R1    99
R2    12
R3     9
R4     4
R5    99
R6    99
dtype: int64
dfM = pd.DataFrame(np.arange(0,15).reshape(3,5))
dfM
01234
001234
156789
21011121314
dfM.loc[1,2]=np.nan
dfM
01234
0012.034
156NaN89
2101112.01314
# Quandl 
import Quandl
conda...
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值