Pandas

最新推荐文章于 2022-04-19 23:07:50 发布

weixin_41656898

最新推荐文章于 2022-04-19 23:07:50 发布

阅读量310

点赞数

本文链接：https://blog.csdn.net/weixin_41656898/article/details/103505557

版权

import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
%matplotlib inline

df=pd.read_csv(r'C:\Users\yxl22\Desktop\Python\pandas\insurance.csv') #读入文件并且命名数据集

#Pandas使用一个二维的数据结构DataFrame来表示表格式的数据

type(df) #DataFrame来自frame模块，而frame模块是属于core模块的，同时pandas包又是由core模块等组成

pandas.core.frame.DataFrame

df.head() #默认取前五个,显示索引index

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520

df.head(n=2) #返回前N条

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.90	0	yes	southwest	16884.9240
1	18	male	33.77	1	no	southeast	1725.5523

df.tail() #默认取后五个

	age	sex	bmi	children	smoker	region	charges
1333	50	male	30.97	3	no	northwest	10600.5483
1334	18	female	31.92	0	no	northeast	2205.9808
1335	18	female	36.85	0	no	southeast	1629.8335
1336	21	female	25.80	0	no	southwest	2007.9450
1337	61	female	29.07	0	yes	northwest	29141.3603

df.shape #返回tuple，行x列

(1338, 7)

#关注表中的黑体字

df.index #不包括1338，label索引名，返回Index类型的索引的集合(行索引)

RangeIndex(start=0, stop=1338, step=1)

df.columns #列名，返回Index类型的列的集合(列索引)

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

df.values #值的二维数组，以numpy.ndarray矩阵返回

array([[19, 'female', 27.9, ..., 'yes', 'southwest', 16884.924],
       [18, 'male', 33.77, ..., 'no', 'southeast', 1725.5523],
       [28, 'male', 33.0, ..., 'no', 'southeast', 4449.462],
       ...,
       [18, 'female', 36.85, ..., 'no', 'southeast', 1629.8335],
       [21, 'female', 25.8, ..., 'no', 'southwest', 2007.945],
       [61, 'female', 29.07, ..., 'yes', 'northwest', 29141.3603]],
      dtype=object)

myvalues=df.values

type(myvalues)

numpy.ndarray

np.random.seed(123)
mydata=np.random.randint(0,10,30).reshape(6,5)#0到10的10个随机数，分成6x5的矩阵
mydata

array([[2, 2, 6, 1, 3],
       [9, 6, 1, 0, 1],
       [9, 0, 0, 9, 3],
       [4, 0, 0, 4, 1],
       [7, 3, 2, 4, 7],
       [2, 4, 8, 0, 7]])

myindex='R1 R2 R3 R4 R5 R6'.split()
myindex

['R1', 'R2', 'R3', 'R4', 'R5', 'R6']

mycols='C1 C2 C3 C4 C5'.split()
mycols

['C1', 'C2', 'C3', 'C4', 'C5']

#创建一个新的dataframe，结合mydata

df2=pd.DataFrame(mydata,index=myindex,columns=mycols)#index是行名,columns是列名

type(df2)

pandas.core.frame.DataFrame

df2.index #list of row

Index(['R1', 'R2', 'R3', 'R4', 'R5', 'R6'], dtype='object')

df2.columns

Index(['C1', 'C2', 'C3', 'C4', 'C5'], dtype='object')

df2.values==mydata

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

df.head()

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520

charge=df['charges']

type(charge) #series, just one column of DataFrame

pandas.core.series.Series

charge.head()#index+value

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

df.dtypes #显示各种列的种类

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

df.info()#overall information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB

df.describe()#默认只输出数值型数据的统计信息

	age	bmi	children	charges
count	1338.000000	1338.000000	1338.000000	1338.000000
mean	39.207025	30.663397	1.094918	13270.422265
std	14.049960	6.098187	1.205493	12110.011237
min	18.000000	15.960000	0.000000	1121.873900
25%	27.000000	26.296250	0.000000	4740.287150
50%	39.000000	30.400000	1.000000	9382.033000
75%	51.000000	34.693750	2.000000	16639.912515
max	64.000000	53.130000	5.000000	63770.428010

df=df.iloc[0:10,:]#从第一行到第10行，所有列

df.iloc[:,0:2]#从第一列和第二列

	age	sex
0	19	female
1	18	male
2	28	male
3	33	male
4	32	male
5	31	female
6	46	female
7	37	female
8	37	male
9	60	female

df

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520
5	31	female	25.740	0	no	southeast	3756.62160
6	46	female	33.440	1	no	southeast	8240.58960
7	37	female	27.740	3	no	northwest	7281.50560
8	37	male	29.830	2	no	northeast	6406.41070
9	60	female	25.840	0	no	northwest	28923.13692

mycols=['sex','region','charges']

df[mycols]#只要其中指定的三列，但是并没有存

	sex	region	charges
0	female	southwest	16884.92400
1	male	southeast	1725.55230
2	male	southeast	4449.46200
3	male	northwest	21984.47061
4	male	northwest	3866.85520
5	female	southeast	3756.62160
6	female	southeast	8240.58960
7	female	northwest	7281.50560
8	male	northeast	6406.41070
9	female	northwest	28923.13692

df #但是并没有存

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520
5	31	female	25.740	0	no	southeast	3756.62160
6	46	female	33.440	1	no	southeast	8240.58960
7	37	female	27.740	3	no	northwest	7281.50560
8	37	male	29.830	2	no	northeast	6406.41070
9	60	female	25.840	0	no	northwest	28923.13692

dfsmall=df[mycols] #新定义了才存上

dfsmall

	sex	region	charges
0	female	southwest	16884.92400
1	male	southeast	1725.55230
2	male	southeast	4449.46200
3	male	northwest	21984.47061
4	male	northwest	3866.85520
5	female	southeast	3756.62160
6	female	southeast	8240.58960
7	female	northwest	7281.50560
8	male	northeast	6406.41070
9	female	northwest	28923.13692

mycols=['C2','C4']
df2[mycols]#只能这种形式

	C2	C4
R1	2	1
R2	6	0
R3	0	9
R4	0	4
R5	3	4
R6	4	0

df2.loc[:,['C2','C4']]

	C2	C4
R1	2	1
R2	6	0
R3	0	9
R4	0	4
R5	3	4
R6	4	0

myrows=['R1','R5']

df2.loc[myrows,:]

	C1	C2	C3	C4	C5
R1	2	2	6	1	3
R5	7	3	2	4	7

df2

	C1	C2	C3	C4	C5
R1	2	2	6	1	3
R2	9	6	1	0	1
R3	9	0	0	9	3
R4	4	0	0	4	1
R5	7	3	2	4	7
R6	2	4	8	0	7

df2.loc[:,'C1':'C4']#包括C4
#loc利用index的名称，来获取想要的行（或列）
#iloc利用index的具体位置（所以它只能是整数型参数）

	C1	C2	C3	C4
R1	2	2	6	1
R2	9	6	1	0
R3	9	0	0	9
R4	4	0	0	4
R5	7	3	2	4
R6	2	4	8	0

df2.loc[['R1','R5'],['C2','C4','C5']]

	C2	C4	C5
R1	2	1	3
R5	3	4	7

df2.loc['R1':'R6':2,'C1':'C5':2]

	C1	C3	C5
R1	2	6	3
R3	9	0	3
R5	7	2	7

df2 #还是不变

	C1	C2	C3	C4	C5
R1	2	2	6	1	3
R2	9	6	1	0	1
R3	9	0	0	9	3
R4	4	0	0	4	1
R5	7	3	2	4	7
R6	2	4	8	0	7

myrows=[1,3]
df2.iloc[myrows,:]

	C1	C2	C3	C4	C5
R2	9	6	1	0	1
R4	4	0	0	4	1

myrows=[1,3]
df2.iloc[:,myrows]

	C2	C4
R1	2	1
R2	6	0
R3	0	9
R4	0	4
R5	3	4
R6	4	0

myrows=[1,3]
df2.iloc[myrows]

	C1	C2	C3	C4	C5
R2	9	6	1	0	1
R4	4	0	0	4	1

df2.iloc[4,2]

df2.iloc[0,0]=99 #赋值
df2

	C1	C2	C3	C4	C5
R1	99	2	6	1	3
R2	9	6	1	0	1
R3	9	0	0	9	3
R4	4	0	0	4	1
R5	7	3	2	4	7
R6	2	4	8	0	7

df2

	C1	C2	C3	C4	C5
R1	99	2	6	1	3
R2	9	6	1	0	1
R3	9	0	0	9	3
R4	4	0	0	4	1
R5	7	3	2	4	7
R6	2	4	8	0	7

df2['mysum']=df2['C1']+df2['C2'] #create a new column
df2

	C1	C2	C3	C4	C5	mysum
R1	99	2	6	1	3	101
R2	9	6	1	0	1	15
R3	9	0	0	9	3	9
R4	4	0	0	4	1	4
R5	7	3	2	4	7	10
R6	2	4	8	0	7	6

df2=df2.drop('mysum',axis=1)#再扔掉这一行,只有drop类函数才会这么反着axis

df2

	C1	C2	C3	C4	C5
R1	99	2	6	1	3
R2	9	6	1	0	1
R3	9	0	0	9	3
R4	4	0	0	4	1
R5	7	3	2	4	7
R6	2	4	8	0	7

df=pd.read_csv(r'C:\Users\yxl22\Desktop\Python\pandas\insurance.csv')#重新读入

df.head()

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520

df.describe()

	age	bmi	children	charges
count	1338.000000	1338.000000	1338.000000	1338.000000
mean	39.207025	30.663397	1.094918	13270.422265
std	14.049960	6.098187	1.205493	12110.011237
min	18.000000	15.960000	0.000000	1121.873900
25%	27.000000	26.296250	0.000000	4740.287150
50%	39.000000	30.400000	1.000000	9382.033000
75%	51.000000	34.693750	2.000000	16639.912515
max	64.000000	53.130000	5.000000	63770.428010

df['charges']

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
5        3756.62160
6        8240.58960
7        7281.50560
8        6406.41070
9       28923.13692
10       2721.32080
11      27808.72510
12       1826.84300
13      11090.71780
14      39611.75770
15       1837.23700
16      10797.33620
17       2395.17155
18      10602.38500
19      36837.46700
20      13228.84695
21       4149.73600
22       1137.01100
23      37701.87680
24       6203.90175
25      14001.13380
26      14451.83515
27      12268.63225
28       2775.19215
29      38711.00000
           ...     
1308    33900.65300
1309     6875.96100
1310     6940.90985
1311     4571.41305
1312     4536.25900
1313    36397.57600
1314    18765.87545
1315    11272.33139
1316     1731.67700
1317     1163.46270
1318    19496.71917
1319     7201.70085
1320     5425.02335
1321    28101.33305
1322    12981.34570
1323    43896.37630
1324     4239.89265
1325    13143.33665
1326     7050.02130
1327     9377.90470
1328    22395.74424
1329    10325.20600
1330    12629.16560
1331    10795.93733
1332    11411.68500
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

filt20=df['charges']>20000 #是否大于20000
filt20.head()

0    False
1    False
2    False
3     True
4    False
Name: charges, dtype: bool

df20=df[filt20] #剔除false的

df20.head()

	age	sex	bmi	smoker	region	charges
3	33	male	22.705	no	northwest	21984.47061
9	60	female	25.840	no	northwest	28923.13692
11	62	female	26.290	yes	southeast	27808.72510
14	27	male	42.130	yes	southeast	39611.75770
19	30	male	35.300	yes	southwest	36837.46700

filt1=df['charges']>20000
filt2=df['sex']=='male'
filt=filt1 & filt2  #且
m20=df[filt].head()
m20

	age	sex	bmi	children	smoker	region	charges
3	33	male	22.705	0	no	northwest	21984.47061
14	27	male	42.130	0	yes	southeast	39611.75770
19	30	male	35.300	0	yes	southwest	36837.46700
29	31	male	36.300	2	yes	southwest	38711.00000
30	22	male	35.600	0	yes	southwest	35585.57600

m20.describe()

	age	bmi	children	charges
count	5.00000	5.000000	5.000000	5.000000
mean	28.60000	34.407000	0.400000	34546.054262
std	4.27785	7.113116	0.894427	7196.052493
min	22.00000	22.705000	0.000000	21984.470610
25%	27.00000	35.300000	0.000000	35585.576000
50%	30.00000	35.600000	0.000000	36837.467000
75%	31.00000	36.300000	0.000000	38711.000000
max	33.00000	42.130000	2.000000	39611.757700

filt=filt1 | filt2 #或
df[filt].head()

	age	sex	bmi	children	smoker	region	charges
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520
8	37	male	29.830	2	no	northeast	6406.41070

g=df.groupby('sex') #分组

g #like dict

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072C5FD390>

g.describe()

	age								bmi		...	children		charges
	count	mean	std	min	25%	50%	75%	max	count	mean	...	75%	max	count	mean	std	min	25%	50%	75%	max
sex
female	662.0	39.503021	14.054223	18.0	27.0	40.0	51.75	64.0	662.0	30.377749	...	2.0	5.0	662.0	12569.578844	11128.703801	1607.5101	4885.1587	9412.96250	14454.691825	63770.42801
male	676.0	38.917160	14.050141	18.0	26.0	39.0	51.00	64.0	676.0	30.943129	...	2.0	5.0	676.0	13956.751178	12971.025915	1121.8739	4619.1340	9369.61575	18989.590250	62592.87309

2 rows × 32 columns

g.corr()

		age	bmi	children	charges
sex
female	age	1.000000	0.097214	0.078500	0.324575
	bmi	0.097214	1.000000	0.022151	0.161419
	children	0.078500	0.022151	1.000000	0.058492
	charges	0.324575	0.161419	0.058492	1.000000
male	age	1.000000	0.123088	0.008690	0.282369
	bmi	0.123088	1.000000	0.002385	0.225847
	children	0.008690	0.002385	1.000000	0.074496
	charges	0.282369	0.225847	0.074496	1.000000

type(g)

pandas.core.groupby.generic.DataFrameGroupBy

for gp,df_gp in g:
    print(gp)#显示分类
    print(df_gp)#显示具体数据
    print('')

female
      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
5      31  female  25.740         0     no  southeast   3756.62160
6      46  female  33.440         1     no  southeast   8240.58960
7      37  female  27.740         3     no  northwest   7281.50560
9      60  female  25.840         0     no  northwest  28923.13692
11     62  female  26.290         0    yes  southeast  27808.72510
13     56  female  39.820         0     no  southeast  11090.71780
16     52  female  30.780         1     no  northeast  10797.33620
20     60  female  36.005         0     no  northeast  13228.84695
21     30  female  32.400         1     no  southwest   4149.73600
23     34  female  31.920         1    yes  northeast  37701.87680
25     59  female  27.720         3     no  southeast  14001.13380
26     63  female  23.085         0     no  northeast  14451.83515
27     55  female  32.775         2     no  northwest  12268.63225
31     18  female  26.315         0     no  northeast   2198.18985
32     19  female  28.600         5     no  southwest   4687.79700
36     62  female  32.965         3     no  northwest  15612.19335
40     24  female  26.600         0     no  northeast   3046.06200
41     31  female  36.630         2     no  southeast   4949.75870
43     37  female  30.800         2     no  southeast   6313.75900
46     18  female  38.665         2     no  northeast   3393.35635
47     28  female  34.770         0     no  northwest   3556.92230
48     60  female  24.530         0     no  southeast  12629.89670
50     18  female  35.625         0     no  northeast   2211.13075
51     21  female  33.630         2     no  northwest   3579.82870
54     40  female  28.690         3     no  northwest   8059.67910
56     58  female  31.825         2     no  northeast  13607.36875
58     53  female  22.880         1    yes  southeast  23244.79020
59     34  female  37.335         2     no  northwest   5989.52365
63     28  female  25.935         1     no  northwest   4133.64165
...   ...     ...     ...       ...    ...        ...          ...
1277   32  female  29.735         0     no  northwest   4357.04365
1279   25  female  26.790         2     no  northwest   4189.11310
1280   48  female  33.330         0     no  southeast   8283.68070
1281   47  female  27.645         2    yes  northwest  24535.69855
1282   18  female  21.660         0    yes  northeast  14283.45940
1285   47  female  24.320         0     no  northeast   8534.67180
1286   28  female  17.290         0     no  northeast   3732.62510
1287   36  female  25.900         1     no  southwest   5472.44900
1290   38  female  19.950         2     no  northeast   7133.90250
1297   28  female  26.510         2     no  southeast   4340.44090
1299   19  female  25.745         1     no  northwest   2710.82855
1302   25  female  20.800         1     no  southwest   3208.78700
1305   24  female  27.720         0     no  southeast   2464.61880
1306   29  female  21.850         0    yes  northeast  16115.30450
1308   25  female  30.200         0    yes  southwest  33900.65300
1311   33  female  26.695         0     no  northwest   4571.41305
1313   19  female  34.700         2    yes  southwest  36397.57600
1314   30  female  23.655         3    yes  northwest  18765.87545
1316   19  female  20.600         0     no  southwest   1731.67700
1319   39  female  26.315         2     no  northwest   7201.70085
1323   42  female  40.370         2    yes  southeast  43896.37630
1326   42  female  32.870         0     no  northeast   7050.02130
1328   23  female  24.225         2     no  northeast  22395.74424
1330   57  female  25.740         2     no  southeast  12629.16560
1331   23  female  33.400         0     no  southwest  10795.93733
1332   52  female  44.700         3     no  southwest  11411.68500
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[662 rows x 7 columns]

male
      age   sex     bmi  children smoker     region      charges
1      18  male  33.770         1     no  southeast   1725.55230
2      28  male  33.000         3     no  southeast   4449.46200
3      33  male  22.705         0     no  northwest  21984.47061
4      32  male  28.880         0     no  northwest   3866.85520
8      37  male  29.830         2     no  northeast   6406.41070
10     25  male  26.220         0     no  northeast   2721.32080
12     23  male  34.400         0     no  southwest   1826.84300
14     27  male  42.130         0    yes  southeast  39611.75770
15     19  male  24.600         1     no  southwest   1837.23700
17     23  male  23.845         0     no  northeast   2395.17155
18     56  male  40.300         0     no  southwest  10602.38500
19     30  male  35.300         0    yes  southwest  36837.46700
22     18  male  34.100         0     no  southeast   1137.01100
24     37  male  28.025         2     no  northwest   6203.90175
28     23  male  17.385         1     no  northwest   2775.19215
29     31  male  36.300         2    yes  southwest  38711.00000
30     22  male  35.600         0    yes  southwest  35585.57600
33     63  male  28.310         0     no  northwest  13770.09790
34     28  male  36.400         1    yes  southwest  51194.55914
35     19  male  20.425         0     no  northwest   1625.43375
37     26  male  20.800         0     no  southwest   2302.30000
38     35  male  36.670         1    yes  northeast  39774.27630
39     60  male  39.900         0    yes  southwest  48173.36100
42     41  male  21.780         1     no  southeast   6272.47720
44     38  male  37.050         1     no  northeast   6079.67150
45     55  male  37.300         0     no  southwest  20630.28351
49     36  male  35.200         1    yes  southeast  38709.17600
52     48  male  28.000         1    yes  southwest  23568.27200
53     36  male  34.430         0    yes  southeast  37742.57570
55     58  male  36.955         2    yes  northwest  47496.49445
...   ...   ...     ...       ...    ...        ...          ...
1283   18  male  30.030         1     no  southeast   1720.35370
1284   61  male  36.300         1    yes  southwest  47403.88000
1288   20  male  39.400         2    yes  southwest  38344.56600
1289   44  male  34.320         1     no  southeast   7147.47280
1291   19  male  34.900         0    yes  southwest  34828.65400
1292   21  male  23.210         0     no  southeast   1515.34490
1293   46  male  25.745         3     no  northwest   9301.89355
1294   58  male  25.175         0     no  northeast  11931.12525
1295   20  male  22.000         1     no  southwest   1964.78000
1296   18  male  26.125         0     no  northeast   1708.92575
1298   33  male  27.455         2     no  northwest   5261.46945
1300   45  male  30.360         0    yes  southeast  62592.87309
1301   62  male  30.875         3    yes  northwest  46718.16325
1303   43  male  27.800         0    yes  southwest  37829.72420
1304   42  male  24.605         2    yes  northeast  21259.37795
1307   32  male  28.120         4    yes  northwest  21472.47880
1309   41  male  32.200         2     no  southwest   6875.96100
1310   42  male  26.315         1     no  northwest   6940.90985
1312   34  male  42.900         1     no  southwest   4536.25900
1315   18  male  28.310         1     no  northeast  11272.33139
1317   18  male  53.130         0     no  southeast   1163.46270
1318   35  male  39.710         4     no  northeast  19496.71917
1320   31  male  31.065         3     no  northwest   5425.02335
1321   62  male  26.695         0    yes  northeast  28101.33305
1322   62  male  38.830         0     no  southeast  12981.34570
1324   31  male  25.935         1     no  northwest   4239.89265
1325   61  male  33.535         0     no  northeast  13143.33665
1327   51  male  30.030         1     no  southeast   9377.90470
1329   52  male  38.600         2     no  southwest  10325.20600
1333   50  male  30.970         3     no  northwest  10600.54830

[676 rows x 7 columns]

g.size() #不能用shape

sex
female    662
male      676
dtype: int64

g.count() #无missing value

	age	bmi	children	smoker	region	charges
sex
female	662	662	662	662	662	662
male	676	676	676	676	676	676

mymeans=g.mean()
mymeans#index是性别

	age	bmi	children	charges
sex
female	39.503021	30.377749	1.074018	12569.578844
male	38.917160	30.943129	1.115385	13956.751178

mymeans[['age','bmi','children']].plot(kind='barh');

在这里插入图片描述

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-21vYpy4W-1576119946192)(output_75_0.png)]

df_region=df.groupby('region')

df_region['charges'].mean().round(2)

region
northeast    13406.38
northwest    12417.58
southeast    14735.41
southwest    12346.94
Name: charges, dtype: float64

df_region.describe()

	age								bmi		...	children		charges
	count	mean	std	min	25%	50%	75%	max	count	mean	...	75%	max	count	mean	std	min	25%	50%	75%	max
region
northeast	324.0	39.268519	14.069007	18.0	27.00	39.5	51.0	64.0	324.0	29.173503	...	2.0	5.0	324.0	13406.384516	11255.803066	1694.7964	5194.322288	10057.652025	16687.3641	58571.07448
northwest	325.0	39.196923	14.051646	19.0	26.00	39.0	51.0	64.0	325.0	29.199785	...	2.0	5.0	325.0	12417.575374	11072.276928	1621.3402	4719.736550	8965.795750	14711.7438	60021.39897
southeast	364.0	38.939560	14.164585	18.0	26.75	39.0	51.0	64.0	364.0	33.355989	...	2.0	5.0	364.0	14735.411438	13971.098589	1121.8739	4440.886200	9294.131950	19526.2869	63770.42801
southwest	325.0	39.455385	13.959886	19.0	27.00	39.0	51.0	64.0	325.0	30.596615	...	2.0	5.0	325.0	12346.937377	11557.179101	1241.5650	4751.070000	8798.593000	13462.5200	52590.82939

4 rows × 32 columns

mylist=['charges','bmi']
df_region[mylist].describe().round(2)

	charges								bmi
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
region
northeast	324.0	13406.38	11255.80	1694.80	5194.32	10057.65	16687.36	58571.07	324.0	29.17	5.94	15.96	24.87	28.88	32.89	48.07
northwest	325.0	12417.58	11072.28	1621.34	4719.74	8965.80	14711.74	60021.40	325.0	29.20	5.14	17.39	25.74	28.88	32.78	42.94
southeast	364.0	14735.41	13971.10	1121.87	4440.89	9294.13	19526.29	63770.43	364.0	33.36	6.48	19.80	28.57	33.33	37.81	53.13
southwest	325.0	12346.94	11557.18	1241.56	4751.07	8798.59	13462.52	52590.83	325.0	30.60	5.69	17.40	26.90	30.30	34.60	47.60

Multiple Grouping Columns

-This is my first comment
-This is my second comment

mylist = ['region', 'sex']
mygroups = df.groupby(mylist)

mygroups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072BC8E908>

mygroups['charges'].mean().round(2)

region     sex   
northeast  female    12953.20
           male      13854.01
northwest  female    12479.87
           male      12354.12
southeast  female    13499.67
           male      15879.62
southwest  female    11274.41
           male      13412.88
Name: charges, dtype: float64

mm=['sex', 'region']

mmgroups = df.groupby(mm)

mmgroups['charges'].mean().round(2)

sex     region   
female  northeast    12953.20
        northwest    12479.87
        southeast    13499.67
        southwest    11274.41
male    northeast    13854.01
        northwest    12354.12
        southeast    15879.62
        southwest    13412.88
Name: charges, dtype: float64

mytable =df.pivot_table(index = 'sex', columns = 'region', values = 'charges', aggfunc = 'mean').round(2) #aggfun指的是想要的数据

mytable

region	northeast	northwest	southeast	southwest
sex
female	12953.20	12479.87	13499.67	11274.41
male	13854.01	12354.12	15879.62	13412.88

df=pd.read_csv(r'C:\Users\yxl22\Desktop\Python\pandas\mycountry.csv') #读入文件并且命名数据集

df.head()

	country	country	year	POP	XRAT	tcgdp	cc	cg
0	Argentina	ARG	2000	37335.653	0.999500	2.950722e+05	75.716805	5.578804
1	Australia	AUS	2000	19053.186	1.724830	5.418047e+05	67.759026	6.720098
2	India	IND	2000	1006300.297	44.941600	1.728144e+06	64.575551	14.072206
3	Israel	ISR	2000	6114.570	4.077330	1.292539e+05	64.436451	10.266688
4	Malawi	MWI	2000	11801.505	59.543808	5.026222e+03	74.707624	11.658954

df.columns

Index(['country', 'country ', 'year', 'POP', 'XRAT', 'tcgdp', 'cc', 'cg'], dtype='object')

df = df[['country', 'POP', 'tcgdp']]

df.head()

	country	POP	tcgdp
0	Argentina	37335.653	2.950722e+05
1	Australia	19053.186	5.418047e+05
2	India	1006300.297	1.728144e+06
3	Israel	6114.570	1.292539e+05
4	Malawi	11801.505	5.026222e+03

df=df.set_index('country') #country变成了index

df.head()

	POP	tcgdp
country
Argentina	37335.653	2.950722e+05
Australia	19053.186	5.418047e+05
India	1006300.297	1.728144e+06
Israel	6114.570	1.292539e+05
Malawi	11801.505	5.026222e+03

df.columns #country就被剔除了column中

Index(['POP', 'tcgdp'], dtype='object')

df.columns=['population','total_gdp'] #重新命名

df.head()

	population	total_gdp
country
Argentina	37335.653	2.950722e+05
Australia	19053.186	5.418047e+05
India	1006300.297	1.728144e+06
Israel	6114.570	1.292539e+05
Malawi	11801.505	5.026222e+03

df['GDP_percap']=df['total_gdp']*1000000/df['population']

df.head()

	population	total_gdp	GDP_percap
country
Argentina	37335.653	2.950722e+05	7.903229e+06
Australia	19053.186	5.418047e+05	2.843643e+07
India	1006300.297	1.728144e+06	1.717325e+06
Israel	6114.570	1.292539e+05	2.113867e+07
Malawi	11801.505	5.026222e+03	4.258967e+05

df['GDP_percap'].plot(kind = 'bar') #横坐标是index=country

<matplotlib.axes._subplots.AxesSubplot at 0x2072bc8e748>

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-265aMTfM-1576119946194)(output_101_1.png)]

df=df.sort_values(by='GDP_percap', ascending = True)#降序

df.head()

	population	total_gdp	GDP_percap
country
Malawi	11801.505	5.026222e+03	4.258967e+05
India	1006300.297	1.728144e+06	1.717325e+06
South Africa	45064.098	2.272424e+05	5.042648e+06
Uruguay	3219.793	2.525596e+04	7.843971e+06
Argentina	37335.653	2.950722e+05	7.903229e+06

df['GDP_percap'].plot(kind = 'bar');

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u0TsPaBD-1576119946194)(output_104_0.png)]

df.GDP_percap.plot(kind = 'barh', figsize =(12,6),colormap='summer', title='This a hot') #横图

<matplotlib.axes._subplots.AxesSubplot at 0x2072d8da780>

在这里插入图片描述

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Hrb1YN1d-1576119946195)(output_105_1.png)]

df['GDP_percap'].plot(kind = 'pie',figsize =(12,6),colormap='Blues');

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XzceHeJp-1576119946195)(output_106_0.png)]

#https://blog.csdn.net/h_hxx/article/details/90635650

def times2(x):
    return x*2

df2['double']= df2['C2'].apply(times2) #应用函数

df2.head()

	C1	C2	C3	C4	C5	double
R1	99	2	6	1	3	4
R2	9	6	1	0	1	12
R3	9	0	0	9	3	0
R4	4	0	0	4	1	0
R5	7	3	2	4	7	6

def weight(x):
    if(x[4]<5):
        ans =x[1] +x[2]
    else:
        ans = 99
    return ans

df2['summary']=df2.apply(weight, axis = 1)#行

df2.head()

	C1	C2	C3	C4	C5	double	summary
R1	99	2	6	1	3	4	8
R2	9	6	1	0	1	12	7
R3	9	0	0	9	3	0	0
R4	4	0	0	4	1	0	0
R5	7	3	2	4	7	6	99

df2.apply(np.max, axis = 0)

C1         99
C2          6
C3          8
C4          9
C5          7
double     12
summary    99
dtype: int64

df2.apply(np.max, axis = 1)

R1    99
R2    12
R3     9
R4     4
R5    99
R6    99
dtype: int64

dfM = pd.DataFrame(np.arange(0,15).reshape(3,5))
dfM

	0	1	2	3	4
0	0	1	2	3	4
1	5	6	7	8	9
2	10	11	12	13	14

dfM.loc[1,2]=np.nan
dfM

	0	1	2	3	4
0	0	1	2.0	3	4
1	5	6	NaN	8	9
2	10	11	12.0	13	14

# Quandl 
import Quandl
conda...

weixin_41656898

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫