1. 基础:数据导入,读取,索引
1.1 数据导入,读取
import pandas as pa
titan= pa. read_csv( './titanic_train911.csv' )
titan. head( 5 )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
titan. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
titan. columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
titan. dtypes
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
titan. values
array([[1, 0, 3, ..., 7.25, nan, 'S'],
[2, 1, 1, ..., 71.2833, 'C85', 'C'],
[3, 1, 3, ..., 7.925, nan, 'S'],
...,
[889, 0, 3, ..., 23.45, nan, 'S'],
[890, 1, 1, ..., 30.0, 'C148', 'C'],
[891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
titan. index
RangeIndex(start=0, stop=891, step=1)
titan= titan. set_index( 'PassengerId' )
titan. head( 8 )
Survived Pclass Age SibSp Parch Ticket Cabin Embarked PassengerId 1 0 3 22.0 1 0 A/5 21171 NaN S 2 1 1 38.0 1 0 PC 17599 C85 C 3 1 3 26.0 0 0 STON/O2. 3101282 NaN S 4 1 1 35.0 1 0 113803 C123 S 5 0 3 35.0 0 0 373450 NaN S 6 0 3 NaN 0 0 330877 NaN Q 7 0 1 54.0 0 0 17463 E46 S 8 0 3 2.0 3 1 349909 NaN S
titan. describe( )
Survived Pclass Age SibSp Parch count 891.000000 891.000000 714.000000 891.000000 891.000000 mean 0.383838 2.308642 29.699118 0.523008 0.381594 std 0.486592 0.836071 14.526497 1.102743 0.806057 min 0.000000 1.000000 0.420000 0.000000 0.000000 25% 0.000000 2.000000 20.125000 0.000000 0.000000 50% 0.000000 3.000000 28.000000 0.000000 0.000000 75% 1.000000 3.000000 38.000000 1.000000 0.000000 max 1.000000 3.000000 80.000000 8.000000 6.000000
1.2 索引
titan[ 'Survived' ] [ : 8 ]
PassengerId
1 0
2 1
3 1
4 1
5 0
6 0
7 0
8 0
Name: Survived, dtype: int64
findlist= [ 'Survived' , 'Age' ]
titan[ findlist] [ : 8 ]
Survived Age PassengerId 1 0 22.0 2 1 38.0 3 1 26.0 4 1 35.0 5 0 35.0 6 0 NaN 7 0 54.0 8 0 2.0
titan= titan. set_index( 'Age' )
titan. loc[ 38 ]
Survived Pclass SibSp Parch Ticket Cabin Embarked Age 38.0 1 1 1 0 PC 17599 C85 C 38.0 1 3 1 5 347077 NaN S 38.0 1 1 0 0 113572 B28 NaN 38.0 0 3 0 0 349249 NaN S 38.0 1 1 1 0 19943 C93 S 38.0 0 1 0 1 PC 17582 C91 S 38.0 0 2 0 0 237671 NaN S 38.0 0 3 0 0 SOTON/O.Q. 3101306 NaN S 38.0 0 3 0 0 315089 NaN S 38.0 1 1 0 0 PC 17757 C45 C 38.0 0 1 0 0 19972 NaN S
titan. iloc[ 666 ]
Survived 0
Pclass 2
SibSp 0
Parch 0
Ticket 234686
Cabin NaN
Embarked S
Name: 25.0, dtype: object
titan. iloc[ 666 : 668 , 2 : 5 ]
SibSp Parch Ticket Age 25.0 0 0 234686 NaN 0 0 312993
titan= pa. read_csv( './titanic_train911.csv' )
titan. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
titan= titan. set_index( 'Name' )
titan. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Name Braund, Mr. Owen Harris 1 0 3 male 22.0 1 0 A/5 21171 7.2500 NaN S Cumings, Mrs. John Bradley (Florence Briggs Thayer) 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85 C Heikkinen, Miss. Laina 3 1 3 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 1 1 female 35.0 1 0 113803 53.1000 C123 S Allen, Mr. William Henry 5 0 3 male 35.0 0 0 373450 8.0500 NaN S
titan[ titan[ 'Age' ] > 18 ] [ 2 : 8 ]
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Name Heikkinen, Miss. Laina 3 1 3 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 1 1 female 35.0 1 0 113803 53.1000 C123 S Allen, Mr. William Henry 5 0 3 male 35.0 0 0 373450 8.0500 NaN S McCarthy, Mr. Timothy J 7 0 1 male 54.0 0 0 17463 51.8625 E46 S Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 9 1 3 female 27.0 0 2 347742 11.1333 NaN S Bonnell, Miss. Elizabeth 12 1 1 female 58.0 0 0 113783 26.5500 C103 S
titan[ titan[ 'Age' ] == 18 ] [ : 8 ]
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Name Vander Planke, Miss. Augusta Maria 39 0 3 female 18.0 2 0 345764 18.0000 NaN S Arnold-Franchi, Mrs. Josef (Josefine Franchi) 50 0 3 female 18.0 1 0 349237 17.8000 NaN S Andrew, Mr. Edgardo Samuel 145 0 2 male 18.0 0 0 231945 11.5000 NaN S Klasen, Mr. Klas Albin 176 0 3 male 18.0 1 1 350404 7.8542 NaN S Cohen, Mr. Gurshon "Gus" 205 1 3 male 18.0 0 0 A/5 3540 8.0500 NaN S Fahlstrom, Mr. Arne Jonas 229 0 2 male 18.0 0 0 236171 13.0000 NaN S Ryerson, Miss. Emily Borie 312 1 1 female 18.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C Wiklund, Mr. Jakob Alfred 372 0 3 male 18.0 1 0 3101267 6.4958 NaN S
1.3 排序
titan. groupby( 'Sex' ) . sum ( )
PassengerId Survived Pclass Age SibSp Parch Fare Sex female 135343 233 678 7286.00 218 204 13966.6628 male 262043 109 1379 13919.17 248 136 14727.2865
titan. groupby( 'Sex' ) [ 'Age' ] . mean( )
Sex
female 27.915709
male 30.726645
Name: Age, dtype: float64
2 统计计算
mac= pa. read_csv( 'macrodata.csv' )
mac. head( )
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint 0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00 1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74 2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09 3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06 4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19
mac[ 'realinv' ] . sum ( )
205611.364
mac. sum ( axis= 0 )
year 402727.000
quarter 506.000
realgdp 1465897.896
realcons 979534.500
realinv 205611.364
realgovt 134655.714
realdpi 1078039.800
cpi 21330.385
m1 135589.300
tbilrate 1078.290
unemp 1194.600
pop 48664.003
infl 804.150
realint 271.310
dtype: float64
mac. mean( axis= 1 )
0 669.717000
1 681.807214
2 681.973643
3 684.259143
4 692.140500
5 691.066643
6 692.083643
7 687.265786
8 690.154643
9 699.882857
10 708.679857
11 719.592571
12 728.865429
13 734.630857
14 740.307857
15 743.272714
16 748.771071
17 754.663357
18 765.761786
19 770.859429
20 783.005000
21 792.656357
22 801.510857
23 804.556643
24 818.246500
25 825.562357
26 841.191571
27 857.239786
28 871.669929
29 873.893786
...
173 2461.527500
174 2469.052643
175 2475.373643
176 2485.468214
177 2513.350214
178 2550.004357
179 2570.842071
180 2588.743786
181 2611.854143
182 2632.526786
183 2659.255071
184 2667.557429
185 2680.025786
186 2699.507571
187 2714.498857
188 2750.110786
189 2761.561000
190 2766.796214
191 2785.227357
192 2794.254500
193 2807.221143
194 2824.258929
195 2828.893143
196 2820.595929
197 2840.309786
198 2816.360214
199 2802.594286
200 2767.597071
201 2774.576071
202 2788.397500
Length: 203, dtype: float64
mac. sum ( axis= 1 )
0 9376.038
1 9545.301
2 9547.631
3 9579.628
4 9689.967
5 9674.933
6 9689.171
7 9621.721
8 9662.165
9 9798.360
10 9921.518
11 10074.296
12 10204.116
13 10284.832
14 10364.310
15 10405.818
16 10482.795
17 10565.287
18 10720.665
19 10792.032
20 10962.070
21 11097.189
22 11221.152
23 11263.793
24 11455.451
25 11557.873
26 11776.682
27 12001.357
28 12203.379
29 12234.513
...
173 34461.385
174 34566.737
175 34655.231
176 34796.555
177 35186.903
178 35700.061
179 35991.789
180 36242.413
181 36565.958
182 36855.375
183 37229.571
184 37345.804
185 37520.361
186 37793.106
187 38002.984
188 38501.551
189 38661.854
190 38735.147
191 38993.183
192 39119.563
193 39301.096
194 39539.625
195 39604.504
196 39488.343
197 39764.337
198 39429.043
199 39236.320
200 38746.359
201 38844.065
202 39037.565
Length: 203, dtype: float64
mac. cov( )
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint year 215.702580 -0.186558 4.647762e+04 3.329587e+04 8.096633e+03 1782.201530 3.510097e+04 889.385028 6.510849e+03 -7.164087 0.914942 547.621880 -6.729093 -0.093007 quarter -0.186558 1.251183 2.187058e+01 1.421728e+01 4.628355e+00 1.913510 1.622233e+01 0.345575 2.667532e+00 0.035261 -0.006797 0.348573 -0.008926 0.065246 realgdp 46477.622176 21.870583 1.033594e+07 7.431573e+06 1.839145e+06 393386.359633 7.783769e+06 194353.924345 1.432731e+06 -2484.948926 -301.503958 119402.877291 -2006.862002 -416.061514 realcons 33295.865483 14.217276 7.431573e+06 5.351571e+06 1.321452e+06 283735.957690 5.601336e+06 139453.560568 1.029089e+06 -1906.458928 -189.504759 85662.398005 -1496.161105 -367.448608 realinv 8096.633424 4.628355 1.839145e+06 1.321452e+06 3.423447e+05 65506.893003 1.377683e+06 33957.136254 2.486486e+05 -448.762616 -158.177573 20952.267936 -366.256662 -72.505352 realgovt 1782.201530 1.913510 3.933864e+05 2.837360e+05 6.550689e+04 19842.569221 2.977704e+05 7619.553691 5.711414e+04 -119.590326 -5.058225 4539.121687 -135.480431 18.526286 realdpi 35100.969912 16.222333 7.783769e+06 5.601336e+06 1.377683e+06 297770.416433 5.873430e+06 146538.249776 1.078988e+06 -1834.742182 -108.863828 90094.189290 -1471.848862 -315.850267 cpi 889.385028 0.345575 1.943539e+05 1.394536e+05 3.395714e+04 7619.553691 1.465382e+05 3755.100856 2.759093e+04 -40.682744 2.210069 2268.750456 -44.328471 4.689722 m1 6510.849454 2.667532 1.432731e+06 1.029089e+06 2.486486e+05 57114.142869 1.078988e+06 27590.930981 2.073403e+05 -421.744940 -10.890270 16668.489012 -395.866750 -18.638259 tbilrate -7.164087 0.035261 -2.484949e+03 -1.906459e+03 -4.487626e+02 -119.590326 -1.834742e+03 -40.682744 -4.217449e+02 7.857206 0.940225 -22.802668 5.658413 2.233145 unemp 0.914942 -0.006797 -3.015040e+02 -1.895048e+02 -1.581776e+02 -5.058225 -1.088638e+02 2.210069 -1.089027e+01 0.940225 2.127439 0.487778 0.308501 0.632511 pop 547.621880 0.348573 1.194029e+05 8.566240e+04 2.095227e+04 4539.121687 9.009419e+04 2268.750456 1.666849e+04 -22.802668 0.487778 1398.045724 -19.926368 -2.014960 infl -6.729093 -0.008926 -2.006862e+03 -1.496161e+03 -3.662567e+02 -135.480431 -1.471849e+03 -44.328471 -3.958667e+02 5.658413 0.308501 -19.926368 10.583418 -4.870398 realint -0.093007 0.065246 -4.160615e+02 -3.674486e+02 -7.250535e+01 18.526286 -3.158503e+02 4.689722 -1.863826e+01 2.233145 0.632511 -2.014960 -4.870398 7.122486
mac. corr( )
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint year 1.000000 -0.011356 0.984331 0.979991 0.942204 0.861450 0.986156 0.988215 0.973572 -0.174020 0.042711 0.997223 -0.140837 -0.002373 quarter -0.011356 1.000000 0.006082 0.005494 0.007072 0.012144 0.005984 0.005042 0.005237 0.011246 -0.004166 0.008334 -0.002453 0.021856 realgdp 0.984331 0.006082 1.000000 0.999229 0.977708 0.868651 0.999008 0.986524 0.978696 -0.275745 -0.064297 0.993297 -0.191880 -0.048492 realcons 0.979991 0.005494 0.999229 1.000000 0.976290 0.870713 0.999091 0.983735 0.976946 -0.294004 -0.056163 0.990350 -0.198804 -0.059517 realinv 0.942204 0.007072 0.977708 0.976290 1.000000 0.794797 0.971564 0.947084 0.933281 -0.273622 -0.185347 0.957720 -0.192416 -0.046433 realgovt 0.861450 0.012144 0.868651 0.870713 0.794797 1.000000 0.872241 0.882714 0.890436 -0.302875 -0.024619 0.861811 -0.295641 0.049280 realdpi 0.986156 0.005984 0.999008 0.999091 0.971564 0.872241 1.000000 0.986721 0.977752 -0.270082 -0.030797 0.994238 -0.186683 -0.048834 cpi 0.988215 0.005042 0.986524 0.983735 0.947084 0.882714 0.986721 1.000000 0.988812 -0.236846 0.024727 0.990182 -0.222361 0.028676 m1 0.973572 0.005237 0.978696 0.976946 0.933281 0.890436 0.977752 0.988812 1.000000 -0.330426 -0.016397 0.979025 -0.267236 -0.015337 tbilrate -0.174020 0.011246 -0.275745 -0.294004 -0.273622 -0.302875 -0.270082 -0.236846 -0.330426 1.000000 0.229969 -0.217566 0.620508 0.298516 unemp 0.042711 -0.004166 -0.064297 -0.056163 -0.185347 -0.024619 -0.030797 0.024727 -0.016397 0.229969 1.000000 0.008944 0.065015 0.162489 pop 0.997223 0.008334 0.993297 0.990350 0.957720 0.861811 0.994238 0.990182 0.979025 -0.217566 0.008944 1.000000 -0.163815 -0.020192 infl -0.140837 -0.002453 -0.191880 -0.198804 -0.192416 -0.295641 -0.186683 -0.222361 -0.267236 0.620508 0.065015 -0.163815 1.000000 -0.560965 realint -0.002373 0.021856 -0.048492 -0.059517 -0.046433 0.049280 -0.048834 0.028676 -0.015337 0.298516 0.162489 -0.020192 -0.560965 1.000000
mac[ 'cpi' ] . value_counts( ascending = True , bins= 4 )
(76.388, 123.795] 38
(171.203, 218.61] 38
(123.795, 171.203] 44
(28.788999999999998, 76.388] 83
Name: cpi, dtype: int64
3. 对象操作
3.1 Dataframe结构
import pandas as pa
data = [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ]
index = [ 'a' , 'b' ]
columns = [ 'A' , 'B' , 'C' ]
df = pa. DataFrame( data= data, index= index, columns = columns)
df
增:df.loc()/pd.concat()(练接两个dataframe)/直接添加;删:df.drop()/del df[]/;改:通过loc和index重新定义;查:iloc/loc/索引;
df. loc[ 'c' ] = [ 7 , 8 , 9 ]
df
df. drop( [ 'D' ] , axis= 0 , inplace= True )
df
df. loc[ 'a' ] [ 'B' ] = 20
df
df. loc[ 'b2' ]
B 5
C 6
D 200
Name: b2, dtype: int64
df. iloc[ 2 ]
B 8
C 9
D 300
Name: c3, dtype: int64
df[ 'D' ] = [ 100 , 200 , 300 ]
df
A B C D a 1 20 3 100 b 4 5 6 200 c 7 8 9 300
del df[ 'A' ]
df
df. index= [ 'a1' , 'b2' , 'c3' ]
df
B C D a1 20 3 100 b2 5 6 200 c3 8 9 300
df[ 'D' ]
a1 100
b2 200
c3 300
Name: D, dtype: int64
data = [ [ 10 , 20 , 30 ] , [ 40 , 50 , 60 ] , [ 70 , 80 , 90 ] ]
index = [ 'j' , 'k' , 'l' ]
columns = [ 'B' , 'C' , 'D' ]
df1 = pa. DataFrame( data= data, index= index, columns = columns)
df1
df2= pa. concat( [ df, df1] )
df2
B C D a1 20 3 100 b2 5 6 200 c3 8 9 300 j 10 20 30 k 40 50 60 l 70 80 90
df2= pa. concat( [ df, df1] , axis= 1 )
df2
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
"""Entry point for launching an IPython kernel.
B C D B C D a1 20.0 3.0 100.0 NaN NaN NaN b2 5.0 6.0 200.0 NaN NaN NaN c3 8.0 9.0 300.0 NaN NaN NaN j NaN NaN NaN 10.0 20.0 30.0 k NaN NaN NaN 40.0 50.0 60.0 l NaN NaN NaN 70.0 80.0 90.0
left = pa. DataFrame( { 'key' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
right = pa. DataFrame( { 'key' : [ 'K0' , 'K1' , 'K2' , 'K3' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
key A B 0 K0 A0 B0 1 K1 A1 B1 2 K2 A2 B2 3 K3 A3 B3
merge= pa. merge( left, right)
merge
key A B C D 0 K0 A0 B0 C0 D0 1 K1 A1 B1 C1 D1 2 K2 A2 B2 C2 D2 3 K3 A3 B3 C3 D3
merge= pa. merge( left, right, on= 'key' )
merge
key A B C D 0 K0 A0 B0 C0 D0 1 K1 A1 B1 C1 D1 2 K2 A2 B2 C2 D2 3 K3 A3 B3 C3 D3
3.2 series结构
data = [ 10 , 11 , 12 ]
index = [ 'a' , 'b' , 'c' ]
s = pa. Series( data = data, index = index)
s
a 10
b 11
c 12
dtype: int64
data = [ 100 , 110 ]
index = [ 'h' , 'k' ]
s2 = pa. Series( data = data, index = index)
s3= s. append( s2)
s3
a 10
b 11
c 12
h 100
k 110
dtype: int64
s3[ 'L' ] = 900
s3
a 10
b 11
c 12
h 100
k 110
L 900
dtype: int64
del s2[ 'h' ]
s2
k 110
dtype: int64
s. drop( [ 'b' , 'a' ] , inplace = True )
s
c 12
dtype: int64
s3. replace( to_replace = 100 , value = 101 , inplace = True )
s3
a 10
b 11
c 12
h 101
k 110
L 900
dtype: int64
s3. rename( index = { 'a' : 'A' } , inplace = True )
s3
A 10
b 11
c 12
h 101
k 110
L 900
dtype: int64
s3. loc[ 'b' ]
11
s3. iloc[ 2 ]