Pandas基础学习入门级别

1. 基础:数据导入,读取,索引

1.1 数据导入,读取

import pandas as pa#导入库
titan=pa.read_csv('./titanic_train911.csv')#读取文件
titan.head(5)#查看前5行
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
titan.info()#信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
titan.columns#查看表格的列名
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
titan.dtypes#查看数据类型
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
titan.values#查看数据
array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
titan.index#查看索引
RangeIndex(start=0, stop=891, step=1)
titan=titan.set_index('PassengerId')#自己定义索引(可取列名)
titan.head(8)
SurvivedPclassAgeSibSpParchTicketCabinEmbarked
PassengerId
10322.010A/5 21171NaNS
21138.010PC 17599C85C
31326.000STON/O2. 3101282NaNS
41135.010113803C123S
50335.000373450NaNS
603NaN00330877NaNQ
70154.00017463E46S
8032.031349909NaNS
titan.describe()#查看基本统计特性
SurvivedPclassAgeSibSpParch
count891.000000891.000000714.000000891.000000891.000000
mean0.3838382.30864229.6991180.5230080.381594
std0.4865920.83607114.5264971.1027430.806057
min0.0000001.0000000.4200000.0000000.000000
25%0.0000002.00000020.1250000.0000000.000000
50%0.0000003.00000028.0000000.0000000.000000
75%1.0000003.00000038.0000001.0000000.000000
max1.0000003.00000080.0000008.0000006.000000

1.2 索引

titan['Survived'][:8]#通过索引查看0-8行的Survived 属性
PassengerId
1    0
2    1
3    1
4    1
5    0
6    0
7    0
8    0
Name: Survived, dtype: int64
findlist=['Survived','Age']
titan[findlist][:8]#将要查询的属性列名保存到list中传入进行查看
SurvivedAge
PassengerId
1022.0
2138.0
3126.0
4135.0
5035.0
60NaN
7054.0
802.0
titan=titan.set_index('Age')#先指定索引
titan.loc[38]#loc方法定位
SurvivedPclassSibSpParchTicketCabinEmbarked
Age
38.01110PC 17599C85C
38.01315347077NaNS
38.01100113572B28NaN
38.00300349249NaNS
38.0111019943C93S
38.00101PC 17582C91S
38.00200237671NaNS
38.00300SOTON/O.Q. 3101306NaNS
38.00300315089NaNS
38.01100PC 17757C45C
38.0010019972NaNS
titan.iloc[666]#iloc方法查找定位
Survived         0
Pclass           2
SibSp            0
Parch            0
Ticket      234686
Cabin          NaN
Embarked         S
Name: 25.0, dtype: object
titan.iloc[666:668,2:5]
SibSpParchTicket
Age
25.000234686
NaN00312993
titan=pa.read_csv('./titanic_train911.csv')#读取文件
titan.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
titan=titan.set_index('Name')
titan.head()
PassengerIdSurvivedPclassSexAgeSibSpParchTicketFareCabinEmbarked
Name
Braund, Mr. Owen Harris103male22.010A/5 211717.2500NaNS
Cumings, Mrs. John Bradley (Florence Briggs Thayer)211female38.010PC 1759971.2833C85C
Heikkinen, Miss. Laina313female26.000STON/O2. 31012827.9250NaNS
Futrelle, Mrs. Jacques Heath (Lily May Peel)411female35.01011380353.1000C123S
Allen, Mr. William Henry503male35.0003734508.0500NaNS
titan[titan['Age']>18][2:8]#bool查询
PassengerIdSurvivedPclassSexAgeSibSpParchTicketFareCabinEmbarked
Name
Heikkinen, Miss. Laina313female26.000STON/O2. 31012827.9250NaNS
Futrelle, Mrs. Jacques Heath (Lily May Peel)411female35.01011380353.1000C123S
Allen, Mr. William Henry503male35.0003734508.0500NaNS
McCarthy, Mr. Timothy J701male54.0001746351.8625E46S
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)913female27.00234774211.1333NaNS
Bonnell, Miss. Elizabeth1211female58.00011378326.5500C103S
titan[titan['Age']==18][:8]
PassengerIdSurvivedPclassSexAgeSibSpParchTicketFareCabinEmbarked
Name
Vander Planke, Miss. Augusta Maria3903female18.02034576418.0000NaNS
Arnold-Franchi, Mrs. Josef (Josefine Franchi)5003female18.01034923717.8000NaNS
Andrew, Mr. Edgardo Samuel14502male18.00023194511.5000NaNS
Klasen, Mr. Klas Albin17603male18.0113504047.8542NaNS
Cohen, Mr. Gurshon "Gus"20513male18.000A/5 35408.0500NaNS
Fahlstrom, Mr. Arne Jonas22902male18.00023617113.0000NaNS
Ryerson, Miss. Emily Borie31211female18.022PC 17608262.3750B57 B59 B63 B66C
Wiklund, Mr. Jakob Alfred37203male18.01031012676.4958NaNS

1.3 排序

titan.groupby('Sex').sum()#按照年龄分组后统计各组的sum和
PassengerIdSurvivedPclassAgeSibSpParchFare
Sex
female1353432336787286.0021820413966.6628
male262043109137913919.1724813614727.2865
titan.groupby('Sex')['Age'].mean()#按照性别分组后求各组中年龄的均值
Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

2 统计计算

mac=pa.read_csv('macrodata.csv')
mac.head()
yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959.01.02710.3491707.4286.898470.0451886.928.98139.72.825.8177.1460.000.00
11959.02.02778.8011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.74
21959.03.02775.4881751.8289.226491.2601916.429.35140.53.825.3178.6572.741.09
31959.04.02785.2041753.7299.356484.0521931.329.37140.04.335.6179.3860.274.06
41960.01.02847.6991770.5331.722462.1991955.529.54139.63.505.2180.0072.311.19
mac['realinv'].sum()#
205611.364
mac.sum(axis=0)#按列求和
year         402727.000
quarter         506.000
realgdp     1465897.896
realcons     979534.500
realinv      205611.364
realgovt     134655.714
realdpi     1078039.800
cpi           21330.385
m1           135589.300
tbilrate       1078.290
unemp          1194.600
pop           48664.003
infl            804.150
realint         271.310
dtype: float64
mac.mean(axis=1)#按行求均值
0       669.717000
1       681.807214
2       681.973643
3       684.259143
4       692.140500
5       691.066643
6       692.083643
7       687.265786
8       690.154643
9       699.882857
10      708.679857
11      719.592571
12      728.865429
13      734.630857
14      740.307857
15      743.272714
16      748.771071
17      754.663357
18      765.761786
19      770.859429
20      783.005000
21      792.656357
22      801.510857
23      804.556643
24      818.246500
25      825.562357
26      841.191571
27      857.239786
28      871.669929
29      873.893786
          ...     
173    2461.527500
174    2469.052643
175    2475.373643
176    2485.468214
177    2513.350214
178    2550.004357
179    2570.842071
180    2588.743786
181    2611.854143
182    2632.526786
183    2659.255071
184    2667.557429
185    2680.025786
186    2699.507571
187    2714.498857
188    2750.110786
189    2761.561000
190    2766.796214
191    2785.227357
192    2794.254500
193    2807.221143
194    2824.258929
195    2828.893143
196    2820.595929
197    2840.309786
198    2816.360214
199    2802.594286
200    2767.597071
201    2774.576071
202    2788.397500
Length: 203, dtype: float64
#mac.sum(axis ='columns')
mac.sum(axis=1)#按列求和
0       9376.038
1       9545.301
2       9547.631
3       9579.628
4       9689.967
5       9674.933
6       9689.171
7       9621.721
8       9662.165
9       9798.360
10      9921.518
11     10074.296
12     10204.116
13     10284.832
14     10364.310
15     10405.818
16     10482.795
17     10565.287
18     10720.665
19     10792.032
20     10962.070
21     11097.189
22     11221.152
23     11263.793
24     11455.451
25     11557.873
26     11776.682
27     12001.357
28     12203.379
29     12234.513
         ...    
173    34461.385
174    34566.737
175    34655.231
176    34796.555
177    35186.903
178    35700.061
179    35991.789
180    36242.413
181    36565.958
182    36855.375
183    37229.571
184    37345.804
185    37520.361
186    37793.106
187    38002.984
188    38501.551
189    38661.854
190    38735.147
191    38993.183
192    39119.563
193    39301.096
194    39539.625
195    39604.504
196    39488.343
197    39764.337
198    39429.043
199    39236.320
200    38746.359
201    38844.065
202    39037.565
Length: 203, dtype: float64
mac.cov()#协方差
yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
year215.702580-0.1865584.647762e+043.329587e+048.096633e+031782.2015303.510097e+04889.3850286.510849e+03-7.1640870.914942547.621880-6.729093-0.093007
quarter-0.1865581.2511832.187058e+011.421728e+014.628355e+001.9135101.622233e+010.3455752.667532e+000.035261-0.0067970.348573-0.0089260.065246
realgdp46477.62217621.8705831.033594e+077.431573e+061.839145e+06393386.3596337.783769e+06194353.9243451.432731e+06-2484.948926-301.503958119402.877291-2006.862002-416.061514
realcons33295.86548314.2172767.431573e+065.351571e+061.321452e+06283735.9576905.601336e+06139453.5605681.029089e+06-1906.458928-189.50475985662.398005-1496.161105-367.448608
realinv8096.6334244.6283551.839145e+061.321452e+063.423447e+0565506.8930031.377683e+0633957.1362542.486486e+05-448.762616-158.17757320952.267936-366.256662-72.505352
realgovt1782.2015301.9135103.933864e+052.837360e+056.550689e+0419842.5692212.977704e+057619.5536915.711414e+04-119.590326-5.0582254539.121687-135.48043118.526286
realdpi35100.96991216.2223337.783769e+065.601336e+061.377683e+06297770.4164335.873430e+06146538.2497761.078988e+06-1834.742182-108.86382890094.189290-1471.848862-315.850267
cpi889.3850280.3455751.943539e+051.394536e+053.395714e+047619.5536911.465382e+053755.1008562.759093e+04-40.6827442.2100692268.750456-44.3284714.689722
m16510.8494542.6675321.432731e+061.029089e+062.486486e+0557114.1428691.078988e+0627590.9309812.073403e+05-421.744940-10.89027016668.489012-395.866750-18.638259
tbilrate-7.1640870.035261-2.484949e+03-1.906459e+03-4.487626e+02-119.590326-1.834742e+03-40.682744-4.217449e+027.8572060.940225-22.8026685.6584132.233145
unemp0.914942-0.006797-3.015040e+02-1.895048e+02-1.581776e+02-5.058225-1.088638e+022.210069-1.089027e+010.9402252.1274390.4877780.3085010.632511
pop547.6218800.3485731.194029e+058.566240e+042.095227e+044539.1216879.009419e+042268.7504561.666849e+04-22.8026680.4877781398.045724-19.926368-2.014960
infl-6.729093-0.008926-2.006862e+03-1.496161e+03-3.662567e+02-135.480431-1.471849e+03-44.328471-3.958667e+025.6584130.308501-19.92636810.583418-4.870398
realint-0.0930070.065246-4.160615e+02-3.674486e+02-7.250535e+0118.526286-3.158503e+024.689722-1.863826e+012.2331450.632511-2.014960-4.8703987.122486
mac.corr()#相关系数
yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
year1.000000-0.0113560.9843310.9799910.9422040.8614500.9861560.9882150.973572-0.1740200.0427110.997223-0.140837-0.002373
quarter-0.0113561.0000000.0060820.0054940.0070720.0121440.0059840.0050420.0052370.011246-0.0041660.008334-0.0024530.021856
realgdp0.9843310.0060821.0000000.9992290.9777080.8686510.9990080.9865240.978696-0.275745-0.0642970.993297-0.191880-0.048492
realcons0.9799910.0054940.9992291.0000000.9762900.8707130.9990910.9837350.976946-0.294004-0.0561630.990350-0.198804-0.059517
realinv0.9422040.0070720.9777080.9762901.0000000.7947970.9715640.9470840.933281-0.273622-0.1853470.957720-0.192416-0.046433
realgovt0.8614500.0121440.8686510.8707130.7947971.0000000.8722410.8827140.890436-0.302875-0.0246190.861811-0.2956410.049280
realdpi0.9861560.0059840.9990080.9990910.9715640.8722411.0000000.9867210.977752-0.270082-0.0307970.994238-0.186683-0.048834
cpi0.9882150.0050420.9865240.9837350.9470840.8827140.9867211.0000000.988812-0.2368460.0247270.990182-0.2223610.028676
m10.9735720.0052370.9786960.9769460.9332810.8904360.9777520.9888121.000000-0.330426-0.0163970.979025-0.267236-0.015337
tbilrate-0.1740200.011246-0.275745-0.294004-0.273622-0.302875-0.270082-0.236846-0.3304261.0000000.229969-0.2175660.6205080.298516
unemp0.042711-0.004166-0.064297-0.056163-0.185347-0.024619-0.0307970.024727-0.0163970.2299691.0000000.0089440.0650150.162489
pop0.9972230.0083340.9932970.9903500.9577200.8618110.9942380.9901820.979025-0.2175660.0089441.000000-0.163815-0.020192
infl-0.140837-0.002453-0.191880-0.198804-0.192416-0.295641-0.186683-0.222361-0.2672360.6205080.065015-0.1638151.000000-0.560965
realint-0.0023730.021856-0.048492-0.059517-0.0464330.049280-0.0488340.028676-0.0153370.2985160.162489-0.020192-0.5609651.000000
mac['cpi'].value_counts(ascending = True,bins=4)#升序,bins:平均分成4份
(76.388, 123.795]               38
(171.203, 218.61]               38
(123.795, 171.203]              44
(28.788999999999998, 76.388]    83
Name: cpi, dtype: int64

3. 对象操作

3.1 Dataframe结构

import pandas as pa#导入库
#Dataframe结构
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']

df = pa.DataFrame(data=data,index=index,columns = columns)
df
ABC
a123
b456

增:df.loc()/pd.concat()(练接两个dataframe)/直接添加;删:df.drop()/del df[]/;改:通过loc和index重新定义;查:iloc/loc/索引;

df.loc['c']=[7,8,9]#增加行
df
ABC
a123
b456
D777
c789
df.drop(['D'],axis=0,inplace=True)#删除行
df
ABC
a123
b456
c789
df.loc['a']['B']=20#改
df
ABC
a1203
b456
c789
df.loc['b2']#查行
B      5
C      6
D    200
Name: b2, dtype: int64
df.iloc[2]
B      8
C      9
D    300
Name: c3, dtype: int64
df['D']=[100,200,300]#增加列
df
ABCD
a1203100
b456200
c789300
del df['A']#删除列
df
BCD
a203100
b56200
c89300
df.index=['a1','b2','c3']#修改索引
df
BCD
a1203100
b256200
c389300
df['D']#查列
a1    100
b2    200
c3    300
Name: D, dtype: int64
data = [[10,20,30],[40,50,60],[70,80,90]]
index = ['j','k','l']
columns = ['B','C','D']

df1 = pa.DataFrame(data=data,index=index,columns = columns)
df1
BCD
j102030
k405060
l708090
df2=pa.concat([df,df1])#拼接
df2
BCD
a1203100
b256200
c389300
j102030
k405060
l708090
df2=pa.concat([df,df1],axis=1)
df2
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.
BCDBCD
a120.03.0100.0NaNNaNNaN
b25.06.0200.0NaNNaNNaN
c38.09.0300.0NaNNaNNaN
jNaNNaNNaN10.020.030.0
kNaNNaNNaN40.050.060.0
lNaNNaNNaN70.080.090.0
left = pa.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'], 
                    'B': ['B0', 'B1', 'B2', 'B3']})
right = pa.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'C': ['C0', 'C1', 'C2', 'C3'], 
                    'D': ['D0', 'D1', 'D2', 'D3']})
keyAB
0K0A0B0
1K1A1B1
2K2A2B2
3K3A3B3
merge=pa.merge(left,right)
merge
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3
merge=pa.merge(left,right,on='key')
merge
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3

3.2 series结构

data = [10,11,12]
index = ['a','b','c']
s = pa.Series(data = data,index = index)#Sersies结构
s
a    10
b    11
c    12
dtype: int64
data = [100,110]
index = ['h','k']
s2 = pa.Series(data = data,index = index)
s3=s.append(s2)#增:append()
s3
a     10
b     11
c     12
h    100
k    110
dtype: int64
s3['L']=900#直接添加
s3
a     10
b     11
c     12
h    100
k    110
L    900
dtype: int64
del s2['h']#删除一行
s2
k    110
dtype: int64
s.drop(['b','a'],inplace = True)#删除多行用drop()
s
c    12
dtype: int64
s3.replace(to_replace = 100,value = 101,inplace = True)#改,replace()、修改索引rename()
s3
a     10
b     11
c     12
h    101
k    110
L    900
dtype: int64
s3.rename(index = {'a':'A'},inplace = True)
s3
A     10
b     11
c     12
h    101
k    110
L    900
dtype: int64
s3.loc['b']#查:可用索引/loc/Iloc等查找定位
11
s3.iloc[2]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值