import numpy as np
import pandas as pd
print(pd.__version__)
1.0.5
#Pandas 的 Series 对象是一个带索引数据构成的一维数组#Series 对象将一组数据和一组索引绑定在一起,我们可以通过values 属性和 index 属性获取数据
data = pd.Series([0.25,0.5,0.75,1])print(data)print(data.values)print(data.index)
#索引可以是任意想要的类型
data = pd.Series([0.25,0.5,0.75,1],index=['a','b','c','d'])print(data)print(data['a'])#还可以用python 字典初始化series
data = pd.Series({2:'a',1:'b',3:'c'})print(data)
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
0.25
2 a
1 b
3 c
dtype: object
#每一种形式都可以通过显式指定索引筛选需要的结果:
data = pd.Series({2:'a',1:'b',3:'c'}, index=[3,2])print(data)#Series 对象只会保留显式定义的键值对。
data = pd.Series({2:'a',1:'b',3:'c'}, index=[4,5,6])print(data)
3 c
2 a
dtype: object
4 NaN
5 NaN
6 NaN
dtype: object
#如果将 Series 类比为带灵活索引的一维数组,那么 DataFrame 就可以看作是一种既有灵活的行索引,又有灵活列名的二维数组。
area_dict ={'California':423967,'Texas':695662,'New York':141297,'Florida':170312,'Illinois':149995}
population_dict ={'California':38332521,'Texas':26448193,'New York':19651127,'Florida':19552860,'Illinois':12882135}
states = pd.DataFrame({'Population': population_dict,'area':area_dict})print(states)#dataframe的行标签和列标签分别用index、columns表示print(states.index)print(states.columns)
Population area
California 38332521 423967
Texas 26448193 695662
New York 19651127 141297
Florida 19552860 170312
Illinois 12882135 149995
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['Population', 'area'], dtype='object')
#创建dataframe#通过单个 Series 对象创建print(pd.DataFrame(pd.Series(population_dict), columns=['population']))#通过字典列表创建。任何元素是字典的列表都可以变成 DataFrame。
data =[{'a': i,'b':2* i,'c':3* i}for i inrange(2)]print(pd.DataFrame(data))#通过 Series 对象字典创建。就像之前见过的那样,DataFrame 也可以用一个由 Series对象构成的字典创建(或由字典构成的字典创建)
states = pd.DataFrame({'Population': pd.Series(population_dict),'area':pd.Series(area_dict)})print(states)
states = pd.DataFrame({'Population': population_dict,'area':area_dict})print(states)
population
California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
a b c
0 0 0 0
1 1 2 3
Population area
California 38332521 423967
Texas 26448193 695662
New York 19651127 141297
Florida 19552860 170312
Illinois 12882135 149995
Population area
California 38332521 423967
Texas 26448193 695662
New York 19651127 141297
Florida 19552860 170312
Illinois 12882135 149995
#通过 NumPy 二维数组创建。
data = pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=['a','b','c'])print(data)
foo bar
a 0.751958 0.188040
b 0.128621 0.814735
c 0.768985 0.817199
#通过 NumPy 结构化数组创建。
A = np.zeros(3, dtype=[('A','i8'),('B','f8')])print(pd.DataFrame(A))
#series数据选择import pandas as pd
data = pd.Series([0.25,0.5,0.75,1.0], index=['a','b','c','d'])#添加
data['e']=1.25print(data)print(list(data.items()))print(data.keys())print(data.values)
a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0), ('e', 1.25)]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[0.25 0.5 0.75 1. 1.25]
a 0.25
b 0.50
c 0.75
dtype: float64
a 0.25
b 0.50
dtype: float64
b 0.50
c 0.75
dtype: float64
a 0.25
e 1.25
dtype: float64
<class 'pandas.core.series.Series'>
#如果你的 Series 是显式整数索引,那么 data[1] 这样的取值操作会使用显式索引,而 data[1:3] 这样的切片操作却会使用隐式索引。
data = pd.Series(['a','b','c'], index=[1,3,5])print(data[1:3])#为了防止混淆, pandas索引器:暴露切片接口的“属性”#loc: 表示取值和切片都是显式的print(data.loc[1:3])#iloc: 表示取值和切片是python形式隐式的print(data.iloc[1:3])#ix索引器是前面两种的混合,series中等价于python的[],主要用于dataframe。#pandas1.0之后被删除了!
3 b
5 c
dtype: object
1 a
3 b
dtype: object
3 b
5 c
dtype: object
#dataframe数据选择
area = pd.Series({'California':423967,'Texas':695662,'New York':141297,'Florida':170312,'Illinois':149995})
pop = pd.Series({'California':38332521,'Texas':26448193,'New York':19651127,'Florida':19552860,'Illinois':12882135})
data = pd.DataFrame({'area':area,'pop':pop})print(data)#获取一列,纯字符串列名还可以用属性方法print(data['pop'])print(data.area)
area pop
California 423967 38332521
Texas 695662 26448193
New York 141297 19651127
Florida 170312 19552860
Illinois 149995 12882135
California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
Name: pop, dtype: int64
California 423967
Texas 695662
New York 141297
Florida 170312
Illinois 149995
Name: area, dtype: int64
area pop density
California 423967 38332521 90.413926
Texas 695662 26448193 38.018740
New York 141297 19651127 139.076746
Florida 170312 19552860 114.806121
Illinois 149995 12882135 85.883763
[[4.23967000e+05 3.83325210e+07 9.04139261e+01]
[6.95662000e+05 2.64481930e+07 3.80187404e+01]
[1.41297000e+05 1.96511270e+07 1.39076746e+02]
[1.70312000e+05 1.95528600e+07 1.14806121e+02]
[1.49995000e+05 1.28821350e+07 8.58837628e+01]]
California Texas New York Florida Illinois
area 4.239670e+05 6.956620e+05 1.412970e+05 1.703120e+05 1.499950e+05
pop 3.833252e+07 2.644819e+07 1.965113e+07 1.955286e+07 1.288214e+07
density 9.041393e+01 3.801874e+01 1.390767e+02 1.148061e+02 8.588376e+01
#通过 iloc 索引器,我们就可以像对待 NumPy 数组一样索引 Pandas的底层数组(Python 的隐式索引),DataFrame 的行列标签会自动保留在结果中。print(data.iloc[:3,:2])#使用 ix 索引器可以实现一种混合效果 #pandas1.0之后被删除了!#print(data.ix[:3, :'pop'])#任何用于处理 NumPy 形式数据的方法都可以用于这些索引器。例如,可以在 loc 索引器中结合使用掩码与花哨的索引方法:print(data.loc[data.density >100,['pop','density']])
area pop
California 423967 38332521
Texas 695662 26448193
New York 141297 19651127
pop density
New York 19651127 139.076746
Florida 19552860 114.806121
A B C D
0 6 9 2 6
1 7 4 3 7
2 7 2 5 4
A B C D
0 -1.000000 7.071068e-01 1.000000 -1.000000e+00
1 -0.707107 1.224647e-16 0.707107 -7.071068e-01
2 -0.707107 1.000000e+00 -0.707107 1.224647e-16
#二元运算:会对齐索引
area = pd.Series({'Alaska':1723337,'Texas':695662,'California':423967}, name='area')
population = pd.Series({'California':38332521,'Texas':26448193,'New York':19651127}, name='population')print(population/area)#可以设置缺失时的填充值print(population.divide(area,fill_value=0))#会将两个操作数中缺少的补为0再计算
Alaska NaN
California 90.413926
New York NaN
Texas 38.018740
dtype: float64
Alaska 0.000000
California 90.413926
New York inf
Texas 38.018740
dtype: float64
#dataframe的索引对齐
A = pd.DataFrame(rng.randint(0,20,(2,2)),columns=list('AB'))
B = pd.DataFrame(rng.randint(0,10,(3,3)),columns=list('BAC'))print(A)print(B)print(A + B)#用A的均值作为填充
fill = A.stack().mean()#计算 A 的均值需要用 stack 将二维数组压缩成一维数组print(A.add(B, fill_value=fill))
A B
0 8 1
1 19 14
B A C
0 6 7 2
1 0 3 1
2 7 3 1
A B C
0 15.0 7.0 NaN
1 22.0 14.0 NaN
2 NaN NaN NaN
A B C
0 15.0 7.0 12.5
1 22.0 14.0 11.5
2 13.5 17.5 11.5
[[6 3 6 7]
[0 5 7 4]
[3 1 5 5]]
A B C D
0 0 0 0 0
1 -6 2 1 -3
2 -3 -2 -1 -2
A B C D
0 0 -3 0 1
1 0 5 7 4
2 0 -2 2 2
A B C D 0 1 2
0 NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN