Pandas基础复习-DataFrame-CSDN博客

数据类型-DataFrame

DataFrame是由多个Series数据列组成的表格数据类型，每行Series值都增加了一个共用的索引
既有行索引，又有列索引
- 行索引，表明不同行，横向索引，叫index，0轴，axis=0
- 列索引，表名不同列，纵向索引，叫columns，1轴，axis=1
DataFrame数据类型可视为：二维带标签数组
每列值的类型可以不同
基本操作类似Series，依据行列索引操作
常用于表达二维数据，但也可以表达多维数据(Dataframe嵌套，极少用)

DataFrame数据类型创建

Python list列表 创建DataFrame

import pandas as pd

df = pd.DataFrame([True, 1, 2.3, 'a', '你好']) # 1维
df

	0
0	True
1	1
2	2.3
3	a
4	你好

df = pd.DataFrame([[True,1,2.3,'a','你好'],[1,2,3,4,5]]) #2维
df

	0	1	2	3	4
0	True	1	2.3	a	你好
1	1	2	3.0	4	5

# 3维，不建议
df = pd.DataFrame([[[True,1,2.3,'a','你好'],
                    [1,2,3,4,5]],
                   [[True,1,2.3,'a','你好'],
                    [1,2,3,4,5]]
                  ]) 
df

	0	1
0	[True, 1, 2.3, a, 你好]	[1, 2, 3, 4, 5]
1	[True, 1, 2.3, a, 你好]	[1, 2, 3, 4, 5]

Python 字典 创建DataFrame

df = pd.DataFrame({'one':[1,2,3,4],
                   'two':[9,8,7,6]})
df

	one	two
0	1	9
1	2	8
2	3	7
3	4	6

# 自定义行索引
df = pd.DataFrame({'one':[1,2,3,4],
                   'two':[9,8,7,6]},index = ['a','b','c','d']) 
df

	one	two
a	1	9
b	2	8
c	3	7
d	4	6

df = pd.DataFrame({
    'A' : 1,
    'B' : 2.3,
    'C' : ['x','y',5] #需要多行
})
df

	A	B	C
0	1	2.3	x
1	1	2.3	y
2	1	2.3	5

dt = {
    'one' : pd.Series([1,2,3],index=['a','b','c']),
    'two' : pd.Series([9,8,7,6],index=['a','b','c','d',])
}
dt

{'one': a    1
 b    2
 c    3
 dtype: int64, 'two': a    9
 b    8
 c    7
 d    6
 dtype: int64}

# one two自动列索引,abcd自动行索引.每个元素对应DataFrame的一列,每个元素内的键值对应一行
d = pd.DataFrame(dt) 
d

	one	two
a	1.0	9
b	2.0	8
c	3.0	7
d	NaN	6

# 数据根据行列索引自动补齐
d_2 = pd.DataFrame(dt,index=['b','c','d'],columns=['two','three']) 
d_2

	two	three
b	8	NaN
c	7	NaN
d	6	NaN

ndarray数组 创建DataFrame

import numpy as np

df = pd.DataFrame(np.arange(10).reshape(2,5)) # 自动生成行/列索引
df

	0	1	2	3	4
0	0	1	2	3	4
1	5	6	7	8	9

# 自定义行列索引
df = pd.DataFrame(np.random.randn(6,4),
                  index=[1,2,3,4,5,6],
                  columns=['a','b','c','d']) 
df

	a	b	c	d
1	0.274340	0.296507	0.751198	0.763512
2	0.181134	0.675380	0.553695	0.632163
3	-0.059765	0.347702	1.138297	-0.143998
4	-1.370677	-0.951640	0.135964	-0.665875
5	1.490610	0.420539	0.628784	2.119896
6	-1.669737	1.167765	1.254722	-0.948624

Series 创建DataFrame

e = pd.DataFrame([pd.Series([1,2,3]),
                  pd.Series([9,8,7,6])],
                 index=['a','b'])
e

	0	1	2	3
a	1.0	2.0	3.0	NaN
b	9.0	8.0	7.0	6.0

DataFrame属性


di = {
    '姓名':['张三','李四','王五','赵六'],
    '性别':['男','女','女','男'],
    '年龄':[12,22,32,42],
    '地址':['北京','上海','广州','深圳']
}
di

{'地址': ['北京', '上海', '广州', '深圳'],
 '姓名': ['张三', '李四', '王五', '赵六'],
 '年龄': [12, 22, 32, 42],
 '性别': ['男', '女', '女', '男']}

d = pd.DataFrame(di,index=['d1','d2','d3','d4'])
d

	地址	姓名	年龄	性别
d1	北京	张三	12	男
d2	上海	李四	22	女
d3	广州	王五	32	女
d4	深圳	赵六	42	男

d.head() # 显示头部几行

	地址	姓名	年龄	性别
d1	北京	张三	12	男
d2	上海	李四	22	女
d3	广州	王五	32	女
d4	深圳	赵六	42	男

d.tail(3) # 显示末尾几行

	地址	姓名	年龄	性别
d2	上海	李四	22	女
d3	广州	王五	32	女
d4	深圳	赵六	42	男

d.info() # 相关信息概览

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, d1 to d4
Data columns (total 4 columns):
地址    4 non-null object
姓名    4 non-null object
年龄    4 non-null int64
性别    4 non-null object
dtypes: int64(1), object(3)
memory usage: 160.0+ bytes

d.shape # 行数 列数

(4, 4)

d.dtypes # 列数据类型

地址    object
姓名    object
年龄     int64
性别    object
dtype: object

d.index # 获取行索引

Index(['d1', 'd2', 'd3', 'd4'], dtype='object')

d.columns # 获取列索引

Index(['地址', '姓名', '年龄', '性别'], dtype='object')

d.values # 获取值

array([['北京', '张三', 12, '男'],
       ['上海', '李四', 22, '女'],
       ['广州', '王五', 32, '女'],
       ['深圳', '赵六', 42, '男']], dtype=object)

DataFrame查增改删

查 Read

类list/ndarray数据访问方式

dates = pd.date_range('20130101',periods=10)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(10,4),index=dates,columns=['A','B','C','D'])
df

	A	B	C	D
2013-01-01	0.754077	-0.346202	-0.557050	0.778106
2013-01-02	0.103394	-1.051044	-0.413054	0.268955
2013-01-03	0.174730	2.056007	1.781379	1.643397
2013-01-04	-0.950517	-0.226887	-0.097138	-0.442010
2013-01-05	0.076178	-0.518970	1.142290	-0.952401
2013-01-06	1.371702	-1.028873	-1.470106	-0.113098
2013-01-07	0.126720	-0.251519	-2.212507	1.050036
2013-01-08	-1.246918	1.530266	1.761499	0.940741
2013-01-09	0.941099	-2.420932	1.927863	-0.549143
2013-01-10	1.951555	-0.264012	-0.171690	0.869293

#索引
df['A']

2013-01-01    0.754077
2013-01-02    0.103394
2013-01-03    0.174730
2013-01-04   -0.950517
2013-01-05    0.076178
2013-01-06    1.371702
2013-01-07    0.126720
2013-01-08   -1.246918
2013-01-09    0.941099
2013-01-10    1.951555
Freq: D, Name: A, dtype: float64

df.A

2013-01-01    0.754077
2013-01-02    0.103394
2013-01-03    0.174730
2013-01-04   -0.950517
2013-01-05    0.076178
2013-01-06    1.371702
2013-01-07    0.126720
2013-01-08   -1.246918
2013-01-09    0.941099
2013-01-10    1.951555
Freq: D, Name: A, dtype: float64

df['A']['2013-01-01'] # 先列后行

0.75407705661157032

df.A['2013-01-01']

0.75407705661157032

df[['A','C']]

	A	C
2013-01-01	0.754077	-0.557050
2013-01-02	0.103394	-0.413054
2013-01-03	0.174730	1.781379
2013-01-04	-0.950517	-0.097138
2013-01-05	0.076178	1.142290
2013-01-06	1.371702	-1.470106
2013-01-07	0.126720	-2.212507
2013-01-08	-1.246918	1.761499
2013-01-09	0.941099	1.927863
2013-01-10	1.951555	-0.171690

Pandas专用的数据访问方式 — .loc 通过自定义索引获取数据

#选取某行
df.loc['2013-01-01']

A    0.754077
B   -0.346202
C   -0.557050
D    0.778106
Name: 2013-01-01 00:00:00, dtype: float64

#选取某列
df.loc[:,'A']

2013-01-01    0.754077
2013-01-02    0.103394
2013-01-03    0.174730
2013-01-04   -0.950517
2013-01-05    0.076178
2013-01-06    1.371702
2013-01-07    0.126720
2013-01-08   -1.246918
2013-01-09    0.941099
2013-01-10    1.951555
Freq: D, Name: A, dtype: float64

# 选取特定值
df.loc['2013-01-01','A'] # 先行后列

0.75407705661157032

# 选取指定的行/列
df.loc[[dates[0],dates[2]],:] # 指定行

	A	B	C	D
2013-01-01	0.754077	-0.346202	-0.557050	0.778106
2013-01-03	0.174730	2.056007	1.781379	1.643397

df.loc[:,['A','B']] # 指定列

	A	B
2013-01-01	0.754077	-0.346202
2013-01-02	0.103394	-1.051044
2013-01-03	0.174730	2.056007
2013-01-04	-0.950517	-0.226887
2013-01-05	0.076178	-0.518970
2013-01-06	1.371702	-1.028873
2013-01-07	0.126720	-0.251519
2013-01-08	-1.246918	1.530266
2013-01-09	0.941099	-2.420932
2013-01-10	1.951555	-0.264012

df.loc[[dates[0],dates[2]],['A','B']] # 指定行列

	A	B
2013-01-01	0.754077	-0.346202
2013-01-03	0.174730	2.056007

# 切片
df.loc['2013-01-01':'2013-01-04',:] # 对行切片

	A	B	C	D
2013-01-01	0.754077	-0.346202	-0.557050	0.778106
2013-01-02	0.103394	-1.051044	-0.413054	0.268955
2013-01-03	0.174730	2.056007	1.781379	1.643397
2013-01-04	-0.950517	-0.226887	-0.097138	-0.442010

df.loc[:,'A':'C'] # 对列切片

	A	B	C
2013-01-01	0.754077	-0.346202	-0.557050
2013-01-02	0.103394	-1.051044	-0.413054
2013-01-03	0.174730	2.056007	1.781379
2013-01-04	-0.950517	-0.226887	-0.097138
2013-01-05	0.076178	-0.518970	1.142290
2013-01-06	1.371702	-1.028873	-1.470106
2013-01-07	0.126720	-0.251519	-2.212507
2013-01-08	-1.246918	1.530266	1.761499
2013-01-09	0.941099	-2.420932	1.927863
2013-01-10	1.951555	-0.264012	-0.171690

# 切片选取连续区块。行，列。左开右闭
df.loc['2013-01-01':'2013-01-04','A':'C']

	A	B	C
2013-01-01	0.754077	-0.346202	-0.557050
2013-01-02	0.103394	-1.051044	-0.413054
2013-01-03	0.174730	2.056007	1.781379
2013-01-04	-0.950517	-0.226887	-0.097138

.iloc 通过默认索引获取数据

# 选取某行
df.iloc[3]

A   -0.950517
B   -0.226887
C   -0.097138
D   -0.442010
Name: 2013-01-04 00:00:00, dtype: float64

# 选取某列
df.iloc[:,2]

2013-01-01   -0.557050
2013-01-02   -0.413054
2013-01-03    1.781379
2013-01-04   -0.097138
2013-01-05    1.142290
2013-01-06   -1.470106
2013-01-07   -2.212507
2013-01-08    1.761499
2013-01-09    1.927863
2013-01-10   -0.171690
Freq: D, Name: C, dtype: float64

# 选取特定值:
df.iloc[1,2]

-0.41305425875508139

# 选取指定的行/列
df.iloc[[1,2,4],:] # 指定行

	A	B	C	D
2013-01-02	0.103394	-1.051044	-0.413054	0.268955
2013-01-03	0.174730	2.056007	1.781379	1.643397
2013-01-05	0.076178	-0.518970	1.142290	-0.952401

df.iloc[:,[0,2]] # 指定列

	A	C
2013-01-01	0.754077	-0.557050
2013-01-02	0.103394	-0.413054
2013-01-03	0.174730	1.781379
2013-01-04	-0.950517	-0.097138
2013-01-05	0.076178	1.142290
2013-01-06	1.371702	-1.470106
2013-01-07	0.126720	-2.212507
2013-01-08	-1.246918	1.761499
2013-01-09	0.941099	1.927863
2013-01-10	1.951555	-0.171690

df.iloc[[1,2,4],[0,2]] # 指定行列 ，先行后列

	A	C
2013-01-02	0.103394	-0.413054
2013-01-03	0.174730	1.781379
2013-01-05	0.076178	1.142290

# 切片
df.iloc[1:3,:] # 对行切片:

	A	B	C	D
2013-01-02	0.103394	-1.051044	-0.413054	0.268955
2013-01-03	0.174730	2.056007	1.781379	1.643397

df.iloc[:,1:3] # 对列切片:

	B	C
2013-01-01	-0.346202	-0.557050
2013-01-02	-1.051044	-0.413054
2013-01-03	2.056007	1.781379
2013-01-04	-0.226887	-0.097138
2013-01-05	-0.518970	1.142290
2013-01-06	-1.028873	-1.470106
2013-01-07	-0.251519	-2.212507
2013-01-08	1.530266	1.761499
2013-01-09	-2.420932	1.927863
2013-01-10	-0.264012	-0.171690

df.iloc[3:5,0:2] # 切片选取连续区块。行，列。左开右闭

	A	B
2013-01-04	-0.950517	-0.226887
2013-01-05	0.076178	-0.518970

Boolean索引

# 通过某列选择数据:
df[df.A > 0]

	A	B	C	D
2013-01-01	0.754077	-0.346202	-0.557050	0.778106
2013-01-02	0.103394	-1.051044	-0.413054	0.268955
2013-01-03	0.174730	2.056007	1.781379	1.643397
2013-01-05	0.076178	-0.518970	1.142290	-0.952401
2013-01-06	1.371702	-1.028873	-1.470106	-0.113098
2013-01-07	0.126720	-0.251519	-2.212507	1.050036
2013-01-09	0.941099	-2.420932	1.927863	-0.549143
2013-01-10	1.951555	-0.264012	-0.171690	0.869293

# 通过where选择数据:
b = df[df > 0]
b

	A	B	C	D
2013-01-01	0.754077	NaN	NaN	0.778106
2013-01-02	0.103394	NaN	NaN	0.268955
2013-01-03	0.174730	2.056007	1.781379	1.643397
2013-01-04	NaN	NaN	NaN	NaN
2013-01-05	0.076178	NaN	1.142290	NaN
2013-01-06	1.371702	NaN	NaN	NaN
2013-01-07	0.126720	NaN	NaN	1.050036
2013-01-08	NaN	1.530266	1.761499	0.940741
2013-01-09	0.941099	NaN	1.927863	NaN
2013-01-10	1.951555	NaN	NaN	0.869293

type(b['A']['2013-01-01'])

numpy.float64

# 通过 isin() 过滤数据:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three','five','four','three','five']
df2

	A	B	C	D	E
2013-01-01	0.754077	-0.346202	-0.557050	0.778106	one
2013-01-02	0.103394	-1.051044	-0.413054	0.268955	one
2013-01-03	0.174730	2.056007	1.781379	1.643397	two
2013-01-04	-0.950517	-0.226887	-0.097138	-0.442010	three
2013-01-05	0.076178	-0.518970	1.142290	-0.952401	four
2013-01-06	1.371702	-1.028873	-1.470106	-0.113098	three
2013-01-07	0.126720	-0.251519	-2.212507	1.050036	five
2013-01-08	-1.246918	1.530266	1.761499	0.940741	four
2013-01-09	0.941099	-2.420932	1.927863	-0.549143	three
2013-01-10	1.951555	-0.264012	-0.171690	0.869293	five

df2['E'].isin(['one','four'])

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06    False
2013-01-07    False
2013-01-08     True
2013-01-09    False
2013-01-10    False
Freq: D, Name: E, dtype: bool

df2[df2['E'].isin(['one','four'])]

	A	B	C	D	E
2013-01-01	0.754077	-0.346202	-0.557050	0.778106	one
2013-01-02	0.103394	-1.051044	-0.413054	0.268955	one
2013-01-05	0.076178	-0.518970	1.142290	-0.952401	four
2013-01-08	-1.246918	1.530266	1.761499	0.940741	four

增 Create

s1 = pd.Series([1,2,3,4,5,6], 
               index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

# 新增一列数据
df2['F'] = s1
df2

	A	B	C	D	E	F
2013-01-01	0.754077	-0.346202	-0.557050	0.778106	one	NaN
2013-01-02	0.103394	-1.051044	-0.413054	0.268955	one	1.0
2013-01-03	0.174730	2.056007	1.781379	1.643397	two	2.0
2013-01-04	-0.950517	-0.226887	-0.097138	-0.442010	three	3.0
2013-01-05	0.076178	-0.518970	1.142290	-0.952401	four	4.0
2013-01-06	1.371702	-1.028873	-1.470106	-0.113098	three	5.0
2013-01-07	0.126720	-0.251519	-2.212507	1.050036	five	6.0
2013-01-08	-1.246918	1.530266	1.761499	0.940741	four	NaN
2013-01-09	0.941099	-2.420932	1.927863	-0.549143	three	NaN
2013-01-10	1.951555	-0.264012	-0.171690	0.869293	five	NaN

改 Update

# 更新一列值
df2.loc[:,'D']

2013-01-01    0.778106
2013-01-02    0.268955
2013-01-03    1.643397
2013-01-04   -0.442010
2013-01-05   -0.952401
2013-01-06   -0.113098
2013-01-07    1.050036
2013-01-08    0.940741
2013-01-09   -0.549143
2013-01-10    0.869293
Freq: D, Name: D, dtype: float64

df2.loc[:,'D'] = 5
df2

	A	B	C	D	E	F
2013-01-01	0.754077	-0.346202	-0.557050	5	one	NaN
2013-01-02	0.103394	-1.051044	-0.413054	5	one	1.0
2013-01-03	0.174730	2.056007	1.781379	5	two	2.0
2013-01-04	-0.950517	-0.226887	-0.097138	5	three	3.0
2013-01-05	0.076178	-0.518970	1.142290	5	four	4.0
2013-01-06	1.371702	-1.028873	-1.470106	5	three	5.0
2013-01-07	0.126720	-0.251519	-2.212507	5	five	6.0
2013-01-08	-1.246918	1.530266	1.761499	5	four	NaN
2013-01-09	0.941099	-2.420932	1.927863	5	three	NaN
2013-01-10	1.951555	-0.264012	-0.171690	5	five	NaN

df2.iloc[1,3]

df2.iloc[1,3] = 10.1
df2

	A	B	C	D	E	F
2013-01-01	0.754077	-0.346202	-0.557050	5.0	one	NaN
2013-01-02	0.103394	-1.051044	-0.413054	10.1	one	1.0
2013-01-03	0.174730	2.056007	1.781379	5.0	two	2.0
2013-01-04	-0.950517	-0.226887	-0.097138	5.0	three	3.0
2013-01-05	0.076178	-0.518970	1.142290	5.0	four	4.0
2013-01-06	1.371702	-1.028873	-1.470106	5.0	three	5.0
2013-01-07	0.126720	-0.251519	-2.212507	5.0	five	6.0
2013-01-08	-1.246918	1.530266	1.761499	5.0	four	NaN
2013-01-09	0.941099	-2.420932	1.927863	5.0	three	NaN
2013-01-10	1.951555	-0.264012	-0.171690	5.0	five	NaN

# 通过where更新
df3 = df.copy()
df3[df3 > 0] = -df3
df3

	A	B	C	D
2013-01-01	-0.754077	-0.346202	-0.557050	-0.778106
2013-01-02	-0.103394	-1.051044	-0.413054	-0.268955
2013-01-03	-0.174730	-2.056007	-1.781379	-1.643397
2013-01-04	-0.950517	-0.226887	-0.097138	-0.442010
2013-01-05	-0.076178	-0.518970	-1.142290	-0.952401
2013-01-06	-1.371702	-1.028873	-1.470106	-0.113098
2013-01-07	-0.126720	-0.251519	-2.212507	-1.050036
2013-01-08	-1.246918	-1.530266	-1.761499	-0.940741
2013-01-09	-0.941099	-2.420932	-1.927863	-0.549143
2013-01-10	-1.951555	-0.264012	-0.171690	-0.869293