Pandas数据结构Series
'''
【课程2.2】 Pandas数据结构Series:基本概念及创建
"一维数组"Serise
'''
Series 基本概念及创建
# Series 数据结构
# Series 是带有标签的一维数组,可以保存任何数据类型(整数,字符串,浮点数,Python对象等),轴标签统称为索引
import numpy as np
import pandas as pd
ar = np.random.rand(5)
s = pd.Series(ar,index=list('abcde')) # 默认的index是数字,也可以指定index为字母等,但是提供的index必须和元素数量一致
print(ar)
print(s)
print(type(s))
print('------')
print(s.index,list(s.index)) # 这是一个生成器
print(s.values,type(s.values)) # 查看数组中的值,以列表形式返回
# .index查看series索引,类型为rangeindex
# .values查看series值,类型是ndarray
# 核心:series相比于ndarray,是一个自带索引index的数组 → 一维数组 + 对应索引
# 所以当只看series的值的时候,就是一个ndarray
# series和ndarray较相似,索引切片功能差别不大
# series和dict相比,series更像一个有顺序的字典(dict本身不存在顺序),其索引原理与字典相似(一个用key,一个用index)
[0.65459383 0.50536351 0.60767807 0.57021068 0.19890865]
a 0.654594
b 0.505364
c 0.607678
d 0.570211
e 0.198909
dtype: float64
<class 'pandas.core.series.Series'>
------
Index(['a', 'b', 'c', 'd', 'e'], dtype='object') ['a', 'b', 'c', 'd', 'e']
[0.65459383 0.50536351 0.60767807 0.57021068 0.19890865] <class 'numpy.ndarray'>
Series 创建方法一:由字典创建
# Series 创建方法一:由字典创建,字典的key就是index,values就是values
dic = {'a':1 ,'b':2 , 'c':3, '4':4, '5':5} # 这里的key可以为任意的基本类型
s = pd.Series(dic)
print(s)
# 注意:key肯定是字符串,假如values类型不止一个会怎么样? → dic = {'a':1 ,'b':'hello' , 'c':3, '4':4, '5':5}
a 1
b 2
c 3
4 4
5 5
dtype: int64
Series 创建方法二:由数组创建(一维数组)
# Series 创建方法二:由数组创建(一维数组)
arr = np.random.randn(5)
s = pd.Series(arr)
print(arr)
print(s)
# 默认index是从0开始,步长为1的数字
s = pd.Series(arr, index = ['a','b','c','d','e'],dtype = np.object) # np.object = np.str 表示子元素是字符串类型
print(s)
# index参数:设置index,长度保持一致
# dtype参数:设置数值类型
[ 0.91786854 0.86101153 -0.66725857 -0.40829045 1.69792589]
0 0.917869
1 0.861012
2 -0.667259
3 -0.408290
4 1.697926
dtype: float64
a 0.917869
b 0.861012
c -0.667259
d -0.40829
e 1.69793
dtype: object
Series 创建方法三:由标量创建
# Series 创建方法三:由标量创建
s = pd.Series(10, index = range(4)) # 这里的index就等于创建的长度
print(s)
# 如果data是标量值,则必须提供索引。该值会重复,来匹配索引的长度
0 10
1 10
2 10
3 10
dtype: int64
课堂作业
# 字典方式创建
dictionary = {'Jack':92,'Marry':92,'Tom':89,'Zack':65}
s = pd.Series(dictionary,dtype=np.float64,name='作业一')
print(s)
Jack 92.0
Marry 92.0
Tom 89.0
Zack 65.0
Name: 作业一, dtype: float64
# 数组方式创建
dictionary = {'Jack':92,'Marry':92,'Tom':89,'Zack':65}
s = pd.Series(list(dictionary.values()),index=list(dictionary.keys()),dtype=np.float64,name='作业一')
print(s)
Jack 92.0
Marry 92.0
Tom 89.0
Zack 65.0
Name: 作业一, dtype: float64
Series基本技巧
'''
【课程2.3】 Pandas数据结构Series:基本技巧
数据查看 / 重新索引 / 对齐 / 添加、修改、删除值
'''
数据查看
# 数据查看
s = pd.Series(np.random.rand(15))
print(s.head(3)) # 查看头n条 默认显示5条
print(s.tail()) # 查看尾n条 默认显示5条
0 0.658717
1 0.446246
2 0.779948
dtype: float64
10 0.387464
11 0.000036
12 0.877476
13 0.068350
14 0.151361
dtype: float64
重新索引reindex
# 重新索引reindex
# .reindex将会根据索引重新排序,如果当前索引不存在,则引入缺失值
s = pd.Series(np.random.rand(5),index=list('abcde'))
print(s)
print('----------')
s1 = s.reindex(index=list('bdcf'))
print(s)
print(s1)
print(s.reindex(index=list('bdef'),fill_value = 0)) # 对于找不到的索引,会使用0来填充
a 0.650794
b 0.892547
c 0.709360
d 0.097026
e 0.547834
dtype: float64
----------
a 0.650794
b 0.892547
c 0.709360
d 0.097026
e 0.547834
dtype: float64
b 0.892547
d 0.097026
c 0.709360
f NaN
dtype: float64
b 0.892547
d 0.097026
e 0.547834
f 0.000000
dtype: float64
Series对齐 自动将名称相同的索引对齐,并进行相应的运算
# Series对齐 自动将名称相同的索引对齐,并进行相应的运算
s1 = pd.Series(np.random.rand(3), index = ['Jack','Marry','Tom'])
s2 = pd.Series(np.random.rand(3), index = ['Wang','Jack','Marry'])
print(s1)
print(s2)
print(s1+s2)
# Series 和 ndarray 之间的主要区别是,Series 上的操作会根据标签自动对齐
# index顺序不会影响数值计算,以标签来计算
# 空值和任何值计算结果扔为空值
Jack 0.970366
Marry 0.105214
Tom 0.609774
dtype: float64
Wang 0.959125
Jack 0.205666
Marry 0.892832
dtype: float64
Jack 1.176031
Marry 0.998046
Tom NaN
Wang NaN
dtype: float64
删除:.drop
# 删除:.drop
s = pd.Series(np.random.rand(5), index = list('ngjur'))
print(s)
s1 = s.drop('n')
s2 = s.drop(['g','j'])
print(s1)
print(s2)
print(s)
# drop 删除元素之后返回副本(inplace=False),如果是(inplace=False)就会在原数据上更改
n 0.360458
g 0.068896
j 0.217887
u 0.486113
r 0.475699
dtype: float64
g 0.068896
j 0.217887
u 0.486113
r 0.475699
dtype: float64
n 0.360458
u 0.486113
r 0.475699
dtype: float64
n 0.360458
g 0.068896
j 0.217887
u 0.486113
r 0.475699
dtype: float64
添加,类似字典
# 添加
s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = list('ngjur'))
print(s1)
print(s2)
s1[5] = 100
s2['a'] = 100
print(s1)
print(s2)
print('-----')
# 直接通过下标索引/标签index添加值
s3 = s1.append(s2)
print(s3)
print(s1)
# 通过.append方法,直接添加一个数组
# .append方法生成一个新的数组,不改变之前的数组
0 0.539677
1 0.807219
2 0.816236
3 0.867660
4 0.101201
dtype: float64
n 0.176133
g 0.786885
j 0.572897
u 0.506762
r 0.812056
dtype: float64
0 0.539677
1 0.807219
2 0.816236
3 0.867660
4 0.101201
5 100.000000
dtype: float64
n 0.176133
g 0.786885
j 0.572897
u 0.506762
r 0.812056
a 100.000000
dtype: float64
-----
0 0.539677
1 0.807219
2 0.816236
3 0.867660
4 0.101201
5 100.000000
n 0.176133
g 0.786885
j 0.572897
u 0.506762
r 0.812056
a 100.000000
dtype: float64
0 0.539677
1 0.807219
2 0.816236
3 0.867660
4 0.101201
5 100.000000
dtype: float64
课堂作业
s = pd.Series(np.arange(10),index=list('abcdefghij'),dtype=np.int32)
print('创建series:')
print(s)
print('-------')
print('修改后:')
s1 = s.reindex(index=list('acdefghij'))
# s1 = s.drop('b') # 也可以直接删除
# s.drop('b',inplace=True)
s1['e'] = 100
s1['f'] = 100
print(s1)
创建series:
a 0
b 1
c 2
d 3
e 4
f 5
g 6
h 7
i 8
j 9
dtype: int32
-------
修改后:
a 0
c 2
d 3
e 100
f 100
g 6
h 7
i 8
j 9
dtype: int32
Series:索引
'''
【课程2.4】 Pandas数据结构Series:索引
位置下标 / 标签索引 / 切片索引 / 布尔型索引
'''
位置下标索引,类似序列
# 位置下标,类似序列
s = pd.Series(np.random.rand(5))
print(s)
print(s[0],type(s[0]),s[0].dtype)
print(float(s[0]),type(float(s[0])))
#print(s[-1])
# 位置下标从0开始
# 输出结果为numpy.float格式,
# 可以通过float()函数转换为python float格式
# numpy.float与float占用字节不同
# s[-1]结果如何? 报错,与列表的操作并不完全相同,下标-1做索引报错,但是-1做切片是可以的
0 0.669082
1 0.026808
2 0.609000
3 0.883043
4 0.764092
dtype: float64
0.6690815408230547 <class 'numpy.float64'> float64
0.6690815408230547 <class 'float'>
标签index索引
# 标签索引
s = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])
print(s)
print(s['a'],type(s['a']),s['a'].dtype)
# 方法类似下标索引,用[]表示,内写上index,注意index是字符串
sci = s[['a','b','e']] # 返回的是一个新的Series
print(sci,type(sci))
# 如果需要选择多个标签的值,用[[]]来表示(相当于[]中包含一个列表)
# 多标签索引结果是新的数组!!!!!!
a 0.106570
b 0.804462
c 0.824573
d 0.972822
e 0.147719
dtype: float64
0.10656950971706569 <class 'numpy.float64'> float64
a 0.106570
b 0.804462
e 0.147719
dtype: float64 <class 'pandas.core.series.Series'>
切片索引
# 切片索引
s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])
print(s1[1:4],s1[4]) # 【1,4) 末端不包含
print(s2['a':'c'],s2['c']) #['a','c'] 末端包含
print(s2[0:3],s2[3])
print('-----')
# 注意:用index做切片是末端包含
# 需要注意的是,如果index是默认的1,2,3......,那么默认是以下标形式切片的,所以推荐加上索引
print(s2[:-1])
print(s2[::2])
# 下标索引做切片,和list写法一样
1 0.489538
2 0.272762
3 0.732089
dtype: float64 0.17599774297103532
a 0.737894
b 0.527086
c 0.568121
dtype: float64 0.5681207964616851
a 0.737894
b 0.527086
c 0.568121
dtype: float64 0.7325959705989727
-----
a 0.737894
b 0.527086
c 0.568121
d 0.732596
dtype: float64
a 0.737894
c 0.568121
e 0.540423
dtype: float64
布尔型索引
# 布尔型索引
s = pd.Series(np.random.rand(3)*100)
# s[4] = None # 添加一个空值
# s[5] = np.NAN # 也可以这样添加空值
print(s)
bs1 = s > 50
bs2 = s.isnull()
bs3 = s.notnull()
print(bs1, type(bs1), bs1.dtype)
print(bs2, type(bs2), bs2.dtype)
print(bs3, type(bs3), bs3.dtype)
print('-----')
# 数组做判断之后,返回的是一个由布尔值组成的新的数组
# .isnull() / .notnull() 判断是否为空值 (None代表空值,NaN代表有问题的数值,两个都会识别为空值)
print(s[s > 50])
print(s[bs3])
# 布尔型索引方法:用[判断条件]表示,其中判断条件可以是 一个语句,或者是 一个布尔型数组!
0 29.382354
1 76.100793
2 42.179288
dtype: float64
0 False
1 True
2 False
dtype: bool <class 'pandas.core.series.Series'> bool
0 False
1 False
2 False
dtype: bool <class 'pandas.core.series.Series'> bool
0 True
1 True
2 True
dtype: bool <class 'pandas.core.series.Series'> bool
-----
1 76.100793
dtype: float64
0 29.382354
1 76.100793
2 42.179288
dtype: float64
课堂作业
s = pd.Series(np.random.rand(10)*100,index=list([chr(i) for i in range(97,107)]))
print(s)
print('------')
print(s[['b','c']])
print('------')
# print(s[4:7])
print(s['d':'f'])
print('------')
print(s[s>50])
a 86.207480
b 47.007835
c 38.695491
d 15.130944
e 13.137920
f 15.688881
g 92.948870
h 81.823954
i 85.160048
j 69.286039
dtype: float64
------
b 47.007835
c 38.695491
dtype: float64
------
d 15.130944
e 13.137920
f 15.688881
dtype: float64
------
a 86.207480
g 92.948870
h 81.823954
i 85.160048
j 69.286039
dtype: float64
Pandas数据结构Dataframe
'''
【课程2.5】 Pandas数据结构Dataframe:基本概念及创建
"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。
Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。
'''
Dataframe:基本概念及创建
# Dataframe 数据结构 其实可以看成一个excel表格
# Dataframe是一个表格型的数据结构,“带有标签的二维数组”。
# Dataframe带有index(行标签)和columns(列标签)
data = {'name':['Jack','Tom','Mary'],
'age':[18,19,20],
'gender':['m','m','w']}
frame = pd.DataFrame(data)
print(frame)
print(type(frame))
print(frame.index,'该数据类型为:',type(frame.index))
print(frame.columns,'该数据类型为:',type(frame.columns)) # 注意:字段名其实也是一个Index
print(frame.values,'该数据类型为:',type(frame.values))
# 查看数据,数据类型为dataframe
# .index查看行标签
# .columns查看列标签
# .values查看值,数据类型为ndarray
name age gender
0 Jack 18 m
1 Tom 19 m
2 Mary 20 w
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=3, step=1) 该数据类型为: <class 'pandas.core.indexes.range.RangeIndex'>
Index(['name', 'age', 'gender'], dtype='object') 该数据类型为: <class 'pandas.core.indexes.base.Index'>
[['Jack' 18 'm']
['Tom' 19 'm']
['Mary' 20 'w']] 该数据类型为: <class 'numpy.ndarray'>
Dataframe 创建方法一:由数组/list组成的字典
# Dataframe 创建方法一:由数组/list组成的字典
# 创建方法:pandas.Dataframe()
data1 = {'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]}
data2 = {'one':np.random.rand(3),
'two':np.random.rand(3)} # 这里如果尝试 'two':np.random.rand(4) 会怎么样? 会报错
print(data1)
print(data2)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
# 由数组/list组成的字典 创建Dataframe,columns为字典key,index为默认数字标签
# 字典的值的长度必须保持一致!
df1 = pd.DataFrame(data1, columns = ['b','c','a','d']) # 这里的用法和reindex类似,如果有则重新排序,没有就默认为Nan
print(df1)
df1 = pd.DataFrame(data1, columns = ['b','c'])
print(df1)
# columns参数:可以重新指定列的顺序,格式为list,如果现有数据中没有该列(比如'd'),则产生NaN值
# 如果columns重新指定时候,列的数量可以少于原数据
df2 = pd.DataFrame(data2, index = ['f1','f2','f3']) # 这里如果尝试 index = ['f1','f2','f3','f4'] 会怎么样? 会报错,因此Shape必须一致
print(df2)
# index参数:重新定义index,格式为list,长度必须保持一致
{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}
{'one': array([0.95067378, 0.49663747, 0.91243946]), 'two': array([0.77852305, 0.46251614, 0.16788786])}
a b c
0 1 3 5
1 2 4 6
2 3 5 7
one two
0 0.950674 0.778523
1 0.496637 0.462516
2 0.912439 0.167888
b c a d
0 3 5 1 NaN
1 4 6 2 NaN
2 5 7 3 NaN
b c
0 3 5
1 4 6
2 5 7
one two
f1 0.950674 0.778523
f2 0.496637 0.462516
f3 0.912439 0.167888
Dataframe 创建方法二:由Series组成的字典
# Dataframe 创建方法二:由Series组成的字典
data1 = {'one':pd.Series(np.random.rand(2)),
'two':pd.Series(np.random.rand(3))} # 没有设置index的Series
data2 = {'one':pd.Series(np.random.rand(2), index = ['a','b']),
'two':pd.Series(np.random.rand(3),index = ['a','b','c'])} # 设置了index的Series 和数组不同,series可以行数不一样
print(data1)
print(data2)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2) # 会默认将Series中相同index的列,对齐
print(df1)
print(df2)
# 由Seris组成的字典 创建Dataframe,columns为字典key,index为Series的标签(如果Series没有指定标签,则是默认数字标签)
# Series可以长度不一样,生成的Dataframe会出现NaN值!!!!!!
{'one': 0 0.379779
1 0.756374
dtype: float64, 'two': 0 0.363515
1 0.557073
2 0.095631
dtype: float64}
{'one': a 0.565477
b 0.476197
dtype: float64, 'two': a 0.654894
b 0.898691
c 0.513671
dtype: float64}
one two
0 0.379779 0.363515
1 0.756374 0.557073
2 NaN 0.095631
one two
a 0.565477 0.654894
b 0.476197 0.898691
c NaN 0.513671
Dataframe 创建方法三:通过二维数组直接创建
# Dataframe 创建方法三:通过二维数组直接创建
ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three']) # 可以尝试一下index或columns长度不等于已有数组的情况
print(df1)
print(df2)
# 通过二维数组直接创建Dataframe,得到一样形状的结果数据,如果不指定index和columns,两者均返回默认数字格式
# index和colunms指定长度与原数组保持一致
[[0.0483642 0.66503315 0.98202803]
[0.30064878 0.51297485 0.00827587]
[0.01321322 0.08634642 0.58617837]]
0 1 2
0 0.048364 0.665033 0.982028
1 0.300649 0.512975 0.008276
2 0.013213 0.086346 0.586178
one two three
a 0.048364 0.665033 0.982028
b 0.300649 0.512975 0.008276
c 0.013213 0.086346 0.586178
Dataframe 创建方法四:由字典组成的列表
# Dataframe 创建方法四:由字典组成的列表
data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
print(data)
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index = ['a','b'])
df3 = pd.DataFrame(data, columns = ['one','two'])
print(df1)
print(df2)
print(df3)
# 由字典组成的列表创建Dataframe,columns为字典的key,index不做指定则为默认数组标签
# colunms和index参数分别重新指定相应列及行标签
[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
one three two
0 1 NaN 2
1 5 20.0 10
one three two
a 1 NaN 2
b 5 20.0 10
one two
0 1 2
1 5 10
Dataframe 创建方法五:由字典组成的字典
# Dataframe 创建方法五:由字典组成的字典
data = {'Jack':{'math':90,'english':89,'art':78},
'Marry':{'math':82,'english':95,'art':92},
'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)
# 由字典组成的字典创建Dataframe,columns为字典的key,index为子字典的key
df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])
df3 = pd.DataFrame(data, index = ['a','b','c'])
print(df2)
print(df3)
# columns参数可以增加和减少现有列,如出现新的列,值为NaN
# index在这里和之前不同,并不能改变原有index,如果指向新的标签,值为NaN (非常重要!)
Jack Marry Tom
art 78 92 NaN
english 89 95 67.0
math 90 82 78.0
Jack Tom Bob
art 78 NaN NaN
english 89 67.0 NaN
math 90 78.0 NaN
Jack Marry Tom
a NaN NaN NaN
b NaN NaN NaN
c NaN NaN NaN
创建方法总结
###### 总结
# Dataframe 创建方法一:由数组/list组成的字典
# 创建方法:pandas.Dataframe()
data1 = {'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]} # index个数必须相同,默认为数字 colums为a,b,c
# Dataframe 创建方法二:由Series组成的字典
data1 = {'one':pd.Series(np.random.rand(2)),
'two':pd.Series(np.random.rand(3))} # 没有设置index的Series index默认为数字,个数可以不同,colums 为one,two
# Dataframe 创建方法三:通过二维数组直接创建
ar = np.random.rand(9).reshape(3,3) # 先指定形状,再设置 index columns
df2 = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three']) # 可以尝试一下index或columns长度不等于已有数组的情况
# Dataframe 创建方法四:由字典组成的列表 ,一个字典代表一行数据
data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}] # index为数字,columns为one two three
# Dataframe 创建方法五:由字典组成的字典
data = {'Jack':{'math':90,'english':89,'art':78},
'Marry':{'math':82,'english':95,'art':92},
'Tom':{'math':78,'english':67}} # index 为math english art ,columns为Jack Marry Tom
# columns参数可以增加和减少现有列,如出现新的列,值为NaN
# index在这里和之前不同,并不能改变原有index,如果指向新的标签,值为NaN (非常重要!)
##### 课后作业 #####
data = {'one':[1,2,3,4,5],
'two':[2,3,4,5,6],
'three':[3,4,5,6,7],
'four':[4,5,6,7,8],
'five':[5,6,7,8,9]}
df = pd.DataFrame(data,index=list('abcde'))
print(df)
data = {'one':np.arange(1,6).astype(np.int32),
'two':np.arange(2,7,dtype=np.int32),
'three':np.arange(3,8,dtype=np.int32),
'four':np.arange(4,9,dtype=np.int32),
'five':np.arange(5,10,dtype=np.int32)}
df = pd.DataFrame(data,index=list('abcde'))
print(df)
data = np.arange(1,6)
for i in range(2,6):
arr = np.arange(i,i+5)
data = np.vstack((data,arr))
df = pd.DataFrame(data,index=list('abcde'),columns=['one','two','three','four','five'])
print(df)
data = []
for i in range(1,6):
raw = {}
raw['one'] = i
raw['two'] = i+1
raw['three'] = i+2
raw['four'] = i+3
raw['five'] = i+4
data.append(raw)
df = pd.DataFrame(data,index=list('abcde'),columns=['one','two','three','four','five'])
print(df)
one two three four five
a 1 2 3 4 5
b 2 3 4 5 6
c 3 4 5 6 7
d 4 5 6 7 8
e 5 6 7 8 9
one two three four five
a 1 2 3 4 5
b 2 3 4 5 6
c 3 4 5 6 7
d 4 5 6 7 8
e 5 6 7 8 9
one two three four five
a 1 2 3 4 5
b 2 3 4 5 6
c 3 4 5 6 7
d 4 5 6 7 8
e 5 6 7 8 9
one two three four five
a 1 2 3 4 5
b 2 3 4 5 6
c 3 4 5 6 7
d 4 5 6 7 8
e 5 6 7 8 9
Dataframe:索引
'''
【课程2.6】 Pandas数据结构Dataframe:索引
Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)
选择列 / 选择行 / 切片 / 布尔判断
'''
选择行与列
# 选择行与列
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)
data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))
print('-----')
# 按照列名选择列,只选择一列输出Series,选择多列输出Dataframe
data3 = df.loc['one']
data4 = df.loc[['one','two']]
print(data2,type(data3))
print(data3,type(data4))
# 按照index选择行,只选择一行输出Series,选择多行输出Dataframe
a b c d
one 63.658051 61.219455 95.970986 58.796278
two 51.466117 37.149830 65.926048 14.213535
three 69.975003 80.048414 86.991389 56.791412
one 63.658051
two 51.466117
three 69.975003
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
a c
one 63.658051 95.970986
two 51.466117 65.926048
three 69.975003 86.991389 <class 'pandas.core.frame.DataFrame'>
-----
a c
one 63.658051 95.970986
two 51.466117 65.926048
three 69.975003 86.991389 <class 'pandas.core.series.Series'>
a 63.658051
b 61.219455
c 95.970986
d 58.796278
Name: one, dtype: float64 <class 'pandas.core.frame.DataFrame'>
df[] - 选择列
# df[] - 选择列
# 一般用于选择列,也可以选择行
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)
print('-----')
data1 = df['a']
data2 = df[['b','c']] # 尝试输入 data2 = df[['b','c','e']]
print(data1)
print(data2)
# df[]默认选择列,[]中写列名(所以一般数据colunms都会单独制定,不会用默认数字列名,以免和index冲突)
# 单选列为Series,print结果为Series格式
# 多选列为Dataframe,print结果为Dataframe格式
data3 = df[:1]
#data3 = df[0]
#data3 = df['one']
print(data3,type(data3))
# df[]中为数字时,默认选择行,且只能进行切片的选择,不能单独选择(df[0])
# 输出结果为Dataframe,即便只选择一行
# df[]不能通过索引标签名来选择行(df['one'])
# 核心笔记:df[col]一般用于选择列,[]中写列名
# 选择一个为Series,多个为Dataframe
a b c d
one 97.270936 71.105559 65.199722 70.490576
two 25.521698 27.123590 79.141004 72.313144
three 60.423873 74.037118 75.549528 43.943606
-----
one 97.270936
two 25.521698
three 60.423873
Name: a, dtype: float64
b c
one 71.105559 65.199722
two 27.123590 79.141004
three 74.037118 75.549528
a b c d
one 97.270936 71.105559 65.199722 70.490576 <class 'pandas.core.frame.DataFrame'>
df.loc[] - 按index选择行
# df.loc[] - 按index选择行
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df2)
print('-----')
data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n-----')
# 单个标签索引,返回Series
data3 = df1.loc[['two','three','five']]
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----')
# 多个标签索引,如果标签不存在,则返回NaN
# 顺序可变
data5 = df1.loc['one':'three']
data6 = df2.loc[1:3]
print(data5)
print(data6)
print('切片索引')
# 可以做切片对象
# 末端包含!!!!!!
# 核心笔记:df.loc[label]主要针对index选择行,同时支持指定index,及默认数字index
# 选择一个为Series,多个为Dataframe
a b c d
one 34.200299 97.300522 91.231261 24.340456
two 51.042480 39.357641 22.455337 59.087299
three 11.484544 61.092752 46.594442 40.344873
four 71.231453 64.665933 8.626087 4.135545
a b c d
0 97.361671 26.151709 16.752567 99.514991
1 25.585227 66.758317 73.043166 24.242259
2 47.956365 50.873741 16.364481 66.536050
3 13.925689 44.546587 54.799909 92.746187
-----
a 34.200299
b 97.300522
c 91.231261
d 24.340456
Name: one, dtype: float64
a 25.585227
b 66.758317
c 73.043166
d 24.242259
Name: 1, dtype: float64
单标签索引
-----
a b c d
two 51.042480 39.357641 22.455337 59.087299
three 11.484544 61.092752 46.594442 40.344873
five NaN NaN NaN NaN
a b c d
3 13.925689 44.546587 54.799909 92.746187
2 47.956365 50.873741 16.364481 66.536050
1 25.585227 66.758317 73.043166 24.242259
多标签索引
-----
a b c d
one 34.200299 97.300522 91.231261 24.340456
two 51.042480 39.357641 22.455337 59.087299
three 11.484544 61.092752 46.594442 40.344873
a b c d
1 25.585227 66.758317 73.043166 24.242259
2 47.956365 50.873741 16.364481 66.536050
3 13.925689 44.546587 54.799909 92.746187
切片索引
D:\python\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
df.iloc[] - 按照整数位置(从轴的0到length-1)选择行
# df.iloc[] - 按照整数位置(从轴的0到length-1)选择行
# 类似list的索引,其顺序就是dataframe的整数位置,从0开始计
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df.iloc[0])
print(df.iloc[-1])
#print(df.iloc[4])
print('单位置索引\n-----')
# 单位置索引
# 和loc索引不同,不能索引超出数据行数的整数位置
print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----')
# 多位置索引
# 顺序可变
print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
# 切片索引
# 末端不包含 !!!!!!!
a b c d
one 15.329859 18.333796 55.690386 80.674277
two 45.645575 34.352875 73.265605 19.522695
three 24.222764 51.737716 0.766872 95.886575
four 28.882672 34.100851 77.533629 7.644694
------
a 15.329859
b 18.333796
c 55.690386
d 80.674277
Name: one, dtype: float64
a 28.882672
b 34.100851
c 77.533629
d 7.644694
Name: four, dtype: float64
单位置索引
-----
a b c d
one 15.329859 18.333796 55.690386 80.674277
three 24.222764 51.737716 0.766872 95.886575
a b c d
four 28.882672 34.100851 77.533629 7.644694
three 24.222764 51.737716 0.766872 95.886575
two 45.645575 34.352875 73.265605 19.522695
多位置索引
-----
a b c d
two 45.645575 34.352875 73.265605 19.522695
three 24.222764 51.737716 0.766872 95.886575
a b c d
one 15.329859 18.333796 55.690386 80.674277
three 24.222764 51.737716 0.766872 95.886575
切片索引
布尔型索引
# 布尔型索引
# 和Series原理相同
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
b1 = df < 20
print(b1,type(b1))
print(df[b1]) # 也可以书写为 df[df < 20]
print('------')
# 不做索引则会对数据每个值进行判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2]) # 也可以书写为 df[df['a'] > 50]
print('------')
# 单列做判断
# 索引结果保留 单列判断为True的行数据,包括其他列
b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3]) # 也可以书写为 df[df[['a','b']] > 50]
print('------')
# 多列做判断 and鏈接
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4]) # 也可以书写为 df[df.loc[['one','three']] < 50]
print('------')
# 多行做判断 and连接
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
a b c d
one 71.684637 37.911621 35.274205 45.574088
two 5.291718 53.618707 99.059019 42.536355
three 59.062514 60.712185 46.012685 68.391096
four 75.862519 45.377423 63.825425 21.287015
------
a b c d
one False False False False
two True False False False
three False False False False
four False False False False <class 'pandas.core.frame.DataFrame'>
a b c d
one NaN NaN NaN NaN
two 5.291718 NaN NaN NaN
three NaN NaN NaN NaN
four NaN NaN NaN NaN
------
one True
two False
three True
four True
Name: a, dtype: bool <class 'pandas.core.series.Series'>
a b c d
one 71.684637 37.911621 35.274205 45.574088
three 59.062514 60.712185 46.012685 68.391096
four 75.862519 45.377423 63.825425 21.287015
------
a b
one True False
two False True
three True True
four True False <class 'pandas.core.frame.DataFrame'>
a b c d
one 71.684637 NaN NaN NaN
two NaN 53.618707 NaN NaN
three 59.062514 60.712185 NaN NaN
four 75.862519 NaN NaN NaN
------
a b c d
one False True True True
three False False True False <class 'pandas.core.frame.DataFrame'>
a b c d
one NaN 37.911621 35.274205 45.574088
two NaN NaN NaN NaN
three NaN NaN 46.012685 NaN
four NaN NaN NaN NaN
------
多重索引:比如同时索引行和列
# 多重索引:比如同时索引行和列
# 先选择列再选择行 —— 相当于对于一个数据,先筛选字段,再选择数据量
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df['a'].loc[['one','three']]) # 选择a列的one,three行
print(df[['b','c','d']].iloc[::2]) # 选择b,c,d列的one,three行
print(df[df['a'] < 50].iloc[:2]) # 选择满足判断索引的前两行数据
a b c d
one 56.875213 9.908319 62.699024 58.315170
two 52.079981 33.619668 3.588995 20.659000
three 0.246695 59.132116 20.443237 67.412348
four 97.904774 46.142361 5.697341 13.133505
------
one 56.875213
three 0.246695
Name: a, dtype: float64
b c d
one 9.908319 62.699024 58.315170
three 59.132116 20.443237 67.412348
a b c d
three 0.246695 59.132116 20.443237 67.412348
######## 课后作业 ########
data = (np.random.rand(16)*100).reshape(4,4)
df = pd.DataFrame(data,index=['one','two','three','four'],columns=list('abcd'))
print(df)
print(df[['b','c']])
print(df.loc[['three','four']])
print(df.iloc[:2])
print(df[df>50])
a b c d
one 16.708037 61.669559 93.579206 56.147879
two 77.357731 53.615608 57.372610 8.899283
three 3.172545 45.956448 52.441762 10.711575
four 54.574644 83.767485 76.276766 10.062513
b c
one 61.669559 93.579206
two 53.615608 57.372610
three 45.956448 52.441762
four 83.767485 76.276766
a b c d
three 3.172545 45.956448 52.441762 10.711575
four 54.574644 83.767485 76.276766 10.062513
a b c d
one 16.708037 61.669559 93.579206 56.147879
two 77.357731 53.615608 57.372610 8.899283
a b c d
one NaN 61.669559 93.579206 56.147879
two 77.357731 53.615608 57.372610 NaN
three NaN NaN 52.441762 NaN
four 54.574644 83.767485 76.276766 NaN
Dataframe:基本技巧
'''
【课程2.7】 Pandas数据结构Dataframe:基本技巧
数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序
'''
数据查看、转置
# 数据查看、转置
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
columns = ['a','b'])
print(df.head(2))
print(df.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条
print(df.T)
# .T 转置
a b
0 22.193015 88.255882
1 84.893487 96.155443
a b
3 37.605641 84.316712
4 95.337196 99.352520
5 53.815803 22.544153
6 71.428959 71.459992
7 64.036473 99.673388
0 1 2 3 4 5 \
a 22.193015 84.893487 71.382948 37.605641 95.337196 53.815803
b 88.255882 96.155443 39.672617 84.316712 99.352520 22.544153
6 7
a 71.428959 64.036473
b 71.459992 99.673388
添加与修改
# 添加与修改
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)
df['e'] = 10
df.loc[4] = 20
print(df)
# 新增列/行并赋值
df['e'] = 20
df[['a','c']] = 100
print(df)
# 索引后直接修改值
a b c d
0 33.445828 21.298638 11.622448 93.670966
1 59.519157 9.834715 47.069324 33.137916
2 41.723832 21.524908 4.367143 86.918814
3 22.700945 26.289441 96.076336 43.490014
a b c d e
0 33.445828 21.298638 11.622448 93.670966 10
1 59.519157 9.834715 47.069324 33.137916 10
2 41.723832 21.524908 4.367143 86.918814 10
3 22.700945 26.289441 96.076336 43.490014 10
4 20.000000 20.000000 20.000000 20.000000 20
a b c d e
0 100 21.298638 100 93.670966 20
1 100 9.834715 100 33.137916 20
2 100 21.524908 100 86.918814 20
3 100 26.289441 100 43.490014 20
4 100 20.000000 100 20.000000 20
删除 del / drop()
# 删除 del / drop()
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)
del df['a']
print(df)
print('-----')
# del语句 - 删除列
print(df.drop(0))
print(df.drop([1,2]))
print(df)
print('-----')
# drop()删除行,inplace=False → 删除后生成新的数据,不改变原数据
print(df.drop(['d'], axis = 1))
print(df)
# drop()删除列,需要加上axis = 1,inplace=False → 删除后生成新的数据,不改变原数据
a b c d
0 12.351620 74.471394 56.015964 61.092441
1 23.061527 91.817518 69.678690 14.832106
2 65.334732 40.489227 7.234896 99.952920
3 47.907514 11.695344 90.186409 18.699640
b c d
0 74.471394 56.015964 61.092441
1 91.817518 69.678690 14.832106
2 40.489227 7.234896 99.952920
3 11.695344 90.186409 18.699640
-----
b c d
1 91.817518 69.678690 14.832106
2 40.489227 7.234896 99.952920
3 11.695344 90.186409 18.699640
b c d
0 74.471394 56.015964 61.092441
3 11.695344 90.186409 18.699640
b c d
0 74.471394 56.015964 61.092441
1 91.817518 69.678690 14.832106
2 40.489227 7.234896 99.952920
3 11.695344 90.186409 18.699640
-----
b c
0 74.471394 56.015964
1 91.817518 69.678690
2 40.489227 7.234896
3 11.695344 90.186409
b c d
0 74.471394 56.015964 61.092441
1 91.817518 69.678690 14.832106
2 40.489227 7.234896 99.952920
3 11.695344 90.186409 18.699640
对齐
# 对齐
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
print(df1 + df2)
# DataFrame对象之间的数据自动按照列和索引(行标签)对齐
A B C D
0 0.188197 -0.864355 1.541761 NaN
1 -0.383886 2.263777 -0.729035 NaN
2 1.408411 -0.265004 -0.041134 NaN
3 -0.028582 0.197926 1.028466 NaN
4 0.689885 -1.450125 1.306084 NaN
5 2.418933 1.130174 -0.817424 NaN
6 0.509692 3.894432 1.126399 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
排序
排序1 - 按值排序 .sort_values
# 排序1 - 按值排序 .sort_values
# 同样适用于Series
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_values(['a'], ascending = True)) # 升序
print(df1.sort_values(['a'], ascending = False)) # 降序
print('------')
# ascending参数:设置升序降序,默认升序
# 单列排序
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
'b':list(range(8)),
'c':list(range(8,0,-1))})
print(df2)
print(df2.sort_values(['a','c']))
# 多列排序,按列顺序排序 先按照a列排序,然后在a列的基础上分别对c列进行排序
a b c d
0 41.642377 50.688196 45.361126 76.524721
1 40.661101 60.343825 79.966625 61.835813
2 12.831094 54.553397 46.191899 11.115860
3 52.551135 87.637384 69.010097 43.259785
a b c d
2 12.831094 54.553397 46.191899 11.115860
1 40.661101 60.343825 79.966625 61.835813
0 41.642377 50.688196 45.361126 76.524721
3 52.551135 87.637384 69.010097 43.259785
a b c d
3 52.551135 87.637384 69.010097 43.259785
0 41.642377 50.688196 45.361126 76.524721
1 40.661101 60.343825 79.966625 61.835813
2 12.831094 54.553397 46.191899 11.115860
------
a b c
0 1 0 8
1 1 1 7
2 1 2 6
3 1 3 5
4 2 4 4
5 2 5 3
6 2 6 2
7 2 7 1
a b c
3 1 3 5
2 1 2 6
1 1 1 7
0 1 0 8
7 2 7 1
6 2 6 2
5 2 5 3
4 2 4 4
排序2 - 索引排序 .sort_index
# 排序2 - 索引排序 .sort_index
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = [5,4,3,2],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['h','s','x','g'],
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())
# 按照index排序
# 默认 ascending=True, inplace=False
a b c d
5 77.127624 27.818492 97.237470 57.074214
4 15.119740 33.619821 47.544680 90.845413
3 0.476552 94.617322 87.196046 87.365182
2 21.088018 83.879127 14.659860 43.161499
a b c d
2 21.088018 83.879127 14.659860 43.161499
3 0.476552 94.617322 87.196046 87.365182
4 15.119740 33.619821 47.544680 90.845413
5 77.127624 27.818492 97.237470 57.074214
a b c d
h 24.210562 24.170220 92.366109 11.883535
s 54.510477 92.414721 92.918148 83.001856
x 70.710244 28.175513 53.988156 25.846942
g 48.119542 9.927192 56.889912 82.495444
a b c d
g 48.119542 9.927192 56.889912 82.495444
h 24.210562 24.170220 92.366109 11.883535
s 54.510477 92.414721 92.918148 83.001856
x 70.710244 28.175513 53.988156 25.846942
课堂作业
data = (np.random.rand(9)*100).reshape(3,3)
df = pd.DataFrame(data,index=list('abc'),columns=['v1','v2','v3'])
print(df)
print(df.sort_index())
print(df.sort_index(ascending=False))
print(df.sort_values(['v2']))
print(df.sort_values(['v2'],ascending=False))
v1 v2 v3
a 64.747624 66.416410 29.855549
b 15.363681 79.276846 78.421204
c 14.596782 38.543484 89.849837
v1 v2 v3
a 64.747624 66.416410 29.855549
b 15.363681 79.276846 78.421204
c 14.596782 38.543484 89.849837
v1 v2 v3
c 14.596782 38.543484 89.849837
b 15.363681 79.276846 78.421204
a 64.747624 66.416410 29.855549
v1 v2 v3
c 14.596782 38.543484 89.849837
a 64.747624 66.416410 29.855549
b 15.363681 79.276846 78.421204
v1 v2 v3
b 15.363681 79.276846 78.421204
a 64.747624 66.416410 29.855549
c 14.596782 38.543484 89.849837
data = (np.random.rand(10)*100).reshape(5,2)
df = pd.DataFrame(data,index=list('abcde'),columns=['v1','v2'])
print(df)
df.drop(['e'],inplace=True)
print(df.T)
v1 v2
a 35.339786 74.534705
b 27.846159 11.802441
c 80.493176 37.910645
d 78.444950 23.569637
e 34.169596 83.212744
a b c d
v1 35.339786 27.846159 80.493176 78.444950
v2 74.534705 11.802441 37.910645 23.569637