Pandas数据结构
1 Series
import pandas as pd
#创建Series数组
s1 = pd.Series([1,2,3,4,5])
print('s1:{}'.format(s1))
s1:0 1
1 2
2 3
3 4
4 5
dtype: int64
左边的列表示索引,右边的列表示值
#Series数组与index参数
s2 = pd.Series([1,2,3,4,5],index=['第一','第二','第三','第四','第五'])
print('s2:{}'.format(s2))
s2:第一 1
第二 2
第三 3
第四 4
第五 5
dtype: int64
#Series的索引与切片
print('s2索引:{}'.format(s2.index))
print('s2值:{}'.format(s2.values))
s2索引:Index(['第一', '第二', '第三', '第四', '第五'], dtype='object')
s2值:[1 2 3 4 5]
print('s2中第二对应的值:{}'.format(s2['第二']))
s2['第二'] = 10
print('s2中第二对应的值:{}'.format(s2['第二']))
s2中第二对应的值:2
s2中第二对应的值:10
print('s2中 第二第四第五 对应的数值: {}'.format(s2[['第二','第四','第五']]))
print('s2中 第二到第五 对应的数值: {}'.format(s2['第二':'第五']))
s2中 第二第四第五 对应的数值: 第二 10
第四 4
第五 5
dtype: int64
s2中 第二到第五 对应的数值: 第二 10
第三 3
第四 4
第五 5
dtype: int64
#将字典类型转化为Series对象
s_dic = {'First':1, 'Second':2, 'Third':3, 'Fourth':4, 'Fifth':5}
s4 = pd.Series(s_dic,index=['First','Second','Third','Fourth','Fifth'])
print('s4: {}'.format(s4))
s4: First 1
Second 2
Third 3
Fourth 4
Fifth 5
dtype: int64
#查看Series中元素
print('s4 中含有sixth: {}'.format('sixth' in s4))
print('s4 中不含有sixth: {}'.format('sixth' not in s4))
s4 中含有sixth: False
s4 中不含有sixth: True
#查看是否存在缺省值
s_dic = {'First':1, 'Second':2, 'Third':3, 'Fourth':4, 'Fifth':5}
s5 = pd.Series(s_dic,index=['First','Second','Third','Fourth','Tenth'])
print('s5: {}'.format(s5))
s5: First 1.0
Second 2.0
Third 3.0
Fourth 4.0
Tenth NaN
dtype: float64
print('数据缺失:{}'.format(s5.isnull()))
print('数据不缺失:{}'.format(s5.notnull()))
数据缺失:First False
Second False
Third False
Fourth False
Tenth True
dtype: bool
数据不缺失:First True
Second True
Third True
Fourth True
Tenth False
dtype: bool
#Series的算术运算
print('s4+s5:{}'.format(s4+s5))
s4+s5:Fifth NaN
First 2.0
Fourth 8.0
Second 4.0
Tenth NaN
Third 6.0
dtype: float64
2 DataFrame
#创建DataFrame
df_dic = {'color':['red','yellow','blue','purple','pink'],
'size':['medium','small','big','medium','small'],
'taste':['sweet','sour','salty','sweet','spicy']}
df = pd.DataFrame(df_dic)
print('df: {}'.format(df))
df: color size taste
0 red medium sweet
1 yellow small sour
2 blue big salty
3 purple medium sweet
4 pink small spicy
#指定DataFrame中的columns(指定列系列)
df_dic = {'color':['red','yellow','blue','purple','pink'],
'size':['medium','small','big','medium','small'],
'taste':['sweet','sour','salty','sweet','spicy']}
df1 = pd.DataFrame(df_dic, columns=['taste','color','size'])
print('df1: {}'.format(df1))
df1: taste color size
0 sweet red medium
1 sour yellow small
2 salty blue big
3 sweet purple medium
4 spicy pink small
df_dic = {'color':['red','yellow','blue','purple','pink'],
'size':['medium','small','big','medium','small'],
'taste':['sweet','sour','salty','sweet','spicy']}
df2 = pd.DataFrame(df_dic, columns=['taste','color','size','category'])
print('df2: {}'.format(df2))
df2: taste color size category
0 sweet red medium NaN
1 sour yellow small NaN
2 salty blue big NaN
3 sweet purple medium NaN
4 spicy pink small NaN
传入的colums中含有与原字典数据key值不匹配的列名称时,该列将会被记作NaN列。
#设置DataFrame的表头
df2.index.name = 'sample'
df2.columns.name = 'feature'
print('df2:{}'.format(df2))
df2:feature taste color size category
sample
0 sweet red medium NaN
1 sour yellow small NaN
2 salty blue big NaN
3 sweet purple medium NaN
4 spicy pink small NaN
#获取DataFrame的所有数据
print('df2的values值为:{}'.format(df2.values))
df2的values值为:[['sweet' 'red' 'medium' nan]
['sour' 'yellow' 'small' nan]
['salty' 'blue' 'big' nan]
['sweet' 'purple' 'medium' nan]
['spicy' 'pink' 'small' nan]]
#DataFrame的列索引
print('df2中的color列:{}'.format(df2['color']))
print('df2中的color列:{}'.format(df2.color))
df2中的color列:sample
0 red
1 yellow
2 blue
3 purple
4 pink
Name: color, dtype: object
df2中的color列:sample
0 red
1 yellow
2 blue
3 purple
4 pink
Name: color, dtype: object
#DataFrame的行索引
print('df2中的行序号为3:{}'.format(df2.ix[3]))
df2中的行序号为3:feature
taste sweet
color purple
size medium
category NaN
Name: 3, dtype: object
import numpy as np
#DataFrame列元素填补
df2['category'] = np.arange(5)
print('df2:{}'.format(df2))
df2:feature taste color size category
sample
0 sweet red medium 0
1 sour yellow small 1
2 salty blue big 2
3 sweet purple medium 3
4 spicy pink small 4
#填充部分缺省值
df2['category'] = pd.Series([2,3,4],index=[0,1,2])
print('df2:{}'.format(df2))
df2:feature taste color size category
sample
0 sweet red medium 2.0
1 sour yellow small 3.0
2 salty blue big 4.0
3 sweet purple medium NaN
4 spicy pink small NaN
#索引创建新列
df2['country'] = pd.Series(['China','UK','USA','Australia','Japan'])
print('df2: {}'.format(df2))
df2: feature taste color size category country
sample
0 sweet red medium 2.0 China
1 sour yellow small 3.0 UK
2 salty blue big 4.0 USA
3 sweet purple medium NaN Australia
4 spicy pink small NaN Japan
print('df2中category小于等于3的样本数据:{}'.format(df2[df2['category']<=3]))
df2中category小于等于3的样本数据:feature taste color size category country
sample
0 sweet red medium 2.0 China
1 sour yellow small 3.0 UK
3 DataFrame数学统计运算
#生成一个DataFrame
df3 = pd.DataFrame([[3,2,3,1],[2,5,3,6],[3,4,5,2],[9,5,3,1]],
index=['a','b','c','d'],columns=['one','two','three','four'])
print('df3: {}'.format(df3))
df3: one two three four
a 3 2 3 1
b 2 5 3 6
c 3 4 5 2
d 9 5 3 1
#按列求和/按行求和
print('df3.sum 按列求和: {}'.format(df3.sum()))
print('df3.sum 按行求和: {}'.format(df3.sum(axis = 1)))
df3.sum 按列求和: one 17
two 16
three 14
four 10
dtype: int64
df3.sum 按行求和: a 9
b 16
c 14
d 18
dtype: int64
#累计求和
print('df3.sum 从上到下累计求和: {}'.format(df3.cumsum()))
print('df3.sum 从左到右累计求和: {}'.format(df3.cumsum(axis = 1)))
df3.sum 从上到下累计求和: one two three four
a 3 2 3 1
b 5 7 6 7
c 8 11 11 9
d 17 16 14 10
df3.sum 从左到右累计求和: one two three four
a 3 5 8 9
b 2 7 10 16
c 3 7 12 14
d 9 14 17 18
4 数据处理
#查找DataFrame中的缺省值
df4 = pd.DataFrame([[3,np.nan,3,1],[2,5,np.nan,6],[3,4,5,np.nan],[5,3,1,3]],
index=['a','b','c','d'],columns=['one','two','three','four'])
# 输出判断dataframe中每个位置是否是缺失值
print(df4.isnull())
one two three four
a False True False False
b False False True False
c False False False True
d False False False False
#输出含有缺省值的行
print(df4[df4.isnull().any(axis=1)])
one two three four
a 3 NaN 3.0 1.0
b 2 5.0 NaN 6.0
c 3 4.0 5.0 NaN
使用any函数,只要传入的数据中包含True,那么就返回True
#过滤Series数组中的缺省值
# 创建一个Series数组
arr = pd.Series([1,2,3,np.nan,5,6])
print('arr: {}'.format(arr))
# 过滤缺失值
print('过滤缺失值:{}'.format(arr.dropna()))
print('过滤缺失值之后的arr: {}'.format(arr))
arr: 0 1.0
1 2.0
2 3.0
3 NaN
4 5.0
5 6.0
dtype: float64
过滤缺失值:0 1.0
1 2.0
2 3.0
4 5.0
5 6.0
dtype: float64
过滤缺失值之后的arr: 0 1.0
1 2.0
2 3.0
3 NaN
4 5.0
5 6.0
dtype: float64
dropna函数返回的是一个执行了删除操作之后的新Series数组,删除操作不改变原数据
arr = arr.dropna()
arr.dropna(inplace=True)
print('过滤缺失值之后的arr: {}'.format(arr))
# 输出df4过滤缺失值之后的结果
print(df4.dropna())
过滤缺失值之后的arr: 0 1.0
1 2.0
2 3.0
4 5.0
5 6.0
dtype: float64
one two three four
d 5 3.0 1.0 3.0
dropna函数的使用方法与过滤Series数组类似,返回值默认为删除了含有缺失值NaN的所有行。
df4['fifth'] = np.NAN
# 输出df4
print('df4: {}'.format(df4))
# 输出过滤缺失值之后的结果
print('过滤缺失值之后: {}'.format(df4.dropna(how = 'all',axis=1,inplace=True)))
df4: one two three four fifth
a 3 NaN 3.0 1.0 NaN
b 2 5.0 NaN 6.0 NaN
c 3 4.0 5.0 NaN NaN
d 5 3.0 1.0 3.0 NaN
过滤缺失值之后: None
dropna函数中传入how='all’可以删除全为缺失值NaN的行或者列。
# 输出df4
print(df4)
# 输出用0替换缺失值之后的df4
print(df4.fillna(0))
one two three four
a 3 NaN 3.0 1.0
b 2 5.0 NaN 6.0
c 3 4.0 5.0 NaN
d 5 3.0 1.0 3.0
one two three four
a 3 0.0 3.0 1.0
b 2 5.0 0.0 6.0
c 3 4.0 5.0 0.0
d 5 3.0 1.0 3.0
# 输出用中位数替换缺失值之后的df4
print(df4.fillna(df4.median()))
one two three four
a 3 4.0 3.0 1.0
b 2 5.0 3.0 6.0
c 3 4.0 5.0 3.0
d 5 3.0 1.0 3.0
# 输出向上填充之后的df4
print(df4.ffill())
# 输出向下填充之后的df4
print(df4.bfill())
one two three four
a 3 NaN 3.0 1.0
b 2 5.0 3.0 6.0
c 3 4.0 5.0 6.0
d 5 3.0 1.0 3.0
one two three four
a 3 5.0 3.0 1.0
b 2 5.0 5.0 6.0
c 3 4.0 5.0 3.0
d 5 3.0 1.0 3.0
#生成一个DataFrame
df4 = pd.DataFrame([[3,5,3,1],[2,5,5,6],[3,4,5,3],[5,3,1,3],[3,4,5,3],[3,4,6,8]],
index=['a','b','c','d','e','f'],columns=['one','two','three','four'])
# 查看是否存在重复行
print(df4[df4.duplicated()])
# 查看是否存在前两列重复的行
print(df4[df4.duplicated(subset=['one','two'])])
one two three four
e 3 4 5 3
one two three four
e 3 4 5 3
f 3 4 6 8
对DataFrame使用duplicated函数,返回数据表中的重复行,默认保留第一次出现重复值的行。subset参数用于识别重复的列标签或列标签序列,默认为所有的列标签。可根据列筛选重复行
# 删除重复列,保留第一次出现的重复行
print(df4.drop_duplicates(subset=['one','two'],keep='first'))
one two three four
a 3 5 3 1
b 2 5 5 6
c 3 4 5 3
d 5 3 1 3
keep参数值设置为’First’表示在去除重复值的过程中保留第一次出现的重复值,keep参数还有另外的两个取值,分别为’last’和’False’,表示保留最后一次出现的重复值和去除所有的重复值行
5 数据记录合并与分组
#使用append函数链接两个DataFrame
df5 = pd.DataFrame([[3,3,2,4],[5,4,3,3]],
index=['g','h'],columns=['one','two','three','four'])
# 输出合并之后的DataFrame
print(df4.append(df5))
one two three four
a 3 5 3 1
b 2 5 5 6
c 3 4 5 3
d 5 3 1 3
e 3 4 5 3
f 3 4 6 8
g 3 3 2 4
h 5 4 3 3
#使用concat函数合并数据记录
# df4和df5上下连接
print(pd.concat([df4,df5]))
# df4和df5左右连接
print(pd.concat([df4,df5],axis=1))
one two three four
a 3 5 3 1
b 2 5 5 6
c 3 4 5 3
d 5 3 1 3
e 3 4 5 3
f 3 4 6 8
g 3 3 2 4
h 5 4 3 3
one two three four one two three four
a 3.0 5.0 3.0 1.0 NaN NaN NaN NaN
b 2.0 5.0 5.0 6.0 NaN NaN NaN NaN
c 3.0 4.0 5.0 3.0 NaN NaN NaN NaN
d 5.0 3.0 1.0 3.0 NaN NaN NaN NaN
e 3.0 4.0 5.0 3.0 NaN NaN NaN NaN
f 3.0 4.0 6.0 8.0 NaN NaN NaN NaN
g NaN NaN NaN NaN 3.0 3.0 2.0 4.0
h NaN NaN NaN NaN 5.0 4.0 3.0 3.0
concat函数上下连接时相同列索引的数据合并,左右连接时相同行索引的数据合并
#使用merge函数合并数据记录
df_dic = {'color':['red','yellow','blue','purple','pink'],
'size':['medium','small','big','medium','small'],
'taste':['sweet','sour','salty','sweet','spicy'],
'category':[2,3,4,5,6]}
df6 = pd.DataFrame(df_dic, columns=['taste','color','size','category'])
print('df6:{}'.format(df6))
df_dic1 = {'country':['China','UK','USA','Australia','Japan'],
'quality':['good','normal','excellent','good','bad'],
'category':[2,3,5,6,7]}
df7 = pd.DataFrame(df_dic1,columns=['country','quality','category'])
print('df7:{}'.format(df7))
df6: taste color size category
0 sweet red medium 2
1 sour yellow small 3
2 salty blue big 4
3 sweet purple medium 5
4 spicy pink small 6
df7: country quality category
0 China good 2
1 UK normal 3
2 USA excellent 5
3 Australia good 6
4 Japan bad 7
# 输出合并之后的数据集
print(pd.merge(df6,df7,left_on='category',right_on='category',how='left'))
taste color size category country quality
0 sweet red medium 2 China good
1 sour yellow small 3 UK normal
2 salty blue big 4 NaN NaN
3 sweet purple medium 5 USA excellent
4 spicy pink small 6 Australia good
category作为一个主键分别对df6和df7进行匹配,并将两个数据集合并。
6文件操作
# 写入文件
df.to_csv('df.csv',sep=',',header=True,index=True)
#读取文件
pd.read_csv('df.csv',encoding='utf-8')
Unnamed: 0 | color | size | taste | |
---|---|---|---|---|
0 | 0 | red | medium | sweet |
1 | 1 | yellow | small | sour |
2 | 2 | blue | big | salty |
3 | 3 | purple | medium | sweet |
4 | 4 | pink | small | spicy |
参考书籍:
Python3 快速入门与实战