pandas基础篇
我们打算从以下几个方面进行代码练习:
** 创建Series ** Series基本操作 ** 创建DataFrame ** DataFrame基本操作 ** DataFrame文件操作 ** Series,DataFrame和多索引 ** 透视表 ** 数据清洗 ** 数据预处理 ** 可视化
创建 Series 数据类型:
1)从列表创建 Series
import pandas as pd
print(pd.__version__)
arr=[0,1,2,3,4,5]
s1=pd.Series(arr)
print(s1)
'''
结果:如果不指定索引,默认从0开始
0 0
1 1
2 2
3 3
4 4
5 5
'''
2)从 Ndarray 创建 Series
import numpy as np
n=np.random.rand(5)
index=['a','b','c','d','e']
s2=pd.Series(n,index=index)
print(s2)
3)从字典创建 Series
d={'a':1,'b':2,'c':3,'d':4,'e':5}
s3=pd.Series(d)
print(s3)
Series基本操作
#修改 Series 索引
print(s1)
s1.index=['A','B','C','D','E','F']
print(s1)
#Series 纵向拼接
s4=s3.append(s1)
print(s4)
#Series 按指定索引删除元素
L=['A','B','C','D','E']
s5=pd.Series(L)
print(s5)
s5=s5.drop(3) #删除索引为3的值
print(s5)
#结果
'''
dtype: int64
0 A
1 B
2 C
3 D
4 E
dtype: object
0 A
1 B
2 C
4 E
dtype: object
'''
#Series 修改指定索引元素
L=['A','B','C','D','E']
s5=pd.Series(L)
print(s5)
s5[3]=8
print(s5)
"""
dtype: object
0 A
1 B
2 C
3 D
4 E
dtype: object
0 A
1 B
2 C
3 8
4 E
"""
#Series 按指定索引查找元素
print(s5[4])
'''
结果:E
'''
#Series 切片操作,例如对s5的前 3 个数据访问
print(s5[:3])
'''
0 A
1 B
2 C
dtype: object
'''
#Series 加法运算,Series 的加法运算是按照索引计算,如果索引不同则填充为 NaN(空值)。
L=['A','B','C']
#L=[1,2,3,4,6]
s5=pd.Series(L)
L1=['A','C','D']
#L1=[2,3,4,5,7]
s6=pd.Series(L1)
#print(s5)
#print(s6)
#print(s5.add(s6))
#print(s5.sub(s6))
#print(s5.mul(s6))
#print(s5.div(s6))
'''
0 AA
1 BC
2 CD
dtype: object
0 3
1 5
2 7
3 9
4 13
dtype: int64
'''
#print(s5.median())#求中位数
L=[1,2,3,4,5]
s5=pd.Series(L)
print(s5)
print(s5.sum())#求和
print(s5.max())#求最大值
print(s5.min())#求最小值
'''
0 1
1 2
2 3
3 4
4 5
dtype: int64
15
5
1
'''
创建DataFrame
#创建DataFrame数据类型,与 Sereis 不同,DataFrame 可以存在多列数据。
#通过 NumPy 数组创建 DataFrame
datas=pd.date_range('today',periods=6)
num_arr=np.random.rand(6,4)
columns=['A','B','C','D']
df1=pd.DataFrame(num_arr,index=datas,columns=columns)
print(df1)
'''
A B C D
2018-10-23 10:01:57.288209 0.318049 0.648244 0.824381 0.991212
2018-10-24 10:01:57.288209 0.749709 0.450722 0.421013 0.640053
2018-10-25 10:01:57.288209 0.091959 0.307871 0.833770 0.567152
2018-10-26 10:01:57.288209 0.481489 0.932971 0.085914 0.689568
2018-10-27 10:01:57.288209 0.071565 0.897287 0.062263 0.627351
2018-10-28 10:01:57.288209 0.594281 0.856175 0.110247 0.172222
'''
#通过字典数组创建 DataFrame
data={'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age':[2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits':[1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority':['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2=pd.DataFrame(data,index=labels)
print(df2)
'''
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
'''
# 查看 DataFrame 的数据类型
print(df2.dtypes)
'''
animal object
age float64
visits int64
priority object
'''
DataFrame基本操作
#预览 DataFrame 的前 5 行数据,此方法对快速了解陌生数据集结构十分有用
print(df2.head(6))#默认为前五行
'''
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
'''
print(df2.tail(3))#查看后三行
'''
animal age visits priority
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
'''
#查看 DataFrame 的索引
print(df2.index)
#Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
#查看 DataFrame 的列名
print(df2.columns) #Index(['animal', 'age', 'visits', 'priority'], dtype='object')
#查看 DataFrame 的数值
print(df2.values)
'''
[['cat' 2.5 1 'yes']
['cat' 3.0 3 'yes']
['snake' 0.5 2 'no']
['dog' nan 3 'yes']
['dog' 5.0 2 'no']
['cat' 2.0 3 'no']
['snake' 4.5 1 'no']
['cat' nan 1 'yes']
['dog' 7.0 2 'no']
['dog' 3.0 1 'no']]
'''
#查看 DataFrame 的统计数据
print(df2.describe())
'''
age visits
count 8.000000 10.000000
mean 3.437500 1.900000
std 2.007797 0.875595
min 0.500000 1.000000
25% 2.375000 1.000000
50% 3.000000 2.000000
75% 4.625000 2.750000
max 7.000000 3.000000
'''
#DataFrame 转置操作
print(df2.T)
'''
a b c d e f g h i j
animal cat cat snake dog dog cat snake cat dog dog
age 2.5 3 0.5 NaN 5 2 4.5 NaN 7 3
visits 1 3 2 3 2 3 1 1 2 1
priority yes yes no yes no no no yes no no
'''
#对 DataFrame 进行按列排序
print(df2.sort_values(by='age')) #按照age升序排列
'''
animal age visits priority
c snake 0.5 2 no
f cat 2.0 3 no
a cat 2.5 1 yes
b cat 3.0 3 yes
j dog 3.0 1 no
g snake 4.5 1 no
e dog 5.0 2 no
i dog 7.0 2 no
d dog NaN 3 yes
h cat NaN 1 yes
'''
#对 DataFrame 数据切片
print(df2[1:3])
'''
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
'''
# 对 DataFrame 通过标签查询(单列)
print(df2['age'])
'''
a 2.5
b 3.0
c 0.5
d NaN
e 5.0
f 2.0
g 4.5
h NaN
i 7.0
j 3.0
Name: age, dtype: float64
'''
#对 DataFrame 通过标签查询(多列)
print(df2[['age','animal']])
'''
age animal
a 2.5 cat
b 3.0 cat
c 0.5 snake
d NaN dog
e 5.0 dog
f 2.0 cat
g 4.5 snake
h NaN cat
i 7.0 dog
j 3.0 dog
'''
#对 DataFrame 通过位置查询
print(df2.iloc[1:3])
'''
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
'''
#DataFrame 副本拷贝,生成 DataFrame 副本,方便数据集被多个不同流程使用
df3=df2.copy()
print(df3)
#判断 DataFrame 元素是否为空
print(df3.isnull())
'''
animal age visits priority
a False False False False
b False False False False
c False False False False
d False True False False
e False False False False
f False False False False
g False False False False
h False True False False
i False False False False
j False False False False
'''
#添加列数据
num=pd.Series([0,1,2,3,4,5,6,7,8,9],index=df3.index)
df3['No.']=num
print(df3)
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 3.0 3 yes 1
c snake 0.5 2 no 2
d dog NaN 3 yes 3
e dog 5.0 2 no 4
f cat 2.0 3 no 5
g snake 4.5 1 no 6
h cat NaN 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#根据 DataFrame 的下标值进行更改。
df3.iat[1,1]=4
print(df3)
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 4.0 3 yes 1
c snake 0.5 2 no 2
d dog NaN 3 yes 3
e dog 5.0 2 no 4
f cat 2.0 3 no 5
g snake 4.5 1 no 6
h cat NaN 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#根据 DataFrame 的标签对数据进行修改
df3.loc['f','age']=1.5
print(df3)
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 4.0 3 yes 1
c snake 0.5 2 no 2
d dog NaN 3 yes 3
e dog 5.0 2 no 4
f cat 1.5 3 no 5
g snake 4.5 1 no 6
h cat NaN 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#DataFrame 求平均值操作
print(df3.mean())
'''
age 3.5
visits 1.9
No. 4.5
dtype: float64
'''
#对 DataFrame 中任意列做求和操作
print(df3['visits'].sum()) #结果:19
#-----------------字符串操作----------------------------
# 将字符串转化为小写字母
string=pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
np.nan, 'CABA', 'dog', 'cat'])
print(string)
print(string.str.lower())
'''
0 A
1 B
2 C
3 Aaba
4 Baca
5 NaN
6 CABA
7 dog
8 cat
dtype: object
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
'''
print(string.str.upper())#转换为大写
#---------------------DataFrame 缺失值操作-------------------
#对缺失值进行填充
df4=df3.copy()
print(df4)
print(df4.fillna(value=3))
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 4.0 3 yes 1
c snake 0.5 2 no 2
d dog NaN 3 yes 3
e dog 5.0 2 no 4
f cat 1.5 3 no 5
g snake 4.5 1 no 6
h cat NaN 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 4.0 3 yes 1
c snake 0.5 2 no 2
d dog 3.0 3 yes 3
e dog 5.0 2 no 4
f cat 1.5 3 no 5
g snake 4.5 1 no 6
h cat 3.0 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#删除存在缺失值的行
df5=df3.copy()
print(df5.dropna(how='any')) ## 任何存在 NaN 的行都将被删除
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 4.0 3 yes 1
c snake 0.5 2 no 2
e dog 5.0 2 no 4
f cat 1.5 3 no 5
g snake 4.5 1 no 6
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#DataFrame 按指定列对齐
left=pd.DataFrame({'key':['foo1','foo2','foo3'],'one':[1,2,3]})
right=pd.DataFrame({'key':['foo2','foo3','foo4'],'one':[4,5,6]})
print(left)
print(right)
print(pd.merge(left,right,on='key'))
'''
key one
0 foo1 1
1 foo2 2
2 foo3 3
key one
0 foo2 4
1 foo3 5
2 foo4 6
key one_x one_y
0 foo2 2 4
1 foo3 3 5
'''
DataFrame 文件操作
#CSV文件写入
print(df3.to_csv('animal.csv'))
print("写入成功")
#CSV 文件读取
df_animal=pd.read_csv('animal.csv')
print(df_animal)
'''
写入成功
Unnamed: 0 animal age visits priority No.
0 a cat 2.5 1 yes 0
1 b cat 4.0 3 yes 1
2 c snake 0.5 2 no 2
3 d dog NaN 3 yes 3
4 e dog 5.0 2 no 4
5 f cat 1.5 3 no 5
6 g snake 4.5 1 no 6
7 h cat NaN 1 yes 7
8 i dog 7.0 2 no 8
9 j dog 3.0 1 no 9
'''
# Excel 写入操作
df3.to_excel('animal.xlsx', sheet_name='Sheet1')
print("写入成功.")
#Excel 读取操作
print(pd.read_excel('animal.xlsx', 'Sheet1', index_col=None, na_values=['NA']))
pandas进阶篇
时间序列索引
dti=pd.date_range(start='2018-01-01',end='2018-12-31',freq='D')
s=pd.Series(np.random.rand(len(dti)),index=dti)
print(s)
'''
2018-01-01 0.819138
2018-01-02 0.698994
2018-01-03 0.535303
2018-01-04 0.201711
2018-01-05 0.614942
2018-01-06 0.046608
2018-01-07 0.236259
2018-01-08 0.575685
2018-01-09 0.647902
2018-01-10 0.154899
2018-01-11 0.104855
2018-01-12 0.658812
2018-01-13 0.840483
2018-01-14 0.404469
2018-01-15 0.919582
2018-01-16 0.000066
2018-01-17 0.267015
2018-01-18 0.059417
2018-01-19 0.115044
2018-01-20 0.919654
2018-01-21 0.536861
2018-01-22 0.514372
2018-01-23 0.167115
2018-01-24 0.812080
2018-01-25 0.689845
2018-01-26 0.651800
2018-01-27 0.523809
2018-01-28 0.484807
2018-01-29 0.110739
2018-01-30 0.401936
...
2018-12-02 0.254287
2018-12-03 0.038438
2018-12-04 0.868603
2018-12-05 0.320990
2018-12-06 0.630131
2018-12-07 0.012450
2018-12-08 0.538951
2018-12-09 0.303097
2018-12-10 0.330134
2018-12-11 0.109834
2018-12-12 0.828355
2018-12-13 0.079942
2018-12-14 0.596775
2018-12-15 0.936687
2018-12-16 0.944666
2018-12-17 0.510053
2018-12-18 0.506758
2018-12-19 0.019668
2018-12-20 0.260052
2018-12-21 0.813267
2018-12-22 0.581939
2018-12-23 0.058081
2018-12-24 0.268361
2018-12-25 0.992682
2018-12-26 0.167770
2018-12-27 0.253564
2018-12-28 0.845280
2018-12-29 0.744601
2018-12-30 0.262743
2018-12-31 0.415526
Freq: D, Length: 365, dtype: float64
'''
#统计s中每一个周三对应值的和
print(s[s.index.weekday==2].sum()) #23.771697397833957
#统计s中每个月值的平均值
print(s.resample('M').mean())
'''
2018-01-31 0.559565
2018-02-28 0.515923
2018-03-31 0.599616
2018-04-30 0.465701
2018-05-31 0.613371
2018-06-30 0.477651
2018-07-31 0.510719
2018-08-31 0.495345
2018-09-30 0.583058
2018-10-31 0.584103
2018-11-30 0.599802
2018-12-31 0.562433
Freq: M, dtype: float64
'''
#UTC世界时间标准
s=pd.date_range('today',periods=1,freq='D')
ts=pd.Series(np.random.rand(len(s)),s)
ts_utc=ts.tz_localize('UTC')#转换为UTC时间
print(ts_utc)
'''
2018-10-23 14:24:42.755209+00:00 0.088392
Freq: D, dtype: float64
'''
#转换为上海所在的时区
ts_shanghai=ts_utc.tz_convert('Asia/Shanghai')
print(ts_shanghai)
'''
2018-10-23 22:28:33.716209+08:00 0.237161
Freq: D, dtype: float64
'''
#不同时间表示方式的转换
rng=pd.date_range('1/1/2018',periods=5,freq='M')
ts=pd.Series(np.random.rand(len(rng)),index=rng)
print(ts)
ps=ts.to_period()
print(ps)
print(ps.to_timestamp)
'''
2018-01-31 0.794304
2018-02-28 0.165253
2018-03-31 0.231160
2018-04-30 0.296905
2018-05-31 0.847177
Freq: M, dtype: float64
2018-01 0.794304
2018-02 0.165253
2018-03 0.231160
2018-04 0.296905
2018-05 0.847177
Freq: M, dtype: float64
<bound method Series.to_timestamp of 2018-01 0.794304
2018-02 0.165253
2018-03 0.231160
2018-04 0.296905
2018-05 0.847177
Freq: M, dtype: float64>
'''
创建多重索引
#构建一个letters=['A','B','C']和numbers=liat(range(10))为索引,值为随机数的多重索引Series
letters=['A','B','C']
numbers=list(range(10))
mi=pd.MultiIndex.from_product([letters,numbers])
s=pd.Series(np.random.rand(30),index=mi)
print(s)
'''
A 0 0.293706
1 0.887773
2 0.856729
3 0.435714
4 0.997529
5 0.732427
6 0.166934
7 0.717487
8 0.389486
9 0.679912
B 0 0.149687
1 0.805469
2 0.238372
3 0.908750
4 0.025010
5 0.109468
6 0.909549
7 0.132050
8 0.084095
9 0.186603
C 0 0.536121
1 0.191505
2 0.287627
3 0.729340
4 0.089618
5 0.265803
6 0.027589
7 0.056560
8 0.174068
9 0.021970
dtype: float64
'''
#多重索引Series查询
#查询索引为1,3,6的值
print(s.loc[:,[1,3,6]])
'''
A 1 0.705519
3 0.733832
6 0.311256
B 1 0.168374
3 0.204631
6 0.453191
C 1 0.935524
3 0.306979
6 0.597759
dtype: float64
'''
#多重索引切片
print(s.loc[pd.IndexSlice[:'B',5:]])
"""
A 5 0.745123
6 0.316538
7 0.647127
8 0.152854
9 0.726968
B 5 0.469216
6 0.586324
7 0.185740
8 0.704940
9 0.540358
dtype: float64
"""
#根据多重索引创建 DataFrame
#创建一个以 letters = ['A', 'B'] 和 numbers = list(range(6))为索引,值为随机数据的多重索引 DataFrame。
frame=pd.DataFrame(np.arange(12).reshape(6,2),index=[list('AAABBB'),list('123123')],columns=['hello','shiyanlou'])
print(frame)
'''
hello shiyanlou
A 1 0 1
2 2 3
3 4 5
B 1 6 7
2 8 9
3 10 11
'''
frame.index.names=['first','second']
print(frame)
'''
hello shiyanlou
first second
A 1 0 1
2 2 3
3 4 5
B 1 6 7
2 8 9
3 10 11
'''
print(frame.groupby('first').sum())
'''
hello shiyanlou
first
A 6 9
B 24 27
'''
print(frame.groupby('second').sum())
'''
hello shiyanlou
second
1 6 8
2 10 12
3 14 16
'''
# DataFrame 行列名称转换
print(frame)
print(frame.stack())
'''
first second
A 1 hello 0
shiyanlou 1
2 hello 2
shiyanlou 3
3 hello 4
shiyanlou 5
B 1 hello 6
shiyanlou 7
2 hello 8
shiyanlou 9
3 hello 10
shiyanlou 11
dtype: int32
'''
#DataFrame 索引转换
print(frame.unstack())
'''
hello shiyanlou
second 1 2 3 1 2 3
first
A 0 2 4 1 3 5
B 6 8 10 7 9 11
'''
#DataFrame 条件查找
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df=pd.DataFrame(data,index=labels)
print(df)
'''
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
'''
#查找 age 大于 3 的全部信息
print(df[df['age']>3])
'''
animal age visits priority
e dog 5.0 2 no
g snake 4.5 1 no
i dog 7.0 2 no
'''
#根据行列索引切片
print(df.iloc[2:4,1:3])
'''
age visits
c 0.5 2
d NaN 3
'''
#查找 age<3 且为 cat 的全部数据。
print(df[(df['animal']=='cat')&(df['age']<3)])
'''
animal age visits priority
a cat 2.5 1 yes
f cat 2.0 3 no
'''
print(df[df['animal'].isin(['cat','dog'])])
'''
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
'''
#DataFrame 按标签及列名查询
print(df.loc[df.index[[3,4,8]],['animal','age']])
'''
animal age
d dog NaN
e dog 5.0
i dog 7.0
'''
#按照 age 降序,visits 升序排列
print(df.sort_values(by=['age','visits'],ascending=[False,True]))
#将 priority 列的 yes 值替换为 True,no 值替换为 False。
print(df['priority'].map({'yes':True,'no':False}))
'''
a True
b True
c False
d True
e False
f False
g False
h True
i False
j False
'''
print(df.groupby('animal').sum())
'''
animal
cat 7.5 8
dog 15.0 8
snake 5.0 3
'''
#使用列表拼接多个 DataFrame
temp_df1=pd.DataFrame(np.random.randn(5,4))
temp_df2=pd.DataFrame(np.random.randn(5,4))
temp_df3=pd.DataFrame(np.random.randn(5,4))
print(temp_df1)
print(temp_df2)
print(temp_df3)
pieces=[temp_df1,temp_df2,temp_df3]
print(pd.concat(pieces))
#找出 DataFrame 表中和最小的列
df=pd.DataFrame(np.random.random(size=(5,10)),columns=list('abcdefghij'))
print(df)
print(df.sum().idxmin()) #[5 rows x 10 columns] b
#DataFrame 中每个元素减去每一行的平均值
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
print(df.sub(df.mean(axis=1), axis=0))