import numpy as np
import pandas as pd
s = pd.Series([1,3,6, np.nan,88,3])# series 系列print(s)
dates = pd.date_range('20160101', periods=6)# periods 周期,时期print(dates)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['a','b','c','d'])print(df)'''
a b c d
2016-01-01 -0.558977 -0.741284 0.562322 1.212522
2016-01-02 0.236301 0.069895 -0.945983 1.328624
2016-01-03 -1.319606 -1.071127 1.013776 0.070183
2016-01-04 -0.725431 0.190953 1.538477 -0.212148
2016-01-05 -1.028601 0.302218 -0.857509 -0.630979
2016-01-06 0.309582 0.318177 0.787511 0.244988
'''print("如果index=, columns= 都是默认")
df1 = pd.DataFrame(np.random.randn(12).reshape(3,4))print(df1)'''
0 1 2 3
0 0.066480 1.976401 0.151181 1.668614
1 -0.491919 0.815604 1.305159 -0.263863
2 -0.258095 1.345336 1.333973 -1.313072
'''print("定义DataFrame的第二种方式")
df2 = pd.DataFrame({"A":1,"B": pd.Timestamp('20130102'),"C": pd.Series(1, index=list(range(4)), dtype='float32'),"D": np.array([3]*4, dtype='int32'),"E": pd.Categorical(['test','train','test','train']),"F":'foo'})print(df2)print(df2.dtypes)''' A B C D E F
0 1 2013-01-02 1.0 3 test foo
1 1 2013-01-02 1.0 3 train foo
2 1 2013-01-02 1.0 3 test foo
3 1 2013-01-02 1.0 3 train foo
A int64
B datetime64[ns]
C float32
D int32
E category
F object
'''print("所有行的序号")print(df2.index)print(df2.columns)'''
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''print("获取值信息")print(df2.values)'''
[[1 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
[1 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
'''print("describe(), 可以获得以下信息")print(df2.describe())''' A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0'''print("转置, transpose(), 也可以用df2.T")print(df2.transpose())''' 0 ... 3
A 1 ... 1
B 2013-01-02 00:00:00 ... 2013-01-02 00:00:00
C 1.0 ... 1.0
D 3 ... 3
E test ... train
F foo ... foo
[6 rows x 4 columns]'''print("排序, axis=0是对行排序,axis=1是对列进行排序")print(df2.sort_index(axis=1, ascending=False))''' F E D C B A
0 foo test 3 1.0 2013-01-02 1
1 foo train 3 1.0 2013-01-02 1
2 foo test 3 1.0 2013-01-02 1
3 foo train 3 1.0 2013-01-02 1
'''print("用值进行排序")print(df2.sort_values(by='E'))''' A B C D E F
0 1 2013-01-02 1.0 3 test foo
2 1 2013-01-02 1.0 3 test foo
1 1 2013-01-02 1.0 3 train foo
3 1 2013-01-02 1.0 3 train foo
'''
选择数据,数据筛选
import numpy as np
import pandas as pd
print("创建一个pandas日期列表")
dates = pd.date_range('20130101', periods=6)print(dates)'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')'''print("创建一个DataFrame")
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'])print(df)'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''print("获取列")print(df['A'], df.A)'''
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32'''print("按照切片来取")print(df[:3],'\n', df['20130102':'20130104'])'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
A B C D
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
'''print("更具体一点的选择,用标签: select by label: loc")print(df.loc['20130102'])'''
A 4
B 5
C 6
D 7
'''print("选择所有行, 指定列的数据")print(df.loc[:,['A','C']])'''
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10
2013-01-04 12 14
2013-01-05 16 18
2013-01-06 20 22'''print("也可以这样")print(df.loc['20130102',['A','C']])'''
A 4
C 6'''print("按照位置来选择: select by position: iloc")print(df.iloc[3:5,1:3])'''
B C
2013-01-04 13 14
2013-01-05 17 18
'''print("也可以这样")print(df.iloc[[1,3,5],1:3])'''
B C
2013-01-02 5 6
2013-01-04 13 14
2013-01-06 21 22'''# print("标签和位置混合筛选: mixed selection: ix")# print(df.ix[:3, ['A', 'C']])print("Boolean indexing方法")print(df)print(df[df.A >8])'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''
修改数据值
import numpy as np
import pandas as pd
print("创建一个pandas日期列表")
dates = pd.date_range('20130101', periods=6)print(dates)'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')'''print("创建一个DataFrame")
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'])print(df)'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''print("用位置iloc修改值")
df.iloc[2,2]=1111print(df)print("第二种方法")
df.loc['20130102','B']=2222print(df)'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 2222 6 7
2013-01-03 8 9 1111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
Process finished with exit code 0
'''print("第三种方法, ")# df[df.A > 4] = 0# print(df)'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 2222 6 7
2013-01-03 0 0 0 0
2013-01-04 0 0 0 0
2013-01-05 0 0 0 0
2013-01-06 0 0 0 0'''
df.A[df.A >4]=0print(df)'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 2222 6 7
2013-01-03 0 9 1111 11
2013-01-04 0 13 14 15
2013-01-05 0 17 18 19
2013-01-06 0 21 22 23'''print("加一个空列")
df['F']= np.nan
print(df)'''
A B C D F
2013-01-01 0 1 2 3 NaN
2013-01-02 4 2222 6 7 NaN
2013-01-03 0 9 1111 11 NaN
2013-01-04 0 13 14 15 NaN
2013-01-05 0 17 18 19 NaN
2013-01-06 0 21 22 23 NaN'''print("加入一个有值的列")
df['E']= pd.Series(range(6), index=dates)print(df)'''
A B C D F E
2013-01-01 0 1 2 3 NaN 0
2013-01-02 4 2222 6 7 NaN 1
2013-01-03 0 9 1111 11 NaN 2
2013-01-04 0 13 14 15 NaN 3
2013-01-05 0 17 18 19 NaN 4
2013-01-06 0 21 22 23 NaN 5'''
处理丢失的数据
import numpy as np
import pandas as pd
print("创建一个pandas日期列表")
dates = pd.date_range('20130101', periods=6)print(dates)'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')'''print("创建一个DataFrame")
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'])print(df)'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''
df.iloc[2,3]= np.nan
df.iloc[1,2]= np.nan
print(df)'''
A B C D
2013-01-01 0 1 2.0 3.0
2013-01-02 4 5 NaN 7.0
2013-01-03 8 9 10.0 NaN
2013-01-04 12 13 14.0 15.0
2013-01-05 16 17 18.0 19.0
2013-01-06 20 21 22.0 23.0'''print('axis=0 是按行丢掉数据, how=any是只要有nan的就都丢掉 ')print('axis=1是丢掉列','how=all是只有全是nan的时候才丢掉')print(df.dropna(axis=0, how='any'))'''
A B C D
2013-01-01 0 1 2.0 3.0
2013-01-04 12 13 14.0 15.0
2013-01-05 16 17 18.0 19.0
2013-01-06 20 21 22.0 23.0
'''print(df.dropna(axis=1, how='all'))# 不变print('填充数据follna, value=0')print(df.fillna(value=0))print('判断数据是否有缺失用df.isnull(),')print(df.isnull())print(np.any(df.isnull())==True)
数据的读取与写入
import numpy as np
import pandas as pd
print("读取数据")
data = pd.read_csv('')# csv, excel, hdf, sql, json, msgpack, html, gbq, stata, sas, clipboard, pickle等格式
data.to_pickle('')
连接数据1,concat append
import numpy as np
import pandas as pd
# concatenating 连接 合并print('创建一个3行4列用0填充的Frame')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])print('上下合并, 0就是竖向合并, 1是横向')
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)# ignore 忽视print(res)
df4 = pd.DataFrame(np.ones((3,4))*4, columns=['b','c','d','e'], index=[2,3,4])
res1 = pd.concat([df1, df4], axis=0, ignore_index=True, join='outer')# 会用nan填充 outer, innerprint(res1)'''
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 4.0 4.0 4.0 4.0
4 NaN 4.0 4.0 4.0 4.0
5 NaN 4.0 4.0 4.0 4.0
'''
res2 = pd.concat([df1, df4], axis=0, ignore_index=True, join='inner')print(res2)'''
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 4.0 4.0 4.0
4 4.0 4.0 4.0
5 4.0 4.0 4.0'''print('join_axes参数')
res3 = pd.concat([df1, df4], axis=1)print(res3)'''
a b c d b c d e
0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 4.0 4.0 4.0 4.0
3 NaN NaN NaN NaN 4.0 4.0 4.0 4.0
4 NaN NaN NaN NaN 4.0 4.0 4.0 4.0
'''# res4 = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) 已经没有这个参数了print('用append添加数据')
res5 = df1.append([df2, df3], ignore_index=True)print(res5)print('添加一行数据')
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res6 = df1.append(s1, ignore_index=True)print(res6)
合并数据 merge 相同key
import pandas as pd
# merge 合并
left = pd.DataFrame({'key':['k0','k1','k2','k3'],'A':['A0','A1','A2','A3'],'b':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['k0','k1','k2','k3'],'c':['c1','c2','c3','c4'],'d':['d1','d2','d3','d4']})print(left)print(right)
res = pd.merge(left, right, on='key')print(res)'''
key A b c d
0 k0 A0 B0 c1 d1
1 k1 A1 B1 c2 d2
2 k2 A2 B2 c3 d3
3 k3 A3 B3 c4 d4'''
merge 不同key
import numpy as np
import pandas as pd
# merge 合并
left = pd.DataFrame({'key':['k0','k1','k2','k3'],'key1':['k0','k1','k0','k1'],'A':['A0','A1','A2','A3'],'b':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['k0','k1','k2','k3'],'key1':['k0','k0','k0','k0'],'c':['c1','c2','c3','c4'],'d':['d1','d2','d3','d4']})print(left)print(right)
res = pd.merge(left, right, on=['key','key1'], how='inner')# how默认=innerprint(res)# how= [left, right, outer, inner]
res1 = pd.merge(left, right, on=['key','key1'], how='outer')print(res1)
merge 参数 indicator
import pandas as pd
# indicator 指示器 指示信号
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':['b','c','d']})print(df1)print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)print(res)'''
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b b both
2 2 NaN c right_only
3 2 NaN d right_only
'''print('give the indicator a custom name')
res1 = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')print(res1)'''
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b b both
2 2 NaN c right_only
3 2 NaN d right_only
'''
merge参数 left_index, right_index
import pandas as pd
left = pd.DataFrame({'a':['a0','a1','a2'],'b':['b0','b1','b2']},
index=['k0','k1','k2'])
right = pd.DataFrame({'c':['c0','c2','c3'],'d':['d0','d2','d3']},
index=['k0','k2','k3'])print(left)print(right)
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')print(res)'''
a b c d
k0 a0 b0 c0 d0
k1 a1 b1 NaN NaN
k2 a2 b2 c2 d2
k3 NaN NaN c3 d3'''
merge参数suffixes 后缀
import pandas as pd
boys = pd.DataFrame({'k':['ko','k1','k2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['ko','k0','k3'],'age':[4,5,6]})print(boys)print(girls)
res = pd.merge(boys, girls, on='k', how='outer', suffixes=['_boy','_girl'])# suffixes 后缀print(res)'''
k age_boy age_girl
0 ko 1.0 4.0
1 k1 2.0 NaN
2 k2 3.0 NaN
3 k0 NaN 5.0
4 k3 NaN 6.0
'''print('join的用法和merge差不多')