一、常用读入、导出数据方法
导入:
- read_csv
- read_excel
- read_hdf(一种灵活的数据格式,用于存储和传输科学和工程数据 )
- read_sql
- read_json
- read_msgpack(一种轻量级的二进制序列化格式)
- read_html
- read_gbq(一种基于云端的数据仓库服务 )
- read_stata(一种用于数据分析和统计建模的软件 )
- read_sas(一种常用的数据分析和统计建模软件 )
- read_clipboard((剪贴板)是操作系统中的一个功能,用于在应用程序之间传输文本和图像等数据 )
- read_pickle(序列化模块)
导出:
- to_csv
- to_excel
- to_hdf
- to_sql
- to_json
- to_msgpack
- to_html
- to_gbq
- to_stata
- to_clipboard
- to_pickle
二、Panda的一维数组
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
Result:
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
三、Panda的日期的序列
dates = pd.date_range('20160101',periods=6)
print(dates)
Result:
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
四、Panda的二维矩阵操作
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) #第一个参数是矩阵,第二个参数是上面生成的行标签,第三个参数是列标签
print(df)
Result:
a b c d
2016-01-01 0.689988 -0.095512 -0.412387 0.924561
2016-01-02 1.143864 1.090012 -0.191735 -1.191397
2016-01-03 -0.128164 -2.430015 -0.472300 -1.765914
2016-01-04 -0.727960 0.374820 -1.165489 -0.769641
2016-01-05 -0.660326 1.101678 0.176635 0.803214
2016-01-06 -0.886476 0.979563 -0.758531 -0.992916
print(df.index) #打印行标签
Result:
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
print(df.columns) #打印列标签
Result:
Index(['a', 'b', 'c', 'd'], dtype='object')
print(df.values) #打印所有值
Result:
[[ 0.68998818 -0.0955118 -0.41238747 0.92456076]
[ 1.14386412 1.09001154 -0.19173546 -1.19139661]
[-0.12816412 -2.43001539 -0.47229969 -1.76591394]
[-0.72795951 0.3748198 -1.16548892 -0.76964143]
[-0.66032615 1.10167843 0.17663535 0.80321393]
[-0.88647561 0.97956266 -0.75853148 -0.99291591]]
print(df.describe()) #打印常用属性,比如平均值,最大值,最小值等带数据
Result:
a b c d
count 6.000000 6.000000 6.000000 6.000000
mean -0.094846 0.170091 -0.470635 -0.498682
std 0.836511 1.359586 0.461702 1.106704
min -0.886476 -2.430015 -1.165489 -1.765914
25% -0.711051 0.022071 -0.686974 -1.141776
50% -0.394245 0.677191 -0.442344 -0.881279
75% 0.485450 1.062399 -0.246898 0.410000
max 1.143864 1.101678 0.176635 0.924561
# 对行进行逆排序
print(df.sort_index(axis=1,ascending=False))
Result:
d c b a
2016-01-01 0.924561 -0.412387 -0.095512 0.689988
2016-01-02 -1.191397 -0.191735 1.090012 1.143864
2016-01-03 -1.765914 -0.472300 -2.430015 -0.128164
2016-01-04 -0.769641 -1.165489 0.374820 -0.727960
2016-01-05 0.803214 0.176635 1.101678 -0.660326
2016-01-06 -0.992916 -0.758531 0.979563 -0.886476
# 对列进行逆排序
print(df.sort_index(axis=0,ascending=False))
Result:
a b c d
2016-01-06 -0.886476 0.979563 -0.758531 -0.992916
2016-01-05 -0.660326 1.101678 0.176635 0.803214
2016-01-04 -0.727960 0.374820 -1.165489 -0.769641
2016-01-03 -0.128164 -2.430015 -0.472300 -1.765914
2016-01-02 1.143864 1.090012 -0.191735 -1.191397
2016-01-01 0.689988 -0.095512 -0.412387 0.924561
# 对数值里面某些行数据或列数据进行排序
print(df.sort_values(by='a',ascending=False))
Result:
a b c d
2016-01-02 1.143864 1.090012 -0.191735 -1.191397
2016-01-01 0.689988 -0.095512 -0.412387 0.924561
2016-01-03 -0.128164 -2.430015 -0.472300 -1.765914
2016-01-05 -0.660326 1.101678 0.176635 0.803214
2016-01-04 -0.727960 0.374820 -1.165489 -0.769641
2016-01-06 -0.886476 0.979563 -0.758531 -0.992916
print(df['a']) # 等价于 df.a
Result:
2016-01-01 1.902100
2016-01-02 -3.304422
2016-01-03 -0.660913
2016-01-04 -0.485151
2016-01-05 0.724529
2016-01-06 -1.029163
Freq: D, Name: a, dtype: float64
print(df[0:3]) # 等价于 df['2016-01-02':'2016-01-03']
Result:
a b c d
2016-01-01 1.902100 0.588487 0.922994 0.181766
2016-01-02 -3.304422 0.279742 0.291598 -0.739634
2016-01-03 -0.660913 0.739118 -1.687635 -0.420376
#根据标签进行匹配
print(df.loc['2016-01-01 '])
Result:
a 1.902100
b 0.588487
c 0.922994
d 0.181766
Name: 2016-01-01 00:00:00, dtype: float64
#根据位置进行匹配
print(df.iloc[3]) #匹配第三行数据
Result:
a -0.485151
b 0.510899
c -1.214747
d -0.074381
Name: 2016-01-04 00:00:00, dtype: float64
print(df.iloc[3,1]) #匹配第三行的第一个数据
Result:
0.5108988740473241
#筛选某些数据并找出位置
print(df[df.a > 0]) # a列大于0的所有数据
Result:
a b c d
2016-01-01 1.902100 0.588487 0.922994 0.181766
2016-01-05 0.724529 0.282068 -0.332142 2.532092
df.a[df.a>0] = 0 #讲a列大于0的数全部设置为0
print(df)
Result:
a b c d
2016-01-01 -0.103724 0.469858 1.320164 0.818373
2016-01-02 0.000000 0.432307 0.596245 -0.120634
2016-01-03 -1.010943 0.616610 -0.607262 -0.087813
2016-01-04 -0.090025 1.018469 0.518396 1.705550
2016-01-05 0.000000 1.332888 0.830898 -0.878719
2016-01-06 -0.490326 0.600170 0.199292 -1.129577
df['e'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df)
Result:
a b c d e
2016-01-01 -0.103724 0.469858 1.320164 0.818373 NaN
2016-01-02 0.000000 0.432307 0.596245 -0.120634 NaN
2016-01-03 -1.010943 0.616610 -0.607262 -0.087813 NaN
2016-01-04 -0.090025 1.018469 0.518396 1.705550 NaN
2016-01-05 0.000000 1.332888 0.830898 -0.878719 NaN
2016-01-06 -0.490326 0.600170 0.199292 -1.129577 NaN
五、Pandas处理丢失数据
print(df.dropna(axis=1,how='any')) #看看列中为Nan的数据,全部丢掉 , how = 'all '是指
Result:
a b c d
2016-01-01 -0.112092 0.552210 -1.149139 -0.548805
2016-01-02 0.922602 -0.098273 0.325613 0.428681
2016-01-03 -0.483804 0.611667 1.092159 1.062395
2016-01-04 -1.052769 2.194841 1.963597 0.561053
2016-01-05 1.080100 -0.924017 1.357584 0.691858
2016-01-06 -1.073571 -0.644965 -1.279937 -1.950447
df['e'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df.fillna(value=0)) # 给所有Nan值填充为0
Result:
a b c d e
2016-01-01 -0.112092 0.552210 -1.149139 -0.548805 0.0
2016-01-02 0.922602 -0.098273 0.325613 0.428681 0.0
2016-01-03 -0.483804 0.611667 1.092159 1.062395 0.0
2016-01-04 -1.052769 2.194841 1.963597 0.561053 0.0
2016-01-05 1.080100 -0.924017 1.357584 0.691858 0.0
2016-01-06 -1.073571 -0.644965 -1.279937 -1.950447 0.0
print(df.isnull()) # 判断有没有缺失的值
Result:
a b c d e
2016-01-01 False False False False True
2016-01-02 False False False False True
2016-01-03 False False False False True
2016-01-04 False False False False True
2016-01-05 False False False False True
2016-01-06 False False False False True
六、读入、导出数据详细
data = pd.read_csv("D:\DeepLearning\jena_climate.csv")
print(data)
data.to_pickle("D:\DeepLearning\play.pickle")
Result:
Date Time p (mbar) ... max. wv (m/s) wd (deg)
0 01.01.2009 00:10:00 996.52 ... 1.75 152.3
1 01.01.2009 00:20:00 996.57 ... 1.50 136.1
2 01.01.2009 00:30:00 996.53 ... 0.63 171.6
3 01.01.2009 00:40:00 996.51 ... 0.50 198.0
4 01.01.2009 00:50:00 996.51 ... 0.63 214.3
... ... ... ... ... ...
420546 31.12.2016 23:20:00 1000.07 ... 1.52 240.0
420547 31.12.2016 23:30:00 999.93 ... 1.92 234.3
420548 31.12.2016 23:40:00 999.82 ... 2.00 215.2
420549 31.12.2016 23:50:00 999.81 ... 2.16 225.8
420550 01.01.2017 00:00:00 999.82 ... 1.96 184.9
[420551 rows x 15 columns]
七、pandas合并
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # ignore_index 是忽略前面合并的序号
print(res)
Result:
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
res = pd.concat([df1,df2],join='inner',ignore_index=True) # inner 会取找都存在的
print(res)
Result:
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
res = pd.concat([df1,df2],join='outer',ignore_index=True) # outer 会用None补充不存在的元素
print(res)
Result:
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
# append合并
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = df1.append([df2,df3],ignore_index=True)
print(res)
#merge合并
left = pd.DataFrame({'Key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'Key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
res = pd.merge(left,right,on='Key') # 基于Key这一类,合并
print(res)
Result:
Key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
left = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},index=['K0','K2','K3'])
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(res)
Result:
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
res = pd.merge(boys,girls,on='k',suffixes={'_boys','_girls'},how='outer')
print(res)
Result:
k age_girls age_boys
0 K0 1.0 4.0
1 K0 1.0 5.0
2 K1 2.0 NaN
3 K2 3.0 NaN
4 K3 NaN 6.0
八、Pandas画图
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
data = pd.DataFrame(np.random.randn(1000,4),index = np.arange(1000),columns=list("ABCD"))
data = data.cumsum()
'''plot methods: bar、 hist、box、kde、area、scatter、hexbin、pie'''
ax = data.plot.scatter(x='A',y='B',color = 'DarkBlue',label="Class 1")
data.plot.scatter(x='A',y='C',color = 'DarkGreen',label="Class 2",ax=ax) # ax=ax,第一个是属性,第二个是图名
plt.show()