以下是我使用pansas时的纪录,没有过多的解释,只是自己的练习,如果想详细学习可以参考pandsd相关文档,想要快速了解的,我可以推荐一下另一位博主的介绍,很详细,可供参考学习 http://t.csdn.cn/awSTv import numpy as np import pandas as pd s = pd.Series([1,3,6,np.nan,44,1]) print(s) datas = pd.date_range('20220719',periods=6) print(datas) #第一种创建DataFrame df = pd.DataFrame(np.random.randn(6,4),index=datas,columns=['a','b','c','d']) print(df) #第二种创建DataFrame df2 = pd.DataFrame({'A':1, 'B':pd.Series(1,index=list(range(4)),dtype='float32'), 'C':np.array([3]*4,dtype='int32'), 'D':pd.Timestamp('20220722'), 'E':pd.Categorical(['test','train','test','train']), 'F':'foo' }) print(df2) #类型,索引,值,列 print(df2.dtypes,df2.index,df2.values,df2.columns) #详细信息 print(df2.describe) #转置 print(df2.T) #第排序 print(df2.sort_index(axis=1,ascending = False))
0 1.0 1 3.0 2 6.0 3 NaN 4 44.0 5 1.0 dtype: float64 DatetimeIndex(['2022-07-19', '2022-07-20', '2022-07-21', '2022-07-22', '2022-07-23', '2022-07-24'], dtype='datetime64[ns]', freq='D') a b c d 2022-07-19 0.289809 0.632396 -0.770436 -2.898336 2022-07-20 -0.617356 -0.557689 -1.017138 0.796157 2022-07-21 -2.524918 2.134619 -0.013117 -0.206382 2022-07-22 0.095292 1.616551 0.401501 0.917743 2022-07-23 0.194531 -1.924176 -0.375617 -2.735487 2022-07-24 -0.847691 0.874190 0.694338 -0.577505 A B C D E F 0 1 1.0 3 2022-07-22 test foo 1 1 1.0 3 2022-07-22 train foo 2 1 1.0 3 2022-07-22 test foo 3 1 1.0 3 2022-07-22 train foo A int64 B float32 C int32 D datetime64[ns] E category F object dtype: object Int64Index([0, 1, 2, 3], dtype='int64') [[1 1.0 3 Timestamp('2022-07-22 00:00:00') 'test' 'foo'] [1 1.0 3 Timestamp('2022-07-22 00:00:00') 'train' 'foo'] [1 1.0 3 Timestamp('2022-07-22 00:00:00') 'test' 'foo'] [1 1.0 3 Timestamp('2022-07-22 00:00:00') 'train' 'foo']] Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') <bound method NDFrame.describe of A B C D E F 0 1 1.0 3 2022-07-22 test foo 1 1 1.0 3 2022-07-22 train foo 2 1 1.0 3 2022-07-22 test foo 3 1 1.0 3 2022-07-22 train foo> 0 1 2 \ A 1 1 1 B 1.0 1.0 1.0 C 3 3 3 D 2022-07-22 00:00:00 2022-07-22 00:00:00 2022-07-22 00:00:00 E test train test F foo foo foo 3 A 1 B 1.0 C 3 D 2022-07-22 00:00:00 E train F foo F E D C B A 0 foo test 2022-07-22 3 1.0 1 1 foo train 2022-07-22 3 1.0 1 2 foo test 2022-07-22 3 1.0 1 3 foo train 2022-07-22 3 1.0 1
In [24]:
''' pandas数据选择 ''' import numpy as np import pandas as pd datas = pd.date_range('20220719',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=datas,columns=['A','B','C','D']) print(df) print('________________________________________________') print(df['A'],df.A)#两种方法获取指定列 print('________________________________________________') print(df[0:3],df['20220719':'20220722']) print('________________________________________________') #select by lable:loc print(df.loc['20220719'])#select by lable:loc(选取日期为20220719的一行) print(df.loc['20220720',['A','B']])#select by lable:loc(选取日期为20220720,列为A,B的元素) print('________________________________________________') #select by possitions :iloc print(df.iloc[3])#第3行 print(df.iloc[3,2])#第3行第2位 print(df.iloc[1:3,2:4])#1-3行,2-4列 print(df.iloc[[1,2,5],1:4])#第3行 print('________________________________________________') print(df[df.A<8])
A B C D 2022-07-19 0 1 2 3 2022-07-20 4 5 6 7 2022-07-21 8 9 10 11 2022-07-22 12 13 14 15 2022-07-23 16 17 18 19 2022-07-24 20 21 22 23 ________________________________________________ 2022-07-19 0 2022-07-20 4 2022-07-21 8 2022-07-22 12 2022-07-23 16 2022-07-24 20 Freq: D, Name: A, dtype: int32 2022-07-19 0 2022-07-20 4 2022-07-21 8 2022-07-22 12 2022-07-23 16 2022-07-24 20 Freq: D, Name: A, dtype: int32 ________________________________________________ A B C D 2022-07-19 0 1 2 3 2022-07-20 4 5 6 7 2022-07-21 8 9 10 11 A B C D 2022-07-19 0 1 2 3 2022-07-20 4 5 6 7 2022-07-21 8 9 10 11 2022-07-22 12 13 14 15 ________________________________________________ A 0 B 1 C 2 D 3 Name: 2022-07-19 00:00:00, dtype: int32 A 4 B 5 Name: 2022-07-20 00:00:00, dtype: int32 ________________________________________________ A 12 B 13 C 14 D 15 Name: 2022-07-22 00:00:00, dtype: int32 14 C D 2022-07-20 6 7 2022-07-21 10 11 B C D 2022-07-20 5 6 7 2022-07-21 9 10 11 2022-07-24 21 22 23 ________________________________________________ A B C D 2022-07-19 0 1 2 3 2022-07-20 4 5 6 7
In [36]:
''' pandas设置值 ''' import numpy as np import pandas as pd datas = pd.date_range('20220719',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=datas,columns=['A','B','C','D']) df.iloc[2,2]=100 print(df) print('________________________________________________') df.loc['20220719','B']=200 print(df) print('________________________________________________') df[df.A>4]=0 #全部显示小于4的改变 print(df) print('________________________________________________') df.A[df.A>3]=0 #A中显示小于4的改变 print(df) print('________________________________________________') df['E']=np.nan #增加行 print(df) print('________________________________________________') df['F']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20220719',periods=6)) print(df) #增加指定元素 print('________________________________________________')
A B C D 2022-07-19 0 1 2 3 2022-07-20 4 5 6 7 2022-07-21 8 9 100 11 2022-07-22 12 13 14 15 2022-07-23 16 17 18 19 2022-07-24 20 21 22 23 ________________________________________________ A B C D 2022-07-19 0 200 2 3 2022-07-20 4 5 6 7 2022-07-21 8 9 100 11 2022-07-22 12 13 14 15 2022-07-23 16 17 18 19 2022-07-24 20 21 22 23 ________________________________________________ A B C D 2022-07-19 0 200 2 3 2022-07-20 4 5 6 7 2022-07-21 0 0 0 0 2022-07-22 0 0 0 0 2022-07-23 0 0 0 0 2022-07-24 0 0 0 0 ________________________________________________ A B C D 2022-07-19 0 200 2 3 2022-07-20 0 5 6 7 2022-07-21 0 0 0 0 2022-07-22 0 0 0 0 2022-07-23 0 0 0 0 2022-07-24 0 0 0 0 ________________________________________________ A B C D E 2022-07-19 0 200 2 3 NaN 2022-07-20 0 5 6 7 NaN 2022-07-21 0 0 0 0 NaN 2022-07-22 0 0 0 0 NaN 2022-07-23 0 0 0 0 NaN 2022-07-24 0 0 0 0 NaN ________________________________________________ A B C D E F 2022-07-19 0 200 2 3 NaN 1 2022-07-20 0 5 6 7 NaN 2 2022-07-21 0 0 0 0 NaN 3 2022-07-22 0 0 0 0 NaN 4 2022-07-23 0 0 0 0 NaN 5 2022-07-24 0 0 0 0 NaN 6 ________________________________________________
In [56]:
''' pandas处理数据丢失 ''' import numpy as np import pandas as pd datas = pd.date_range('20220719',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=datas,columns=['A','B','C','D']) df.iloc[0,1]=np.nan df.iloc[1,2]=np.nan print(df) print('________________________________________________') print(df.fillna(value=100))#填充数据中为nan的值,赋值为100 print(df) print('________________________________________________') print(df.dropna(axis=0,how='any'))#处理数据:axis=0对行处理,how='any'只要有一个没有数据就处理,how='all'所有没有数据就操作 print(df) print('________________________________________________') print(np.any(df.isnull())==True) print('________________________________________________') print(df.isnull())
A B C D 2022-07-19 0 NaN 2.0 3 2022-07-20 4 5.0 NaN 7 2022-07-21 8 9.0 10.0 11 2022-07-22 12 13.0 14.0 15 2022-07-23 16 17.0 18.0 19 2022-07-24 20 21.0 22.0 23 ________________________________________________ A B C D 2022-07-19 0 100.0 2.0 3 2022-07-20 4 5.0 100.0 7 2022-07-21 8 9.0 10.0 11 2022-07-22 12 13.0 14.0 15 2022-07-23 16 17.0 18.0 19 2022-07-24 20 21.0 22.0 23 A B C D 2022-07-19 0 NaN 2.0 3 2022-07-20 4 5.0 NaN 7 2022-07-21 8 9.0 10.0 11 2022-07-22 12 13.0 14.0 15 2022-07-23 16 17.0 18.0 19 2022-07-24 20 21.0 22.0 23 ________________________________________________ A B C D 2022-07-21 8 9.0 10.0 11 2022-07-22 12 13.0 14.0 15 2022-07-23 16 17.0 18.0 19 2022-07-24 20 21.0 22.0 23 A B C D 2022-07-19 0 NaN 2.0 3 2022-07-20 4 5.0 NaN 7 2022-07-21 8 9.0 10.0 11 2022-07-22 12 13.0 14.0 15 2022-07-23 16 17.0 18.0 19 2022-07-24 20 21.0 22.0 23 ________________________________________________ True ________________________________________________ A B C D 2022-07-19 False True False False 2022-07-20 False False True False 2022-07-21 False False False False 2022-07-22 False False False False 2022-07-23 False False False False 2022-07-24 False False False False
In [ ]:
''' pandas导入导出数据 ''' import numpy as np import pandas as pd pd.read_csv(filename) # 导入csv格式文件中的数据 pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据 pd.read_excel(filename) # 导入Excel格式文件中的数据 pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据 pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据 pd.read_clipboard() # 导入系统粘贴板里面的数据 pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。 df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中 df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中 df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中 df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中
In [61]:
''' pandas DataFrame合并 ''' import numpy as np import pandas as pd df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d']) '''concatenating''' print(pd.concat([df1,df2,df3],axis=0))#axis=0上下合并,而且index重复,不利于操作 print('________________________________________________') print(pd.concat([df1,df2,df3],axis=0,ignore_index=True))#axis=0上下合并,ignore_index忽略原来的index
a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 0 2.0 2.0 2.0 2.0 1 2.0 2.0 2.0 2.0 2 2.0 2.0 2.0 2.0 ________________________________________________ a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 5 1.0 1.0 1.0 1.0 6 2.0 2.0 2.0 2.0 7 2.0 2.0 2.0 2.0 8 2.0 2.0 2.0 2.0
In [64]:
''' pandas DataFrame合并 ''' import numpy as np import pandas as pd df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e']) '''join,['inner','outer']''' print(pd.concat([df1,df2],axis=0,join='outer'))#axis=0上下合并,ignore_index忽略原来的index print('________________________________________________') print(pd.concat([df1,df2],axis=0,join='inner',ignore_index='True'))#axis=0上下合并,ignore_index忽略原来的index print('________________________________________________')
a b c d e 0 0.0 0.0 0.0 0.0 NaN 1 0.0 0.0 0.0 0.0 NaN 2 0.0 0.0 0.0 0.0 NaN 0 NaN 1.0 1.0 1.0 1.0 1 NaN 1.0 1.0 1.0 1.0 2 NaN 1.0 1.0 1.0 1.0 ________________________________________________ b c d 0 0.0 0.0 0.0 1 0.0 0.0 0.0 2 0.0 0.0 0.0 3 1.0 1.0 1.0 4 1.0 1.0 1.0 5 1.0 1.0 1.0
In [68]:
''' pandas DataFrame合并 ''' import numpy as np import pandas as pd df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'],index=[2,3,4]) '''append''' print(df1.append(df2,ignore_index=True)) print('________________________________________________') df4 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) s1 = pd.Series([1,2,3,4],index=['a','b','c','d']) res= df4.append(s1,ignore_index=True) print(res)
a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 5 1.0 1.0 1.0 1.0 ________________________________________________ a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 1.0 2.0 3.0 4.0
C:\Users\Double\AppData\Local\Temp\ipykernel_13356\7792660.py:10: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. print(df1.append(df2,ignore_index=True)) C:\Users\Double\AppData\Local\Temp\ipykernel_13356\7792660.py:14: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. res= df4.append(s1,ignore_index=True)
In [74]:
''' pandas DataFrame合并 ''' import numpy as np import pandas as pd left = pd.DataFrame({ 'key':['k1','k2','k3','k4'], 'A':['a1','a2','a3','a4'], 'B':['b1','b2','b3','b4'] }) right = pd.DataFrame({ 'key':['k1','k2','k3','k4'], 'C':['C1','C2','C3','C4'], 'D':['D1','D2','D3','D4'] }) print(left) print('____________________') print(right) print('____________________') print(pd.merge(left,right,on='key'))
key A B 0 k1 a1 b1 1 k2 a2 b2 2 k3 a3 b3 3 k4 a4 b4 ____________________ key C D 0 k1 C1 D1 1 k2 C2 D2 2 k3 C3 D3 3 k4 C4 D4 ____________________ key A B C D 0 k1 a1 b1 C1 D1 1 k2 a2 b2 C2 D2 2 k3 a3 b3 C3 D3 3 k4 a4 b4 C4 D4
In [84]:
''' pandas DataFrame合并 ''' import numpy as np import pandas as pd left = pd.DataFrame({ 'key1':['k1','k4','k2','k2'], 'key2':['k2','k1','k2','k1'], 'A':['a1','a2','a3','a4'], 'B':['b1','b2','b3','b4'] }) right = pd.DataFrame({ 'key1':['k2','k1','k2','k2'], 'key2':['k2','k2','k2','k1'], 'C':['C1','C2','C3','C4'], 'D':['D1','D2','D3','D4'] }) print(left) print('____________________') print(right) print('____________________') #how=['left','right','outer','inner'] print(pd.merge(left,right,on=['key1','key2'],how='right')) print('____________________') print(pd.merge(left,right,on=['key1','key2'],how='right',indicator=True)) print('____________________') print(pd.merge(left,right,left_index=True,right_index=True,how='right',indicator=True)) print('____________________')
key1 key2 A B 0 k1 k2 a1 b1 1 k4 k1 a2 b2 2 k2 k2 a3 b3 3 k2 k1 a4 b4 ____________________ key1 key2 C D 0 k2 k2 C1 D1 1 k1 k2 C2 D2 2 k2 k2 C3 D3 3 k2 k1 C4 D4 ____________________ key1 key2 A B C D 0 k2 k2 a3 b3 C1 D1 1 k1 k2 a1 b1 C2 D2 2 k2 k2 a3 b3 C3 D3 3 k2 k1 a4 b4 C4 D4 ____________________ key1 key2 A B C D _merge 0 k2 k2 a3 b3 C1 D1 both 1 k1 k2 a1 b1 C2 D2 both 2 k2 k2 a3 b3 C3 D3 both 3 k2 k1 a4 b4 C4 D4 both ____________________ key1_x key2_x A B key1_y key2_y C D _merge 0 k1 k2 a1 b1 k2 k2 C1 D1 both 1 k4 k1 a2 b2 k1 k2 C2 D2 both 2 k2 k2 a3 b3 k2 k2 C3 D3 both 3 k2 k1 a4 b4 k2 k1 C4 D4 both ____________________
In [88]:
''' pandas 图像 ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt data =pd.Series(np.random.randn(1000),index=np.arange(1000)) data =data.cumsum() data.plot() plt.show()
____________________
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [88], in <cell line: 14>() 12 plt.show() 13 print('____________________') ---> 14 data1 =pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),colmuns=list('ABCD')) 15 data1 =data.cumsum() 16 data1.plot() TypeError: __init__() got an unexpected keyword argument 'colmuns'
In [98]:
''' pandas 图像 ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt data1 =pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD')) data1 =data1.cumsum() #ploat methoads: #'bar','hist','box','kde','area','scatter','hexbin','pie' data1.plot() plt.show()
In [100]:
''' pandas 图像 ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt print('____________________') data =pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD')) data =data.cumsum() #ploat methoads: #'bar','hist','box','kde','area','scatter','hexbin','pie' ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1') data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax) plt.show()
____________________
In [ ]: