Pandas的基础操作(一)——矩阵表的创建及其属性
(注:记得在文件开头导入import numpy as np以及import pandas as pd)
1 import pandas as pd 2 import numpy as np 3 4 5 #创建一个Pandas序列 6 7 8 s = pd.Series([1, 3, 6, np.nan, 44, 1]) 9 # print(s) 10 # 0 1.0 11 # 1 3.0 12 # 2 6.0 13 # 3 NaN 14 # 4 44.0 15 # 5 1.0 16 # dtype: float64 17 18 #创建一个矩阵式的DataFrame 19 dates = pd.date_range('20190710', periods=6) 20 # print(dates) 21 # DatetimeIndex(['2019-07-10', '2019-07-11', '2019-07-12', '2019-07-13', 22 # '2019-07-14', '2019-07-15'], 23 # dtype='datetime64[ns]', freq='D') 24 25 26 #行的标签是 dates ;列的标签是:columns 27 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd']) 28 # print(df) 29 # a b c d 30 # 2019-07-10 -0.953463 -2.588401 1.680335 0.258889 31 # 2019-07-11 -2.183960 -1.559565 -0.119690 2.474845 32 # 2019-07-12 0.246754 0.237245 0.555891 -2.291064 33 # 2019-07-13 1.365473 -0.520804 2.351753 -0.650416 34 # 2019-07-14 0.160255 -0.665578 -1.330720 -0.502632 35 # 2019-07-15 1.427740 -0.386175 -0.102600 0.280338 36 37 #采用默认的行列标签方式,行和列的标签号都是从0开始 38 df1 = pd.DataFrame(np.arange(12).reshape((3,4))) 39 # print(df1) 40 # 0 1 2 3 41 # 0 0 1 2 3 42 # 1 4 5 6 7 43 # 2 8 9 10 11 44 45 #利用字典创建pandas矩阵表 46 df2 = pd.DataFrame({ 47 'A':1., 48 'B':pd.Timestamp('20190710'), 49 'C':pd.Series(1, index=list(range(4)), dtype='float32'), 50 'D':np.array([3]*4, dtype='int32'), 51 'E':pd.Categorical(["test", "train", "test", "train"]), 52 'F':'foo' 53 }) 54 55 # print(df2) 56 # A B C D E F 57 # 0 1.0 2019-07-10 1.0 3 test foo 58 # 1 1.0 2019-07-10 1.0 3 train foo 59 # 2 1.0 2019-07-10 1.0 3 test foo 60 # 3 1.0 2019-07-10 1.0 3 train foo 61 # 62 63 64 65 #常用属性介绍(PS:因为是属性所以不用添加括号) 66 67 #1 df.dtypes 查看每一列的数据类型 68 print(df2.dtypes) 69 # A float64 70 # B datetime64[ns] 71 # C float32 72 # D int32 73 # E category 74 # F object 75 # dtype: object 76 77 #2 df.index 查看矩阵表所有每一行的标签号 78 print(df2.index) 79 # Int64Index([0, 1, 2, 3], dtype='int64') 80 81 #3 df.columns 查看矩阵表所有每一列的标签号 82 print(df2.columns) 83 # Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') 84 85 #4 df.values 查看矩阵表中所有的值 86 print(df2.values) 87 # [[1.0 Timestamp('2019-07-10 00:00:00') 1.0 3 'test' 'foo'] 88 # [1.0 Timestamp('2019-07-10 00:00:00') 1.0 3 'train' 'foo'] 89 # [1.0 Timestamp('2019-07-10 00:00:00') 1.0 3 'test' 'foo'] 90 # [1.0 Timestamp('2019-07-10 00:00:00') 1.0 3 'train' 'foo']] 91 92 #5 df.describe() 描述矩阵表中float int 数据类型的均值、方差等参数 93 print(df2.describe()) 94 # A C D 95 # count 4.0 4.0 4.0 96 # mean 1.0 1.0 3.0 97 # std 0.0 0.0 0.0 98 # min 1.0 1.0 3.0 99 # 25% 1.0 1.0 3.0 100 # 50% 1.0 1.0 3.0 101 # 75% 1.0 1.0 3.0 102 # max 1.0 1.0 3.0 103 104 #6 df.T 将矩阵表转置 105 print(df2.T) 106 # A 1 ... 1 107 # B 2019-07-10 00:00:00 ... 2019-07-10 00:00:00 108 # C 1 ... 1 109 # D 3 ... 3 110 # E test ... train 111 # F foo ... foo 112 113 #7 df.sort_index() 对矩阵表进行排序 114 115 #按照索引标签进行排序,axis=1 --> 行位置不变,列位置变化,ascending=False,进行反排序 116 print(df2.sort_index(axis=1, ascending=False)) 117 # F E D C B A 118 # 0 foo test 3 1.0 2019-07-10 1.0 119 # 1 foo train 3 1.0 2019-07-10 1.0 120 # 2 foo test 3 1.0 2019-07-10 1.0 121 # 3 foo train 3 1.0 2019-07-10 1.0 122 123 #按照索引标签进行排序,axis=0 --> 行位置变化,列位置不变,ascending=False,进行反排序 124 print(df2.sort_index(axis=0, ascending=False)) 125 # A B C D E F 126 # 3 1.0 2019-07-10 1.0 3 train foo 127 # 2 1.0 2019-07-10 1.0 3 test foo 128 # 1 1.0 2019-07-10 1.0 3 train foo 129 # 0 1.0 2019-07-10 1.0 3 test foo 130 131 132 #8 df.sort_values() 根据矩阵表中的值进行排序 133 print(df2.sort_values(by='E')) #对第"E"列的值进行排序 134 # A B C D E F 135 # 0 1.0 2019-07-10 1.0 3 test foo 136 # 2 1.0 2019-07-10 1.0 3 test foo 137 # 1 1.0 2019-07-10 1.0 3 train foo 138 # 3 1.0 2019-07-10 1.0 3 train foo