1 #coding: utf-8
2
3 #In[1]:
4
5
6 importpandas as pd7 importnumpy as np8
9
10 #In[2]:
11
12
13 data = pd.DataFrame(pd.read_excel('originalData.xlsx'))14
15 #date hour pressure wind_direction temperature
16 #0 2016-07-01 0.0 1000.4 225.0 26.4
17 #1 2016-07-01 NaN NaN NaN NaN
18 #2 2016-07-01 6.0 998.9 212.0 31.7
19 #3 2016-07-01 235.0 998.7 244.0 NaN
20 #4 2016-07-01 12.0 999.7 222.0 NaN
21 #5 2016-07-01 15.0 1000.0 102.0 NaN
22 #6 2016-07-01 NaN 998.8 202.0 26.0
23 #7 2016-07-01 NaN 1000.2 334.0 25.5
24 #8 2016-07-01 NaN 1000.2 334.0 25.5
25 #9 2016-07-02 3.0 1002.4 46.0 30.0
26 #10 2016-07-02 6.0 1001.3 37.0 29.3
27 #11 2016-07-02 9.0 1001.9 345.0 25.9
28 #12 2016-07-02 12.0 1003.6 113.0 25.1
29 #13 2016-07-02 12.0 1003.6 113.0 25.1
30 #14 2016-07-02 15.0 1002.4 138.0 25.3
31 #hour pressure wind_direction temperature
32 #count 11.000000 14.000000 14.000000 11.000000
33 #mean 29.545455 1000.864286 190.500000 26.890909
34 #std 68.313049 1.685963 102.932951 2.311473
35 #min 0.000000 998.700000 37.000000 25.100000
36 #25% 6.000000 999.775000 113.000000 25.400000
37 #50% 12.000000 1000.300000 207.000000 25.900000
38 #75% 13.500000 1002.275000 239.250000 27.850000
39 #max 235.000000 1003.600000 345.000000 31.700000
40 print(data)41 print(data.describe())42
43
44 #In[3]:
45
46 #RangeIndex(start=0, stop=15, step=1)
47 #Index(['date', 'hour', 'pressure', 'wind_direction', 'temperature'], dtype='object')
48 print(data.index)49 print(data.columns)50
51
52 #In[4]:
53
54 #date hour pressure wind_direction temperature
55 #0 2016-07-01 0.0 1000.4 225.0 26.4
56 #1 2016-07-01 NaN NaN NaN NaN
57 #2 2016-07-01 6.0 998.9 212.0 31.7
58 #3 2016-07-01 235.0 998.7 244.0 NaN
59 #4 2016-07-01 12.0 999.7 222.0 NaN
60 #5 2016-07-01 15.0 1000.0 102.0 NaN
61 #date hour pressure wind_direction temperature
62 #9 2016-07-02 3.0 1002.4 46.0 30.0
63 #10 2016-07-02 6.0 1001.3 37.0 29.3
64 #11 2016-07-02 9.0 1001.9 345.0 25.9
65 #12 2016-07-02 12.0 1003.6 113.0 25.1
66 #13 2016-07-02 12.0 1003.6 113.0 25.1
67 #14 2016-07-02 15.0 1002.4 138.0 25.3
68 print(data.head(6))69 print(data.tail(6))70
71
72 #In[5]:
73
74
75 #1. 删掉空白值超过3的行
76 data.dropna(axis=0, thresh=3, inplace=True)77 data.reset_index(drop=True, inplace=True)78
79 #date hour pressure wind_direction temperature
80 #0 2016-07-01 0.0 1000.4 225.0 26.4
81 #1 2016-07-01 6.0 998.9 212.0 31.7
82 #2 2016-07-01 235.0 998.7 244.0 NaN
83 #3 2016-07-01 12.0 999.7 222.0 NaN
84 #4 2016-07-01 15.0 1000.0 102.0 NaN
85 #5 2016-07-01 NaN 998.8 202.0 26.0
86 #6 2016-07-01 NaN 1000.2 334.0 25.5
87 #7 2016-07-01 NaN 1000.2 334.0 25.5
88 #8 2016-07-02 3.0 1002.4 46.0 30.0
89 #9 2016-07-02 6.0 1001.3 37.0 29.3
90 #10 2016-07-02 9.0 1001.9 345.0 25.9
91 #11 2016-07-02 12.0 1003.6 113.0 25.1
92 #12 2016-07-02 12.0 1003.6 113.0 25.1
93 #13 2016-07-02 15.0 1002.4 138.0 25.3
94 print(data)95
96
97 #In[6]:
98
99
100 #2. 填充空白,hour填充10,temperature填充25.5
101 data.fillna({'hour':10, 'temperature':25.5}, inplace=True)102
103 #date hour pressure wind_direction temperature
104 #0 2016-07-01 0.0 1000.4 225.0 26.4
105 #1 2016-07-01 6.0 998.9 212.0 31.7
106 #2 2016-07-01 235.0 998.7 244.0 25.5
107 #3 2016-07-01 12.0 999.7 222.0 25.5
108 #4 2016-07-01 15.0 1000.0 102.0 25.5
109 #5 2016-07-01 10.0 998.8 202.0 26.0
110 #6 2016-07-01 10.0 1000.2 334.0 25.5
111 #7 2016-07-01 10.0 1000.2 334.0 25.5
112 #8 2016-07-02 3.0 1002.4 46.0 30.0
113 #9 2016-07-02 6.0 1001.3 37.0 29.3
114 #10 2016-07-02 9.0 1001.9 345.0 25.9
115 #11 2016-07-02 12.0 1003.6 113.0 25.1
116 #12 2016-07-02 12.0 1003.6 113.0 25.1
117 #13 2016-07-02 15.0 1002.4 138.0 25.3
118 print(data)119
120
121 #In[7]:
122
123
124 #3. 删掉hour>24的行
125 num =data.index.max()126
127 for i inrange(num):128 if data.loc[i, 'hour'] > 24:129 data.drop([i], inplace=True)130 print('hour > 24, deleted')131
132 data.reset_index(drop=True, inplace=True)133
134 #hour > 24, deleted
135 #date hour pressure wind_direction temperature
136 #0 2016-07-01 0.0 1000.4 225.0 26.4
137 #1 2016-07-01 6.0 998.9 212.0 31.7
138 #2 2016-07-01 12.0 999.7 222.0 25.5
139 #3 2016-07-01 15.0 1000.0 102.0 25.5
140 #4 2016-07-01 10.0 998.8 202.0 26.0
141 #5 2016-07-01 10.0 1000.2 334.0 25.5
142 #6 2016-07-01 10.0 1000.2 334.0 25.5
143 #7 2016-07-02 3.0 1002.4 46.0 30.0
144 #8 2016-07-02 6.0 1001.3 37.0 29.3
145 #9 2016-07-02 9.0 1001.9 345.0 25.9
146 #10 2016-07-02 12.0 1003.6 113.0 25.1
147 #11 2016-07-02 12.0 1003.6 113.0 25.1
148 #12 2016-07-02 15.0 1002.4 138.0 25.3
149 print(data)150
151
152 #In[8]:
153
154
155 #4. 删掉重复的数据行,保留出现的第一行(全部删掉?保留最后一行?)
156 data.drop_duplicates(keep='first', inplace=True)157 data.reset_index(drop=True, inplace=True)158
159 #date hour pressure wind_direction temperature
160 #0 2016-07-01 0.0 1000.4 225.0 26.4
161 #1 2016-07-01 6.0 998.9 212.0 31.7
162 #2 2016-07-01 12.0 999.7 222.0 25.5
163 #3 2016-07-01 15.0 1000.0 102.0 25.5
164 #4 2016-07-01 10.0 998.8 202.0 26.0
165 #5 2016-07-01 10.0 1000.2 334.0 25.5
166 #6 2016-07-02 3.0 1002.4 46.0 30.0
167 #7 2016-07-02 6.0 1001.3 37.0 29.3
168 #8 2016-07-02 9.0 1001.9 345.0 25.9
169 #9 2016-07-02 12.0 1003.6 113.0 25.1
170 #10 2016-07-02 15.0 1002.4 138.0 25.3
171 print(data)172
173
174 #In[9]:
175
176
177 #5. 数据重排
178 randnum =np.random.permutation(data.index.size)179
180 #[ 4 0 10 3 1 5 8 9 7 2 6]
181 print(randnum)182
183
184 #In[10]:
185
186
187 data2 =data.take(randnum)188
189 #date hour pressure wind_direction temperature
190 #4 2016-07-01 10.0 998.8 202.0 26.0
191 #0 2016-07-01 0.0 1000.4 225.0 26.4
192 #10 2016-07-02 15.0 1002.4 138.0 25.3
193 #3 2016-07-01 15.0 1000.0 102.0 25.5
194 #1 2016-07-01 6.0 998.9 212.0 31.7
195 #5 2016-07-01 10.0 1000.2 334.0 25.5
196 #8 2016-07-02 9.0 1001.9 345.0 25.9
197 #9 2016-07-02 12.0 1003.6 113.0 25.1
198 #7 2016-07-02 6.0 1001.3 37.0 29.3
199 #2 2016-07-01 12.0 999.7 222.0 25.5
200 #6 2016-07-02 3.0 1002.4 46.0 30.0
201 print(data2)202
203
204 #In[11]:
205
206
207 #6. 随机采样
208 data3 = data.sample(8)209
210 #date hour pressure wind_direction temperature
211 #4 2016-07-01 10.0 998.8 202.0 26.0
212 #0 2016-07-01 0.0 1000.4 225.0 26.4
213 #2 2016-07-01 12.0 999.7 222.0 25.5
214 #1 2016-07-01 6.0 998.9 212.0 31.7
215 #5 2016-07-01 10.0 1000.2 334.0 25.5
216 #9 2016-07-02 12.0 1003.6 113.0 25.1
217 #7 2016-07-02 6.0 1001.3 37.0 29.3
218 #8 2016-07-02 9.0 1001.9 345.0 25.9
219 print(data3)220 data3.to_csv('data3.csv')