记录一下遇到的问题
- 全屏F11和单步调试都是F10
-
############### # Data Loader # ############### import pandas as pd import numpy as np import json from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import mean_squared_error as mse #输入数据路径,城市,时间,等级,长度,步长,是否进行归一化 def data_loader(data_path, city, year, level='district', length=12, n_steps=12, is_scale=False): def normalize(train, test): # scaling features to range (0, 1) by MinMaxScaler if is_scale: scaler = MinMaxScaler()#数据归一化 train_shape, test_shape = train.shape, test.shape train = scaler.fit_transform(train.reshape(-1, train_shape[-1])) test = scaler.transform(test.reshape(-1, test_shape[-1])) #test_shape[-1]:shape的倒数第一个数字,最后一维的长度 #reshape(-1,2)转化成两列 #转化为n列 #fit_transform方法是fit和transform的结合,fit_transform(X_train) # 意思是找出X_train的均值和标准差,并应用在X_train上。 #这时对于X_test,我们就可以直接使用transform方法。 # 因为此时StandardScaler已经保存了X_train的均值和标准差 return train.reshape(train_shape), test.reshape(test_shape) else: return train, test risk_data = pd.read_csv(f'{data_path}/risk_scores/{city}-{year}-{level}-hour-risk.csv') selected_areas = risk_data.drop(columns=['date', 'time']).columns n_districts = len(selected_areas) # number of districts n_outputs = len(selected_areas) # pd.read_csv()该函数返回一个表格型的数据结构,有行索引和列索引。 # df.drop()通过指定标签名称和相应的轴,或直接给定索引或列名称来删除行或列 # 删除DATA和time列 # 获取的结果可以观察到列名和数据类型,但是无法直接对得到的结果进行操作 # 25行当向量selected_areas # Traffic Accident Risk risk_train, y_train = [], [] risk_test, y_test = [], [] for i in range(length, 721-n_steps): if i <= (21*24): # before date 22nd y_train.append(risk_data.drop(columns=['date', 'time']).iloc[i:i+n_steps, :n_outputs].to_numpy()) risk_train.append(risk_data.drop(columns=['date', 'time']).iloc[i-length:i, :n_districts].to_numpy()) else: y_test.append(risk_data.drop(columns=['date', 'time']).iloc[i:i+n_steps, :n_outputs].to_numpy()) risk_test.append(risk_data.drop(columns=['date', 'time']).iloc[i-length:i, :n_districts].to_numpy()) risk_train, risk_test = normalize(np.array(risk_train), np.array(risk_test)) y_train, y_test = np.array(y_train), np.array(y_test) # Weather & Air Quality weather_data = pd.read_csv(f'{data_path}/weather/{city}-{year}-count.csv').fillna(0) if level == 'district': weather_data['location'] = weather_data['location'].apply(lambda x: x.split('|')[0]) weather_data = weather_data.groupby(by=['date','time','location'], as_index=False).mean() weather_train, weather_test = [], [] location_weather = [] for location in selected_areas: location_weather.append(weather_data[weather_data['location'] == location].iloc[:, 3:].to_numpy()) location_weather = np.concatenate(location_weather, axis=1) for i in range(length, 721-n_steps): if i <= (21*24): weather_train.append(location_weather[i-length:i]) else: weather_test.append(location_weather[i-length:i]) weather_train, weather_test = normalize(np.array(weather_train).reshape(len(weather_train), length, n_districts, -1), np.array(weather_test).reshape(len(weather_test), length, n_districts, -1)) # Dangerous Driving Behavior dtg_data = pd.read_csv(f'{data_path}/dangerous_cases/{city}-{year}-date-hour-{level}-new.csv') dtg_train, dtg_test = [], [] location_dtg = [] for location in selected_areas: if level == 'district': district = location.split('|')[0] location_dtg.append(dtg_data[dtg_data['district'] == district].iloc[:, 3:].to_numpy()) else: district, subdistrict = location.split('|')[0], location.split('|')[1] location_dtg.append(dtg_data[(dtg_data['district'] == district) & (dtg_data['subdistrict'] == subdistrict)].iloc[:, 3:].to_numpy()) location_dtg = np.concatenate(location_dtg, axis=1) for i in range(length, 721-n_steps): if i <= (21*24): dtg_train.append(location_dtg[i-length:i]) else: dtg_test.append(location_dtg[i-length:i]) dtg_train, dtg_test = normalize(np.array(dtg_train).reshape(len(dtg_train), length, n_districts, -1), np.array(dtg_test).reshape(len(dtg_test), length, n_districts, -1)) # Road data road_data = pd.read_csv(f'{data_path}/roads/{city}-{year}-{level}-road-count.csv').drop(columns=['attribute']) road_train, road_test = [], [] location_road = [] for location in selected_areas: location_road.append(road_data[location].to_numpy()) for i in range(length, 721-n_steps): if i <= (21*24): road_train.append(np.array([location_road]*length)) else: road_test.append(np.array([location_road]*length)) road_train, road_test = normalize(np.array(road_train), np.array(road_test)) # Demographics data demo_data = pd.read_csv(f'{data_path}/demographic/{city}-{year}-{level}.csv').drop(columns=['index']) demo_train, demo_test = [], [] location_demo = [] for location in selected_areas: location_demo.append(demo_data[location].to_numpy()) for i in range(length, 721-n_steps): if i <= (21*24): demo_train.append(np.array([location_demo]*length)) else: demo_test.append(np.array([location_demo]*length)) demo_train, demo_test = normalize(np.array(demo_train), np.array(demo_test)) # POI data poi_data = pd.read_csv(f'{data_path}/poi/{city}-{year}-{level}.csv').drop(columns=['location']) poi_train, poi_test = [], [] location_poi = [] for location in selected_areas: location_poi.append(poi_data[location].to_numpy()) for i in range(length, 721-n_steps): if i <= (21*24): poi_train.append(np.array([location_poi]*length)) else: poi_test.append(np.array([location_poi]*length)) poi_train, poi_test = normalize(np.array(poi_train), np.array(poi_test)) # Traffic volumes volume_data = pd.read_csv(f'{data_path}/traffic_volume/{city}-{level}-{year}.csv').drop(columns=['date', 'hour']) volume_train, volume_test = [], [] for i in range(length, 721-n_steps): if i <= (21*24): volume_train.append(volume_data.iloc[i-length:i, :n_districts].to_numpy()) else: volume_test.append(volume_data.iloc[i-length:i, :n_districts].to_numpy()) volume_train, volume_test = normalize(np.array(volume_train), np.array(volume_test)) # Calendar calendar_data = pd.read_csv(f'{data_path}/calendar/calendar-{city}-{year}-{level}.csv') calendar_train, calendar_test = [], [] location_calendar = [] for location in selected_areas: location_calendar.append(calendar_data[calendar_data['location'] == location].iloc[:, 1:].to_numpy()) location_calendar = np.concatenate(location_calendar, axis=1) for i in range(length, 721-n_steps): if i <= (21*24): calendar_train.append(location_calendar[i-length:i]) else: calendar_test.append(location_calendar[i-length:i]) calendar_train, calendar_test = normalize(np.array(calendar_train).reshape(len(calendar_train), length, n_districts, -1), np.array(calendar_test).reshape(len(calendar_test), length, n_districts, -1)) # Match Shape risk_train = risk_train[:,:,:,None] risk_test = risk_test[:,:,:,None] volume_train = volume_train[:,:,:,None] volume_test = volume_test[:,:,:,None] return { 'risk': [risk_train, risk_test], 'road': [road_train, road_test], 'poi': [poi_train, poi_test], 'demo': [demo_train, demo_test], 'weather': [weather_train, weather_test], 'calendar': [calendar_train, calendar_test], 'volume': [volume_train, volume_test], 'dtg': [dtg_train, dtg_test], 'y': [y_train, y_test], 'selected_areas': selected_areas, }
-
pandas库的使用:10 minutes to pandas — pandas 1.5.0 documentation
两类数据类型,Pandas 的主要数据结构是 Series (一维数据)与 DataFrame(二维数据)
series
>>> s = pd.Series(data, index=index)
data
can be many different things:
-
a Python dict
-
an ndarray
-
a scalar value (like 5)
dataframe
iloc 选择数据