VScode中的ipynb编程

记录一下遇到的问题

  • 全屏F11和单步调试都是F10
  • ###############
    # Data Loader #
    ###############
    
    import pandas as pd
    import numpy as np
    import json
    
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import mean_absolute_error as mae 
    from sklearn.metrics import mean_squared_error as mse
    
    #输入数据路径,城市,时间,等级,长度,步长,是否进行归一化
    def data_loader(data_path, city, year, level='district', length=12, n_steps=12, is_scale=False):
        
        def normalize(train, test):
            # scaling features to range (0, 1) by MinMaxScaler
            if is_scale:
                scaler = MinMaxScaler()#数据归一化
                train_shape, test_shape = train.shape, test.shape
                train = scaler.fit_transform(train.reshape(-1, train_shape[-1]))
                test = scaler.transform(test.reshape(-1, test_shape[-1]))
                #test_shape[-1]:shape的倒数第一个数字,最后一维的长度
                #reshape(-1,2)转化成两列
                #转化为n列
                #fit_transform方法是fit和transform的结合,fit_transform(X_train) 
                # 意思是找出X_train的均值和​​​​​​​标准差,并应用在X_train上。
                #这时对于X_test,我们就可以直接使用transform方法。
                # 因为此时StandardScaler已经保存了X_train的均值和标准差
    
                return train.reshape(train_shape), test.reshape(test_shape)
            else:
                return train, test
    
        risk_data = pd.read_csv(f'{data_path}/risk_scores/{city}-{year}-{level}-hour-risk.csv')
        selected_areas = risk_data.drop(columns=['date', 'time']).columns
        n_districts = len(selected_areas) # number of districts
        n_outputs = len(selected_areas)
        # pd.read_csv()该函数返回一个表格型的数据结构,有行索引和列索引。
        # df.drop()通过指定标签名称和相应的轴,或直接给定索引或列名称来删除行或列
        # 删除DATA和time列
        # 获取的结果可以观察到列名和数据类型,但是无法直接对得到的结果进行操作
        # 25行当向量selected_areas
        # Traffic Accident Risk
        risk_train, y_train = [], []
        risk_test, y_test = [], []
        for i in range(length, 721-n_steps):
            if i <= (21*24): # before date 22nd
                y_train.append(risk_data.drop(columns=['date', 'time']).iloc[i:i+n_steps, :n_outputs].to_numpy())
                risk_train.append(risk_data.drop(columns=['date', 'time']).iloc[i-length:i, :n_districts].to_numpy())
            else:
                y_test.append(risk_data.drop(columns=['date', 'time']).iloc[i:i+n_steps, :n_outputs].to_numpy())
                risk_test.append(risk_data.drop(columns=['date', 'time']).iloc[i-length:i, :n_districts].to_numpy())
            
        risk_train, risk_test = normalize(np.array(risk_train), np.array(risk_test))
        y_train, y_test = np.array(y_train), np.array(y_test)
    
        # Weather & Air Quality  
        weather_data = pd.read_csv(f'{data_path}/weather/{city}-{year}-count.csv').fillna(0)
        if level == 'district':
            weather_data['location'] = weather_data['location'].apply(lambda x: x.split('|')[0])
            weather_data = weather_data.groupby(by=['date','time','location'], as_index=False).mean()                
        weather_train, weather_test = [], []
    
        location_weather = []
        for location in selected_areas:
            location_weather.append(weather_data[weather_data['location'] == location].iloc[:, 3:].to_numpy())
    
        location_weather = np.concatenate(location_weather, axis=1)
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                weather_train.append(location_weather[i-length:i])
            else:
                weather_test.append(location_weather[i-length:i])
        
        weather_train, weather_test = normalize(np.array(weather_train).reshape(len(weather_train), length, n_districts, -1), np.array(weather_test).reshape(len(weather_test), length, n_districts, -1))
    
    
        # Dangerous Driving Behavior
        dtg_data = pd.read_csv(f'{data_path}/dangerous_cases/{city}-{year}-date-hour-{level}-new.csv')
        dtg_train, dtg_test = [], []
    
        location_dtg = []
        for location in selected_areas:
            if level == 'district':
                district = location.split('|')[0]
                location_dtg.append(dtg_data[dtg_data['district'] == district].iloc[:, 3:].to_numpy())
            else:
                district, subdistrict = location.split('|')[0], location.split('|')[1]
                location_dtg.append(dtg_data[(dtg_data['district'] == district) & (dtg_data['subdistrict'] == subdistrict)].iloc[:, 3:].to_numpy())
    
        location_dtg = np.concatenate(location_dtg, axis=1)
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                dtg_train.append(location_dtg[i-length:i])
            else:
                dtg_test.append(location_dtg[i-length:i])
    
        dtg_train, dtg_test = normalize(np.array(dtg_train).reshape(len(dtg_train), length, n_districts, -1), np.array(dtg_test).reshape(len(dtg_test), length, n_districts, -1))
    
    
        # Road data
        road_data = pd.read_csv(f'{data_path}/roads/{city}-{year}-{level}-road-count.csv').drop(columns=['attribute'])
        road_train, road_test = [], []
    
        location_road = []
        for location in selected_areas:
            location_road.append(road_data[location].to_numpy())
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                road_train.append(np.array([location_road]*length))
            else:
                road_test.append(np.array([location_road]*length))
                
        road_train, road_test = normalize(np.array(road_train), np.array(road_test))
    
    
        # Demographics data
        demo_data = pd.read_csv(f'{data_path}/demographic/{city}-{year}-{level}.csv').drop(columns=['index'])
        demo_train, demo_test = [], []
    
        location_demo = []
        for location in selected_areas:
            location_demo.append(demo_data[location].to_numpy())
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                demo_train.append(np.array([location_demo]*length))
            else:
                demo_test.append(np.array([location_demo]*length))
        demo_train, demo_test = normalize(np.array(demo_train), np.array(demo_test))
    
    
        # POI data
        poi_data = pd.read_csv(f'{data_path}/poi/{city}-{year}-{level}.csv').drop(columns=['location'])
        poi_train, poi_test = [], []
    
        location_poi = []
        for location in selected_areas:
            location_poi.append(poi_data[location].to_numpy())
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                poi_train.append(np.array([location_poi]*length))
            else:
                poi_test.append(np.array([location_poi]*length))
                
        poi_train, poi_test = normalize(np.array(poi_train), np.array(poi_test))
    
    
        # Traffic volumes
        volume_data = pd.read_csv(f'{data_path}/traffic_volume/{city}-{level}-{year}.csv').drop(columns=['date', 'hour'])
        volume_train, volume_test = [], []
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                volume_train.append(volume_data.iloc[i-length:i, :n_districts].to_numpy())
            else:
                volume_test.append(volume_data.iloc[i-length:i, :n_districts].to_numpy())
    
        volume_train, volume_test = normalize(np.array(volume_train), np.array(volume_test))
    
    
        # Calendar
        calendar_data = pd.read_csv(f'{data_path}/calendar/calendar-{city}-{year}-{level}.csv')
        calendar_train, calendar_test = [], []
        
        location_calendar = []
        for location in selected_areas:
            location_calendar.append(calendar_data[calendar_data['location'] == location].iloc[:, 1:].to_numpy())
    
        location_calendar = np.concatenate(location_calendar, axis=1)
    
        for i in range(length, 721-n_steps):
            if i <= (21*24):
                calendar_train.append(location_calendar[i-length:i])
            else:
                calendar_test.append(location_calendar[i-length:i])
                
        calendar_train, calendar_test = normalize(np.array(calendar_train).reshape(len(calendar_train), length, n_districts, -1), np.array(calendar_test).reshape(len(calendar_test), length, n_districts, -1))
        
        # Match Shape
        risk_train = risk_train[:,:,:,None]
        risk_test = risk_test[:,:,:,None]
        volume_train = volume_train[:,:,:,None]
        volume_test = volume_test[:,:,:,None]
    
        return {
            'risk': [risk_train, risk_test],
            'road': [road_train, road_test],
            'poi': [poi_train, poi_test],
            'demo': [demo_train, demo_test],
            'weather': [weather_train, weather_test],
            'calendar': [calendar_train, calendar_test],
            'volume': [volume_train, volume_test],
            'dtg': [dtg_train, dtg_test],
            'y': [y_train, y_test],
            'selected_areas': selected_areas,
        }

  • pandas库的使用:10 minutes to pandas — pandas 1.5.0 documentation

 两类数据类型,Pandas 的主要数据结构是 Series (一维数据)与 DataFrame(二维数据)

 series

>>> s = pd.Series(data, index=index)

data can be many different things:

  • a Python dict

  • an ndarray

  • a scalar value (like 5)

dataframe

 iloc 选择数据

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值