Python绘图的几种变化形式

1.概述

在使用matplotlib.pyplot画出图形窗口时,通常会有不同的变种,本文根据实际使用经验,特记录以下几种常见的情况。

2. plt.subplot 操作

from __future__ import division
import os
import sys
sys.path.append('../../')
import time
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools

def compare_analysis(trip_time, th, path):
    trip_time_idF = list(np.where(trip_time < trip_time_thresh)[0])
    trip_time_idF_M = list(np.where(trip_time >=trip_time_thresh)[0])
    '''give plot'''
    plt.figure(figsize=(23,16))
    plt.subplot(211)
    plt.scatter(trip_time[trip_time_idF], th[trip_time_idF],alpha=0.5)
    #plt.scatter(tripTime[tripTime_idF_M], th[tripTime_idF_M],c='r',marker='*',alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.xlim(trip_time.min()-1000, trip_time.max())
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)
    plt.ylim(th.min()-1000, th.max())
    plt.xticks(np.arange(0, 25000, 2500))
    #plt.yticks(np.arange(-5, 5, 0.5))
    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16)

    plt.subplot(212)
    plt.scatter(trip_time[trip_time_idF], th[trip_time_idF],alpha=0.5)
    plt.scatter(trip_time[trip_time_idF_M], th[trip_time_idF_M],c='r',marker='*',alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)
    plt.xticks(np.arange(0, 25000, 2500))
    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16)
    plt.savefig('{0}/compare.png'.format(path))  

""" actual and regression time scatter """
def scatterPlot(rates, th, tr, tp, ta, path):
    rates_h = rates[:,0]  # hist pass rate,1 or 0
    rates_r = rates[:,1]  # real pass rate,1 or 0
    rates_a = rates[:,2]  # actual pass rate,1 or 0
    #use_colours = {"T":"green","F":"red"} 
    colors = ['r','g']
    
    plt.figure(figsize=(22,7))
    plt.subplot(131)
    plt.scatter(ta, th, c=[colors[int(x)] for x in rates_h], alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)
    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16)    
    
    plt.subplot(132)
    plt.scatter(ta, tr, c=[colors[int(x)] for x in rates_r], alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Real time with hist speed (s)',fontsize=16)
    plt.grid(True)
    plt.title('Real2Trip ScatterPlot',fontsize=16)     
    
    plt.subplot(133)
    plt.scatter(ta, tp, c=[colors[int(x)] for x in rates_a],alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Regre time with Regression coffe (s)',fontsize=16)
    plt.grid(True)
    plt.title('Regre2Trip ScatterPlot',fontsize=16)  
    plt.savefig('{0}/timeScatter.png'.format(path))

3.axes操作

def histPlot(th, tr, tp, ta, path):
    dth = th - ta
    dtr = tr - ta
    dtp = tp - ta
    
    fig,axes = plt.subplots(1,3, figsize=(23,8)) #get 1x1 subplots
    axes[0].set_xlabel(r'$\Delta$' + "t (s)",fontsize=16) #set the coordinate axis-x label
    axes[0].set_ylabel(u'Probability',fontsize=16) #set the coordinate axis-y label
    axes[0].set_title(u"Hist2trip time diff distribution",fontsize=16)
    axes[0].set_xlim(-3600, 3600)
    axes[0].hist(dth,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') #th-tripTime
    axes[0].grid(True)
    #get 1x1 subplots
    axes[1].set_xlabel(r'$\Delta$' + "t (s)",fontsize=16) #set the coordinate axis-x label
    axes[1].set_ylabel(u'Probability',fontsize=16) #set the coordinate axis-y label
    axes[1].set_title(u"Real2trip time diff distribution",fontsize=16)
    axes[1].set_xlim(-3600, 3600)
    axes[1].hist(dtr,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') # tr - tripTime
    axes[1].grid(True)
    
    axes[2].set_xlabel(r'$\Delta$' + "t (s)",fontsize=16) #set the coordinate axis-x label
    axes[2].set_ylabel(u'Probability',fontsize=16)  #set the coordinate axis-y label
    axes[2].set_title(u"Regre2trip time diff distribution",fontsize=16)
    axes[2].set_xlim(-3600, 3600)
    axes[2].hist(dtp,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') #tp - tripTime
    axes[2].grid(True)
    plt.savefig('{0}/timeDiff.png'.format(path))

4.具体应用

4.1 应用1

# -*- coding: utf-8 -*-
from __future__ import division
import os
import sys
sys.path.append('../../')
import time
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import tensorflow as tf
#from sklearn.preprocessing import StandardScaler

""" peak:[06:30,07:30)-[07:30,08:30)-[08:30,09:30)-[16:30,17:30)-[17:30,18:30)-[18:30,19:30)-[19:30,20:30) """
""" from starting point:[0,10)-[10,20)-[20,40)-[40,60)-[60,90)-[90,inf) """
""" traffic state: smooth 3 ; slow 2 ; congestion 1; heavy congestion 0 """

link_scena0 = 6   # time from starting point
link_scena1 = 5   # link road class
link_scena2 = 8   # link time phase(peak or not)
link_scena3 = 2   # link state (congestion 0 or not 1)

node_scena0 = 1   # node type
node_scena1 = 4   # node turn type
node_scena2 = 8   # node time phase

p_am_begin_0 = 195  # Am rush hour begin  -period
p_am_begin_1 = 225  # Am rush hour begin  -period
p_am_begin_2 = 255  # Am rush hour begin  -period
p_am_end     = 285  # Am rusuh hour end   -period

p_pm_begin_0 = 495  # Pm rush hour begin  -period
p_pm_begin_1 = 525  # Pm rush hour begin  -period
p_pm_begin_2 = 555  # Pm rush hour begin  -period
p_pm_begin_3 = 585  # Pm rush hour begin  -period
p_pm_end     = 615  # Pm rush hour end    -period

t_begin_lower = "00:00:00"
t_begin_upper = "23:59:59"

"""trip time threshold when necessary"""
trip_time_thresh = 9000 
scenario_num_filter = 100   

""" generate map dict for link scenario """
scena_link = []
lc_ls = [0,1,3]     # highway roads, urban express roads, main roads
fs_ls = [0,1,2]     # 0~10min, 10~20min, 20~40min
for fs in range(link_scena0):  # time from starting point
    for lc in range(link_scena1): # link road class
        for lp in range(link_scena2): # link time phase
            for ls in range(link_scena3): # link state
                if fs in fs_ls and lc in lc_ls:
                    if (fs,lc,lp,ls) not in scena_link:
                        scena_link.append((fs,lc,lp,ls))   
                else:
                    if (fs,lc,lp,1) not in scena_link:
                        scena_link.append((fs,lc,lp,1))
num_link_scena = len(scena_link)                            
links_dict = dict(zip(scena_link,range(num_link_scena)))

"""generate map dict for node scenario """                            
scena_node = []
for ntt in range(node_scena1):
    for ntp in range(node_scena2):
        scena_node.append((node_scena0,ntt,ntp))   
num_node_scena = len(scena_node)               
nodes_dict = dict(zip(scena_node,range(num_node_scena)))  

""" get a generator for data stream """
def data_stream_trip(data_stream):
    for data in data_stream:
        yield data
       
""" load *.trip.etalr file and get trip time file for learning""" 
def load_data_eta(trips_file, is_filter = False): #get data 
    filter_thresh = trip_time_thresh # trip time threshold 
    # trips_file = ["310000.20190312.trip.etalr"]
    filterNum = 0
    tripData  = []
    tripData_ = []
    for trip_file in trips_file:
        print('{} read file {}'.format(time.strftime('%Y-%m-%d %X'), trip_file))
        trip_fd  = open(trip_file, 'r')
        ik = 0
        for line in trip_fd:
            line = line.strip().split('|')
            if len(line) < 20: #20
                continue
            tmc_index,time_hist,time_real,\
            from_start,time_phase,time_period,link_class,link_state,\
            node_phase,node_type,node_turntype,node_time,\
            begin_ts,end_ts,period,eta_act,th_,tr_,length,trip_id = line
            if is_filter:
                if float(eta_act) > filter_thresh:
                    filterNum +=1
                    continue
            if ik % 100000 == 0:
                print('{} read lines {}'.format(time.strftime('%Y-%m-%d %X'), ik))
            ik += 1
            if begin_ts < t_begin_lower or begin_ts >= t_begin_upper:
                continue
            tripData.append((
                [float(t)*1.0 for t in time_hist.split(',')],
                [float(t)*1.0 for t in time_real.split(',')],
                [int(t) for t in from_start.split(',')],
                [int(t) for t in time_phase.split(',')],
                #[int(t) for t in time_period.split(',')],
                [int(t) for t in link_class.split(',')],
                [int(t) for t in link_state.split(',')],
                [int(t) for t in node_phase.split(',') if t != ''],
                [int(t) for t in node_type.split(',') if t != ''],
                [int(t) for t in node_turntype.split(',') if t != ''],
                [float(t) for t in node_time.split(',') if t != ''],
                begin_ts,end_ts,int(period),float(eta_act),float(length),trip_id))
        trip_fd.close()
    """combine road class for lower level and combine link state"""
    for c, trip in enumerate(tripData):
        time_hist,time_real,\
        from_start,time_phase,link_class,link_state,\
        node_phase,node_type,node_turntype,node_time,\
        begin_ts,end_ts,period,eta_act,length,trip_id = trip

        for ilc,lc in enumerate(link_class):
            if lc == 6:  # urban express roads
                link_class[ilc] = 1  
            elif lc in [1,2,3]:  # national highway,provincial highways, county roads,
                link_class[ilc] = 2
            elif lc in [7]:  # main roads
                link_class[ilc] = 3 
            elif lc in [4,5,8,9,10]:  
            # township highways,county rural internal roads,secondary roads,ordinary roads,small roads
                link_class[ilc] = 4
            # link_class[ilc] = lc-5 if lc>=8 else lc
            if link_class[ilc] in lc_ls and from_start[ilc] in fs_ls:
                if link_state[ilc] in [0,1,2]:
                    link_state[ilc] = 0
                else:
                    link_state[ilc] = 1
            else:
                link_state[ilc] = 1 
        tripData_.append((time_hist,time_real,\
        from_start,time_phase,link_class,link_state,\
        node_phase,node_type,node_turntype,node_time,\
        begin_ts,end_ts,period,eta_act,length,trip_id))
    fmt = '{} filtered trip number is:{},rate is:{:.2f} owing to trip time'
    print(fmt.format(time.strftime('%Y-%m-%d %X'),filterNum,filterNum/(len(tripData)+filterNum))) 
    return tripData_

""" convert trip time file to training file"""         
def data_trip_processing(data, reg_flag = 2, is_jointReg = True):
    tps = len(data)
    t_links_ht = np.zeros((tps,num_link_scena))
    t_links_rt = np.zeros((tps,num_link_scena))
    t_nodes = np.zeros((tps,num_node_scena))
    tripInfo = np.zeros((tps,5))
    tripId = []
    skip_links = 0
    skip_nodes = 0
    for i, trip in enumerate(data):
        link_ht, link_rt, from_start, link_phase,link_class,link_state,\
        node_phase, node_type, node_turntype, node_time, \
        begin_ts, end_ts, trip_period, trip_time, trip_len, trip_id = trip
        if len(from_start) != len(link_class) or len(link_class) != len(link_phase) or len(link_phase)!=len(link_state):
            skip_links +=1
            continue
        links_map = []
        for fs,lc,lp,ls in zip(from_start,link_class,link_phase,link_state):
            links_map.append(links_dict[(fs,lc,lp,ls)])
        
        for j, link in enumerate(links_map):
            t_links_ht[i,link] +=link_ht[j]
            t_links_rt[i,link] +=link_rt[j]
            
        if node_type:
            if len(node_type)!= len(node_turntype) or len(node_turntype)!=len(node_phase):
                skip_nodes +=1
                continue     
            nodes_map = []
            for nt,ntt,ntp in zip(node_type,node_turntype,node_phase):
                nodes_map.append(nodes_dict[(nt,ntt,ntp)])
            for k, node in enumerate(nodes_map):
                t_nodes[i,node] +=node_time[k]
                
        tripInfo[i][0] = trip_period      # trip period
        tripInfo[i][1] = trip_len         # tripLength
        tripInfo[i][2] = len(link_phase)  # linksNum for a trip
        tripInfo[i][3] = len(node_phase)  # nodesNum for a trip
        tripInfo[i][4] = trip_time        # tripTime
        tripId.append(trip_id)
      
    ta = tripInfo[:,4]      # trip actual time 
    if is_jointReg == True: # weighted th & tr respectively
        t_links = np.concatenate((t_links_ht, t_links_rt), axis=1)
        Xdata = np.concatenate((t_links,t_nodes), axis=1)
        Ydata = ta 
    else:
        if reg_flag == 1:  # only th
            t_links = t_links_ht
            Xdata = np.concatenate((t_links,t_nodes), axis=1)
            Ydata = ta 
        elif reg_flag == 2:  # only tr 
            t_links = t_links_rt
            Xdata = np.concatenate((t_links,t_nodes), axis=1)
            Ydata = ta 
        else:  # weighted th & tr (k,1-k)
            t_links = t_links_ht - t_links_rt  # t_h - t_r
            Xdata = np.concatenate((t_links,t_nodes), axis=1)
            t_links_rt_x = np.sum(t_links_rt,axis=1)
            Ydata = ta  - t_links_rt_x

    t_ht = np.sum(t_links_ht, axis=1)
    t_rt = np.sum(t_links_rt, axis=1)
    t_nt = np.sum(t_nodes, axis=1)
    th = t_ht + t_nt    # trip time with hist
    tr = t_rt + t_nt    # trip time with real
    df_trip = pd.DataFrame(tripInfo[:,:4])
    df_trip[4] = tripId #trip id
    
    print('{} filtered trip number is:{} owing to links'.format(time.strftime('%Y-%m-%d %X'),skip_links))  
    print('{} filtered trip number is:{} owing to nodes'.format(time.strftime('%Y-%m-%d %X'),skip_nodes))   
    return Xdata,Ydata,t_ht,t_rt,t_nt,th,tr,ta,df_trip

""" ordinary least square(ols) linear/ridge regression model """
""" score:coefficient of determination R^2, the value range [0,1] """
"""       the stronger the interpretation ability of the model """
def test_regression_ols(data_iter, path, is_jointReg = True, is_ridge = True):
    """ models for comparison"""
    # RR = Ridge(alpha=0.5,fit_intercept = False)
    LR = LinearRegression(fit_intercept = False)
    # LA = Lasso(alpha=0.0001,precompute=True,max_iter=1000,
    #         positive=True, random_state=9999, selection='random',fit_intercept = False)
    LA = ElasticNet(alpha=0.001, l1_ratio=0.9, fit_intercept=False, precompute=True, 
                     max_iter=1000,tol=0.01,random_state=9999,positive=True)
    """prepare data"""
    train_trip = [item for item in data_iter] # invert iterator to list
    tripDataTrain = load_data_eta(train_trip)
    outPut = data_trip_processing(data = tripDataTrain, is_jointReg = is_jointReg)
    Xdata, Ydata = outPut[0], outPut[1]
    X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size = 0.01, random_state = 0)
    if is_ridge:
        # RR.fit(X_train, y_train)
        LA.fit(X_train, y_train)
        print("%s Coefficients:%s, intercept:%.4f" % (time.strftime('%Y-%m-%d %X'),LA.coef_, LA.intercept_))
        print("{} RMSE:{:.4f}".format(time.strftime('%Y-%m-%d %X'),np.sqrt(np.mean((LA.predict(X_test) - y_test) ** 2))))
        print("{} score:{:.4f}".format(time.strftime('%Y-%m-%d %X'),LA.score(X_test, y_test)))
        coeff = np.around(LA.coef_, decimals=4)
        interc = np.around(LA.intercept_, decimals=4)
        rmse = np.sqrt(np.mean((LA.predict(X_test) - y_test) ** 2))
    else:
        LR.fit(X_train, y_train)
        print("%s Coefficients:%s, intercept:%.4f" % (time.strftime('%Y-%m-%d %X'),LR.coef_, LR.intercept_))
        print("{} RMSE:{:.4f}".format(time.strftime('%Y-%m-%d %X'),np.sqrt(np.mean((LR.predict(X_test) - y_test) ** 2))))
        print("{} score:{:.4f}".format(time.strftime('%Y-%m-%d %X'),LR.score(X_test, y_test)))
        coeff = np.around(LR.coef_, decimals=4)
        interc = np.around(LR.intercept_, decimals=4)
        rmse = np.sqrt(np.mean((LR.predict(X_test) - y_test) ** 2))
    link_states, node_states = num_scenario(tripDataTrain)
    scenarioPlot(link_states, node_states, path)
    save_params(link_states, node_states, coeff, interc, path, is_jointReg = is_jointReg)
    np.save(path+"/coeff_interc.npy", np.concatenate((coeff,np.array(interc).reshape(-1,)), axis=0)) 
    return coeff, interc, rmse

""" sgd linear/ridge regression model """        
def test_regression_sgd(data_iter, path, partial_fit_num =600, display_step=100, is_jointReg=True):
    sgdR = SGDRegressor(max_iter=5000, tol=0.01, epsilon=5, eta0=1e-6,fit_intercept=False,
                         loss="epsilon_insensitive", penalty="l2")
    link_states, node_states = None, None
    for ik, train_trip in enumerate(data_iter):
        # load training and test data and split for training and validation
        print("{} generate training data No.{}".format(time.strftime('%Y-%m-%d %X'),ik))
        tripDataTrain = load_data_eta([train_trip])
        outPut = data_trip_processing(data = tripDataTrain, is_jointReg = is_jointReg)
        Xdata, Ydata = outPut[0], outPut[1]
        link_states_, node_states_ = num_scenario(tripDataTrain)
        if link_states is None or node_states is None:
             link_states, node_states = link_states_, node_states_
        else:
            link_states[:,0] = link_states[:,0] + link_states_[:,0]
            node_states[:,0] = node_states[:,0] + node_states_[:,0]
        X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size = 0.01, random_state = 0)
        print("{} start training and get result No.{}".format(time.strftime('%Y-%m-%d %X'),ik))
        # iter some times for each data in data iterator
        for n in range(partial_fit_num):
            sgdR.partial_fit(X_train, y_train)
            if n % display_step ==0:
                print('{} training times {}'.format(time.strftime('%Y-%m-%d %X'), n))
        print("%s Coefficients:%s, intercept:%.4f" % (time.strftime('%Y-%m-%d %X'),sgdR.coef_, sgdR.intercept_))
        print("{} RMSE:{:.4f}".format(time.strftime('%Y-%m-%d %X'),np.sqrt(np.mean((sgdR.predict(X_test) - y_test) ** 2))))
        print("{} score:{:.4f}".format(time.strftime('%Y-%m-%d %X'),sgdR.score(X_test, y_test)))
        outPut, Xdata, Ydata, X_train, y_train = None, None, None, None, None
    coeff = np.around(sgdR.coef_, decimals=4)
    interc = np.around(sgdR.intercept_[0], decimals=4)
    rmse = np.sqrt(np.mean((sgdR.predict(X_test) - y_test) ** 2))
    scenarioPlot(link_states, node_states, path)
    save_params(link_states, node_states, coeff, interc, path, is_jointReg = is_jointReg)
    np.save(path+"/coeff_interc.npy", np.concatenate((coeff,np.array(interc).reshape(-1,)), axis=0))  
    return coeff, interc, rmse

""" testing different lambda on prediction performance"""
def test_ridge_lmbda(*data):
    x_train, x_test, y_train, y_test = data
    lmbdas = [0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
    scores = []
    for i, lmbda in enumerate(lmbdas):
        RR = Ridge(alpha = lmbda)
        RR.fit(x_train, y_train)
        scores.append(RR.score(x_test, y_test))
    return lmbdas, scores

""" get scenario number for different scenario"""    
def num_scenario(data):
    tripData = data
    link_dict = {}
    node_dict = {}
    for i, line in enumerate(tripData):
        link_ht, link_rt, from_start, link_phase, link_class, link_state,\
        node_phase, node_type, node_turntype, node_time,\
        begin_ts, end_ts, trip_period, trip_time, trip_len, trip_id= line
        for lkey in zip(from_start, link_class, link_phase, link_state):
            if not lkey in link_dict:
                link_dict[lkey] = 1
            else:
                link_dict[lkey] += 1
        for nkey in zip(node_type, node_turntype, node_phase):
            if not nkey in node_dict:
                node_dict[nkey] = 1
            else:
                node_dict[nkey] += 1             
    link_states = np.zeros((num_link_scena,5), dtype = np.int32)
    node_states = np.zeros((num_node_scena,4), dtype = np.int32)
    link_states[:,1:] = np.array(scena_link) 
    node_states[:,1:] = np.array(scena_node)         
    for lkey in link_dict:
        lk = links_dict[lkey] #row number
        link_states[lk,0] = link_dict[lkey]
    for nkey in node_dict:
        nk = nodes_dict[nkey] #row number
        node_states[nk,0] = node_dict[nkey]   
    return link_states, node_states

""" trip time with regression prediction""" 
def time_reg(Xdata, coeff, interc):
    tp = np.around(np.dot(Xdata,coeff) + interc, decimals=2)
    return tp

""" evaluate eta to get csv file""" 
def eta_evaluate_csv(t_hist_link, t_real_link, t_node, t_hist, t_tmc, t_reg, t_act, df_trip, path):
    rate_eta = np.zeros((t_hist_link.size,3))
    time_ = np.concatenate((t_act.reshape(-1,1), t_hist.reshape(-1,1)), axis=1)
    time_ = np.concatenate((time_, t_tmc.reshape(-1,1)), axis=1)
    time_ = np.concatenate((time_, t_reg.reshape(-1,1)), axis=1)
    for i, act in enumerate(t_act):
        rate = get_pass_rate(tuple(time_[i]))
        rate_eta[i,0] = rate[0]
        rate_eta[i,1] = rate[1]
        rate_eta[i,2] = rate[2]
    df_time = df_trip # tripPeriod,triplength,linksNum,nodesNum,tripId 
    df_time[5] = t_hist_link  # static_link_time
    df_time[6] = t_real_link  # tmc_link_time
    df_time[7] = t_node  # node_time
    df_time[8] = t_hist  # static_time
    df_time[9] = t_tmc   # tmc_time
    df_time[10] = t_reg  # reg_time
    df_time[11] = t_act  # actual_time
    df_time[12] = np.abs(t_act - t_reg) #absoluteDiff
    df_time[13] = np.around(np.abs(t_act - t_reg)/t_act *100,  decimals=2) #relativeDiff%
    df_time[14] = rate_eta[:,0] #static ok(0/1)
    df_time[15] = rate_eta[:,1] #tmc ok(0/1)
    df_time[16] = rate_eta[:,2] #reg ok(0/1)
    static_rate = round(np.mean(rate_eta[:,0]),4)
    tmc_rate = round(np.mean(rate_eta[:,1]),4)
    reg_rate = round(np.mean(rate_eta[:,2]),4)
    """" Am ok rate"""
    reg_rate_am_0 = df_time[(df_time.iloc[:,0] >=p_am_begin_0)&(df_time.iloc[:,0] <p_am_begin_1)].iloc[:,16].mean() # [06:30,07:30)
    reg_rate_am_1 = df_time[(df_time.iloc[:,0] >=p_am_begin_1)&(df_time.iloc[:,0] <p_am_begin_2)].iloc[:,16].mean() # [07:30-08:30)
    reg_rate_am_2 = df_time[(df_time.iloc[:,0] >=p_am_begin_2)&(df_time.iloc[:,0] <p_am_end)].iloc[:,16].mean() # [08:30-09:30)
    reg_rate_am = df_time[(df_time.iloc[:,0] >=p_am_begin_0)&(df_time.iloc[:,0] <p_am_end)].iloc[:,16].mean() # 06:30-09:30
    """" Pm/off-peak ok rate"""
    reg_rate_pm_0 = df_time[(df_time.iloc[:,0] >=p_pm_begin_0)&(df_time.iloc[:,0] <p_pm_begin_1)].iloc[:,16].mean() # [16:30,17:30)
    reg_rate_pm_1 = df_time[(df_time.iloc[:,0] >=p_pm_begin_1)&(df_time.iloc[:,0] <p_pm_begin_2)].iloc[:,16].mean() # [17:30-18:30)
    reg_rate_pm_2 = df_time[(df_time.iloc[:,0] >=p_pm_begin_2)&(df_time.iloc[:,0] <p_pm_begin_3)].iloc[:,16].mean() # [18:30-19:30)
    reg_rate_pm_3 = df_time[(df_time.iloc[:,0] >=p_pm_begin_3)&(df_time.iloc[:,0] <p_pm_end)].iloc[:,16].mean() # [19:30-20:30)
    reg_rate_pm = df_time[(df_time.iloc[:,0] >=p_pm_begin_0)&(df_time.iloc[:,0] <p_pm_end)].iloc[:,16].mean() # 16:30-20:30
    reg_rate_no = df_time[(df_time.iloc[:,0] <p_am_begin_0)|(df_time.iloc[:,0] >=p_pm_end)|(
                        (df_time.iloc[:,0] >=p_am_end)&(df_time.iloc[:,0] <p_pm_begin_0))].iloc[:,16].mean()
    """replace nan"""
    if np.isnan(reg_rate_am_0): reg_rate_am_0 = 0
    if np.isnan(reg_rate_am_1): reg_rate_am_1 = 0
    if np.isnan(reg_rate_am_2): reg_rate_am_2 = 0
    if np.isnan(reg_rate_am):   reg_rate_am = 0
    if np.isnan(reg_rate_pm_0): reg_rate_pm_0 = 0
    if np.isnan(reg_rate_pm_1): reg_rate_pm_1 = 0
    if np.isnan(reg_rate_pm_2): reg_rate_pm_2 = 0
    if np.isnan(reg_rate_pm_3): reg_rate_pm_3 = 0
    if np.isnan(reg_rate_pm):   reg_rate_pm = 0 
    if np.isnan(reg_rate_no):   reg_rate_no = 0
    
    print("{} static ok rate(in total)  : {:.3%}".format(time.strftime('%Y-%m-%d %X'),static_rate))
    print("{} tmc ok rate(in total)     : {:.3%}".format(time.strftime('%Y-%m-%d %X'),tmc_rate))
    print("{} reg ok rate(in total)     : {:.3%}".format(time.strftime('%Y-%m-%d %X'),reg_rate))
    print("{} reg ok rate(rush hour--am): {:.3%}({:.3%},{:.3%},{:.3%})".format(time.strftime('%Y-%m-%d %X'),
                                          reg_rate_am,reg_rate_am_0,reg_rate_am_1,reg_rate_am_2))
    print("{} reg ok rate(rush hour--pm): {:.3%}({:.3%},{:.3%},{:.3%},{:.3%})".format(time.strftime('%Y-%m-%d %X'),
                                  reg_rate_pm,reg_rate_pm_0,reg_rate_pm_1,reg_rate_pm_2,reg_rate_pm_3))
    print("{} reg ok rate(off-peak time): {:.3%}".format(time.strftime('%Y-%m-%d %X'),reg_rate_no))
    
    cols = ['tripPeriod', 'tripLength','linksNum','nodesNum','tripId','static_link_time', 'tmc_link_time',
            'node_time', 'static_time', 'tmc_time','reg_time','actual_time', 'absoluteDiff','relativeDiff%',
             'static ok(0/1)', 'tmc ok(0/1)','reg ok(0/1)']
    df_time.columns = cols
    df_time.to_csv(path + '/eta_result.csv', header=True, index=True)
    df_time_abs = df_time.sort_values(by = ['absoluteDiff'],axis = 0,ascending = False)
    df_time_abs.to_csv(path + '/eta_absolute.csv', header=True, index=True)
    df_time_re = df_time.sort_values(by = ['relativeDiff%'],axis = 0,ascending = False)
    df_time_re.to_csv(path + '/eta_relative.csv', header=True, index=True)
    return rate_eta

""" pass_rate calculate""" 
def get_pass_rate(trip_time):
    standard_time, static_time, tmc_time, regr_time = trip_time
    static_diff = abs(standard_time - static_time)/standard_time
    tmc_diff = abs(standard_time - tmc_time)/standard_time
    regr_diff = abs(standard_time - regr_time)/standard_time
    
    actual_time = standard_time
    tmc_diff_time = abs(standard_time - tmc_time)
    static_diff_time = abs(standard_time - static_time)
    regr_diff_time = abs(standard_time - regr_time)
    
    static_ok = 0
    tmc_ok = 0
    regr_ok = 0

    if actual_time <= 10 * 60:
        if static_diff <= 0.3:
            static_ok = 1
    elif actual_time <= 15 * 60:
        if static_diff_time <= 3 * 60:
            static_ok = 1
    elif actual_time <= 20 * 60:
        if static_diff <= 0.2:
            static_ok = 1
    elif actual_time <= 40 * 60:
        if static_diff_time <= 4 * 60:
            static_ok = 1
    else:
        if static_diff <= 0.1:
            static_ok = 1
            
    if actual_time <= 10 * 60:
        if tmc_diff <= 0.3:
            tmc_ok = 1
    elif actual_time <= 15 * 60:
        if tmc_diff_time <= 3 * 60:
            tmc_ok = 1
    elif actual_time <= 20 * 60:
        if tmc_diff <= 0.2:
            tmc_ok = 1
    elif actual_time <= 40 * 60:
        if tmc_diff_time <= 4 * 60:
            tmc_ok = 1
    else:
        if tmc_diff <= 0.1:
            tmc_ok = 1
            
    if actual_time <= 10 * 60:
        if regr_diff <= 0.3:
            regr_ok = 1
    elif actual_time <= 15 * 60:
        if regr_diff_time <= 3 * 60:
            regr_ok = 1
    elif actual_time <= 20 * 60:
        if regr_diff <= 0.2:
            regr_ok = 1
    elif actual_time <= 40 * 60:
        if regr_diff_time <= 4 * 60:
            regr_ok = 1
    else:
        if regr_diff <= 0.1:
            regr_ok = 1
            
    ret = (static_ok, tmc_ok, regr_ok)
    
    return ret

""" output link and node coeff params for different scenario""" 
def save_params(link_states, node_states, coeff, interc, path, is_jointReg = True):
    df_coeffLink = pd.DataFrame(link_states) # trip link scena
    df_coeffNode = pd.DataFrame(node_states) # trip node scena
    if is_jointReg:
        df_coeffLink[5] = coeff[:num_link_scena]  #static coeff
        df_coeffLink[6] = coeff[num_link_scena:num_link_scena*2] #tmc coeff
        df_coeffLink[7] = interc  #trip interc
        df_coeffNode[4] = coeff[num_link_scena*2:]
        df_coeffLink.columns = ['linkTotal','linkFromStp','linkClass','linkPhase','linkState','staticCoeff','tmcCoeff','interc']
        df_coeffNode.columns = ['nodeTotal','nodeType','nodeTurnType','nodePhase', 'nodeCoeff']
    else:
        df_coeffLink[5] = coeff[:num_link_scena]  #static or tmc coeff
        df_coeffLink[6] = interc  #trip interc
        df_coeffNode[4] = coeff[num_link_scena:]
        df_coeffLink.columns = ['linkTotal','linkFromStp','linkClass','linkPhase','linkState','Coeff','interc']
        df_coeffNode.columns = ['nodeTotal','nodeType','nodeTurnType','nodePhase', 'nodeCoeff']
    df_coeffLink_filter = df_coeffLink[(df_coeffLink.iloc[:,0] <scenario_num_filter)] # scenario filter
    df_coeffLink.to_csv(path + '/link_params.csv', header=True, index=True)  
    df_coeffLink_filter.to_csv(path + '/link_params_filter.csv', header=True, index=True) 
    df_coeffNode.to_csv(path + '/node_params.csv', header=True, index=True) 
    
"""compare trip time when necessary """     
def compare_analysis(trip_time, th, path):
    trip_time_idF = list(np.where(trip_time < trip_time_thresh)[0])
    trip_time_idF_M = list(np.where(trip_time >=trip_time_thresh)[0])
    '''give plot'''
    plt.figure(figsize=(23,16))
    plt.subplot(211)
    plt.scatter(trip_time[trip_time_idF], th[trip_time_idF],alpha=0.5)
    #plt.scatter(tripTime[tripTime_idF_M], th[tripTime_idF_M],c='r',marker='*',alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.xlim(trip_time.min()-1000, trip_time.max())
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)
    plt.ylim(th.min()-1000, th.max())
    plt.xticks(np.arange(0, 25000, 2500))
    #plt.yticks(np.arange(-5, 5, 0.5))
    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16)

    plt.subplot(212)
    plt.scatter(trip_time[trip_time_idF], th[trip_time_idF],alpha=0.5)
    plt.scatter(trip_time[trip_time_idF_M], th[trip_time_idF_M],c='r',marker='*',alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)
    plt.xticks(np.arange(0, 25000, 2500))
    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16)
    plt.savefig('{0}/compare.png'.format(path))  

""" plot fig for scenario number """     
def scenarioPlot(link_states, node_states, path):
    
    df_link = pd.DataFrame(link_states)
    df_node = pd.DataFrame(node_states)
    df_link.columns = ['linkTotal','linkFromStp','linkClass','linkPhase','linkState']
    df_node.columns = ['nodeTotal','nodeType','nodeTurnType','nodePhase']
    
    df_link_max = df_link.sort_values(by = ['linkTotal'],axis = 0,ascending = False).iloc[:10,:]
    df_link_min = df_link.sort_values(by = ['linkTotal'],axis = 0,ascending = False).iloc[-10:,:]
    df_link_max = df_link_max.set_index(['linkFromStp','linkClass','linkPhase','linkState'])
    df_link_min = df_link_min.set_index(['linkFromStp','linkClass','linkPhase','linkState'])
    
    df_node_max = df_node.sort_values(by = ['nodeTotal'],axis = 0,ascending = False).iloc[:10,:]
    df_node_min = df_node.sort_values(by = ['nodeTotal'],axis = 0,ascending = False).iloc[-10:,:]
    df_node_max = df_node_max.set_index(['nodeType','nodeTurnType','nodePhase'])
    df_node_min = df_node_min.set_index(['nodeType','nodeTurnType','nodePhase'])
    
    fig,axes = plt.subplots(2,2, figsize=(24,18)) #get 1x1 subplots
    
    df_link_max.plot(kind='barh',ax=axes[0,0],title=u" ",alpha=0.95,stacked=True)
    df_link_max.linkTotal.plot(kind='barh',ax=axes[0,0],title=u" ",alpha=0.95,stacked=True)
    axes[0,0].set_xlabel(r'Scenario number',fontsize=16) #set the coordinate axis-x label
    axes[0,0].set_ylabel('linkFromStp,linkClass,linkPhase,linkState',fontsize=16)
    axes[0,0].set_title(u"Link scenario(Max-10)",fontsize=16)
    
    df_link_min.plot(kind='barh',ax=axes[0,1],title=u" ",alpha=0.95,stacked=True)
    df_link_min.linkTotal.plot(kind='barh',ax=axes[0,1],title=u" ",alpha=0.95,stacked=True)
    axes[0,1].set_xlabel(r'Scenario number',fontsize=16) #set the coordinate axis-x label
    axes[0,1].set_ylabel('linkFromStp,linkClass,linkPhase,linkState',fontsize=16)
    axes[0,1].set_title(u"Link scenario(Min-10)",fontsize=16)
    
    df_node_max.plot(kind='barh',ax=axes[1,0],title=u" ",alpha=0.95,stacked=True)
    df_node_max.nodeTotal.plot(kind='barh',ax=axes[1,0],title=u" ",alpha=0.95,stacked=True)
    axes[1,0].set_xlabel(r'Scenario number',fontsize=16) #set the coordinate axis-x label
    axes[1,0].set_ylabel('nodeType,nodeTurnType,nodePhase',fontsize=16)
    axes[1,0].set_title(u"Node scenario(Max-10)",fontsize=16)
    
    df_node_min.plot(kind='barh',ax=axes[1,1],title=u" ",alpha=0.95,stacked=True)
    df_node_min.nodeTotal.plot(kind='barh',ax=axes[1,1],title=u" ",alpha=0.95,stacked=True)
    axes[1,1].set_xlabel(r'Scenario number',fontsize=16) #set the coordinate axis-x label
    axes[1,1].set_ylabel('nodeType,nodeTurnType,nodePhase',fontsize=16)
    axes[1,1].set_title(u"Node scenario(Min-10)",fontsize=16)
    plt.savefig('{0}/scenarioDistri.png'.format(path))   

"""time diff distribution"""
def histPlot(th, tr, tp, ta, path):
    dth = th - ta
    dtr = tr - ta
    dtp = tp - ta
    
    fig,axes = plt.subplots(1,3, figsize=(23,8)) #get 1x1 subplots
    axes[0].set_xlabel(r'$\Delta$' + "t (s)",fontsize=16) #set the coordinate axis-x label
    axes[0].set_ylabel(u'Probability',fontsize=16) #set the coordinate axis-y label
    axes[0].set_title(u"Hist2trip time diff distribution",fontsize=16)
    axes[0].set_xlim(-3600, 3600)
    axes[0].hist(dth,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') #th-tripTime
    axes[0].grid(True)
    #get 1x1 subplots
    axes[1].set_xlabel(r'$\Delta$' + "t (s)",fontsize=16) #set the coordinate axis-x label
    axes[1].set_ylabel(u'Probability',fontsize=16) #set the coordinate axis-y label
    axes[1].set_title(u"Real2trip time diff distribution",fontsize=16)
    axes[1].set_xlim(-3600, 3600)
    axes[1].hist(dtr,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') # tr - tripTime
    axes[1].grid(True)
    
    axes[2].set_xlabel(r'$\Delta$' + "t (s)",fontsize=16) #set the coordinate axis-x label
    axes[2].set_ylabel(u'Probability',fontsize=16)  #set the coordinate axis-y label
    axes[2].set_title(u"Regre2trip time diff distribution",fontsize=16)
    axes[2].set_xlim(-3600, 3600)
    axes[2].hist(dtp,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') #tp - tripTime
    axes[2].grid(True)
    plt.savefig('{0}/timeDiff.png'.format(path))
     
""" actual and regression time scatter """
def scatterPlot(rates, th, tr, tp, ta, path):
    rates_h = rates[:,0]  # hist pass rate,1 or 0
    rates_r = rates[:,1]  # real pass rate,1 or 0
    rates_a = rates[:,2]  # actual pass rate,1 or 0
    #use_colours = {"T":"green","F":"red"} 
    colors = ['r','g']
    
    plt.figure(figsize=(22,7))
    plt.subplot(131)
    plt.scatter(ta, th, c=[colors[int(x)] for x in rates_h], alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)
    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16)    
    
    plt.subplot(132)
    plt.scatter(ta, tr, c=[colors[int(x)] for x in rates_r], alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Real time with hist speed (s)',fontsize=16)
    plt.grid(True)
    plt.title('Real2Trip ScatterPlot',fontsize=16)     
    
    plt.subplot(133)
    plt.scatter(ta, tp, c=[colors[int(x)] for x in rates_a],alpha=0.5)
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Regre time with Regression coffe (s)',fontsize=16)
    plt.grid(True)
    plt.title('Regre2Trip ScatterPlot',fontsize=16)  
    plt.savefig('{0}/timeScatter.png'.format(path))

def linear_plot(X_train, y_train, coeff, interc):
    tp_train = np.dot(X_train,coeff) + interc
    ta_train = y_train
    
    plt.figure(figsize=(12,12))
    plt.scatter(tp_train, ta_train,alpha=0.5)
    # plt.scatter(tt[tt_idF_M], th[tt_idF_M],c='r',marker='*',alpha=0.5)
    
    plt.xlabel('Trip actual time (s)',fontsize=16)
    plt.ylabel('Hist time with hist speed (s)',fontsize=16)

    plt.grid(True)
    plt.title('Hist2Trip ScatterPlot',fontsize=16) 

def show_plot(alphas, scores):
    figure = plt.figure()
    ax = figure.add_subplot(1, 1, 1)
    ax.plot(alphas, scores)
    ax.set_xlabel(r"$\lambda$")
    ax.set_ylabel(r"score")
    ax.set_xscale("log")
    ax.set_title("Ridge")
    ax.set_gid
    plt.show()

class linear_model_tf(object):  # define a class for linear regression model
    def __init__(self, file_num=1, data_iter=None, path=None, is_jointReg=True):  # initialization
        self.num = file_num
        self.iter = data_iter
        self.path = path
        self.jointReg = is_jointReg
   
    def get_minibatch(self):
        X_train, X_test = np.asarray([],dtype=float),np.asarray([],dtype=float)
        y_train, y_test = np.asarray([],dtype=float),np.asarray([],dtype=float)
        for data_file in itertools.islice(self.iter, self.num):
            tripDataTrain = load_data_eta([data_file])
            outPut = data_trip_processing(data = tripDataTrain, is_jointReg = self.jointReg)
            Xdata, Ydata = outPut[0], outPut[1]
            X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size = 0.01, random_state = 0)
        return X_train, y_train, X_test, y_test
    
    def iter_minibatches(self):
        """Generator of minibatches."""
        X_train, y_train, X_test, y_test = self.get_minibatch()
        while len(X_train):
            yield X_train, y_train,X_test, y_test
            X_train, y_train,X_test, y_test  =self.get_minibatch()

    def tf_linear_regression(self,epochs=2000,display_step=500,lmbda=1e-3,lr = 0.001):
        # create symbolic variables 
        if self.jointReg:
            cols = num_link_scena*2 + num_node_scena
        else:
            cols = num_link_scena + num_node_scena
            
        X = tf.placeholder(tf.float32,[None,cols],name='input-X')
        Y = tf.placeholder(tf.float32,[None,1],name='input-Y')
        # create shared variable 
        W = tf.Variable(tf.random_normal([cols,1],stddev=0.01),name='coefficients')
        b = tf.Variable(tf.random_normal([1],stddev=0.01),name='intercept')
        Y_pred = tf.add(tf.matmul(X,W),b)
        loss = tf.reduce_mean(tf.square(Y_pred-Y))
        loss = tf.add(loss, tf.contrib.layers.l2_regularizer(lmbda)(W))

        minibatch_iterators = self.iter_minibatches()

        # define Optimizer 
        optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
        
        #define session
        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)
        # training...
        with tf.Session() as sess:
            # Run the initializer
            sess.run(init)
            # iter...
            for i, (x_train, y_train,x_test,y_test) in enumerate(minibatch_iterators):
                y_train,y_test = y_train.reshape(-1,1),y_test.reshape(-1,1)
                for epoch in range(epochs):
                    sess.run(optimizer, feed_dict={X: x_train, Y: y_train})
                    # display logs per epoch step
                    if (epoch+1) % display_step == 0:
                        c = sess.run(loss, feed_dict={X: x_train, Y:y_train})
                        print("{} Epoch: {:04d}, cost={:.6f}".format(time.strftime('%Y-%m-%d %X'),epoch+1, c))
                coeff,interc = sess.run(W),sess.run(b)
                coeff = np.around(coeff, decimals=4)
                interc = np.around(interc, decimals=4)
                y_pred = np.dot(x_test, coeff) + interc
                rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))
                print("%s Coefficients:%s, intercept:%.4f" % (time.strftime('%Y-%m-%d %X'),coeff.reshape(-1,), interc))
                print("{} RMSE:{:.4f}".format(time.strftime('%Y-%m-%d %X'),rmse))
                print("{} score: {:.4f}".format(time.strftime('%Y-%m-%d %X'),r2_score(y_pred, y_test)))
            # scenarioPlot(link_states, node_states, self.path)
            # save_params(link_states, node_states, coeff, interc, path, is_jointReg = is_jointReg)
            params = np.concatenate((coeff.reshape(-1,),np.array(interc).reshape(-1,)),axis=0)
            np.save(self.path+"/coeff_interc.npy", params)
        return coeff.reshape(-1,), interc, rmse

if __name__ == '__main__':
    
    is_tf = False         # define regression framework,scikit-learn or tensorflow
    is_ols = True         # define regression model, ols or sgd.svm
    is_jointReg = True    # define the object of regression, if tr/th respectively or together
    sys_len = len(sys.argv)
    if sys_len < 5:
        print('usage: {} <data_dir> <citycode> <training_data date> <test_data date>'.format(sys.argv[0]))
        sys.exit(1)
        
    data_dir = sys.argv[1]   # data dir
    citycode = sys.argv[2]   # city code
    data_train = [os.path.join(data_dir, '{}.{}.trip.etalr'.format(citycode, sys.argv[i].replace('-',''))) 
                            for i in range(3,sys_len-1)] # training data date
    test_trip = [os.path.join(data_dir, '{}.{}.trip.etalr'.format(citycode, sys.argv[-1].replace('-','')))]
    data_stream = data_stream_trip(data_train) #iter(data_train)
    """ data file for training and test """
    """ train_trips = ["310000.20190313.trip.etalr"] """
    """ test_trips = ["310000.20190312.trip.etalr","310000.20190314.trip.etalr"]"""
    # train_trip = data_stream_trip(data_train)
    if is_tf:
        lm_tf = linear_model_tf(file_num=1, data_iter=data_stream, is_jointReg=is_jointReg,path=data_dir)
        coeff, interc, rmse = lm_tf.tf_linear_regression()
        lm_tf = None
    else:
        if is_ols:
            coeff, interc, rmse = test_regression_ols(data_stream, data_dir)
        else:
            coeff, interc, rmse = test_regression_sgd(data_stream, data_dir)
    # coeff, interc, rmse = test_regression_ols(data_stream, data_dir)
    print("{} generate test data".format(time.strftime('%Y-%m-%d %X')))
    tripDataTest = load_data_eta(test_trip)
    Xdata, Ydata1, t_ht, t_rt, t_nt, th, tr, ta, df_trip = data_trip_processing(data = tripDataTest, is_jointReg = is_jointReg)
    print("{} using result to evaluate test data".format(time.strftime('%Y-%m-%d %X')))
    tp = time_reg(Xdata, coeff, interc) #regression time
    rates = eta_evaluate_csv(t_ht, t_rt, t_nt, th, tr, tp, ta, df_trip, data_dir)
    histPlot(th, tr, tp, ta, data_dir)
    scatterPlot(rates, th, tr, tp, ta, data_dir)
    print('{} finished,anything is oK!'.format(time.strftime('%Y-%m-%d %X')))

4.2 应用2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
npg=np.genfromtxt
#import scipy
#fig=plt.figure()
#ax=fig.add_subplot(1,1,1)

"""b :blue,r:red,g: green,c:cyan,m: magenta,y:yellow,k:black, w:white"""


def data_getting():
    
    mr_=np.load(path1+'\\'+'mr_.npy','r+')
    mr_train=np.load(path1+'\\'+'mr_train.npy','r+')
    mr_test=np.load(path1+'\\'+'mr_test.npy','r+')
    
    params=npg(path1+'\\'+'params.txt',delimiter=':',dtype=None,encoding='utf-8')
    params=dict(params) #params dict
    rmse_interval=int(params['rmse_interval'])     #get time slot end
    
    T=np.load(path1+'\\'+'T.npy','r+')
    R=np.load(path1+'\\'+'R.npy','r+')
    G=np.load(path1+'\\'+'G.npy','r+')
    F=np.load(path1+'\\'+'F.npy','r+')
    mrPred=np.dot(T, R.T)
    
    rmse_train=np.loadtxt(path1+'\\'+'rmse_train.csv')
    rmse_test=np.loadtxt(path1+'\\'+'rmse_test.csv')
    loss=np.loadtxt(path1+'\\'+'loss.csv')
   
    return mr_,mr_train,mr_test,mrPred,T,R,G,F,rmse_train,rmse_test,loss,rmse_interval

def data_prcocessing():
    
    mr_,mr_train,mr_test,mrPred,T,R,G,F,rmse_train,rmse_test,loss,invl=data_getting()
    
    orig_train = mr_train[mr_train>0.1] # initial trainning value
    orig_test  = mr_test[mr_test>0.1]   # initial test value 
    orig_data  = np.hstack((orig_train,orig_test)) # initial value 
    pred_train = mrPred[mr_train>0.1]   # predicted trainning value 
    pred_test  = mrPred[mr_test>0.1]    # predicted test value
    
    pos_missing=np.where(mr_[:,:]<0.1)  # all missing position
    pred_missing=mrPred[pos_missing]    # predicted  value for missing position
 
    zero_columns=list(np.where(~mr_.any(axis=0))[0])# the columns all zeros(fiber missing)
    pred_fmissing=mrPred[:,zero_columns].flatten() #(fiber-like missing speed)

    emissing_mr_= np.delete(mr_,zero_columns, axis=1) 
    emissing_pos=np.where(emissing_mr_[:,:]<0.1)
    pred_emissing=mrPred[emissing_pos] #(element-like missing speed)

    return orig_data,orig_train,orig_test,pred_missing,pred_emissing,pred_fmissing,\
            pred_train,pred_test,rmse_train,rmse_test,loss,invl  

def speed_plot(orig_data, pred_train, pred_test, 
               pred_missing, pred_emissing,pred_fmissing,rmse_train,rmse_test):
    
    fig,axes = plt.subplots(2,3, figsize=(30,18)) #get 2x2 subplots
    #get figure for speed distribution 
    axes[0, 0].set_xlabel(u'Speed(km/h)',fontsize=16) #set the coordinate axis-x label
    axes[0, 0].set_ylabel(u'Probability',fontsize=16) #set the coordinate axis-y label
    axes[0, 0].set_title(u"Original No-missing(num:{})".format(len(orig_data)),fontsize=16)
    axes[0, 0].grid(True)
    axes[0, 0].set_xlim(-10, 130)
    axes[0, 0].hist(orig_data,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') #initial speed
   
    axes[0, 1].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[0, 1].set_ylabel(u'Probability',fontsize=16) 
    axes[0, 1].set_title(u"Predicted training(num:{}, RMSE:{})".format(len(pred_train),rmse_train[-1]),fontsize=16)
    axes[0, 1].grid(True)
    axes[0, 1].set_xlim(-10, 130)
    axes[0, 1].hist(pred_train,bins=50,density=1,color='g',alpha=0.5,edgecolor='k') #predicted training speed
    
    axes[0, 2].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[0, 2].set_ylabel(u'Probability',fontsize=16) 
    axes[0, 2].set_title(u"Predicted test(num:{}, RMSE:{})".format(len(pred_test),rmse_test[-1]),fontsize=16)
    axes[0, 2].grid(True)
    axes[0, 2].set_xlim(-10, 130)
    axes[0, 2].hist(pred_test,bins=50,density=1,color='b',alpha=0.5,edgecolor='k') #predicted testing speed
        
    axes[1, 0].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[1, 0].set_ylabel(u'Probability',fontsize=16) 
    axes[1, 0].set_title(u"Original missing predicted(num:{})".format(len(pred_missing)),fontsize=16)
    axes[1, 0].grid(True)
    axes[1, 0].set_xlim(-10, 130)
    axes[1, 0].hist(pred_missing,bins=60,density=1,color='c',alpha=0.5,edgecolor='k') #predicted missing speed
    
    axes[1, 1].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[1, 1].set_ylabel(u'Probability',fontsize=16) 
    axes[1, 1].set_title(u"Element-like missing(num:{})".format(len(pred_emissing)),fontsize=16)
    axes[1, 1].grid(True)
    axes[1, 1].set_xlim(-10, 130)
    axes[1, 1].hist(pred_emissing,bins=4,density=1,color='m',alpha=0.5,edgecolor='k') #predicted Element-like missing speed
    
    axes[1, 2].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[1, 2].set_ylabel(u'Probability',fontsize=16) 
    axes[1, 2].set_title(u"Fiber-like missing(num:{})".format(len(pred_fmissing)),fontsize=16)
    axes[1, 2].grid(True)
    axes[1, 2].set_xlim(-10, 130)
    axes[1, 2].hist(pred_fmissing,bins=60,density=1,color='y',alpha=0.5,edgecolor='k') #predicted Fiber-like missing speed
    
    plt.savefig('{0}\\Speed_predicted.png'.format(path1))
    
def speed_dist_plot(orig_train, pred_train, orig_test, pred_test):

    #  get figure for error with ±5km/h  
    bias_train=pred_train-orig_train
    bias_test=pred_test-orig_test
    names = ["training bias", "test bias"]  
    bins=[-np.inf,-20,-15,-10,-5,0,5,10,15,20,np.inf]
    cats_train=pd.value_counts(pd.cut(bias_train,bins))
    cats_train=cats_train/sum(cats_train)*100
    cats_test=pd.value_counts(pd.cut(bias_test,bins))
    cats_test=cats_test/sum(cats_test)*100
    cats=pd.concat([cats_train,cats_test],axis=1)
    cats.sort_index(inplace=True)
    cats.columns=pd.Index(names)

    fig,axes = plt.subplots(1,1, figsize=(15,10))
    axes.set_xlabel('Speed bias(km/h)',fontsize=16) 
    axes.set_ylabel('Percentage(%)',fontsize=16)
    axes.set_title('Speed bias distribution', fontsize=16)
    axes=cats.plot(kind='bar',ax=axes,color=['green','red'],\
              alpha=0.5,rot=30)
    for p in axes.patches:
        axes.annotate(np.round(p.get_height(),decimals=2),\
                       (p.get_x()+p.get_width()/2, p.get_height()+ 0.3),\
                       ha='center', va='center',fontsize=9)
    axes.legend(loc='best',fontsize=16) 
    plt.grid()
    plt.savefig(f'{path1}/ speed_error.png')
    
def loss_rmse_plot(loss,rmse_train,rmse_test,invl):
    #get figure for loss and rmse with steps
    plt.figure(figsize=(21,7))
    plt.subplot(121)
    plt.plot(list(range(len(loss))), loss, 'bo')
    plt.xlabel('Steps',fontsize=16)
    plt.ylabel('Loss of objective function',fontsize=16)
    plt.grid()
    plt.title('Loss of steps',fontsize=16)
    
    plt.subplot(122)
    plt.plot(list(range(invl,(len(rmse_train)+1)*invl,invl)), rmse_train, 'bo',\
             list(range(invl,(len(rmse_test)+1)*invl,invl)), rmse_test, 'ro')
    plt.xlabel('Steps',fontsize=16)
    plt.ylabel('RMSE',fontsize=16)
    plt.grid()
    plt.title('RMSE of steps',fontsize=16)
    plt.legend(['RMSE of training','RMSE of test'],fontsize=16)
    plt.savefig(f'{path1}\\loss and rmse.png')
    
def show_matplt():
    
    #get_data()
    orig_data,orig_train,orig_test,pred_missing,pred_emissing,pred_fmissing,\
        pred_train,pred_test,rmse_train,rmse_test,loss,invl=data_prcocessing()  
    
    #plot graph
    speed_plot(orig_data, pred_train, pred_test, 
               pred_missing, pred_emissing,pred_fmissing,rmse_train,rmse_test)
    
    speed_dist_plot(orig_train, pred_train, orig_test, pred_test)
    
    loss_rmse_plot(loss,rmse_train,rmse_test, invl)
    
def main():
    show_matplt()

if __name__=='__main__':
    main()

4.3 应用3

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
npg=np.genfromtxt
#import scipy
#fig=plt.figure()
#ax=fig.add_subplot(1,1,1)

"""b :blue,r:red,g: green,c:cyan,m: magenta,y:yellow,k:black, w:white"""


def data_getting():
    
    mr_=np.load(path1+'\\'+'mr_.npy','r+')
    mr_train=np.load(path1+'\\'+'mr_train.npy','r+')
    mr_test=np.load(path1+'\\'+'mr_test.npy','r+')
    
    params=npg(path1+'\\'+'params.txt',delimiter=':',dtype=None,encoding='utf-8')
    params=dict(params) #params dict
    rmse_interval=int(params['rmse_interval'])     #get time slot end
    
    T=np.load(path1+'\\'+'T.npy','r+')
    R=np.load(path1+'\\'+'R.npy','r+')
    G=np.load(path1+'\\'+'G.npy','r+')
    F=np.load(path1+'\\'+'F.npy','r+')
    mrPred=np.dot(T, R.T)
    
    rmse_train=np.loadtxt(path1+'\\'+'rmse_train.csv')
    rmse_test=np.loadtxt(path1+'\\'+'rmse_test.csv')
    loss=np.loadtxt(path1+'\\'+'loss.csv')
   
    return mr_,mr_train,mr_test,mrPred,T,R,G,F,rmse_train,rmse_test,loss,rmse_interval

def data_prcocessing():
    
    mr_,mr_train,mr_test,mrPred,T,R,G,F,rmse_train,rmse_test,loss,invl=data_getting()
    
    orig_train = mr_train[mr_train>0.1] # initial trainning value
    orig_test  = mr_test[mr_test>0.1]   # initial test value 
    orig_data  = np.hstack((orig_train,orig_test)) # initial value 
    pred_train = mrPred[mr_train>0.1]   # predicted trainning value 
    pred_test  = mrPred[mr_test>0.1]    # predicted test value
    
    pos_missing=np.where(mr_[:,:]<0.1)  # all missing position
    pred_missing=mrPred[pos_missing]    # predicted  value for missing position
 
    zero_columns=list(np.where(~mr_.any(axis=0))[0])# the columns all zeros(fiber missing)
    pred_fmissing=mrPred[:,zero_columns].flatten() #(fiber-like missing speed)

    emissing_mr_= np.delete(mr_,zero_columns, axis=1) 
    emissing_pos=np.where(emissing_mr_[:,:]<0.1)
    pred_emissing=mrPred[emissing_pos] #(element-like missing speed)

    return orig_data,orig_train,orig_test,pred_missing,pred_emissing,pred_fmissing,\
            pred_train,pred_test,rmse_train,rmse_test,loss,invl  

def speed_plot(orig_data, pred_train, pred_test, 
               pred_missing, pred_emissing,pred_fmissing,rmse_train,rmse_test):
    
    fig,axes = plt.subplots(2,3, figsize=(30,18)) #get 2x2 subplots
    #get figure for speed distribution 
    axes[0, 0].set_xlabel(u'Speed(km/h)',fontsize=16) #set the coordinate axis-x label
    axes[0, 0].set_ylabel(u'Probability',fontsize=16) #set the coordinate axis-y label
    axes[0, 0].set_title(u"Original No-missing(num:{})".format(len(orig_data)),fontsize=16)
    axes[0, 0].grid(True)
    axes[0, 0].set_xlim(-10, 130)
    axes[0, 0].hist(orig_data,bins=50,density=1,color='r',alpha=0.5,edgecolor='k') #initial speed
   
    axes[0, 1].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[0, 1].set_ylabel(u'Probability',fontsize=16) 
    axes[0, 1].set_title(u"Predicted training(num:{}, RMSE:{})".format(len(pred_train),rmse_train[-1]),fontsize=16)
    axes[0, 1].grid(True)
    axes[0, 1].set_xlim(-10, 130)
    axes[0, 1].hist(pred_train,bins=50,density=1,color='g',alpha=0.5,edgecolor='k') #predicted training speed
    
    axes[0, 2].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[0, 2].set_ylabel(u'Probability',fontsize=16) 
    axes[0, 2].set_title(u"Predicted test(num:{}, RMSE:{})".format(len(pred_test),rmse_test[-1]),fontsize=16)
    axes[0, 2].grid(True)
    axes[0, 2].set_xlim(-10, 130)
    axes[0, 2].hist(pred_test,bins=50,density=1,color='b',alpha=0.5,edgecolor='k') #predicted testing speed
        
    axes[1, 0].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[1, 0].set_ylabel(u'Probability',fontsize=16) 
    axes[1, 0].set_title(u"Original missing predicted(num:{})".format(len(pred_missing)),fontsize=16)
    axes[1, 0].grid(True)
    axes[1, 0].set_xlim(-10, 130)
    axes[1, 0].hist(pred_missing,bins=60,density=1,color='c',alpha=0.5,edgecolor='k') #predicted missing speed
    
    axes[1, 1].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[1, 1].set_ylabel(u'Probability',fontsize=16) 
    axes[1, 1].set_title(u"Element-like missing(num:{})".format(len(pred_emissing)),fontsize=16)
    axes[1, 1].grid(True)
    axes[1, 1].set_xlim(-10, 130)
    axes[1, 1].hist(pred_emissing,bins=4,density=1,color='m',alpha=0.5,edgecolor='k') #predicted Element-like missing speed
    
    axes[1, 2].set_xlabel(u'Speed(km/h)',fontsize=16) 
    axes[1, 2].set_ylabel(u'Probability',fontsize=16) 
    axes[1, 2].set_title(u"Fiber-like missing(num:{})".format(len(pred_fmissing)),fontsize=16)
    axes[1, 2].grid(True)
    axes[1, 2].set_xlim(-10, 130)
    axes[1, 2].hist(pred_fmissing,bins=60,density=1,color='y',alpha=0.5,edgecolor='k') #predicted Fiber-like missing speed
    
    plt.savefig('{0}\\Speed_predicted.png'.format(path1))
    
def speed_dist_plot(orig_train, pred_train, orig_test, pred_test):

    #  get figure for error with ±5km/h  
    bias_train=pred_train-orig_train
    bias_test=pred_test-orig_test
    names = ["training bias", "test bias"]  
    bins=[-np.inf,-20,-15,-10,-5,0,5,10,15,20,np.inf]
    cats_train=pd.value_counts(pd.cut(bias_train,bins))
    cats_train=cats_train/sum(cats_train)*100
    cats_test=pd.value_counts(pd.cut(bias_test,bins))
    cats_test=cats_test/sum(cats_test)*100
    cats=pd.concat([cats_train,cats_test],axis=1)
    cats.sort_index(inplace=True)
    cats.columns=pd.Index(names)

    fig,axes = plt.subplots(1,1, figsize=(15,10))
    axes.set_xlabel('Speed bias(km/h)',fontsize=16) 
    axes.set_ylabel('Percentage(%)',fontsize=16)
    axes.set_title('Speed bias distribution', fontsize=16)
    axes=cats.plot(kind='bar',ax=axes,color=['green','red'],\
              alpha=0.5,rot=30)
    for p in axes.patches:
        axes.annotate(np.round(p.get_height(),decimals=2),\
                       (p.get_x()+p.get_width()/2, p.get_height()+ 0.3),\
                       ha='center', va='center',fontsize=9)
    axes.legend(loc='best',fontsize=16) 
    plt.grid()
    plt.savefig(f'{path1}/ speed_error.png')
    
def loss_rmse_plot(loss,rmse_train,rmse_test,invl):
    #get figure for loss and rmse with steps
    plt.figure(figsize=(21,7))
    plt.subplot(121)
    plt.plot(list(range(len(loss))), loss, 'bo')
    plt.xlabel('Steps',fontsize=16)
    plt.ylabel('Loss of objective function',fontsize=16)
    plt.grid()
    plt.title('Loss of steps',fontsize=16)
    
    plt.subplot(122)
    plt.plot(list(range(invl,(len(rmse_train)+1)*invl,invl)), rmse_train, 'bo',\
             list(range(invl,(len(rmse_test)+1)*invl,invl)), rmse_test, 'ro')
    plt.xlabel('Steps',fontsize=16)
    plt.ylabel('RMSE',fontsize=16)
    plt.grid()
    plt.title('RMSE of steps',fontsize=16)
    plt.legend(['RMSE of training','RMSE of test'],fontsize=16)
    plt.savefig(f'{path1}\\loss and rmse.png')
    
def show_matplt():
    
    #get_data()
    orig_data,orig_train,orig_test,pred_missing,pred_emissing,pred_fmissing,\
        pred_train,pred_test,rmse_train,rmse_test,loss,invl=data_prcocessing()  
    
    #plot graph
    speed_plot(orig_data, pred_train, pred_test, 
               pred_missing, pred_emissing,pred_fmissing,rmse_train,rmse_test)
    
    speed_dist_plot(orig_train, pred_train, orig_test, pred_test)
    
    loss_rmse_plot(loss,rmse_train,rmse_test, invl)
    
def main():
    show_matplt()

if __name__=='__main__':
    main()

  • 7
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

scott198512

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值