import pandas as pd
import numpy as np
import csv
from datetime import datetime
零、数据筛选
0-1 搭建grid号
from utils2019 import get_pos_lola
filename=['./data/datajun','./data/dataoct']
import pandas as pd
for fn in filename:
start_grid=[]
end_grid=[]
df=pd.read_csv(fn+'.csv')
for i in range(len(df)):
start_grid.append(get_pos_lola(df.loc[i]["starting_lng"],df.loc[i]['starting_lat']))
end_grid.append(get_pos_lola(df.loc[i]["dest_lng"],df.loc[i]['dest_lat']))
df["start_grid"]=start_grid
df["end_grid"]=end_grid
df["ridetime"]=pd.DataFrame((pd.to_datetime(df['finished_time']) -
pd.to_datetime(df['begun_time'])).apply(lambda x: int(x/np.timedelta64(1,'m'))))
print(df.head(3))
df.to_csv(fn+"_new.csv")
# df.to_csv(fn+"_grid.csv")
0-2数据筛选
- 添加grid-后的数据
- 调用函数筛选数据
from utils2019 import ready_data
# 数据处理
month = "jun"
ready_data(month)
# month = "oct"
ready_data(month)
- 结果展示
month = "jun"
root = "../data/"
leaf = root+month +"_ready.csv"
pd.read_csv(leaf)
文件的列名为:
Index(['starting_lng', 'starting_lat', 'dest_lng', 'dest_lat', 'start_grid',
'end_grid', 'ridetime', 'start_day', 'start_hour', 'start_minute',
'end_day', 'end_minute', 'end_hour'],
dtype='object')
文件的大小为:
(374944, 13)
一、以时间为轴的流量数据
对每个月的每日数据进行保存
1.1 数据结构搭建和保存
from tqdm import trange
#流量结构搭建以及保存
class Obtain_timeflow(object):
def __init__(self,month):
self.root = "../data/"
if isinstance(month,(int,float)) :
d = {2:"feb",3:"mar",6:"jun",10:"oct"}
self.month = d[int(month)]
else:
self.month = month.lower()
self.DAY = self.repre_day(self.month)
path = self.root+self.month +"_ready.csv"
if os.path.exists(path):
self.df =pd.read_csv(path,index_col=False)
self.columns = self.df.columns
else:
print("没有该文件!")
def build_flow(self,interval=None):
self.remove_file()
if interval is None:
interval = 15
Inval = int(60/interval)
Monarr = []
for day in trange(1,self.DAY+1):
if not os.path.exists(self.root+self.month+"/"):
os.makedirs(self.root+self.month+"/")
inter_path =self.root+self.month+"/"+str(day)+".npy"
if os.path.exists(inter_path):
Monarr.append(np.load(inter_path))
continue
Dayarr=np.zeros((24,Inval,500,2)) # 数据存储变量
for hour in range(24):
rs=self.day_record(day,hour,"start",Inval) #(12,500,1)
re=self.day_record(day,hour,"end", Inval ) #(12,500,1)
## flow in & flow out 的数据合并
Dayarr[hour]=np.concatenate((rs,re),axis=2) # (12,500,2)
np.save(inter_path,Dayarr) # (24,12,500,2)
Monarr.append(Dayarr)
Monarr = np.array(Monarr) # (28,24,12,500,2)
print("concate data shape:",Monarr.shape)
print("every day sumflow")
print(Monarr.mean(axis=(1,2,3,4)))
Monarr = Monarr.reshape(-1,500,2) #
print("flow feature data shape:",Monarr.shape)
Mon_path = self.root+self.month+"_interval"+str(interval)+".npy"
np.save(Mon_path,Monarr)
return Monarr
def day_record(self,day,hour,point,Inval):
# 筛选具体到一个小时的数据,500个节点的flow_in or flow_out
d = str(point) +"_day"
h = str(point) +"_hour"
m = str(point) +"_minute"
g = str(point) +"_grid"
if d not in self.columns:
print("列名不准确")
record=np.zeros((500,Inval))
# 筛选数据
d1=self.df[(self.df[d]==day) &(self.df[h]==hour)][[g,m]]
#d1.reset_index(drop=True,inplace=True)# 重置索引
for node in set(d1[g]):
# 筛选数据获得一个小时内node站点的用车时间点列表
L= d1[d1[g]==node][m].tolist()
if len(L) ==0:
record[node] = np.zeros(shape=(Inval,))
else:
record[node] = self.node_flow_feature(L,Inval) # 计算每阶段的单个流量特征
r=record.transpose(1, 0)
r=np.expand_dims(r, axis=2)# 扩维
return r
def remove_file(self,filename=None):
if filename is None :
filename =self.root+self.month+"/"+str(0)+".npy"
if os.path.exists(filename):
os.remove(filename)
def remove_dir(self,):
floot_dir=self.root+self.month+"/"
fl = os.listdir(floot_dir)
for p in fl:
path = floot_dir+p
self.remove_file(path)
## 类方法,不要class实例化就可以直接调用
@classmethod
def repre_day(self,month):
if month.lower() =="feb":
return 28
elif month.lower() =="jun":
return 30
else:
return 31
@classmethod
def node_flow_feature(self,L,Inval):
R = np.zeros(shape=(Inval,))
mod = 60/Inval
for m in L:
if m ==60:
m -=1
index = int(m//mod )
R[index] += 1
return R
调用1:
## 获得六月15分钟为间隔的数据
month = "jun"
otf=Obtain_timeflow(month)
Monarr=otf.build_flow()
调用2:
## 获得六月5分钟为间隔的数据
month = "jun"
interval = 5
otf=Obtain_timeflow(month)
otf.remove_dir() #删除原来运行保存的数据
Monarr=otf.build_flow(interval)
1.2 滑动窗口数据
def windows_slices(month,interval,in_window,out_window):
## 读取数据
filename = "../data/"+month+"_interval"+str(interval)+".npy"
Marr = np.load(filename)
Marr = np.transpose(Marr,(1,2,0))
print(month+" data shape is:",Marr.shape)
# (num_vertices, num_features,num_timesteps)
## 获得时间窗口的首尾索引
Inval = int(60/interval)
window = in_window+out_window
N = Marr.shape[2]-window
indices=[(i, i+ window) for i in range(N)] #切片的首尾索引
## 根据索引切片
feature, target =[], []
for i,j in indices:
feature.append(
Marr[:, :, i:i+in_window ].transpose((0,2,1))
) #[slices, N, W, F]
target.append(
Marr[:, :, i+in_window:j ]
) #[slices, N, F, W]
feature= np.array(feature)
print("feature shape [S, N,W,F] = ",feature.shape)
np.save("../data/"+month+"_interval"+str(interval)+"_feature.npy",feature)
target = np.array(target)
print( "target shape [S, N, F,W] = ",target.shape)
np.save("../data/"+month+"_interval"+str(interval)+"_target.npy",target)
return feature,target
调用:
month = "jun"
interval = 15
in_window = 6
out_window = 2
feature,target = windows_slices(month,interval, in_window,out_window)
out:
jun data shape is: (500, 2, 2880)
feature shape [S, N,W,F] = (2872, 500, 6, 2)
target shape [S, N, F,W] = (2872, 500, 2, 2)
二、空间矩阵的搭建
grid性质搭建ing
2.1 grid 4个经纬度【左右上下】顺序
Grid=[118.750802,32.019297,118.818880,32.063531] #矩形区域的范围
def grid_longitude_latitude(Grid):
import numpy as np
num_jing=25 # lie
num_wei=20 # hang
# 以下分别获取维度和经度分割点的坐标
list_jing=np.linspace(Grid[0],Grid[2],num=num_jing+1)
list_wei =np.linspace(Grid[1],Grid[3],num=num_wei+1)
## 创建网格字典列表【{'id':1,'jingxia':xx,'jingshang':xx,'weizuo':xx,'weiyou':xx}】
id=0 # 网格=节点的名称或者索引
grids=collections.defaultdict(list)
#双循环定位到每个小网格
for i in range(num_wei): # hang
for j in range(num_jing):
# 每个网格点的数据类型[左下、右上的经纬度,行、列]
grids[id].append(list_jing[j])
grids[id].append(list_wei[i])
grids[id].append(list_jing[j+1])
grids[id].append(list_wei[i+1])
grids[id].append(i)
grids[id].append(j)
id+=1 #进行下一个节点
return grids
2.2grid 空间邻居矩阵
## 站点的地理位置相邻关系矩阵
def get_neighbor_matrix(grids):
import numpy as np
num_nodes = 500
NM=np.zeros((num_nodes,num_nodes)) #用于存放邻接关系
for i in range(num_nodes):
for j in range(i,num_nodes):
# [0]:经度左,[1]:纬度下,[2]:经度右,[3]纬度上,[4]是行,[5]是列
# i是A,j是B
# 若B是A的上邻居,则A列=B列,A行+1=B行
if (grids[i][5] == grids[j][5]) & (grids[i][4]+1 == grids[j][4]):
NM[i][j]=1
NM[j][i]=1
# 若B是A的右邻居,则A行=B行,A列+1=B列
if (grids[i][4] == grids[j][4]) & (grids[i][5]+1 == grids[j][5]):
NM[i][j]=1
NM[j][i]=1
# 若B是A的右上邻居,则A行+1=B行,A列+1=B列
if (grids[i][4]+1 == grids[j][4]) & (grids[i][5]+1 == grids[j][5]):
NM[i][j]=1
NM[j][i]=1
return NM
数据保存
Grid = [118.750802,32.019297,118.818880,32.063531] #矩形区域的范围
grids = grid_longitude_latitude(Grid)
NM = get_neighbor_matrix(grids)
fpath="../data/Spatial_neighbor.npy"
np.save(fpath,NM)
NM1=np.load(fpath)
NM1
三、文本矩阵的搭建
3.1 grid的pois特征搭建
数据样貌:
统计每个poi在每个grid区域的个数,并保存
csvs=["100000","120000","140000","150000","160000","170000","050000","060000","070000","080000","090000"]
pops = np.zeros(shape=(11,500))
for k in range(11):
df = pd.read_csv("../poi_data/解析结果_118.750802,32.063531#118.818880,32.019297types_"+csvs[k]+".csv")
Cter=defaultdict(lambda : 0) # 默认字典设置value=0
for i in range(len(df)):
grid=get_pos_lola(df.loc[i]["wgs84_lon"], df.loc[i]["wgs84_lat"])
if grid != -1:
Cter[grid] +=1
for j in range(500):
pops[k][j] = Cter[j]
poi = np.transpose(pops,(1,0))
np.save("../poi_data/characteristic500.npy",poi)
结果展示:
3.2 pois特征关系提取–fastDTW
fast-DTW距离矩阵
def dis_DTW(feature=None) :
if feature is None:
feature = np.load("../data/common/"+"/characteristic500.npy")
print("feature data shape is ",feature.shape)
NUM = feature.shape[0]
D = np.zeros(shape =(NUM,NUM))
for i in trange(NUM):
for j in range(i+1,NUM):
d,path = fastdtw(feature[i],feature[j])
D[i][j] = d
D[j][i] = d
np.save("../data/common/dis_DTW.npy",D)
return D
获得归一化后PA矩阵
def get_matrix(ADJ,D=None,Thre=None):
if D is None:
D =np.load("../data/common/dis_DTW.npy")
##求D的大阈值
if Thre is None:
Thre = int(min(np.mean(D),np.median(D))/5)
print("median and mean limit the value is :",Thre)
NUM = D.shape[0]
A = np.zeros(shape=(NUM,NUM))
fname = "../data/common/PN_"+ADJ+".npy"
if ADJ =="weighted":
for i in range(NUM):
for j in range(NUM):
if round(D[i][j],1) ==0:
A[i][j] = Thre
elif round(D[i][j],1) >Thre :
A[i][j] = 0
else :
A[i][j] =round(1/(round(D[i][j],1 )),2)
## 归一化
big_A = np.max(A)
small_A = np.min(A)
A = (A - small_A) / (big_A - small_A)
np.save(fname,A)
return np.round(A,2)
elif ADJ =="unweighted":
Au =A <Thre
np.save(fname,Au)
return Au
调用:
ADJ ="weighted"
D = dis_DTW()
A=get_matrix(ADJ,D,Thre=None)
注:函数fastdtw():见链接fastdtw
验证矩阵:
utils.py
import numpy as np
def node_flow_feature(data,node):
## 注意数据的格式。
data.columns=["station","time"]
T = np.zeros(shape=(12,))
#index=nodes.index(node)
L = data[data["station"]==node]["time"].tolist()
#print(L)
for t in L:
if (t>=0) & (t<5):
T[0]+=1
elif t<10:
T[1]+=1
elif t<15:
T[2]+=1
elif t<20:
T[3]+=1
elif t<25:
T[4]+=1
elif t<30:
T[5]+=1
elif t<35:
T[6]+=1
elif t<40:
T[7]+=1
elif t<45:
T[8]+=1
elif t<50:
T[9]+=1
elif t<55:
T[10]+=1
else:
T[11]+=1
#print(t,T)
return T
import decimal
def get_pos_lola(lo,la):
#区域范围
#lo=round(float(x.split(',')[0].split('(')[1]),6) # 经度
#la=round(float(x.split(',')[1].split(')')[0]),6) # 纬度
lo=decimal.Decimal(lo)
la=decimal.Decimal(la)
lo_l=decimal.Decimal(118.750802)
la_l=decimal.Decimal(32.019297)
lo_r=decimal.Decimal(118.818880)
la_r=decimal.Decimal(32.063531)
d_lo=(lo_r-lo_l)/25 # 经度间隔lie 东西方向25
d_la=(la_r-la_l)/20 # 纬度间隔hang 南北方向20
dx_lo=(lo-lo_l)//d_lo # 经度差(距离最左的列的列数) #东西 行
dy_la=(la-la_l)//d_la # 纬度差(距离最下的行的行数) #南北 列
#print(dx_lo)
#print(dy_la)
if (0<=dx_lo<=24)&(0<=dy_la<=19):
d_xy=int(dx_lo+25*dy_la)
#print('所属区域:%s'%d_xy)
return d_xy
elif (0<=dx_lo<=24)&(la==la_r):
d_xy=int(dx_lo+25*decimal.Decimal(19))
#print('所属区域:%s'%d_xy)
return d_xy
else:
#print('越界')
return -1
def ready_data(month):
root = "../data/"
filename = root+month+"_grid.csv"
if not os.path.exists(filename):
print("文件不存在!")
else :
df=pd.read_csv(filename)
## 删除 ”Unnamed: 0",'vehicle_id', 'is_mkt_card','birth_year' 列
df.drop(['vehicle_id','is_mkt_card','birth_year',"Unnamed: 0"],axis=1,inplace=True) # 删除列
## 删除 不符合条件的行
df.drop(df[df["start_grid"]== -1].index,inplace=True) # 删除行
df.drop(df[df["end_grid"]== -1].index,inplace=True) # 删除行
## 获得时间相关列
df['start_day']=pd.to_datetime(df['begun_time']).dt.day
df['start_hour']=pd.to_datetime(df['begun_time']).dt.hour
df['start_minute']=pd.to_datetime(df['begun_time']).dt.minute
df['end_day']=pd.to_datetime(df['finished_time']).dt.day
df['end_minute']=pd.to_datetime(df['finished_time']).dt.minute
df['end_hour']=pd.to_datetime(df['finished_time']).dt.hour
## 删除时间原列数据
del df["begun_time"]
del df["finished_time"]
## 保存数据
savefile = root+month+"_ready.csv"
df.to_csv(savefile,index=False) #因为删除数据后索引为曾修改
print("文件的列名为:")
print(df.columns)
print("文件的大小为:")
print(df.shape)