def build_reward(self):
with tf.name_scope('permutations'):
# Reorder input % tour
self.permutations = tf.stack([tf.tile(tf.expand_dims(tf.range(self.batch_size,dtype=tf.int32),1),[1,self.max_length+2]),self.positions],2)
self.ordered_input_ = tf.gather_nd(self.input_,self.permutations)
self.ordered_input_ = tf.transpose(self.ordered_input_,[2,1,0]) # [batch size, seq length +1 , features] to [features, seq length +1, batch_size] Rq: +1 because end = start = depot
# Ordered coordinates
ordered_x_ = self.ordered_input_[0] # [seq length +1, batch_size]
delta_x2 = tf.transpose(tf.square(ordered_x_[1:]-ordered_x_[:-1]),[1,0]) # [batch_size, seq length] delta_x**2
ordered_y_ = self.ordered_input_[1] # [seq length +1, batch_size]
delta_y2 = tf.transpose(tf.square(ordered_y_[1:]-ordered_y_[:-1]),[1,0]) # [batch_size, seq length] delta_y**2
# Ordered TW constraints
self.ordered_tw_mean_ = tf.transpose(self.ordered_input_[2][:-1],[1,0]) # [seq length, batch_size] to [batch_size, seq length]
self.ordered_tw_width_ = tf.transpose(self.ordered_input_[3][:-1],[1,0]) # [seq length, batch_size] to [batch_size, seq length]
self.ordered_tw_open_ = self.ordered_tw_mean_ - self.ordered_tw_width_/2
self.ordered_tw_close_ = self.ordered_tw_mean_ + self.ordered_tw_width_/2
with tf.name_scope('environment'):
# Get tour length (euclidean distance)
inter_city_distances = tf.sqrt(delta_x2+delta_y2) # sqrt(delta_x**2 + delta_y**2) this is the euclidean distance between each city: depot --> ... ---> depot [batch_size, seq length]
self.distances = tf.reduce_sum(inter_city_distances, axis=1) # [batch_size]
variable_summaries('tour_length',self.distances, with_max_min = True)
# Get time at each city if no constraint
self.time_at_cities = (1/self.speed)*tf.cumsum(inter_city_distances, axis=1, exclusive=True)-10 # [batch size, seq length] # Rq: -10 to be on time at depot (t_mean centered)
# Apply constraints to each city
self.constrained_delivery_time = []
cumul_lateness = 0
for time_open, delivery_time in zip(tf.unstack(self.ordered_tw_open_,axis=1), tf.unstack(self.time_at_cities,axis=1)): # Unstack % seq length
delayed_delivery = delivery_time + cumul_lateness
cumul_lateness += tf.maximum(time_open-delayed_delivery,tf.zeros([self.batch_size])) # if you have to wait... wait (impacts further states)
self.constrained_delivery_time.append(delivery_time+cumul_lateness)
self.constrained_delivery_time = tf.stack(self.constrained_delivery_time,1)
# Define delay from lateness
self.delay = tf.maximum(self.constrained_delivery_time-self.ordered_tw_close_-0.0001, tf.zeros([self.batch_size,self.max_length+1])) # Delay perceived by the client (doesn't care if the deliver waits..)
self.delay = tf.count_nonzero(self.delay,1)
variable_summaries('delay',tf.cast(self.delay,tf.float32), with_max_min = True)
# Define reward from tour length & delay
self.reward = tf.cast(self.distances,tf.float32)+self.beta*tf.sqrt(tf.cast(self.delay,tf.float32))
variable_summaries('reward',self.reward, with_max_min = True)