#######################################################################
# Copyright (C) #
# 2016 Shangtong Zhang(zhangshangtong.cpp@gmail.com) #
# 2016 Kenta Shimada(hyperkentakun@gmail.com) #
# 2017 Aja Rangaswamy (aja004@gmail.com) #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from math import *
# maximum # of cars in each location
#每个地点最多的汽车
MAX_CARS = 20
# maximum # of cars to move during night
#夜间最多移动车辆数
MAX_MOVE_OF_CARS = 5
# expectation for rental requests in first location
#对第一地点的租赁需求的期望
RENTAL_REQUEST_FIRST_LOC = 3
# expectation for rental requests in second location
#对第二地点的租赁需求的期望
RENTAL_REQUEST_SECOND_LOC = 4
# expectation for # of cars returned in first location
#对在第一地点返回的汽车的期望
RETURNS_FIRST_LOC = 3
# expectation for # of cars returned in second location
#对在第二地点返回的汽车的期望
RETURNS_SECOND_LOC = 2
#折扣因子
DISCOUNT = 0.9
# credit earned by a car
#租车赚的分数
RENTAL_CREDIT = 10
# cost of moving a car
# 移动车辆的代价
MOVE_CAR_COST = 2
# current policy
#当前策略
policy = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
# current state value
#当前状态值
stateValue = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
# all possible states
#所有可能的状态
states = []
# all possible actions
#所有可能的动作
actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1)
# axes for printing use
#用于打印的轴
AxisXPrint = []
AxisYPrint = []
for i in range(0, MAX_CARS + 1):
for j in range(0, MAX_CARS + 1):
AxisXPrint.append(i)
AxisYPrint.append(j)
states.append([i, j])
print(states)
print("\n")
print(actions)
# plot a policy/state value matrix
#绘制策略/状态值矩阵
figureIndex = 0
def prettyPrint(data, labels):
global figureIndex
fig = plt.figure(figureIndex)
figureIndex += 1
ax = fig.add_subplot(111, projection='3d')
AxisZ = []
for i, j in states:
AxisZ.append(data[i, j])
ax.scatter(AxisXPrint, AxisYPrint, AxisZ)
ax.set_xlabel(labels[0])
ax.set_ylabel(labels[1])
ax.set_zlabel(labels[2])
# An up bound for poisson distribution
# If n is greater than this value, then the probability of getting n is truncated to 0
#泊松分布的上界
#如果大于上界,概率为0
POISSON_UP_BOUND = 11
# Probability for poisson distribution
# @lam: lambda should be less than 10 for this function
#给定n和lam(key)的泊松分布的概率
poissonBackup = dict()
def poisson(n, lam):
global poissonBackup
key = n * 100 + lam #把n和lam组合成一个key
if key not in poissonBackup.keys():
poissonBackup[key] = exp(-lam) * pow(lam, n) / factorial(n)
return poissonBackup[key]
# @state: [# of cars in first location, # of cars in second location]
# @action: positive if moving cars from first location to second location,
# negative if moving cars from second location to first location
# @stateValue: state value matrix
#@状态:[第一位置的汽车数量,第二位置的汽车数量]
#@action:如果将汽车从第一位置移动到第二位置,则为正,如果将汽车从第二位置移动到第一位置,则为负
#@stateValue:状态值矩阵
#期望的回报
def expectedReturn(state, action, stateValue):
# initailize total return
returns = 0.0
# cost for moving cars
returns -= MOVE_CAR_COST * abs(action)
# go through all possible rental requests
# 查看所有可能的租赁请求
for rentalRequestFirstLoc in range(0, POISSON_UP_BOUND):
for rentalRequestSecondLoc in range(0, POISSON_UP_BOUND):
# moving cars
#移动车辆
numOfCarsFirstLoc = int(min(state[0] - action, MAX_CARS))
numOfCarsSecondLoc = int(min(state[1] + action, MAX_CARS))
# valid rental requests should be less than actual # of cars
# #有效的租车要求应少于实际的#辆车
realRentalFirstLoc = min(numOfCarsFirstLoc, rentalRequestFirstLoc)
realRentalSecondLoc = min(numOfCarsSecondLoc, rentalRequestSecondLoc)
# get credits for renting
#获得租赁的奖励
reward = (realRentalFirstLoc + realRentalSecondLoc) * RENTAL_CREDIT
numOfCarsFirstLoc -= realRentalFirstLoc
numOfCarsSecondLoc -= realRentalSecondLoc
# probability for current combination of rental requests
##当前租赁请求组合的概率
prob = poisson(rentalRequestFirstLoc, RENTAL_REQUEST_FIRST_LOC) * \
poisson(rentalRequestSecondLoc, RENTAL_REQUEST_SECOND_LOC)
# if set True, model is simplified such that the # of cars returned in daytime becomes constant
# rather than a random value from poisson distribution, which will reduce calculation time
# and leave the optimal policy/value state matrix almost the same
# 如果设置为真,则模型被简化为使得在白天返回的车辆的数量变得恒定,
# 而不是从泊松分布的随机值,这将减少计算时间并使最优策略/值状态矩阵几乎相同
constantReturnedCars = True
if constantReturnedCars:
# get returned cars, those cars can be used for renting tomorrow
returnedCarsFirstLoc = RETURNS_FIRST_LOC
returnedCarsSecondLoc = RETURNS_SECOND_LOC
numOfCarsFirstLoc = min(numOfCarsFirstLoc + returnedCarsFirstLoc, MAX_CARS)
numOfCarsSecondLoc = min(numOfCarsSecondLoc + returnedCarsSecondLoc, MAX_CARS)
returns += prob * (reward + DISCOUNT * stateValue[numOfCarsFirstLoc, numOfCarsSecondLoc])
else:
numOfCarsFirstLoc_ = numOfCarsFirstLoc
numOfCarsSecondLoc_ = numOfCarsSecondLoc
prob_ = prob
for returnedCarsFirstLoc in range(0, POISSON_UP_BOUND):
for returnedCarsSecondLoc in range(0, POISSON_UP_BOUND):
numOfCarsFirstLoc = numOfCarsFirstLoc_
numOfCarsSecondLoc = numOfCarsSecondLoc_
prob = prob_
numOfCarsFirstLoc = min(numOfCarsFirstLoc + returnedCarsFirstLoc, MAX_CARS)
numOfCarsSecondLoc = min(numOfCarsSecondLoc + returnedCarsSecondLoc, MAX_CARS)
prob = poisson(returnedCarsFirstLoc, RETURNS_FIRST_LOC) * \
poisson(returnedCarsSecondLoc, RETURNS_SECOND_LOC) * prob
returns += prob * (reward + DISCOUNT * stateValue[numOfCarsFirstLoc, numOfCarsSecondLoc])
return returns
newStateValue = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
improvePolicy = False
policyImprovementInd = 0
while True:
if improvePolicy == True:
# start policy improvement
print("# start policy improvement")
print('Policy improvement', policyImprovementInd)
policyImprovementInd += 1
newPolicy = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
for i, j in states:
actionReturns = []
# go through all actions and select the best one
for action in actions:
if (action >= 0 and i >= action) or (action < 0 and j >= abs(action)):
actionReturns.append(expectedReturn([i, j], action, stateValue))
else:
actionReturns.append(-float('inf'))
print(actionReturns)
bestAction = np.argmax(actionReturns)
newPolicy[i, j] = actions[bestAction]
# if policy is stable
policyChanges = np.sum(newPolicy != policy)
print('Policy for', policyChanges, 'states changed')
if policyChanges == 0:
policy = newPolicy
break
policy = newPolicy
print("policy")
improvePolicy = False
# start policy evaluation
print("start policy evaluation")
for i, j in states:
newStateValue[i, j] = expectedReturn([i, j], policy[i, j], stateValue)
if np.sum(np.abs(newStateValue - stateValue)) < 1e-4:
stateValue[:] = newStateValue
improvePolicy = True
continue
stateValue[:] = newStateValue
# print("stateValue:")
# print(stateValue)
# print("\n")
prettyPrint(policy, ['# of cars in first location', '# of cars in second location', '# of cars to move during night'])
prettyPrint(stateValue, ['# of cars in first location', '# of cars in second location', 'expected returns'])
plt.show()