【深入浅出强化学习-编程实战】6 基于函数逼近的方法-鸳鸯系统
6.1 鸳鸯系统——基于函数逼近的方法
左上为雄鸟,右上为雌鸟,中间有两道障碍物。目标:雄鸟找到雌鸟。
环境类yuanyang_env.py不变
import pygame
from resource.load import *
import math
import time
import random
import numpy as np
class YuanYangEnv:
def __init__(self):
# 状态空间
self.states = [] # 0-99
for i in range(0, 100):
self.states.append(i)
self.actions = ['e', 'w', 'n', 's']
# 无模型的强化学习算法需要评估行为-值函数
# 每个方格表示一个状态,每个状态的4个值分别对应着4个动作的行为-值函数
self.gamma = 0.95 # 蒙特卡洛利用整条轨迹的数据进行评估策略,如果gamma太小,后面回报的贡献会很快衰减
# 行为-值函数
self.action_value = np.zeros((100, 4))
# 设置渲染属性
self.viewer = None # 一个渲染窗口
# 帧速率是指程序每秒在屏幕山绘制图像的数目,我们可以用FPS来表示它。一般的计算机都能达到每秒60帧的速度。如果我们把帧速率讲得比较低,那么游戏也会看上去较为卡顿。
self.FPSCLOCK = pygame.time.Clock()
# 屏幕大小
self.screen_size = (1200, 900)
# 雄鸟当前位置
self.bird_position = (0, 0)
# 雄鸟在x方向每走一次像素为120
self.limit_distance_x = 120
# 雄鸟在y方向每走一次像素为90
self.limit_distance_y = 90
# 每个障碍物大小为120像素*90像素
self.obstacle_size = [120, 90]
# 一共有两个障碍物墙,每个障碍物墙由8个小障碍物组成
self.obstacle1_x = []
self.obstacle1_y = []
self.obstacle2_x = []
self.obstacle2_y = []
self.path = []
for i in range(8):
# 第一个障碍物
self.obstacle1_x.append(360)
if i <= 3:
self.obstacle1_y.append(90 * i)
else:
self.obstacle1_y.append(90 * (i + 2))
# 第二个障碍物
self.obstacle2_x.append(720)
if i <= 4:
self.obstacle2_y.append(90 * i)
else:
self.obstacle2_y.append(90 * (i + 2))
# 雄鸟初始位置
self.bird_male_init_position = [0, 0]
# 雄鸟当前位置
self.bird_male_position = [0, 0]
# 雌鸟初始位置
self.bird_female_init_position = [1080, 0]
# 雄鸟碰撞检测子函数
def collide(self, state_position):
# 用标志flag,flag1,flag2分别表示是否与障碍物、障碍物墙1、障碍物墙2发生碰撞
flag = 1
flag1 = 1
flag2 = 1
# 检测雄鸟是否与第一个障碍物墙发生碰撞
# 找到雄鸟与第一个障碍物所有障碍物x方向和y方向最近的障碍物的坐标差
# 并判断最近的坐标差是否大于一个最小运动距离
# 如果大于等于 就不会发生碰撞
dx = []
dy = []
for i in range(8):
dx1 = abs(self.obstacle1_x[i] - state_position[0])
dx.append(dx1)
dy1 = abs(self.obstacle1_y[i] - state_position[1])
dy.append(dy1)
mindx = min(dx)
mindy = min(dy)
if mindx >= self.limit_distance_x or mindy >= self.limit_distance_y:
flag1 = 0 # 没碰
# 是否与第二个障碍物墙碰撞
second_dx = []
second_dy = []
for i in range(8):
dx2 = abs(self.obstacle2_x[i] - state_position[0])
second_dx.append(dx2)
dy2 = abs(self.obstacle2_y[i] - state_position[1])
second_dy.append(dy2)
mindx = min(second_dx)
mindy = min(second_dy)
if mindx >= self.limit_distance_x or mindy >= self.limit_distance_y:
flag2 = 0 # 没碰
if flag1 == 0 and flag2 == 0:
flag = 0 # 没碰
# 是否超出边界,如果是,也认为发生碰撞
if state_position[0] > 1080 or state_position[0] < 0 or state_position[1] > 810 or state_position[1] < 0:
flag = 1 # 碰了
# 返回碰撞标志位
return flag
# 雄鸟是否找到雌鸟子函数
def find(self, state_position):
# 设置标志位flag
# 判断雄鸟当前位置和雌鸟位置坐标差,雄安与最小运动距离则为找到
flag = 0
if abs(state_position[0] - self.bird_female_init_position[0]) < self.limit_distance_x and abs(
state_position[1] - self.bird_female_init_position[1]) < self.limit_distance_y:
flag = 1
return flag
# 状态转化为像素坐标子函数
def state_to_position(self, state):
i = int(state / 10)
j = state % 10
position = [0, 0]
position[0] = 120 * j
position[1] = 90 * i
return position
# 像素转化为状态坐标子函数
def position_to_state(self, position):
i = position[0] / 120
j = position[1] / 90
return int(i + 10 * j)
def reset(self):
#随机产生初始状态
flag1=1
flag2=1
while flag1 or flag2 ==1:
#随机产生初始状态,0~99,randoom.random() 产生一个0~1的随机数
state=self.states[int(random.random()*len(self.states))]
state_position = self.state_to_position(state)
flag1 = self.collide(state_position)
flag2 = self.find(state_position)
return state
# 原来的回报只有在找到目标点和碰到障碍物的时候才有回报,是稀疏回报
# 蒙特卡洛方法对于稀疏回报问题估计方差无穷大
# 为此,我们每一步都给出了回报,将稀疏回报变成稠密汇报
def transform(self,state, action):
# 将当前状态转化为坐标
current_position=self.state_to_position(state)
next_position = [0,0]
flag_collide=0
flag_fnd=0
# 判断当i前坐标是否与障碍物碰撞
flag_collide=self.collide(current_position)
# 判断状态是否是终点
flag_find=self.find(current_position)
if flag_collide == 1:
return state, -10, True
if flag_find == 1:
return state, 10, True
# 状态转移
if action=='e':
next_position[0]=current_position[0]+120
next_position[1]=current_position[1]
if action=='s':
next_position[0]=current_position[0]
next_position[1]=current_position[1]+90
if action=='w':
next_position[0] = current_position[0] - 120
next_position[1] = current_position[1]
if action=='n':
next_position[0] = current_position[0]
next_position[1] = current_position[1] - 90
# 判断next_state是否与障碍物碰撞
flag_collide = self.collide(next_position)
# 如果碰撞,那么回报为-10,并结束
if flag_collide==1:
return self.position_to_state(current_position),-10,True
# 判断是否终点
flag_find = self.find(next_position)
if flag_find==1:
return self.position_to_state(next_position),10,True
# 每走一步回报-2
return self.position_to_state(next_position),-0.1, False
def gameover(self):
for event in pygame.event.get():
if event.type == QUIT:
exit()
def render(self):
if self.viewer is None:
pygame.init()
#画一个窗口
self.viewer=pygame.display.set_mode(self.screen_size,0,32)
pygame.display.set_caption("yuanyang")
#下载图片
self.bird_male = load_bird_male()
self.bird_female = load_bird_female()
self.background = load_background()
self.obstacle = load_obstacle()
#self.viewer.blit(self.bird_male, self.bird_male_init_position)
#在幕布上画图片
self.viewer.blit(self.bird_female, self.bird_female_init_position)
self.viewer.blit(self.background, (0, 0))
self.font = pygame.font.SysFont('times', 15)
self.viewer.blit(self.background,(0,0))
#画直线
for i in range(11):
pygame.draw.lines(self.viewer, (255, 255, 255), True, ((120*i, 0), (120*i, 900)), 1)
pygame.draw.lines(self.viewer, (255, 255, 255), True, ((0, 90* i), (1200, 90 * i)), 1)
self.viewer.blit(self.bird_female, self.bird_female_init_position)
#画障碍物
for i in range(8):
self.viewer.blit(self.obstacle, (self.obstacle1_x[i], self.obstacle1_y[i]))
self.viewer.blit(self.obstacle, (self.obstacle2_x[i], self.obstacle2_y[i]))
#画小鸟
self.viewer.blit(self.bird_male, self.bird_male_position)
# 画动作-值函数
for i in range(100):
y = int(i/10)
x = i % 10
# 往东的值函数
surface = self.font.render(str(round(float(self.action_value[i,0]),2)),True,(0,0,0))
self.viewer.blit(surface,(120*x+80,90*y+45))
# 往南的值函数
surface = self.font.render(str(round(float(self.action_value[i,1]),2)),True,(0,0,0))
self.viewer.blit(surface, (120 * x + 50, 90 * y + 70))
# 往西的值函数
surface = self.font.render(str(round(float(self.action_value[i, 2]), 2)), True, (0, 0, 0))
self.viewer.blit(surface, (120 * x + 10, 90 * y + 45))
# 往北的值函数
surface = self.font.render(str(round(float(self.action_value[i, 3]), 2)), True, (0, 0, 0))
self.viewer.blit(surface, (120 * x + 50, 90 * y + 10))
# 画路径点
for i in range(len(self.path)):
rec_position = self.state_to_position(self.path[i])
pygame.draw.rect(self.viewer, [255, 0, 0], [rec_position[0], rec_position[1], 120, 90], 3)
surface = self.font.render(str(i), True, (255, 0, 0))
self.viewer.blit(surface, (rec_position[0] + 5, rec_position[1] + 5))
pygame.display.update()
self.gameover()
# time.sleep(0.1)
self.FPSCLOCK.tick(30)
if __name__=="__main__":
yy=YuanYangEnv()
yy.render()
while True:
for event in pygame.event.get():
if event.type == QUIT:
exit()
基于表格表征表示
LFA_RL.py
import numpy as np
from yuanyang_env_fa import *
from yuanyang_env_fa import YuanYangEnv
class LFA_RL:
# 初始化折扣因子,环境,表格特征表示对应的参数theta_tr,固定稀疏表示对应的参数theta_ftr
# 基于表格特征表示的W-learning算法
def __init__(self,yuanyang):
self.gamma = yuanyang.gamma
self.yuanyang = yuanyang
self.theta_tr = np.zeros((400,1))*0.1
self.theta_fsr = np.zeros((80,1))*0.1
# 定义动作与对应数字的转化,便于查找
def find_anum(self,a):
for i in range(len(self.yuanyang.actions)):
if a == self.yuanyang.actions[i]:
return i
# 定义表格特征函数,输入为状态动作对,输出为该状态动作对对应的特征。此处为表格特征
def feature_tr(self,s,a):
phi_s_a = np.zeros((1,400))
phi_s_a[0,100*a+s] = 1
return phi_s_a
# 定义利用表格特征的行为-值函数对对应的贪婪策略
# 其中行为值函数定义为Q[s,a] = phi_s_a[s,a]*theta_tr
def greedy_policy_tr(self,state):
qfun = np.array([0,0,0,0])*0.1
# 计算行为值函数Q[s,a] = phi_s_a[s,a]*theta_tr
for i in range(4):
qfun[i] = np.dot(self.feature_tr(state,i),self.theta_tr)
amax = qfun.argmax()
return self.yuanyang.actions[amax]
# 定义基于表格特征的epsilon-greedy策略,输入为状态,输出为epsilon-greedy策略
def epsilon_greedy_policy_tr(self,state,epsilon):
qfun = np.array([0,0,0,0])*0.1
# 计算行为值函数Q[s,a] = phi_s_a[s,a]*theta_tr
for i in range(4):
qfun[i] = np.dot(self.feature_tr(state,i),self.theta_tr)
amax = qfun.argmax()
# 概率部分
if np.random.uniform()<1-epsilon:
# 最优动作
return self.yuanyang.actions[amax]
else:
return self.yuanyang.actions[int(random.random()*len(self.yuanyang.actions))]
# 基于表格特征的贪婪策略测试函数
# 如果雄鸟以最短路径找到雌鸟则结束
def greedy_test_tr(self):
s = 0
s_sample = []
done = False
flag = 0
step_num = 0
while False == done and step_num <30:
a = self.greedy_policy_tr(s)
# 与环境交互
s_next,r,done = self.yuanyang.transform(s,a)
s_sample.append(s)
s = s_next
step_num += 1
if s == 9:
flag = 1
if s == 9 and step_num <21:
flag = 2
return flag
# 基于表格特征的Q-learning算法,该算法中,首先初始化要学习的参数theta_tr,然后进入大循环开始学习
def qlearning_lfa_tr(self,num_iter,alpha,epsilon):
iter_num = []
self.theta_tr = np.zeros((400,1))*0.1
# 大循环
for iter in range(num_iter):
# 随机初始化状态
s = 0
# s = yuanyang.reset()
# 在学习前,先调用贪婪策略策略函数,看看是否完成任务,如果能是最短路径就停止训练
flag = self.greedy_test_tr()
if flag == 1:
iter_num.append(iter)
if len(iter_num)<2:
print("qlearning_tr第1次完成任务需要的迭代次数为:",iter_num[0])
if flag == 2:
print('qlearning_tr第1次实现最短路径需要迭代次数为:',iter)
break
s_sample = []
# 随机选初始化动作
a = self.epsilon_greedy_policy_tr(s,epsilon)
t = False
count = 0
# 进入内循环,使得智能体与环境交互,从环境中获得回报,并根据回报学习
while False == t and count <30:
# 与环境交互得到下一个状态
s_next,r,t = yuanyang.transform(s,a)
a_num = self.find_anum(a)
if s_next in s_sample:
r = -2
s_sample.append(s)
if t == True:
q_target = r
else:
# 下一个状态处的最大动作a1用greedy_policy
a1 = self.greedy_policy_tr(s_next)
a1_num = self.find_anum(a1)
# Q-learning得到时间差分目标
q_target = r+self.gamma*np.dot(self.feature_tr(s_next,a1_num),self.theta_tr)
# 获得时间差分后,利用梯度下降法对参数进行更新,最后返回学到的参数theta_tr
self.theta_tr = self.theta_tr+alpha*(q_target-np.dot(self.feature_tr(s,a_num),self.theta_tr))[0,0]*np.transpose(self.feature_tr(s,a_num))
s = s_next
# 行为策略
a = self.epsilon_greedy_policy_tr(s,epsilon)
count+=1
return self.theta_tr
# 基于固定稀疏表示的Qlearning方法
# 首先定义特征函数,在这里我们将x方向离散为10个数,y方向离散为10个数,动作个数为4,则特征的维度为(10+10)*4=80
def feature_fsr(self,s,a):
phi_s_a = np.zeros((1,80))
y = int(s/10)
x = s - 10*y
phi_s_a[0,20*a+x] = 1
phi_s_a[0,20*a+10+y] = 1
return phi_s_a
# 定义基于固定稀疏表示的贪婪策略
def greedy_policy_fsr(self,state):
qfun = np.array([0,0,0,0])*0.1
# 计算行为值函数Q(s,a) = phi(s,a)*theta
for i in range(4):
qfun[i] = np.dot(self.feature_fsr(state,i),self.theta_fsr)
amax = qfun.argmax()
return self.yuanyang.actions[amax]
# 定义基于固定稀疏表示的epsilon-greedy策略,用于采样动作
def epsilon_greedy_policy_fsr(self,state,epsilon):
qfun = np.array([0,0,0,0])*0.1
# 计算行为值函数Q(s,a) = phi(s,a)*theta
for i in range(4):
qfun[i] = np.dot(self.feature_fsr(state,i),self.theta_fsr)
amax = qfun.argmax()
# 概率部分
if np.random.uniform()<1-epsilon:
# 最优动作
return self.yuanyang.actions[amax]
else:
return self.yuanyang.actions[int(random.random()*len(self.yuanyang.actions))]
# 基于固定稀疏的贪婪策略测试函数
# 如果智能体以贪婪策略完成目标,则返回标志位
def greedy_test_fsr(self):
s = 0
s_sample = []
done = False
flag = 0
step_num = 0
while False == done and step_num <30:
a = self.greedy_policy_fsr(s)
# 与环境交互
s_next,r,done = self.yuanyang.transform(s,a)
s_sample.append(s)
s = s_next
step_num += 1
if s == 9:
flag = 1
if s == 9 and step_num <21:
flag = 2
return flag
# 基于固定稀疏的Q-learning算法,该算法中,首先初始化要学习的固定稀疏参数theta_ftr,然后进入大循环开始学习
def qlearning_lfa_fsr(self,num_iter,alpha,epsilon):
iter_num = []
self.theta_fsr = np.zeros((80,1))*0.1
# 大循环
for iter in range(num_iter):
# 随机初始化状态
s = 0
# s = yuanyang.reset()
# 在学习前,先调用贪婪策略策略函数,看看是否完成任务,如果能是最短路径就停止训练
flag = self.greedy_test_fsr()
if flag == 1:
iter_num.append(iter)
if len(iter_num)<2:
print("qlearning_fsr第1次完成任务需要的迭代次数为:",iter_num[0])
if flag == 2:
print('qlearning_fsr第1次实现最短路径需要迭代次数为:',iter)
break
s_sample = []
# 随机选初始化动作
a = self.epsilon_greedy_policy_fsr(s,epsilon)
t = False
count = 0
# 进入内循环,使得智能体与环境交互,从环境中获得回报,并根据回报学习
while False == t and count <30:
# 与环境交互得到下一个状态
s_next,r,t = yuanyang.transform(s,a)
a_num = self.find_anum(a)
if s_next in s_sample:
r = -2
s_sample.append(s)
if t == True:
q_target = r
else:
# 下一个状态处的最大动作a1用greedy_policy
a1 = self.greedy_policy_fsr(s_next)
a1_num = self.find_anum(a1)
# Q-learning得到时间差分目标
q_target = r+self.gamma*np.dot(self.feature_fsr(s_next,a1_num),self.theta_fsr)
# 获得时间差分后,利用梯度下降法对参数进行更新,最后返回学到的参数theta_tr
self.theta_fsr = self.theta_fsr+alpha*(q_target-np.dot(self.feature_fsr(s,a_num),self.theta_fsr))[0,0]*np.transpose(self.feature_fsr(s,a_num))
s = s_next
# 行为策略
a = self.epsilon_greedy_policy_fsr(s,epsilon)
count+=1
return self.theta_fsr
# 主函数
if __name__=="__main__":
yuanyang = YuanYangEnv()
brain = LFA_RL(yuanyang)
# brain.qlearning_lfa_fsr(num_iter=5000, alpha=0.1, epsilon=0.1)
brain.qlearning_lfa_tr(num_iter=5000,alpha=0.1,epsilon=0.1)
# 打印学到的值函数
qvalue2 = np.zeros((100,4))
# qvalue1 = np.zeros((100,4))
for i in range(400):
y = int(i/100)
x = i - 100*y
qvalue2[x,y]=np.dot(brain.feature_tr(x,y),brain.theta_tr)
#qvalue1[x,y]=np.dot(brain.feature_fsr(x,y),brain.theta_fsr)
yuanyang.action_value = qvalue2
# 测试学到的策略
flag = 1
s = 0
step_num = 0
path = []
# 将最优路径打印出来
while flag:
# 渲染路径点
path.append(s)
yuanyang.path = path
a = brain.greedy_policy_tr(s)
#a = brain.greedy_policy_fsr(s)
print('%d->%s\t'%(s,a),qvalue2[s,0],qvalue2[s,1],qvalue2[s,2],qvalue2[s,2],qvalue2[s,3])
yuanyang.bird_male_position = yuanyang.state_to_position(s)
yuanyang.render()
time.sleep(0.25)
step_num+=1
s_,r,t = yuanyang.transform(s,a)
if t == True or step_num >30:
flag = 0
s = s_
# 渲染最后路径点
yuanyang.bird_male_position = yuanyang.state_to_position(s)
path.append(s)
yuanyang.render()
while True:
yuanyang.render()
结果:
qlearning_tr第1次完成任务需要的迭代次数为: 242
qlearning_tr第1次实现最短路径需要迭代次数为: 309
0->s -1.8762719346935661 -7.4581341716709995 -4.68559 -4.68559 -1.7391047223161402
10->s -1.737914028120795 -5.2229358999999995 -1.8442897947542878 -1.8442897947542878 -1.7224999384277635
20->s -1.9809123280821137 -4.0389751 -1.8734555549647016 -1.8734555549647016 -1.5930037164654705
30->s -1.7879015457966834 -4.7918782 -2.020381080454969 -2.020381080454969 -1.7117314942481403
40->e -1.7133662134057452 -4.4641279 -1.7854368266765694 -1.7854368266765694 -1.8227179416380896
41->e -1.7549918143145464 -1.8715871925648397 -2.068711025130093 -2.068711025130093 -1.8050472575842063
42->e -1.3290727851923108 -1.85669485492018 -1.8107941565739671 -1.8107941565739671 -1.4249570644229796
43->e -1.1948457940564916 -1.8942900734042145 -3.5119000000000002 -3.5119000000000002 -1.2750346991449586
44->e -0.9860628961853446 -1.261322472278998 -1.1820976379954864 -1.1820976379954864 -1.0600117422577118
45->s -3.2951 -1.0054522173471994 -1.1179616619625965 -1.1179616619625965 -0.8520403590443152
55->e -0.73466297536728 -1.3738722490708102 -0.9568615646168465 -0.9568615646168465 -0.8664491475133478
56->e -0.675873673050564 -0.7658614497125311 -4.68559 -4.68559 -0.7151223458034627
57->n -0.7831160084131752 -0.7895385053973183 -0.5704306727120169 -0.5704306727120169 -0.7381965939230033
47->n -0.4371881470625 -2.71 -0.2555873032652053 -0.2555873032652053 -0.5971829305074089
37->e 0.2628897354667868 -2.0620000000000003 -0.3951711722500001 -0.3951711722500001 -0.40219652666450006
38->n -0.23748735977500002 -0.38266475000000005 1.776031926412276 1.776031926412276 -0.407004453
28->n -0.03917222875000001 -0.029674500000000003 3.700901670178375 3.700901670178375 -0.46343614354335755
18->e 6.555489249042639 -0.010000000000000002 0.07600000000000001 0.07600000000000001 -0.2614938107952605
19->n -1.0 -0.010000000000000002 9.202335569231275 9.202335569231275 0.11817656816395794
基于固定稀疏表示
结果:
qlearning_fsr第1次实现最短路径需要迭代次数为: 239
0->s -6.611924700405824 -9.702810811870656 -8.718842831072866 -8.718842831072866 -4.839694122615929
10->s -6.133201497793067 -10.168700678662915 -6.534180865784207 -6.534180865784207 -4.780710791592935
20->s -6.151546011905196 -10.322750904664206 -8.107011856447757 -8.107011856447757 -4.983944249393128
30->s -6.909927773181762 -10.23523420998638 -8.198673644361513 -8.198673644361513 -5.191993129600697
40->s -7.045929953509107 -10.522642516857518 -8.30966676356952 -8.30966676356952 -5.257018108654018
50->e -4.946466133398729 -11.19357585152138 -8.78815223135361 -8.78815223135361 -5.681806552181788
51->e -4.928512712395584 -9.166619663576054 -8.423397044923387 -8.423397044923387 -8.252685685582126
52->e -4.910178235247238 -9.084141589044386 -8.113490840614087 -8.113490840614087 -8.69852881539418
53->e -3.954373661542544 -7.284011832645239 -7.269583585958317 -7.269583585958317 -7.420657751421419
54->e -4.111106619656713 -8.583637694598938 -7.919797729409126 -7.919797729409126 -8.821676630605907
55->e -4.362437990031488 -8.287683985943932 -7.883686388242249 -7.883686388242249 -8.950302477406607
56->e -4.442392528180897 -4.898339994476835 -4.968764220211245 -4.968764220211245 -4.937304628718843
57->e -5.716096250942643 -7.13662520618681 -6.4577894129838285 -6.4577894129838285 -6.154191039595808
58->e -5.882251123646437 -6.762252148541144 -6.224254812760385 -6.224254812760385 -6.391622203838924
59->n -6.174218284638121 -6.546789962963743 -4.220255145054704 -4.220255145054704 -6.261752862993307
49->n -8.273682104748499 -5.875856628299881 -3.741769677270615 -3.741769677270615 -5.836964419465535
39->n -8.137679924421152 -5.588448321428745 -3.6307765580626077 -3.6307765580626077 -5.771939440412215
29->n -7.379298163144586 -5.67596501610657 -3.539114770148853 -3.539114770148853 -5.563890560204646
19->n -7.360953649032458 -5.521914790105278 -1.9662837794853025 -1.9662837794853025 -5.360657102404453
代码困惑
- LFA_RL.py——line 21
phi_s_a[0,100*a+s] = 1
这是是想说这是个one-hot的矩阵,不应该是phi_s_a[a,s]=1
吗?
- LFA_RL.py——line 110
[0,0]*np.transpose(self.feature_tr(s,a_num))
使用 numpy.transpose ()
进行变换,其实就是交换了坐标轴,如:x.transpose(1, 2, 0),其实就是将x第二维度挪到第一维上,第二维移到第三维上,原本的第一维移动到第三维上,最后的shape为:(3,2,2)