奖励模型实现

学习自 大模型强化学习之奖励模型的训练https://blog.csdn.net/gzroy/article/details/132630418

import torch
from torch import nn
from torch.nn import functional as F
import math
import inspect

多头注意力机制

class MHA(nn.Module):
    def __init__(self, d_model, num_heads, attn_pdrop, resid_pdrop):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.attn_pdrop = attn_pdrop
        self.resid_dropout = nn.Dropout(resid_pdrop)
        self.c_attn = nn.Linear(d_model, d_model*3)
        self.c_proj = nn.Linear(d_model, d_model)
 
    def forward(self, x, attn_mask):
        B, T, C = x.size() #分别代表批次大小、序列长度和输入特征维度
        x_qkv = self.c_attn(x) 
        q, k, v = x_qkv.split(self.d_model, dim=2) #按照dim=2(第三个维度)将x_qkv划分为q,k,v
        q = q.view(B, T, self.num_heads, C//self.num_heads).transpose(1, 2) #将q重新塑造成[B,T,C//num_heads]的形状,.transpose(1, 2)交换张量维度,交换T和self.num_heads位置
        k = k.view(B, T, self.num_heads, C//self.num_heads).transpose(1, 2)
        v = v.view(B, T, self.num_heads, C//self.num_heads).transpose(1, 2)
        y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.attn_pdrop if self.training else 0, is_causal=True) #Attention=softmax()的计算
        y = y.transpose(1, 2).contiguous().view(B, T, C) #.transpose(1, 2)交换张量两维,.contiguous()强制张量在内存中连续存储,.view(B, T, C)塑造张量形状
        y = self.c_proj(y)
        y = self.resid_dropout(y)
        return y

FF

class FeedForward(nn.Module):
    def __init__(self, d_model, dff, dropout):
        super().__init__()
        self.c_fc = nn.Linear(d_model, dff)
        self.c_proj = nn.Linear(dff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.gelu = nn.GELU()
 
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x
class Block(nn.Module):
    def __init__(self, d_model, num_heads, dff, attn_pdrop, resid_pdrop, dropout):
        super().__init__()
        self.ln_1 = nn.LayerNorm(d_model)
        self.attn = MHA(d_model, num_heads, attn_pdrop, resid_pdrop)
        self.ln_2 = nn.LayerNorm(d_model)
        self.mlp = FeedForward(d_model, dff, dropout)
 
    def forward(self, x, attn_mask):
        x = x + self.attn(self.ln_1(x), attn_mask)
        x = x + self.mlp(self.ln_2(x))
        return x

奖励模型

class RewardModel(nn.Module):
	#初始化
    def __init__(self, vocab_size, d_model, block_size, embed_pdrop, num_heads, dff, attn_pdrop, resid_pdrop, dropout, num_layer):
        super().__init__()
        self.wte = nn.Embedding(vocab_size, d_model, sparse=False) #词汇表vocab_size embedding,映射为d_model维向量
        self.wpe = nn.Embedding(block_size, d_model, sparse=False) #位置block_size embedding,映射为d_model维向量
        self.dropout_embed = nn.Dropout(embed_pdrop)
        self.h = nn.ModuleList([Block(d_model, num_heads, dff, attn_pdrop, resid_pdrop, dropout) for _ in range(num_layer)]) #由num_layer个block组成的nn.ModuleList,
        self.num_layer = num_layer
        self.block_size = block_size
        #self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
        #self.wte.weight = self.lm_head.weight
        self.reward_head = nn.Linear(d_model, 1, bias=False)
        self.ln_f = nn.LayerNorm(d_model)
        self.PAD_ID = vocab_size - 1
 
        self.apply(self._init_weights)
 
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layer))

确定权重大小

def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

forward过程

def forward(self, input_ids, reward_pos, attn_mask, return_loss=False):
        device = input_ids.device
        b, t = input_ids.size()
        pos = torch.arange(0, t, dtype=torch.long, device=device) 
        x = self.wte(input_ids) + self.wpe(pos)
        x = self.dropout_embed(x)
        for block in self.h: #遍历每个block处理x
            x = block(x, attn_mask)
        x = self.ln_f(x)
        rewards = self.reward_head(x).squeeze(-1) #.reward_head(x)计算x的奖励,通过.squeeze(-1)将二维张量转换为一维张量
        #x = torch.reshape(x, [b,t])
        #scores = torch.gather(x, dim=-1, index=reward_pos)
 		
 		#初始化空列表
        chosen_end_scores = []
        rejected_end_scores = []
 		#确定批次大小并分割输入
        bs = input_ids.shape[0] // 2 #bs是输入张量input_ids批次的一半,若input_ids维度是[2 * bs, seq_len]
        chosen = input_ids[:bs] #input_ids的前半部分,形状是[bs, seq_len]
        rejected = input_ids[bs:] #input_ids的后半部分,形状是[bs, seq_len]
        chosen_rewards = rewards[:bs] #对应奖励张量,[bs, seq_len]
        rejected_rewards = rewards[bs:] 
 
 		#循环遍历每个样本计算结束分数
        loss = 0 
        for i in range(bs): #处理一半输入样本
        	#处理相同序列
            if torch.all(torch.eq(chosen[i], rejected[i])).item():#如果chosen[i]和rejected[i]完全相等,则说明这两个序列是相同的
                c_inds = (chosen[i] == self.PAD_ID).nonzero()
                c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen.shape[1]
                chosen_end_scores.append(chosen_rewards[i, c_ind - 1])
                #只需找到chosen[i]中的最后一个填充符号(PAD_ID),并将其之前的奖励值 chosen_rewards[i, c_ind - 1]添加到chosen_end_scores中,这是因为相同序列的结束奖励可以直接从一个序列计算
                continue
            # Check if there is any padding otherwise take length of sequence
            #如果不完全相同要找到填充符号位置
            c_inds = (chosen[i] == self.PAD_ID).nonzero() 
            c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen.shape[1]
            r_inds = (rejected[i] == self.PAD_ID).nonzero()
            r_ind = r_inds[0].item() if len(r_inds) > 0 else rejected.shape[1]
            #c_truncated_reward和r可能是之前计算得到的截断奖励值
            #c_truncated_reward = chosen_rewards[i][c_ind-1]
            #r_truncated_reward = rejected_rewards[i][r_ind-1]
            chosen_end_scores.append(c_truncated_reward) #计算对应的奖励值c_truncated_reward和r_truncated_reward添加到chosen_end_scores和rejected_end_scores中。
            rejected_end_scores.append(r_truncated_reward) 
 
            if return_loss:
                end_ind = max(c_ind, r_ind)
                # Retrieve first index where trajectories diverge
                divergence_ind = (chosen[i] != rejected[i]).nonzero()[0]
                assert divergence_ind > 0
 
                # Index into the correct rewards
                c_truncated_reward = chosen_rewards[i][divergence_ind:end_ind]
                r_truncated_reward = rejected_rewards[i][divergence_ind:end_ind]
 
                # Append the last rewards to the list of end scores
                chosen_end_scores.append(c_truncated_reward[-1])
                rejected_end_scores.append(r_truncated_reward[-1])
 
                # Compute loss based on truncated rewards (ignore padding)
                loss += -F.logsigmoid(c_truncated_reward - r_truncated_reward).mean()
                #loss += -F.logsigmoid(chosen_rewards[i][c_ind-1]-rejected_rewards[i][r_ind-1])
        loss = loss / bs
        
        return chosen_end_scores, rejected_end_scores, loss

优化器

   def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")
 
        return optimizer

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
强化学习是一种机器学习方法,可以在不断试错和学习的过程中,最大化某种奖励函数。在强化学习中,智能体采取行动,环境给予奖励或惩罚,智能体依靠这些奖励或惩罚来学习如何采取更好的行动以最大化总体奖励。 在基于模型的强化学习中,智能体试图学习环境的模型,并使用这个模型来预测采取不同行动的结果。然后,智能体可以使用这个模型来规划最优的行动策略。 下面是一个基于模型的强化学习代码实现的例子,使用 Gym 和 Tensorflow: ```python import gym import numpy as np import tensorflow as tf # 定义超参数 learning_rate = 0.1 discount_factor = 0.99 num_episodes = 1000 num_steps = 200 # 定义环境 env = gym.make('CartPole-v0') # 定义神经网络 inputs = tf.placeholder(shape=[None, env.observation_space.shape[0]], dtype=tf.float32) hidden_layer = tf.layers.dense(inputs=inputs, units=32, activation=tf.nn.relu) outputs = tf.layers.dense(inputs=hidden_layer, units=env.action_space.n, activation=None) # 定义损失函数和优化器 action_taken = tf.placeholder(shape=[None], dtype=tf.int32) target_value = tf.placeholder(shape=[None], dtype=tf.float32) action_mask = tf.one_hot(action_taken, env.action_space.n) chosen_action_value = tf.reduce_sum(action_mask * outputs, axis=1) loss = tf.reduce_mean(tf.square(chosen_action_value - target_value)) optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) # 初始化 Tensorflow 变量 sess = tf.Session() sess.run(tf.global_variables_initializer()) # 训练模型 for episode in range(num_episodes): # 重置环境 state = env.reset() total_reward = 0 # 生成数据 states = [] actions = [] rewards = [] for step in range(num_steps): # 选择行动 action_values = sess.run(outputs, feed_dict={inputs: [state]}) action = np.argmax(action_values[0]) # 执行行动 next_state, reward, done, _ = env.step(action) # 记录数据 states.append(state) actions.append(action) rewards.append(reward) # 更新状态和奖励 state = next_state total_reward += reward if done: break # 计算折扣奖励 discounted_rewards = np.zeros_like(rewards) running_reward = 0 for t in reversed(range(len(rewards))): running_reward = running_reward * discount_factor + rewards[t] discounted_rewards[t] = running_reward # 计算优势值 state_values = sess.run(outputs, feed_dict={inputs: states}) advantages = discounted_rewards - state_values[np.arange(len(states)), actions] # 训练神经网络 sess.run(optimizer, feed_dict={ inputs: states, action_taken: actions, target_value: discounted_rewards }) # 输出结果 print("Episode %d, Total reward: %d" % (episode + 1, total_reward)) ``` 这个例子使用 CartPole-v0 环境,它是 OpenAI Gym 的一个经典强化学习环境。代码使用 Tensorflow 建立一个神经网络来预测行动价值,并优化损失函数来提高预测精度。在每个 episode 中,智能体与环境进行交互,生成数据,计算折扣奖励和优势值,并使用这些数据来训练神经网络。最终,智能体应该能够学会在 CartPole-v0 环境中保持直立。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值