训练一个会斗地主的AI

最新推荐文章于 2025-02-17 09:46:26 发布

slbwgslz

最新推荐文章于 2025-02-17 09:46:26 发布

阅读量646

点赞数

文章标签：人工智能 python tensorflow

本文链接：https://blog.csdn.net/slbwgslz/article/details/130274412

版权

import tensorflow as tf
import numpy as np
import random

# 定义游戏环境
class DouDiZhuEnv():
    def __init__(self):
        self.state_size = 54 # 牌面数量
        self.action_size = 467 # 可选出牌组合数
        self.state = np.zeros((self.state_size,), dtype=np.float32)
        self.cards = [i for i in range(54)]
        self.actions = []
        self.cards_to_values = {}
        self.values_to_cards = {}
        self.reset()

    def reset(self):
        self.state[:] = 0
        self.cards = [i for i in range(54)]
        self.actions = []
        self.cards_to_values = {}
        self.values_to_cards = {}
        random.shuffle(self.cards)

        # 将牌转换成张量形式
        for idx, card in enumerate(self.cards):
            self.cards_to_values[card] = idx
            self.values_to_cards[idx] = card
            self.state[idx] = 1
        
        return self.state

    def step(self, action):
        done = False
        reward = 0

        # 更新状态和奖励
        cards = self.get_card_list_from_action(action)
        if not self.is_valid_action(cards):
            reward -= 5
            return self.state, reward, done

        for card in cards:
            self.state[self.cards_to_values[card]] = 0
        self.actions.append(action)

        if len(self.actions) == 3 and self.is_game_over():
            done = True
            reward += self.get_reward()

        return self.state, reward, done

    def get_card_list_from_action(self, action):
        cards = []
        while action > 0:
            idx = action % 54
            if idx >= 0 and idx <= 51:
                cards.append(idx)
            action //= 54
        return cards[:]

    def get_action_from_card_list(self, cards):
        action = 0
        for card in reversed(cards):
            action = action * 54 + card
        return action

    def is_valid_action(self, cards):
        if not self.actions:
            return True

        last_cards = self.get_card_list_from_action(self.actions[-1])
        if not cards:
            return False
        if self.is_bomb(cards):
            return True
        if len(last_cards) != len(cards):
            return False
        if self.get_type(cards) != self.get_type(last_cards):
            return False
        if self.get_value(cards[0]) <= self.get_value(last_cards[0]):
            return False
        return True

    def is_bomb(self, cards):
        if len(cards) < 4:
            return False
        values = [self.get_value(card) for card in cards]
        if len(set(values)) == 1:
            return True
        return False

    def is_game_over(self):
        if len(self.cards) == 0:
            return True
        if len(self.actions) == 2:
            return True
        return False

    def get_reward(self):
        reward = 0
        last_action = self.actions[-1]
        last_cards = self.get_card_list_from_action(last_action)
        if len(last_cards) == 1:
            reward += 1
        elif len(last_cards) == 2:
            reward += 3
        elif len(last_cards) == 3:
            reward += 6
        elif len(last_cards) == 4:
            if self.is_bomb(last_cards):
                reward += 10
            else:
                reward += 8
        return reward

    def get_type(self, cards):
        if len(cards) == 1:
            return 0
        if len(cards) == 2:
            if self.get_value(cards[0]) == self.get_value(cards[1]):
                return 1
            else:
                return -1
        if len(cards) == 3:
            if self.get_value(cards[0]) == self.get_value(cards[1]) and self.get_value(cards[0]) == self.get_value(cards[2]):
                return 2
            else:
                return -1
        if len(cards) == 4:
            if self.getvalue(cards[0]) == self.get_value(cards[1]) and \
            value(cards[0]) == self.get_value(cards[2]) and \
            value(cards[0]) == self.get_value(cards[3]):
            return 3
        elif self.is_bomb(cards):
            return 4
    if len(cards) == 5:
        if self.get_type(cards[:2]) == 1 and self.get_type(cards[2:]) == 2:
            return 5
        else:
            return -1
    if len(cards) == 6:
        if self.get_type(cards[:3]) == 2 and self.get_type(cards[3:]) == 2:
            return 6
        else:
            return -1
    if len(cards) == 7:
        if self.get_type(cards[:4]) == 3 and self.get_type(cards[4:]) == 2:
            return 7
        else:
            return -1
    if len(cards) == 8:
        if self.get_type(cards[:4]) == 1 and self.get_type(cards[4:]) == 2:
            return 8
        elif self.get_type(cards[:5]) == 5 and self.get_type(cards[5:]) == 3:
            return 9
        else:
            return -1
    if len(cards) == 9:
        if self.get_type(cards[:6]) == 6 and self.get_type(cards[6:]) == 3:
            return 10
        else:
            return -1
    if len(cards) == 10:
        if self.get_type(cards[:5]) == 8 and self.get_type(cards[5:]) == 3:
            return 11
        elif self.get_type(cards[:6]) == 6 and self.get_type(cards[6:]) == 4:
            return 12
        else:
            return -1
    if len(cards) == 11:
        if self.get_type(cards[:6]) == 7 and self.get_type(cards[6:]) == 4:
            return 13
        else:
            return -1
    if len(cards) == 12:
        if self.get_type(cards[:6]) == 9 and self.get_type(cards[6:]) == 4:
            return 14
        elif self.get_type(cards[:8]) == 11 and self.get_type(cards[8:]) == 3:
            return 15
        else:
            return -1

    def get_value(self, card):
        return card % 13

    def get_action_size(self):
        count = 0
        for i in range(1, 13):
            for j in range(54**(i-1), 54**i):
                cards = self.get_card_list_from_action(j)
                if len(cards) != i:
                    continue
                if not self.is_valid_action(cards):
                    continue
                count += 1
        return count

定义策略网络

class PolicyNetwork(tf.keras.Model):
    def init(self, state_size, action_size):
        super(PolicyNetwork, self).init()
        self.dense1 = tf.keras.layers.Dense(128, activation='relu')
        self.dense2 = tf.keras.layers.Dense(action_size, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        output = self.dense2(x)
        return output

定义训练函数

def train(env, policy_network, num_episodes=100):
    optimizer = tf.keras.optimizers.Adam()
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        while True:
        # 选择动作
            logits = policy_network(np.expand_dims(state, axis=0))
            action = tf.random.categorical(logits, num_samples=1)[0, 0]
            state, reward, done = env.step(action)
            episode_reward += reward

        # 计算梯度并更新策略网络
            with tf.GradientTape() as tape:
                loss = -tf.reduce_sum(tf.one_hot([action],            depth=policy_network.action_size) * logits)
                grads = tape.gradient(loss, policy_network.trainable_variables)
                optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))

            if done:
                break
    
            print("Episode {}: Reward = {}".format(episode+1, episode_reward))

创建游戏环境和策略网络，并开始训练

env = DouDiZhuEnv()
action_size = env.get_action_size()
policy_network = PolicyNetwork(env.state_size, action_size)
train(env, policy_network, num_episodes=100)

游戏环境

在定义游戏环境时，我们需要考虑到斗地主卡牌的种类和数量、可行动作的组合数、状态表示等问题。具体来说：

卡牌种类：斗地主牌有54张，其中52张为普通牌，另外两张为大小王。因此我们需要将每张牌都进行编号，方便后续处理。
可行动作：根据斗地主的规则，玩家在出牌时需要考虑牌面大小、牌型等信息。这意味着可行动作的组合数非常多，需要使用算法进行计算。我们可以通过遍历所有可能的出牌组合，从中筛选出合法的组合来得到可行动作的总数。
状态表示：在斗地主中，每位玩家的手牌情况和桌面上已经出过的牌都会影响到下一步的决策。因此，我们需要将这些信息都编码成一个状态向量，以供模型进行学习和预测。

在上述代码中，我们定义了DouDiZhuEnv类作为游戏环境，并实现了以下方法：