一个21点的dp解,https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py,实验环境,和真实的21点规则有点小区别,主要是Blackjack的大小计算有点区别.
#include <bits/stdc++.h>
#pragma warning(disable:4996)
using namespace std;
double dpp[11][11][2][2];
int maction[11][11][2];
double dp[22][6][2];
void dpdealer()
{
for (int j = 1; j >= 0; j--)
{
for (int i = 21; i >= 1; i--)
{
if (i >= 17)
{
dp[i][i - 17][j] = 1;
}
else
{
if (i >= 7 && j == 1 && i <= 11)
{
dp[i][i - 7][j] = 1;
continue;
}
int nxtj = j;
for (int k = 1; k <= 10; k++)
{
double prob = 1.0 / 13;
if (k == 10)
{
prob = 4.0 / 13;
}
if (k == 1)
nxtj = 1;
else
nxtj = j;
int nxtstat = i + k;
if (i + k > 21)
{
dp[i][5][j] += prob;
}
else
{
for (int f = 0; f <= 5; f++)
{
dp[i][f][j] += prob * dp[nxtstat][f][nxtj];
}
}
}
}
}
}
}
void dpplayer()
{
for (int k = 0; k < 2; k++)
{
for (int i = 9; i >= 0; i--)//corresponse to 12-21
{
/*
if (i == 9)
{
for (int j = 0; j < 10; j++)
{
dpp[i][j][k][0] = 1;
maction[i][j][k] = 0;
}
continue;
}*/
for (int j = 1; j <= 10; j++)
{
double winp = 0;
for (int mk = 0; mk < i - 5; mk++)
{
winp += dp[j][mk][j == 1];
}
winp += dp[j][5][j == 1];
double losep = 0;
for (int mk = max(i - 4, 0); mk < 5; mk++)
{
losep += dp[j][mk][j == 1];
}
dpp[i][j - 1][k][0] = winp - losep;
}
for (int j = 1; j <= 10; j++)
{
for (int mk = 1; mk <= 10; mk++)
{
double prob = 1.0 / 13;
if (mk == 10)
prob = 4.0 / 13;
int nxtstat = i + mk;
if (k == 0)
{
if (nxtstat <= 9)
{
dpp[i][j - 1][k][1] += prob * max(dpp[nxtstat][j - 1][k][0], dpp[nxtstat][j - 1][k][1]);
}
else
{
dpp[i][j - 1][k][1] -= prob;
}
}
else
{
if (nxtstat > 9)
{
nxtstat -= 10;
dpp[i][j - 1][k][1] += prob * max(dpp[nxtstat][j - 1][k - 1][0], dpp[nxtstat][j - 1][k - 1][1]);
}
else
{
dpp[i][j - 1][k][1] += prob * max(dpp[nxtstat][j - 1][k][0], dpp[nxtstat][j - 1][k][1]);
}
}
}
maction[i][j - 1][k] = (dpp[i][j - 1][k][0] >= dpp[i][j - 1][k][1]) ? 0 : 1;
}
}
}
}
int main()
{
dpdealer();
printf("without ace prob\n");
for (int i = 1; i <= 10; i++)
{
for (int j = 0; j <= 5; j++)
printf("%.2f ", dp[i][j][0]);
printf("\n");
}
printf("with ace prob\n");
for (int i = 1; i <= 10; i++)
{
for (int j = 0; j <= 5; j++)
printf("%.2f ", dp[i][j][1]);
printf("\n");
}
dpplayer();
printf("\n\n");
printf("without ace policy\n");
for (int i = 9; i >= 0; i--)
{
for (int j = 0; j <= 9; j++)
{
printf("%d ", maction[i][j][0]);
}
printf("\n");
}
printf("with ace policy\n");
for (int i = 9; i >= 0; i--)
{
for (int j = 0; j <= 9; j++)
{
printf("%d ", maction[i][j][1]);
}
printf("\n");
}
}
运行结果如下:
和标准的解相同:
这个项目教会了大家怎么和机器人玩21点