原创: hxj7
本文介绍了如何利用后验概率进行解码,可称为后验解码。
前文《序列比对(12)计算后验概率》介绍了如何计算某一位置可能状态的后验概率。那么可以据此找到某一位置最有可能的状态。即
从上面的公式可以看出,有两种方法求解某一位置的最可能状态。如果是依据公式(1),先计算出后验概率,然后找到其中最大后验概率对应的状态;如果是依据公式(3),无需计算后验概率,比较简单。
本文所用代码采用了公式(1)的方法,虽然稍嫌麻烦,但是可以在原先的代码基础上稍加增改即可。运行效果如下:
具体代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
//#define MIN_LOG_VALUE -15
//#define SAFE_EXP(x) ((x) < MIN_LOG_VALUE ? 0 : exp(x))
typedef char State;
typedef char Result;
State state[] = {'F', 'L'}; // 所有的可能状态
Result result[] = {'1', '2', '3', '4', '5', '6'}; // 所有的可能符号
double init[] = {0.9, 0.1}; // 初始状态的概率向量
double emission[][6] = { // 发射矩阵:行对应着状态,列对应着符号
1.0/6, 1.0/6, 1.0/6, 1.0/6, 1.0/6, 1.0/6,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5
};
double trans[][2] = { // 转移矩阵:行和列都是状态
0.95, 0.05,
0.1, 0.9
};
const int nstate = 2;
const int nresult = 6;
double** fscore; // 前向算法的得分矩阵
double** bscore; // 后向算法的得分矩阵
double* scale; // 缩放因子向量
double logScaleSum;
int random(double* prob, const int n);
void randSeq(State* st, Result* res, const int n);
int getResultIndex(Result r);
void printState(State* st, const int n);
void printResult(Result* res, const int n);
double forward(Result* res, const int n);
double backward(Result* res, const int n);
double** getPostProb(const int n);
void postDecode(double** prob, const int n);
int main(void) {
int i;
int n = 5;
State* rst; // 一串随机状态序列
Result* rres; // 一串随机符号序列
double** postProb;
if ((rst = (State*) malloc(sizeof(State) * n)) == NULL || \
(rres = (Result*) malloc(sizeof(Result) * n)) == NULL || \
(scale = (double*) malloc(sizeof(double) * n)) == NULL || \
(fscore = (double**) malloc(sizeof(double*) * nstate)) == NULL || \
(bscore = (double**) malloc(sizeof(double*) * nstate)) == NULL) {
fputs("Error: out of space!\n", stderr);
exit(1);
}
for (i = 0; i < nstate; i++) {
if ((fscore[i] = (double*) malloc(sizeof(double) * n)) == NULL || \
(bscore[i] = (double*) malloc(sizeof(double) * n)) == NULL) {
fputs("Error: out of space!\n", stderr);
exit(1);
}
}
randSeq(rst, rres, n);
printState(rst, n);
printResult(rres, n);
forward(rres, n);
backward(rres, n);
postProb = getPostProb(n);
postDecode(postProb, n);
free(rst);
free(rres);
free(scale);
free(fscore);
free(bscore);
for (i = 0; i < nstate; i++)
free(postProb[i]);
free(postProb);
}
// 根据一个概率向量从0到n-1随机抽取一个数
int random(double* prob, const int n) {
int i;
double p = rand() / 1.0 / (RAND_MAX + 1);
for (i = 0; i < n - 1; i++) {
if (p <= prob[i])
break;
p -= prob[i];
}
return i;
}
// 根据转移矩阵和发射矩阵生成一串随机状态和符号
void randSeq(State* st, Result* res, const int n) {
int i, ls, lr;
srand((unsigned int) time(NULL));
ls = random(init, nstate);
lr = random(emission[ls], nresult);
st[0] = state[ls];
res[0] = result[lr];
for (i = 1; i < n; i++) {
ls = random(trans[ls], nstate);
lr = random(emission[ls], nresult);
st[i] = state[ls];
res[i] = result[lr];
}
}
int getResultIndex(Result r) {
return r - result[0];
}
// 前向算法计算P(x)
double forward(Result* res, const int n) {
int i, l, k, idx;
double logpx;
// 缩放因子向量初始化
for (i = 0; i < n; i++)
scale[i] = 0;
// 计算第0列分值
idx = getResultIndex(res[0]);
for (l = 0; l < nstate; l++) {
fscore[l][0] = emission[l][idx] * init[l];
scale[0] += fscore[l][0];
}
for (l = 0; l < nstate; l++)
fscore[l][0] /= scale[0];
// 计算从第1列开始的各列分值
for (i = 1; i < n; i++) {
idx = getResultIndex(res[i]);
for (l = 0; l < nstate; l++) {
fscore[l][i] = 0;
for (k = 0; k < nstate; k++) {
fscore[l][i] += fscore[k][i - 1] * trans[k][l];
}
fscore[l][i] *= emission[l][idx];
scale[i] += fscore[l][i];
}
for (l = 0; l < nstate; l++)
fscore[l][i] /= scale[i];
}
// P(x) = product(scale)
// P(x)就是缩放因子向量所有元素的乘积
logpx = 0;
for (i = 0; i < n; i++)
logpx += log(scale[i]);
printf("forward: logP(x) = %f\n", logpx);
logScaleSum = logpx;
/*
for (l = 0; l < nstate; l++) {
for (i = 0; i < n; i++)
printf("%f ", fscore[l][i]);
printf("\n");
}
*/
return exp(logpx);
}
// 后向算法计算P(x)
// backward算法中使用的缩放因子和forward中的一样
double backward(Result* res, const int n) {
int i, l, k, idx;
double tx, logpx;
// 计算最后一列分值
for (l = 0; l < nstate; l++)
bscore[l][n - 1] = 1 / scale[n - 1];
// 计算从第n - 2列开始的各列分值
for (i = n - 2; i >= 0; i--) {
idx = getResultIndex(res[i + 1]);
for (k = 0; k < nstate; k++) {
bscore[k][i] = 0;
for (l = 0; l < nstate; l++) {
bscore[k][i] += bscore[l][i + 1] * trans[k][l] * emission[l][idx];
}
}
for (l = 0; l < nstate; l++)
bscore[l][i] /= scale[i];
}
// 计算P(x)
tx = 0;
idx = getResultIndex(res[0]);
for (l = 0; l < nstate; l++)
tx += init[l] * emission[l][idx] * bscore[l][0];
logpx = log(tx) + logScaleSum;
printf("backward: logP(x) = %f\n", logpx);
/*
for (l = 0; l < nstate; l++) {
for (i = 0; i < n; i++)
printf("%f ", bscore[l][i]);
printf("\n");
}
*/
return exp(logpx);
}
// 计算后验概率
double** getPostProb(const int n) {
int i, k;
double** postProb;
//double logdiff;
if ((postProb = (double**) malloc(sizeof(double*) * nstate)) == NULL) {
fputs("Error: out of space!\n", stderr);
exit(1);
}
for (k = 0; k < nstate; k++) {
if ((postProb[k] = (double*) malloc(sizeof(double) * n)) == NULL) {
fputs("Error: out of space!\n", stderr);
exit(1);
}
}
// 计算后验概率
for (i = 0; i < n; i++) {
for (k = 0; k < nstate; k++) {
postProb[k][i] = scale[i] * fscore[k][i] * bscore[k][i];
}
}
printf("\n");
printf("Posterior Probabilities:\n");
for (k = 0; k < nstate; k++) {
for (i = 0; i < n; i++)
printf("%f ", postProb[k][i]);
printf("\n");
}
return postProb;
}
void postDecode(double** prob, const int n) {
int i, k;
double maxCol;
int idx;
State* st;
if ((st = (State*) malloc(sizeof(State) * n)) == NULL) {
fputs("Error: out of space!\n", stderr);
exit(1);
}
for (i = 0; i < n; i++) {
idx = 0;
maxCol = prob[0][i];
for (k = 1; k < nstate; k++)
if (prob[k][i] > maxCol) {
maxCol = prob[k][i];
idx = k;
}
st[i] = state[idx];
}
printf("\n");
printf("Posterior Decode:\n");
printState(st, n);
free(st);
}
void printState(State* st, const int n) {
int i;
for (i = 0; i < n; i++)
printf("%c", st[i]);
printf("\n");
}
void printResult(Result* res, const int n) {
int i;
for (i = 0; i < n; i++)
printf("%c", res[i]);
printf("\n");
}
(公众号:生信了)