First let's load some libraries, and some data.
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import requests
%matplotlib inline
A diagonal covariance Gaussian Hidden Markov model (an HMM with diagonal covariance Gaussian observation probabilities) is defined by four sets of parameters: $$\pi_i=\Pr\left\{q_1=i\right\}$$ $$a_{ij}=\Pr\left\{q_{t+1}=j|q_t=i\right\}$$ $$\mu_{di}=E\left[x_{dt}|q_t=i\right]$$ $$\Sigma_{i}=E\left[(\vec{x}_{t}-\vec\mu_{i})(\vec{x}_{t}-\vec\mu_{i})^T|q_t=i\right]$$ In this lecture I will generate data using the true values of each parameter, and then try to recognize data using estimated values of each parameter. Let's assume a three-state HMM.
states = [0,1,2,3]
N = len(states)-1 # Define N as the number of emitting states
pi_true = [1,0,0,0]
A_true = [[0.8,0.2,0,0],[0,0.8,0.2,0],[0,0,0.8,0.2]]
mu_true = [[2,2,2,2,0,0,0,0,0,0,0,0],[0,0,0,0,2,2,2,2,0,0,0,0],[0,0,0,0,0,0,0,0,2,2,2,2]]
sigsq_true = [[1,1,1,1,1,1,1,1,1,1,1,1],[2,2,2,2,2,2,2,2,2,2,2,2],[1,1,1,1,1,1,1,1,1,1,1,1]]
Now let's run the HMM. We'll generate a state $q$, then generate a vector $\vec{x}$ from it. Then repeat this process until $q=3$, at which point we stop.
Q = list(np.random.choice(states,1,p=pi_true))
q = Q[-1]
X = []
while q < 3:
xt = stats.multivariate_normal.rvs(mean=mu_true[q],cov=np.diag(sigsq_true[q]))
X.append(xt)
Q.extend(np.random.choice(states,1,p=A_true[q]))
q = Q[-1]
plt.figure(figsize=(15,5))
plt.subplot(211)
plt.imshow(np.transpose(X))
plt.title('Generated feature vectors')
plt.subplot(212)
plt.plot(Q)
plt.xlabel('Frames')
plt.title('True State Sequence')
Now let's create estimated parameters. In order to create the initial estimate, we'll just divide the training example into thirds.
pi=[1,0,0,0]
T=len(X)
d=int(T/3)
a=1/d
A=[[1-a,a,0,0],[0,1-a,a,0],[0,0,1-a,a]]
mu=[np.average(X[0:d],axis=0),np.average(X[d:2*d],axis=0),np.average(X[2*d:T],axis=0)]
Sigma=[np.cov(X[0:d],rowvar=False)+0.2*np.identity(12),np.cov(X[d:2*d],rowvar=False)+0.2*np.identity(12),np.cov(X[2*d:T],rowvar=False)+0.2*np.identity(12)]
plt.figure(figsize=(15,5))
plt.subplot(211)
plt.plot(np.transpose(mu))
plt.legend(['state 0','state 1','state 2'])
plt.title('Initial estimated HMM means')
plt.subplot(212)
plt.plot(np.transpose([ np.diag(S) for S in Sigma ]))
plt.title('Initial estimated HMM variances')
plt.xlabel('Feature dimension')
Now we'll compute the forward-backward algorithm using these initial estimates. First we compute B, a matrix whose $(i,t)^{\textrm{th}}$ element is $p(\vec{x}_t|q_t=i)$.
$$b_i(\vec{x}_t)=p(\vec{x}_t|q_t=i)=\frac{1}{(2\pi)^{D/2}|\Sigma_i|^{1/2}}e^{-\frac{1}{2}(\vec{x}_t-\vec\mu_{i})^T\Sigma_i^{-1}(\vec{x}_t-\vec\mu_i)}$$ For diagonal covariance Gaussians, that's $$b_i(\vec{x}_t)=\prod_{d=1}^{12} \frac{1}{\sqrt{2\pi\sigma_i^2}}e^{-\frac{1}{2}\left(\frac{x_{dt}-\mu_{di}}{\sigma_i}\right)^2}$$ Actually, I'm not sure whether full-covariance or diagonal-covariance Gaussians give better results for this assignment. Full-covariance is a more accurate model, but it sometimes over-trains.
B=np.zeros((N,T))
for t in range(0,T):
for i in range(0,N):
B[i,t]=stats.multivariate_normal(mu[i],Sigma[i]).pdf(X[t])
plt.figure(figsize=(15,5))
plt.imshow(np.log(B))
plt.title('Log probability of the observation given the state')
plt.xlabel('Frame number')
plt.ylabel('State number')
Now we compute alpha, beta, gamma, and xi. $$\alpha_0(i)=\Pr\left\{q_0=i,\vec{x}_0\right\}=\pi_i b_i(\vec{x}_0)$$ $$\alpha_t(i)=\Pr\left\{q_t=i,\vec{x}_0,\ldots,\vec{x}_t\right\}=b_i(\vec{x}_t)\sum_{j=0}^{N-1}\alpha_{t-1}(j)a_{ji}$$ $$\beta_{T-1}(i)=1$$ $$\beta_{t}(i)=\Pr\left\{\vec{x}_{t+1},\ldots,\vec{x}_{T-1}|q_t=i\right\}\sum_{j=0}^{N-1}a_{ij}b_j(\vec{x}_{t+1})\beta_{t+1}(j)$$ $$\gamma_t(i)=\Pr\left\{q_t=i|\vec{x}_0,\ldots,\vec{x}_{T-1}\right\}=\frac{\alpha_t(i)\beta_t(i)}{\sum_{j=0}^{N-1}\alpha_t(j)\beta_t(j)}$$ $$\xi_t(i,j)=\Pr\left\{q_t=i,q_{t+1}=j|\vec{x}_0,\ldots,\vec{x}_{T-1}\right\}=\frac{\alpha_t(i)a_{ij}b_j(\vec{x}_{t+1})\beta_{t+1}(j)}{\sum_{i=0}^{N-1}\sum_{j=0}^{N-1}\alpha_t(i)a_{ij}b_j(\vec{x}_{t+1})\beta_{t+1}(j)}$$
Notice that, since $a_{ij}=0$ except for $j\in\left\{i,i+1\right\}$, therefore $\xi_t(i,j)=0$ except for $j\in\left\{i,i+1\right\}$. Therefore we can save a little space in computations by computing $\xi_t(i,j)$ only for those two possibilities.
alpha = np.zeros((N,T))
beta = np.zeros((N,T))
gamma = np.zeros((N,T))
xi = np.zeros((2*N,T))
Amat = np.array(A) # Convert to an np matrix so we can compute inner products
for i in range(0,N):
alpha[i,0]=pi[i]*B[i,0]
for t in range(1,T):
for i in range(0,N):
alpha[i,t]=B[i,t]*np.inner(alpha[:,t-1],Amat[:,i])
for i in range(0,N):
beta[i,T-1]=1
for t in range(T-2,-1,-1):
for i in range(0,N):
beta[i,t]=np.inner(Amat[i,0:N],beta[:,t+1]*B[:,t+1])
for t in range(0,T):
gamma[:,t]=alpha[:,t]*beta[:,t]
gamma[:,t]=gamma[:,t]/np.sum(gamma[:,t])
for t in range(0,T):
for i in range(0,N):
for j in range(i,i+2):
xi[i+j,t]=alpha[i,t]*Amat[i,j]
if (t<T-1):
if j==N:
xi[i+j,t]=0
else:
xi[i+j,t] = xi[i+j,t]*B[j,t+1]*beta[j,t+1]
xi[:,t]=xi[:,t]/np.sum(xi[:,t])
plt.figure(figsize=(15,5))
plt.subplot(311)
plt.plot(range(0,T),np.transpose(np.log(alpha)),range(0,T),np.transpose(np.log(beta)))
plt.title('Log Alpha and Log Beta')
plt.legend(['state 0','state 1','state 2'])
plt.subplot(312)
plt.plot(np.transpose(gamma))
plt.title('Gamma')
plt.subplot(313)
plt.plot(np.transpose(xi))
plt.legend(['0->0','0->1','1->1','1->2','2->2','2->3'])
plt.title('Xi')
plt.xlabel('Frame index, t')
If you have multiple files, then you compute for example $\gamma_{t\ell}(i)$ for the $t^{th}$ frame of the $\ell^{th}$ file $$a_{ij}=\frac{\sum_\ell\sum_t \xi_{t\ell}(i,j)}{\sum_\ell\sum_t\gamma_{t\ell}(i)}\approx \Pr\left\{q_{t+1}=j|q_t=i\right\}$$ $$\vec\mu_i=\frac{\sum_\ell\sum_t\gamma_{t\ell}(i)\vec{x}_t}{\sum_\ell\sum_t\gamma_{t\ell}(i)}\approx E\left[\vec{x}_{t}|q_t=i\right]$$ $$\sigma_{di}^2=\frac{\sum_\ell\sum_t\gamma_{t\ell}(i)(x_{dt}-\mu_{di})^2}{\sum_\ell\sum_t\gamma_{t\ell}(i)}\approx E\left[(x_{dt}-\mu_{di})^2|q_t=i\right]$$
for i in range(0,N):
for j in range(i,i+2):
A[i][j]=np.sum(xi[i+j,:])/np.sum(gamma[i,:])
for i in range(0,N):
mu[i] = np.inner(np.transpose(X),gamma[i,:])/np.sum(gamma[i,:])
for i in range(0,N):
Sigma[i]=0.2*np.identity(12)
for t in range(0,len(X)):
Sigma[i] += gamma[i,t]*np.outer(X[t]-mu[i],X[t]-mu[i])
plt.figure(figsize=(15,5))
plt.subplot(311)
plt.plot(np.transpose(A))
plt.title('Re-estimated Transition Probabilities')
plt.subplot(312)
plt.plot(np.transpose(mu))
plt.title('Re-estimated Gaussian Means')
plt.legend(['state 0','state 1','state 2'])
plt.subplot(313)
plt.plot(np.transpose([np.diag(S) for S in Sigma]))
plt.title('Re-estimated Gaussian Variances')
plt.xlabel('Feature Dimension')
In real life, we would iterate the above algorithm several times between the E-step and the M-step. We won't do that today.
You might have noticed that $\alpha_t(i)$ keeps getting smaller and smaller as you go further to the right. That's because the Gaussian pdf can be either bigger or smaller than 1.0, but usually it's much smaller.
Suppose that $b_j(\vec{x}_t)$ is typically around $0.001$. Then $\alpha_t(i)\sim 10^{-3t}\sim 2^{-30t}$. Standard IEEE double-precision floating point can represent numbers as small as $2^{-1022}$, which means that you can compute the forward-backward algorithm for an audio file of at most $T\le 1022/30=34$ frames. The problem is much worse on fixed-point embedded processors (obviously).
The solution is to scale $\alpha_t(i)$ and $\beta_t(i)$. As long as we scale them by the same amount, then we can compute $\gamma_t(i)$ without any special re-normalizing. It works like this: $$\tilde{\alpha}_t(i)=\frac{\alpha_t(i)}{\prod_{\tau=1}^t g_\tau}$$ $$\tilde{\beta}_t(i)=\frac{\beta_t(i)}{\prod_{\tau=t+1}^T g_\tau}$$ $$\gamma_t(i)=\frac{\alpha_t(i)\beta_t(i)}{\sum_{j=0}^{N-1}\alpha_t(j)\beta_t(j)}$$ $$=\frac{\frac{\alpha_t(i)\beta_t(i)}{\prod_{\tau=1}^T g_\tau}}{\frac{\sum_{j=0}^{N-1}\alpha_t(j)\beta_t(j)}{\prod_{\tau=1}^T g_\tau}}$$ $$=\frac{\tilde{\alpha}_t(i)\tilde{\beta}_t(i)}{\sum_{j=0}^{N-1}\tilde{\alpha}_t(j)\tilde{\beta}_t(j)}$$
So any scaling constant $g_t$ will work, AS LONG AS YOU USE THE SAME $g_t$ FOR BOTH ALPHA AND BETA!!!
Here's a pretty reasonable choice: $$\bar\alpha_t(i)=b_i(\vec{x}_t)\sum_{j=0}^{N-1}\tilde{\alpha}_{t-1}(j)a_{ji}$$ $$g_t=\sum_{i=0}^{N-1}\bar\alpha_t(i)$$ $$\tilde\alpha_t(i) = \frac{1}{g_t}\bar\alpha_t(i)$$
If you work through it, you discover that, with this choice, $$\prod_{t=1}^T g_t = \sum_{j=0}^{N-1}\alpha_T(j)$$
But you might remember that $\sum_{j=0}^{N-1}\alpha_T(j)$ is actually the probability of the data sequence, $X$, given the model parameters $\Lambda$. So $$p(X|\Lambda)=\sum_{j=0}^{N-1}\alpha_T(j)=\prod_{t=1}^T g_t$$ $$\ln p(X|\Lambda)=\sum_{t=1}^{T}\ln g_t$$
So we don't need to store $g_t$, just $\ln g_t$.
tildealpha=np.zeros((N,T))
tildebeta=np.zeros((N,T))
log_g = np.zeros((T))
baralpha = np.zeros((N,T))
Amat = np.array(A)
for i in range(0,N):
baralpha[i,0]=pi[i]*B[i,0]
log_g[0] = np.log(np.sum(baralpha[:,0]))
tildealpha[:,0]=baralpha[:,0]/np.exp(log_g[0])
for t in range(1,T):
for i in range(0,N):
baralpha[i,t]=B[i,t]*np.inner(tildealpha[:,t-1],Amat[:,i])
log_g[t] = np.log(np.sum(baralpha[:,t]))
tildealpha[:,t]=baralpha[:,t]/np.exp(log_g[t])
for i in range(0,N):
tildebeta[i,T-1] = 1/np.exp(log_g[T-1])
for t in range(T-2,-1,-1):
for i in range(0,N):
tildebeta[i,t]=np.inner(Amat[i,0:N],tildebeta[:,t+1]*B[:,t+1])/np.exp(log_g[t+1])
for t in range(0,T):
gamma[:,t] = tildealpha[:,t]*tildebeta[:,t]
gamma[:,t] = gamma[:,t]/np.sum(gamma[:,t])
plt.figure(figsize=(15,5))
plt.subplot(311)
plt.plot(log_g)
plt.title('Log Gain at each Frame')
plt.subplot(312)
plt.plot(np.transpose(tildealpha))
plt.title('Tilde-Alpha')
plt.legend(['state 0','state 1','state 2'])
plt.subplot(313)
plt.plot(np.transpose(gamma))
plt.title('Gamma')
plt.xlabel('Frame Index')