import json, random

with open('transcripts.json') as f:
    transcripts=json.load(f)

print(transcripts.keys())

dict_keys(['train', 'dev', 'eval', 'phoneset', 'senoneset'])


print('Training transcripts are available for %d files'%(len(transcripts['train'])))
k0 = list(transcripts['train'].keys())[0]
print('The first file is',k0)
print('Its word transcript is:',transcripts['train'][k0]['word'])
print('Its phone transcript is:',transcripts['train'][k0]['phone'])
print('Its senone transcript is:',transcripts['train'][k0]['senone'])

Training transcripts are available for 1000 files
The first file is LJ001-0001
Its word transcript is: Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
Its phone transcript is: pɹɪntɪŋ, ɪnðɪ oʊnli sɛns wɪð wɪtʃ wi ɑɹ æt pɹɛzənt kənsɜnd, dɪfɚz fɹʌm moʊst ɪf nɑt fɹʌm ɔl ðɪ ɑɹts ænd kɹæfts ɹɛpɹɪzɛntᵻd ɪnðɪ ɛksɪbɪʃən
Its senone transcript is: $1 $2 $3 p1 p2 p3 ɹ1 ɹ2 ɹ3 ɪ1 ɪ2 ɪ3 n1 n2 n3 t1 t2 t3 ɪ1 ɪ2 ɪ3 ŋ1 ŋ2 ŋ3 $1 $2 $3 ɪ1 ɪ2 ɪ3 n1 n2 n3 ð1 ð2 ð3 ɪ1 ɪ2 ɪ3 # o1 o2 o3 ʊ1 ʊ2 ʊ3 n1 n2 n3 l1 l2 l3 i1 i2 i3 # s1 s2 s3 ɛ1 ɛ2 ɛ3 n1 n2 n3 s1 s2 s3 # w1 w2 w3 ɪ1 ɪ2 ɪ3 ð1 ð2 ð3 # w1 w2 w3 ɪ1 ɪ2 ɪ3 t1 t2 t3 ʃ1 ʃ2 ʃ3 # w1 w2 w3 i1 i2 i3 # ɑ1 ɑ2 ɑ3 ɹ1 ɹ2 ɹ3 # æ1 æ2 æ3 t1 t2 t3 # p1 p2 p3 ɹ1 ɹ2 ɹ3 ɛ1 ɛ2 ɛ3 z1 z2 z3 ə1 ə2 ə3 n1 n2 n3 t1 t2 t3 # k1 k2 k3 ə1 ə2 ə3 n1 n2 n3 s1 s2 s3 ɜ1 ɜ2 ɜ3 n1 n2 n3 d1 d2 d3 $1 $2 $3 d1 d2 d3 ɪ1 ɪ2 ɪ3 f1 f2 f3 ɚ1 ɚ2 ɚ3 z1 z2 z3 # f1 f2 f3 ɹ1 ɹ2 ɹ3 ʌ1 ʌ2 ʌ3 m1 m2 m3 # m1 m2 m3 o1 o2 o3 ʊ1 ʊ2 ʊ3 s1 s2 s3 t1 t2 t3 # ɪ1 ɪ2 ɪ3 f1 f2 f3 # n1 n2 n3 ɑ1 ɑ2 ɑ3 t1 t2 t3 # f1 f2 f3 ɹ1 ɹ2 ɹ3 ʌ1 ʌ2 ʌ3 m1 m2 m3 # ɔ1 ɔ2 ɔ3 l1 l2 l3 # ð1 ð2 ð3 ɪ1 ɪ2 ɪ3 # ɑ1 ɑ2 ɑ3 ɹ1 ɹ2 ɹ3 t1 t2 t3 s1 s2 s3 # æ1 æ2 æ3 n1 n2 n3 d1 d2 d3 # k1 k2 k3 ɹ1 ɹ2 ɹ3 æ1 æ2 æ3 f1 f2 f3 t1 t2 t3 s1 s2 s3 # ɹ1 ɹ2 ɹ3 ɛ1 ɛ2 ɛ3 p1 p2 p3 ɹ1 ɹ2 ɹ3 ɪ1 ɪ2 ɪ3 z1 z2 z3 ɛ1 ɛ2 ɛ3 n1 n2 n3 t1 t2 t3 ᵻ1 ᵻ2 ᵻ3 d1 d2 d3 # ɪ1 ɪ2 ɪ3 n1 n2 n3 ð1 ð2 ð3 ɪ1 ɪ2 ɪ3 # ɛ1 ɛ2 ɛ3 k1 k2 k3 s1 s2 s3 ɪ1 ɪ2 ɪ3 b1 b2 b3 ɪ1 ɪ2 ɪ3 ʃ1 ʃ2 ʃ3 ə1 ə2 ə3 n1 n2 n3 $1 $2 $3


phones = transcripts['phoneset'].split()
print('There are %d distinct phones:'%(len(phones)))
print(phones)
print('')
senones = transcripts['senoneset'].split()
print('There are %d distinct senones:'%(len(senones)))
print(senones)

There are 46 distinct phones:
['#', '$', 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z', 'æ', 'ð', 'ŋ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɚ', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɫ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʌ', 'ʒ', 'ʔ', 'θ', 'ᵻ']

There are 136 distinct senones:
['#', '$1', '$2', '$3', 'a1', 'a2', 'a3', 'b1', 'b2', 'b3', 'd1', 'd2', 'd3', 'e1', 'e2', 'e3', 'f1', 'f2', 'f3', 'h1', 'h2', 'h3', 'i1', 'i2', 'i3', 'j1', 'j2', 'j3', 'k1', 'k2', 'k3', 'l1', 'l2', 'l3', 'm1', 'm2', 'm3', 'n1', 'n2', 'n3', 'o1', 'o2', 'o3', 'p1', 'p2', 'p3', 'r1', 'r2', 'r3', 's1', 's2', 's3', 't1', 't2', 't3', 'u1', 'u2', 'u3', 'v1', 'v2', 'v3', 'w1', 'w2', 'w3', 'x1', 'x2', 'x3', 'z1', 'z2', 'z3', 'æ1', 'æ2', 'æ3', 'ð1', 'ð2', 'ð3', 'ŋ1', 'ŋ2', 'ŋ3', 'ɐ1', 'ɐ2', 'ɐ3', 'ɑ1', 'ɑ2', 'ɑ3', 'ɔ1', 'ɔ2', 'ɔ3', 'ə1', 'ə2', 'ə3', 'ɚ1', 'ɚ2', 'ɚ3', 'ɛ1', 'ɛ2', 'ɛ3', 'ɜ1', 'ɜ2', 'ɜ3', 'ɡ1', 'ɡ2', 'ɡ3', 'ɪ1', 'ɪ2', 'ɪ3', 'ɫ1', 'ɫ2', 'ɫ3', 'ɹ1', 'ɹ2', 'ɹ3', 'ɾ1', 'ɾ2', 'ɾ3', 'ʃ1', 'ʃ2', 'ʃ3', 'ʊ1', 'ʊ2', 'ʊ3', 'ʌ1', 'ʌ2', 'ʌ3', 'ʒ1', 'ʒ2', 'ʒ3', 'ʔ1', 'ʔ2', 'ʔ3', 'θ1', 'θ2', 'θ3', 'ᵻ1', 'ᵻ2', 'ᵻ3']


import h5py
data = {}
with h5py.File('mfcc.hdf5','r') as f:
    for group in f.keys():
        data[group] = {}
        for id in list(f[group].keys())[:1000]:
            data[group][id] = f[group][id][:]
print(data.keys())

dict_keys(['dev', 'eval', 'train'])


print('Training MFCCs are provided for %d files'%(len(data['train'].keys())))
mfcc0 = data['train'][k0]
print('mfcc[%s]: %d frames, each w/%d MFCCs'%(k0,mfcc0.shape[1],mfcc0.shape[0]))

Training MFCCs are provided for 1000 files
mfcc[LJ001-0001]: 968 frames, each w/20 MFCCs


import librosa
import matplotlib.pyplot as plt

def invert_mfcc(mfcc):
    step1 = librosa.feature.inverse.mfcc_to_mel(mfcc)
    step2 = librosa.amplitude_to_db(step1,top_db=50)
    return step2

fs = 22050

fig, ax = plt.subplots(2,1,figsize=(14,8))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0]),
    sr=22050,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='s',
    fmax=8000,
    ax=ax[0]
)
ax[0].set_title('Melspectrogram of %s: %s...'%(k0,transcripts['train'][k0]['word'][:21]),fontsize=18)
plt.colorbar(im0,ax=ax[0])
im1=ax[1].imshow(data['train'][k0][:,:150],aspect='auto')
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Quefrency (samples)',fontsize=18)
ax[1].set_title('MFCC of %s: %s...'%(k0,transcripts['train'][k0]['phone'][:19]),fontsize=18)
plt.colorbar(im1,ax=ax[1])
fig.tight_layout()


import IPython.display
wav = librosa.feature.inverse.mfcc_to_audio(
    data['train'][k0],
    sr=22050,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    fmax=8000
)
IPython.display.Audio(data=wav,rate=fs)


ntrain=250


import numpy as np
Alpha0 = {}
Beta0 = {}
for u in list(data['train'].keys())[:ntrain]:
    Y = transcripts['train'][u]['senone'].split()
    M = len(Y)
    T = data['train'][u].shape[1]
    Alpha = np.zeros((M,T))
    for m in range(M):
        Alpha[m,int(m*T/M):int((m+1)*T/M)]=1
    Alpha0[u]=Alpha
    Beta0[u]=Alpha


fig, ax = plt.subplots(2,1,figsize=(14,8))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0])[:,:150],
    sr=fs,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='s',
    fmax=8000,
    ax=ax[0]
)
ax[0].set_title('Melspectrogram of %s: %s...'%(k0,transcripts['train'][k0]['word'][:21]),fontsize=18)
plt.colorbar(im0,ax=ax[0])
im1=ax[1].imshow(Alpha0[k0][:60,:150],aspect='auto')
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Senone index',fontsize=18)
N,T = Alpha0[k0].shape
ax[1].set_title('Uniform initial segmentation of %s: %d senones, %d frames'%(k0,N,T),fontsize=18)
fig.tight_layout()


import importlib,submitted
importlib.reload(submitted)
help(submitted.transition_posteriors)

Help on function transition_posteriors in module submitted:

transition_posteriors(Alphadict, Betadict, transcripts, A, ntrain)
    Calculate the posterior probability of each
    transition given the training data and its
    transcript.
    
    @param:
    Alphadict[u][m,t] is the probability of the path 
     ending at the m'th senone at time t
    Betadict[u][m,t] is the probability of the path 
     starting at the m'th senone at time t
    transcripts[u]['senone'].split() is a 
     list of the senones in utterance u
    mfcc[u][:,t] is the t'th MFCC vector 
     in utterance u
    A[i][j] is probability of a transition from 
     senone i to senone j
    ntrain is the number of utterances to process
    
    @return:
    xi[u][m,t,d] is the posterior probability of a
     transition from the m'th to the (m+d)'th senone
     in utterance u at time t.  len(xi)==ntrain.


import numpy as np
import importlib, submitted, time
importlib.reload(submitted)

A_ones = {i:{j:1 for j in senones} for i in senones}

processing_start = time.time()
xi0=submitted.transition_posteriors(
    Alpha0,Beta0,
    transcripts['train'],
    A_ones,1
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 0.00750899 seconds


fig, ax = plt.subplots(2,1,figsize=(14,8))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0])[:,:150],
    sr=22050,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='s',
    fmax=8000,
    ax=ax[0]
)
ax[0].set_title('Melspectrogram of %s: %s...'%(k0,transcripts['train'][k0]['word'][:21]),fontsize=18)
plt.colorbar(im0,ax=ax[0])
im1=ax[1].imshow(xi0[k0][:60,:150,0]+2*xi0[k0][:60,:150,1],aspect='auto')
plt.colorbar(im1,ax=ax[1])
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Senone index',fontsize=18)
ax[1].set_title('Transition posteriors (1=self,2=forward)',fontsize=18)
fig.tight_layout()


processing_start = time.time()
xi0=submitted.transition_posteriors(
    Alpha0,Beta0,
    transcripts['train'],
    A_ones,ntrain
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 1.32532 seconds


import importlib, submitted
importlib.reload(submitted)
help(submitted.reestimate)

Help on function reestimate in module submitted:

reestimate(xi, senones, transcripts, mfcc, minvar, ntrain)
    Maximum-likelihood re-estimation 
    of HMM transition probabilities.
    
    @param:
    xi[u][m,t,d] is the posterior probability of a
     transition from the m'th to the (m+d)'th senone
     in utterance u at time t.
    senones is a list of the senone labels
    transcripts[u]['senone'].split() is a 
     list of the senones in utterance u
    mfcc[u][:,t] is the t'th MFCC vector 
     in utterance u
    minvar (scalar): make sure all variances
     are greater than or equal to this number.
     Also, use this value for any variance terms
     whose calculation would cause a divide-by-zero error.
    ntrain is the number of utterances to process
    
    @return:
    A[i][j] is the probability of a transition
      from senone i to senone j (i,j strings).
      If calculating A[i][j] would divide by zero,
      set A[i][j]=0.
    mu[i] is the reestimated mean (an np.ndarray)
      vector for senone i (a string).
      If calculating mu[i] would divide by zero,
      set mu[i]=np.zeros(nfeats).
    var[i] is the reestimated variance vector
      (an ndarray) for senone i (a string).
      If calculating var[i] would divide by zero,
      set var[i]=np.tile(minvar, nfeats).
    A_numerator[i][j] is the reestimation numerator 
      for transitions from senone i to j, including
      the transition from time T to time T+1
    mu_numerator[i] is the reestimation numerator
     for the mean vector of senone i
    var_numerator[i] is the reestimation numerator
     for the variance vector of senone i


import numpy as np
import importlib, submitted, time
importlib.reload(submitted)

processing_start = time.time()
A0,mu0,var0,A_num,mu_num,var_num=submitted.reestimate(
    xi0,
    transcripts['senoneset'].split(),
    transcripts['train'],
    data['train'],
    1,
    1
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 0.102285 seconds


print('A_num["$1"]["$2"] is',A_num['$1']['$2'])
print('A_num["$1"]["$1"] is',A_num['$1']['$1'])
print('A_num["$1"]["$3"] is',A_num['$1']['$3'])
print('A0["o1"]["o2"] is',A0['o1']['o2'])
print('A_num["o1"]["o2"]/denom is',A_num['o1']['o2']/sum(A_num['o1'].values()))
print('sum(A_num["a1"].values()) is',sum(A_num["a1"].values()))
print('A0["a1"]["a2"] is',A0["a1"]["a2"])
print('mu0["a1"][0] is',mu0["a1"][0])
print('var0["a1"][0] is',var0["a1"][0])

A_num["$1"]["$2"] is 4.0
A_num["$1"]["$1"] is 5.0
A_num["$1"]["$3"] is 0
A0["o1"]["o2"] is 0.3333333333333333
A_num["o1"]["o2"]/denom is 0.3333333333333333
sum(A_num["a1"].values()) is 0
A0["a1"]["a2"] is 0
mu0["a1"][0] is 0.0
var0["a1"][0] is 1


import matplotlib.pyplot as plt
nfeats = len(mu0['o1'])
n = np.arange(nfeats)
fig, ax = plt.subplots(2,2,figsize=(14,8))
ax[0,0].plot(n,mu0['o1'],n,mu_num['o1']/sum(A_num['o1'].values()))
ax[0,0].set_title('Mean of senone o1')
ax[1,0].plot(n,var0['o1'],n,var_num['o1']/sum(A_num['o1'].values()))
ax[1,0].set_title('Variance of senone o1')
ax[0,1].plot(n,mu0['$1'],n,mu_num['$1']/sum(A_num['$1'].values()))
ax[0,1].set_title('Mean of senone $1')
ax[1,1].plot(n,var0['$1'],n,var_num['$1']/sum(A_num['$1'].values()))
ax[1,1].set_title('Variance of senone $1')

Text(0.5, 1.0, 'Variance of senone $1')


import numpy as np
import importlib, submitted, time
importlib.reload(submitted)

processing_start = time.time()
A0,mu0,var0,A_num,mu_num,var_num=submitted.reestimate(
    xi0,
    transcripts['senoneset'].split(),
    transcripts['train'],
    data['train'],
    1,
    ntrain
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 11.3008 seconds


importlib.reload(submitted)
help(submitted.logB)

Help on function logB in module submitted:

logB(mfcc, mu, var, ntrain)
    Calculate the log observation probability densities for 
    each state at each time assuming a diagonal-covariance Gaussian 
    probability density model.
    
    @param:
    mfcc[u][:,t] is the t'th MFCC feature vector in 
      utterance u (u is a string, t an int)
    mu[i] is the mean feature vector for senone i 
    var[i] is the vector of feature variances for senone i
    ntrain is the number of utterances to process
    
    @return:
    logB[u][i][t]=log Pr{mfcc[u][:,t] | senone=i}
      u and i are strings, t is an int.
      len(logB)==ntrain.


importlib.reload(submitted)

processing_start=time.time()
logB0 = submitted.logB(data['train'],mu0,var0,1)
print('len(logB1["LJ001-0001"]["$1"])=',len(logB0['LJ001-0001']['$1']))
print('Processing required %g seconds'%(time.time()-processing_start))

len(logB1["LJ001-0001"]["$1"])= 968
Processing required 0.0484869 seconds


fig, ax = plt.subplots(1,1,figsize=(14,6))
for i in transcripts['train'][k0]['senone'].split()[:20:3]:
    ax.plot(logB0[k0][i][:150],label=i)
ax.legend(fontsize=18)
ax.set_title('Log likelihoods of a few different senones',fontsize=18)
ax.set_xlabel('Time (frame index)',fontsize=18)

Text(0.5, 0, 'Time (frame index)')


fig, ax = plt.subplots(1,1,figsize=(14,6))
Y = transcripts['train'][k0]['senone'].split()
logB_image = np.empty((len(Y),968))
for m in range(len(Y)):
    logB_image[m,:]=logB0[k0][Y[m]]
im0=ax.imshow(logB_image)
ax.set_title('logB0 of "%s..."'%(transcripts['train'][k0]['word'][:20]),fontsize=18)
ax.set_ylabel('Senone index',fontsize=18)
ax.set_xlabel('Frame index',fontsize=18)
plt.colorbar(im0,ax=ax)

<matplotlib.colorbar.Colorbar at 0x7fccbeeea0b0>


importlib.reload(submitted)

processing_start=time.time()
logB0 = submitted.logB(data['train'],mu0,var0,ntrain)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 7.6602 seconds


importlib.reload(submitted)
help(submitted.viterbitrain)

Help on function viterbitrain in module submitted:

viterbitrain(A, logB, transcripts, ntrain)
    Find the maximum likelihood time alignment of each transcript 
    to its MFCC matrix.
    
    @param:
    A[i][j] is probability of a transition from senone i 
      to senone j (i and j are strings)
    logB[u][i][t]=log Pr{mfcc[u][:,t] | senone=i}
      u and i are strings, t is an int
    transcripts[u]['senone'].split() is a list 
      of the senones in utterance u
    ntrain is the number of utterances to process
    
    @return:
    Alphadict[u][m,t]==1 if m'th senone at time t is 
      in the maximum likelihood path from (0,0) to (M-1,T-1),
      otherwise 0.  len(Alphadict)==ntrain.
    Delta[u][m,t] is the log probability of the best 
      path up to m'th senone at time t.  len(Delta)==ntrain.
    Backpointer[u][m,t] is either m-1 or m, the most 
      likely predecessor of m at time t.
      len(Backpointer)==ntrain


importlib.reload(submitted)

processing_start = time.time()
Alpha1,Delta1,Psi1 = submitted.viterbitrain(
    A0,
    logB0,
    transcripts['train'],
    1
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 0.644256 seconds


print(Delta1[k0][:4,:4])

[[ -77.3509607  -155.07789671 -242.35723103 -330.93734968]
 [         -inf -157.92764231 -241.83262821 -329.31514669]
 [         -inf          -inf -245.11691661 -329.81442507]
 [         -inf          -inf          -inf -336.02052783]]


logA_test = np.tile(-np.inf,(4,4))
logB_test = np.empty((4,4))
Y = transcripts['train'][k0]['senone'].split()
for m in range(4):
    logB_test[m,:] = logB0[k0][Y[m]][:4]
    for n in range(4):
        if A0[Y[m]][Y[n]]>0:
            logA_test[m,n]=np.log(A0[Y[m]][Y[n]])
print('logA[m,n] is:')
print(logA_test)
print('')
print('logB[m,t] is:')
print(logB_test)
print('')
print('For example, delta[1,3] should be either')
print(Delta1[k0][1,2]+logA_test[1,1]+logB_test[1,3])
print('or')
print(Delta1[k0][0,2]+logA_test[0,1]+logB_test[1,3])

logA[m,n] is:
[[-0.46833654 -0.98361393        -inf        -inf]
 [       -inf -0.43201381 -1.04754018        -inf]
 [-2.58215916        -inf -0.30285175 -5.23010544]
 [       -inf        -inf        -inf -0.44995724]]

logB[m,t] is:
[[-77.3509607  -77.25859947 -86.81099779 -88.11178211]
 [-79.63115263 -79.59306768 -85.77111758 -87.05050467]
 [-78.93853412 -78.9200743  -86.14173412 -86.93425668]
 [-83.22614421 -83.09596919 -84.216702   -85.67350578]]

For example, delta[1,3] should be either
-329.3151466935237
or
-330.3913496280543


fig, ax = plt.subplots(3,1,figsize=(14,12))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0]),
    sr=fs,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='s',
    fmax=8000,
    ax=ax[0]
)
ax[0].set_title('Melspectrogram of %s: %s...'%(k0,transcripts['train'][k0]['word'][:21]),fontsize=18)
plt.colorbar(im0,ax=ax[0])
im1=ax[1].imshow(Delta1[k0][:,:],aspect='auto')
plt.colorbar(im1,ax=ax[1])
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Senone index',fontsize=18)
M,T = Delta1[k0].shape
ax[1].set_title('logprob best path %s: %d senones, %d frames'%(k0,N,T),fontsize=18)
im2=ax[2].imshow(Alpha1[k0][:,:],aspect='auto')
plt.colorbar(im2,ax=ax[2])
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Senone index',fontsize=18)
M,T = Alpha1[k0].shape
ax[2].set_title('Viterbi segmentation of %s: %d senones, %d frames'%(k0,N,T),fontsize=18)
fig.tight_layout()


importlib.reload(submitted)

processing_start = time.time()
Alpha1,Delta1,Psi1 = submitted.viterbitrain(
    A0,
    logB0,
    transcripts['train'],
    ntrain
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 71.3123 seconds


importlib.reload(submitted)

processing_start = time.time()
xi1=submitted.transition_posteriors(
    Alpha1,
    Alpha1,
    transcripts['train'],
    A0,
    ntrain
)
A1,mu1,var1,A_num,mu_num,var_num=submitted.reestimate(
    xi1,
    transcripts['senoneset'].split(),
    transcripts['train'],
    data['train'],
    1,
    ntrain
)
logB1 = submitted.logB(
    data['train'],
    mu1,
    var1,
    ntrain
)
Alpha2,Delta2,Psi2 = submitted.viterbitrain(
    A1,
    logB1,
    transcripts['train'],
    ntrain
)
print('Processing required %g seconds'%(time.time()-processing_start))

Processing required 91.5484 seconds


fig, ax = plt.subplots(3,1,figsize=(14,12))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0]),
    sr=fs,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='s',
    fmax=8000,
    ax=ax[0]
)
ax[0].set_title('Melspectrogram of %s: %s...'%(k0,transcripts['train'][k0]['word'][:21]),fontsize=18)
plt.colorbar(im0,ax=ax[0])
im1=ax[1].imshow(Delta2[k0][:,:],aspect='auto')
plt.colorbar(im1,ax=ax[1])
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Senone index',fontsize=18)
M,T = Delta1[k0].shape
ax[1].set_title('logprob best path %s: %d senones, %d frames'%(k0,N,T),fontsize=18)
im2=ax[2].imshow(Alpha2[k0][:,:],aspect='auto')
plt.colorbar(im2,ax=ax[2])
ax[1].set_xlabel('Time (frames)',fontsize=18)
ax[1].set_ylabel('Senone index',fontsize=18)
M,T = Alpha2[k0].shape
ax[2].set_title('Viterbi segmentation of %s: %d senones, %d frames'%(k0,N,T),fontsize=18)
fig.tight_layout()


importlib.reload(submitted)
help(submitted.show_segmentations)

Help on function show_segmentations in module submitted:

show_segmentations(Alphadict, transcripts, ntrain)
    Generate xticks and xticklabels to show phone segmentations.
    
    @param:
    Alphadict[u][m,t] is the probability of the path 
     ending at the m'th senone at time t
    transcripts[u]['senone'].split() is a 
     list of the senones in utterance u
    ntrain is the number of utterances to process
    
    @return:
    xticks[u] is a list of frame indices at which
      phones start, i.e., for Y[m] that is the
      first senone of the k'th phone, 
      xticks[u][k] should be the first value of 
      t such that Alphadict[u][m,t]>0.5.
      Include long-silence phones ($), but not
      word-boundary phones (#).
    xticklabels[u] is a list of the corresponding
      phone strings.


importlib.reload(submitted)
xticks, xticklabels=submitted.show_segmentations(
    Alpha0,
    transcripts['train'],
    1
)
print(xticks[k0][:19])
print(xticklabels[k0][:19])

[0, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 94, 105, 113, 121, 129, 137, 147]
['$', 'p', 'ɹ', 'ɪ', 'n', 't', 'ɪ', 'ŋ', '$', 'ɪ', 'n', 'ð', 'ɪ', 'o', 'ʊ', 'n', 'l', 'i', 's']


fig, ax = plt.subplots(1,1,figsize=(14,6))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0])[:,:150],
    sr=fs,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='frames',
    fmax=8000,
    ax=ax
)
ax.set_xticks(xticks[k0][:19])
ax.set_xticklabels(xticklabels[k0][:19],fontsize=18)
ax.set_xlabel('Phone alignment times, iteration 0',fontsize=18)
ax.set_title(transcripts['train'][k0]['word'][:21]+'...',fontsize=18)

Text(0.5, 1.0, 'Printing, in the only...')


importlib.reload(submitted)
xticks, xticklabels=submitted.show_segmentations(
    Alpha1,
    transcripts['train'],
    1
)
fig, ax = plt.subplots(1,1,figsize=(14,6))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0])[:,:150],
    sr=fs,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='frames',
    fmax=8000,
    ax=ax
)
ax.set_xticks(xticks[k0][:19])
ax.set_xticklabels(xticklabels[k0][:19],fontsize=18)
ax.set_xlabel('Phone alignment times, iteration 1',fontsize=18)
ax.set_title(transcripts['train'][k0]['word'][:21]+'...',fontsize=18)

Text(0.5, 1.0, 'Printing, in the only...')


importlib.reload(submitted)
xticks, xticklabels=submitted.show_segmentations(
    Alpha2,
    transcripts['train'],
    1
)
fig, ax = plt.subplots(1,1,figsize=(14,6))
im0=librosa.display.specshow(
    invert_mfcc(data['train'][k0])[:,:150],
    sr=fs,
    hop_length=int(0.01*fs),
    win_length=int(0.025*fs),
    y_axis='mel',
    x_axis='frames',
    fmax=8000,
    ax=ax
)
ax.set_xticks(xticks[k0][:19])
ax.set_xticklabels(xticklabels[k0][:19],fontsize=18)
ax.set_xlabel('Phone alignment times, iteration 1',fontsize=18)
ax.set_title(transcripts['train'][k0]['word'][:21]+'...',fontsize=18)

Text(0.5, 1.0, 'Printing, in the only...')


import importlib, extra
importlib.reload(extra)
help(extra.recognize)

Help on function recognize in module extra:

recognize(A, logB, ntest)
    Find the maximum likelihood transcription of test 
    utterances
    
    @param:
    A[i][j] is probability of a transition 
      from senone i to senone j (i and j are strings)
    logB[u][i][t]=log Pr{mfcc[u][:,t] | senone=i}, 
      u and i are strings, t is an int
    ntest: number of utterances to process
    
    @return:
    Alpha[u][i][t]==1 if i is senone in the best 
      path at time t, otherwise 0
    Delta[u][i][t] is the log probability of the best 
      path up to senone i (str) at time t (int)
    Psi[u][i][t] is a string, 
      specifying the most likely predecessor of senone i 
      at time t.
    Num[u][i][t] is the number of frames that senone i
      has been repeated on the best path at time t
      len(hyp)==len(Delta)==len(Psi)==len(Num)==ntest


importlib.reload(extra)
logB = submitted.logB(
    data['train'],
    mu1,
    var1,
    1
)
Alpha, Delta, Psi = extra.recognize(
    A1,
    logB,
    1
)


bestpath = []
u = list(Alpha.keys())[0]
for t in range(len(Alpha[u]['$1'])):
    for i in transcripts['senoneset'].split():
        if Alpha[u][i][t]==1:
            bestpath.append(i)

print('There were %d frames in the best path'%(len(bestpath)))

recognized = '$'
for s in bestpath:
    if s[0] != recognized[-1]:
        recognized += s[0]
print(recognized.replace('$',' ').replace('#',' '))

There were 968 frames in the best path
 θ ɐ tɪn ɪndeɪlɪsɛnz ðə wɪʃ ɪ oɹɛpɛz ns ðɚnt sɛfɹɛz fɑmɑm stɪʃənaʊfɑmɔl ɪ ɑɹ ksᵻkæf kstɹɛz ɛntᵻtɪni ɛksɪʃn

MP4: Hidden Markov Models¶

Part 0: Loading the MFCC and transcripts.¶

A warning about computation time¶

Part 1: Transition Posterior Probabilities¶

Part 2: reestimate: find model parameters given transition posteriors¶

Part 3: Calculate the observation pdf¶

Part 4: Viterbi Training¶

Part 5: Show segmentations¶

Extra Credit: Recognize Speech!!¶