import soundfile as sf
speech, samplerate = sf.read('humanrights.wav')
import numpy as  np
time_axis = np.arange(len(speech))/samplerate
import matplotlib.pyplot as plt
plt.figure(figsize=(14, 5))
plt.plot(time_axis,speech)
plt.xlabel('Time (seconds)')
plt.title('Speech sample')

Text(0.5, 1.0, 'Speech sample')


plt.figure(figsize=(14, 5))
plt.plot(time_axis[:samplerate],speech[:samplerate])
plt.xlabel('Time (seconds)')
plt.title('Speech sample')

Text(0.5, 1.0, 'Speech sample')


import librosa, librosa.display
S = librosa.stft(speech[:int(1.5*samplerate)], hop_length=int(0.002*samplerate), win_length=int(0.005*samplerate))
Sdb = librosa.amplitude_to_db(abs(S))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Sdb, sr=samplerate, hop_length=int(0.002*samplerate), x_axis='time', y_axis='hz')
print(Sdb.shape)

(1025, 751)


import IPython.display as ipd
ipd.Audio(speech[:int(1.5*samplerate)], rate=samplerate)


import submitted
import importlib
importlib.reload(submitted)
print(submitted.__doc__)

This is the module you'll submit to the autograder.

There are several function definitions, here, that raise RuntimeErrors.  You should replace
each "raise RuntimeError" line with a line that performs the function specified in the
function's docstring.


help(submitted.make_frames)

Help on function make_frames in module submitted:

make_frames(signal, hop_length, win_length)
    frames = make_frames(signal, hop_length, win_length)
    
    signal (num_samps) - the speech signal
    hop_length (scalar) - the hop length, in samples
    win_length (scalar) - the window length, in samples
    frames (num_frames, win_length) - array with one frame per row
    
    num_frames should be enough so that each sample from the signal occurs in at least one frame.
    The last frame may be zero-padded.


importlib.reload(submitted)
hop_length = int(0.015*samplerate)
win_length = int(0.03*samplerate)
frames = submitted.make_frames(speech, hop_length=hop_length, win_length=win_length)
print('The number of frames, and win_length, are',frames.shape)
print('The hop_length is',hop_length)

The number of frames, and win_length, are (1189, 240)
The hop_length is 120


importlib.reload(submitted)
help(submitted.correlate)

Help on function correlate in module submitted:

correlate(frames)
    autocor = correlate(frames)
    
    frames (num_frames, win_length) - array with one frame per row
    autocor (num_frames, 2*win_length-1) - each row is the autocorrelation of one frame


importlib.reload(submitted)
autocor = submitted.correlate(frames)
print(autocor[0,win_length-3:win_length+2])

[0.000736   0.00078295 0.00081344 0.00078295 0.000736  ]


importlib.reload(submitted)
help(submitted.make_matrices)

Help on function make_matrices in module submitted:

make_matrices(autocor, p)
    R, gamma = make_matrices(autocor, p)
    
    autocor (num_frames, 2*win_length-1) - each row is symmetric autocorrelation of one frame
    p (scalar) - the desired size of the autocorrelation matrices
    R (num_frames, p, p) - p-by-p Toeplitz autocor matrix of each frame, with R[0] on main diagonal
    gamma (num_frames, p) - length-p autocor vector of each frame, R[1] through R[p]


importlib.reload(submitted)
R, gamma = submitted.make_matrices(autocor, 10)
print('In frame 0, R is ')
print(R[0,:6,:6])
print('\nand gamma is')
print(gamma[0,:6])

In frame 0, R is 
[[0.00081344 0.00078295 0.000736   0.00069581 0.00066528 0.00063886]
 [0.00078295 0.00081344 0.00078295 0.000736   0.00069581 0.00066528]
 [0.000736   0.00078295 0.00081344 0.00078295 0.000736   0.00069581]
 [0.00069581 0.000736   0.00078295 0.00081344 0.00078295 0.000736  ]
 [0.00066528 0.00069581 0.000736   0.00078295 0.00081344 0.00078295]
 [0.00063886 0.00066528 0.00069581 0.000736   0.00078295 0.00081344]]

and gamma is
[0.00078295 0.000736   0.00069581 0.00066528 0.00063886 0.00060859]


importlib.reload(submitted)
help(submitted.lpc)

Help on function lpc in module submitted:

lpc(R, gamma)
    a = lpc(R, gamma)
    Calculate the LPC coefficients in each frame
    
    R (num_frames, p, p) - p-by-p Toeplitz autocor matrix of each frame, with R[0] on main diagonal
    gamma (num_frames, p) - length-p autocor vector of each frame, R[1] through R[p]
    a (num_frames,p) - LPC predictor coefficients in each frame


importlib.reload(submitted)
a = submitted.lpc(R, gamma)
print(a[0,:6])

[ 1.30340043 -0.5302854   0.18194818 -0.0657513   0.21331657 -0.27316226]


omega = np.arange(1025)*np.pi/1024
inverse_lpc_filter = np.ones((1025, 100), dtype=complex)
for framenum in range(100):
    for k in range(1,11):
        inverse_lpc_filter[:,framenum] -= a[framenum,k-1]*np.exp(-1j*k*omega)
H = 1/inverse_lpc_filter
Hdb = librosa.amplitude_to_db(abs(H))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Hdb, sr=samplerate, hop_length=hop_length, x_axis='time', y_axis='hz')

<matplotlib.collections.QuadMesh at 0x7ff5f5c47eb0>


importlib.reload(submitted)
help(submitted.framepitch)

Help on function framepitch in module submitted:

framepitch(autocor, Fs)
    framepitch = framepitch(autocor, samplerate)
    
    autocor (num_frames, 2*win_length-1) - autocorrelation of each frame
    Fs (scalar) - sampling frequency
    framepitch (num_frames) - estimated pitch period, in samples, for each frame, or 0 if unvoiced
    
    framepitch[t] = 0 if the t'th frame is unvoiced
    framepitch[t] = pitch period, in samples, if the t'th frame is voiced.
    Pitch period should maximize R[framepitch]/R[0], in the range 4ms <= framepitch < 13ms.
    Call the frame voiced if and only if R[framepitch]/R[0] >= 0.3, else unvoiced.


importlib.reload(submitted)
plt.figure(figsize=(14, 5))
plt.subplot(2,1,1)
plt.plot(np.arange(-win_length+1,win_length),autocor[20,:]/autocor[20,win_length-1])
plt.title('Normalized autocorrelation of a voiced frame')
plt.subplot(2,1,2)
plt.plot(np.arange(-win_length+1,win_length),autocor[58,:]/autocor[58,win_length-1])
plt.title('Normalized autocorrelation of an unvoiced frame')

Text(0.5, 1.0, 'Normalized autocorrelation of an unvoiced frame')


importlib.reload(submitted)
framepitch = submitted.framepitch(autocor, samplerate)
plt.figure(figsize=(14, 5))
plt.subplot(2,1,1)
librosa.display.specshow(Sdb, sr=samplerate, hop_length=int(0.002*samplerate), x_axis='time', y_axis='hz')
plt.subplot(2,1,2)
plt.plot(np.arange(100)*0.015, framepitch[:100])
plt.xlim([0,1.5])
plt.title('Frame-wise pitch period for the first 100 frames')

Text(0.5, 1.0, 'Frame-wise pitch period for the first 100 frames')


importlib.reload(submitted)
help(submitted.framelevel)

Help on function framelevel in module submitted:

framelevel(frames)
    framelevel = framelevel(frames)
    
    frames (num_frames, win_length) - array with one frame per row
    framelevel (num_frames) - framelevel[t] = power (energy/duration) of the t'th frame, in decibels


importlib.reload(submitted)
framelevel = submitted.framelevel(frames)
plt.figure(figsize=(14, 5))
plt.subplot(2,1,1)
librosa.display.specshow(Sdb, sr=samplerate, hop_length=int(0.002*samplerate), x_axis='time', y_axis='hz')
plt.subplot(2,1,2)
plt.plot(np.arange(100)*0.015, framelevel[:100])
plt.xlim([0,1.5])
plt.title('Frame-wise spectral level, in dB, for the first 100 frames')

Text(0.5, 1.0, 'Frame-wise spectral level, in dB, for the first 100 frames')


importlib.reload(submitted)
help(submitted.interpolate)

Help on function interpolate in module submitted:

interpolate(framelevel, framepitch, hop_length)
    samplelevel, samplepitch = interpolate(framelevel, framepitch, hop_length)
    
    framelevel (num_frames) - levels[t] = power (energy/duration) of the t'th frame, in decibels
    framepitch (num_frames) - estimated pitch period, in samples, for each frame, or 0 if unvoiced
    hop_length  (scalar) - number of samples between start of each frame
    samplelevel ((num_frames-1)*hop_length+1) - linear interpolation of framelevel
    samplepitch ((num_frames-1)*hop_length+1) - modified linear interpolation of framepitch
    
    samplelevel is exactly as given by numpy.interp.
    samplepitch is modified so that samplepitch[n]=0 if the current frame or next frame are unvoiced.


importlib.reload(submitted)
samplelevel, samplepitch = submitted.interpolate(framelevel, framepitch, hop_length)


plt.figure(figsize=(14, 5))
plt.subplot(2,2,1)
plt.plot(np.arange(100)*0.015, framelevel[:100])
plt.xlim([0,1.5])
plt.title('Spectral level (dB) (frame:top, sample:bottom)')
plt.subplot(2,2,3)
plt.plot(np.arange(100*hop_length)/samplerate, samplelevel[:100*hop_length])
plt.xlim([0,1.5])
plt.subplot(2,2,2)
plt.plot(np.arange(100)*0.015, framepitch[:100])
plt.xlim([0,1.5])
plt.title('Pitch (samples) (frame:top, sample:bottom)')
plt.subplot(2,2,4)
plt.plot(np.arange(100*hop_length)/samplerate, samplepitch[:100*hop_length])
plt.xlim([0,1.5])

(0.0, 1.5)


importlib.reload(submitted)
help(submitted.excitation)

Help on function excitation in module submitted:

excitation(samplelevel, samplepitch)
    phase, excitation = excitation(samplelevel, samplepitch)
    
    samplelevel ((num_frames-1)*hop_length+1) - effective level (in dB) of every output sample
    samplepitch ((num_frames-1)*hop_length+1) - effective pitch period for every output sample
    phase ((num_frames-1)*hop_length+1) - phase of the fundamental at every output sample,
      modulo 2pi, so that 0 <= phase[n] < 2*np.pi for every n.
    excitation ((num_frames-1)*hop_length+1) - LPC excitation signal
      if samplepitch[n]==0, then excitation[n] is zero-mean Gaussian
      if samplepitch[n]!=0, then excitation[n] is a delta function time-aligned to the phase
      In either case, excitation is scaled so its average power matches samplelevel[n].


importlib.reload(submitted)
phase, excitation = submitted.excitation(samplelevel, samplepitch)


plt.figure(figsize=(14, 5))
plt.subplot(2,1,1)
plt.plot(np.arange(100*hop_length)/samplerate, samplepitch[:100*hop_length])
plt.xlim([0,1.5])
plt.title('Frequency (above) and Phase (below) of the fundamental')
plt.subplot(2,1,2)
plt.plot(np.arange(100*hop_length)/samplerate, phase[:100*hop_length])
plt.xlim([0,1.5])

(0.0, 1.5)


plt.figure(figsize=(14, 5))
plt.subplot(2,1,1)
plt.plot(np.arange(100*hop_length)/samplerate, excitation[:100*hop_length])
plt.xlim([0,1.5])
plt.title('Excitation (above) and Phase (below)')
plt.subplot(2,1,2)
plt.plot(np.arange(100*hop_length)/samplerate, phase[:100*hop_length])
plt.xlim([0,1.5])

(0.0, 1.5)


importlib.reload(submitted)
help(submitted.synthesize)

Help on function synthesize in module submitted:

synthesize(excitation, a)
    y = synthesize(excitation, a)
    excitation ((num_frames-1)*hop_length+1) - LPC excitation signal
    a (num_frames,p) - LPC predictor coefficients in each frame
    y ((num_frames-1)*hop_length+1) - LPC synthesized  speech signal


importlib.reload(submitted)
y = submitted.synthesize(excitation, a)
Y = librosa.stft(y[:int(1.5*samplerate)], hop_length=int(0.002*samplerate), win_length=int(0.005*samplerate))
Ydb = librosa.amplitude_to_db(abs(Y))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Ydb, sr=samplerate, hop_length=int(0.002*samplerate), x_axis='time', y_axis='hz')

<matplotlib.collections.QuadMesh at 0x7ff5fd1b9fd0>


import IPython.display as ipd
ipd.Audio(y, rate=samplerate)


import h5py
with h5py.File('solutions.hdf5','r') as f:
    print(list(f.keys()))

['R', 'a', 'autocor', 'excitation', 'framelevel', 'framepitch', 'frames', 'gamma', 'phase', 'samplelevel', 'samplepitch', 'speech', 'y']


with h5py.File('solutions.hdf5','r')  as f:
    excitation_ref = np.array(f['excitation'][:])
    a_ref = np.array(f['a'])
    y_ref = np.array(f['y'][:])
    
importlib.reload(submitted)
y_hyp = submitted.synthesize(excitation_ref, a_ref)

plt.figure(figsize=(14, 5))
plt.plot(np.arange(8100,8400),y_ref[8100:8400],'b-',np.arange(8100,8400),y_hyp[8100:8400],'r--')

[<matplotlib.lines.Line2D at 0x7ff5f5e5d8b0>,
 <matplotlib.lines.Line2D at 0x7ff5f5e5d9d0>]


import extra
importlib.reload(extra)
help(extra.better_vad)

Help on function better_vad in module extra:

better_vad(signal, samplerate)
    vuv = better_vad(signal, samplerate)
    
    signal (sig_length) - a speech signal
    samplerate (scalar) - the sampling rate, samples/second
    vuv (sig_length) - vuv[n]=1 if signal[n] is  voiced, otherwise vuv[n]=0
    
    Write a function that decides whether each frame is voiced or not.
    You're provided with one labeled training example, and one labeled test example.
    You are free to use any external data you want.
    You can also use any algorithms from the internet that you want, 
    except that
    (1) Don't copy code.  If your code is similar to somebody else's, that's OK, but if it's the
    same, you will not get the extra credit.
    (2) Don't import any modules other than numpy and the standard library.

MP1: LPC¶

Part 1: Plotting and understanding the spectrogram¶

Part 2: Chop it into frames¶

Part 3: Calculating autocorrelation of each frame¶

Part 4: Make matrices¶

Part 5: Calculating the LPC coefficients¶

Part 6: Calculating the pitch period¶

Part 7: Calculating the spectral level in every frame¶

Part 8: Linear interpolation of the pitch and level¶

Part 9: Create the LPC Excitation Signal¶

Part 10: LPC Synthesis¶

Grade your code on your own machine before submitting it!¶

Extra Credit¶