import numpy as np

import matplotlib.figure
import matplotlib.pyplot as plt
%matplotlib inline

import torch
torch.manual_seed(417)
np.random.seed(417)


import importlib
import submitted
importlib.reload(submitted)

<module 'submitted' from '/Users/jhasegaw/Dropbox/mark/teaching/ece417/ece417labs/23fall/mp6/src/submitted.py'>


import submitted, importlib
importlib.reload(submitted)
help(submitted.Encoder.__init__)
help(submitted.Encoder.forward)

Help on function __init__ in module submitted:

__init__(self, dim_neck: int, dim_emb: int, freq: int)
    Sets up the following:
        self.convolutions - the 1-D convolution layers.
            The first should have 80 + dim_emb input channels and 512 output channels,
                while each following convolution layer should have 512 input and 512 output channels.
                All such layers should have a length-5 kernel, with a stride of 1,
                a dilation of 1, and a padding of 2.
            The output of each convolution layer should be fed into a BatchNorm1d layer of 512 input features,
            and the output of each BatchNorm1d should be fed into a ReLU layer.
        self.recurrent - a bidirectional LSTM with two layers, an input size of 512,
            and an output size of dim_neck.

Help on function forward in module submitted:

forward(self, x: typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'input_dim', 'length',), 'cls_name': 'TensorType'}]) -> Tuple[Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'length', 'dim_neck',), 'cls_name': 'TensorType'}], Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'length', 'dim_neck',), 'cls_name': 'TensorType'}]]
    Performs the forward propagation of the AutoVC encoder.
    After passing the input through the convolution layers, the last two dimensions
        should be transposed before passing those layers' output through the EllEssTeeEmm.
        The output from the EllEssTeeEmm should then be split *along the last dimension* into two chunks,
        one for the forward direction (the first self.recurrent_hidden_size columns)
        and one for the backward direction (the last self.recurrent_hidden_size columns).


importlib.reload(submitted)
help(submitted.SpeakerEmbedderGeeArrYou.__init__)
help(submitted.SpeakerEmbedderGeeArrYou.forward)

Help on function __init__ in module submitted:

__init__(self, n_hid: int, n_mels: int, n_layers: int, fc_dim: int, hidden_p: float) -> None
    Sets up the following:
        self.rnn_stack - an n_layers-layer GRU with n_mels input features,
            n_hid hidden features, and a dropout of hidden_p.
        self.projection - a Linear layer with an input size of n_hid
            and an output size of fc_dim.

Help on function forward in module submitted:

forward(self, x: typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'frames', 'n_mels',), 'cls_name': 'TensorType'}]) -> typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'fc_dim',), 'cls_name': 'TensorType'}]
    Performs the forward propagation of the SpeakerEmbedderGeeArrYou.
        After passing the input through the RNN, the last frame of the output
        should be taken and passed through the fully connected layer.
        Each of the frames should then be normalized so that its Euclidean norm is 1.


importlib.reload(submitted)
help(submitted.Decoder.__init__)
help(submitted.Decoder.forward)

Help on function __init__ in module submitted:

__init__(self, dim_neck: int, dim_emb: int, dim_pre: int) -> None
    Sets up the following:
        self.recurrent1 - a unidirectional LSTM with one layer, an input size of 2*dim_neck + dim_emb
            and an output size of dim_pre.
        self.convolutions - the 1-D convolution layers.
            Each convolution layer should have dim_pre input and dim_pre output channels.
            All such layers should have a length-5 kernel, with a stride of 1,
            a dilation of 1, and a padding of 2.
            The output of each convolution layer should be fed into a BatchNorm1d layer of dim_pre input features,
            and the output of that BatchNorm1d should be fed into a ReLU.
        self.recurrent2 - a unidirectional LSTM with two layers, an input size of dim_pre
            and an output size of 1024.
        self.fc_projection = a Linear layer with an input size of 1024 and an output size of 80.

Help on function forward in module submitted:

forward(self, x: typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'input_length', 'input_dim',), 'cls_name': 'TensorType'}]) -> typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'input_length', 'output_dim',), 'cls_name': 'TensorType'}]
    Performs the forward propagation of the AutoVC decoder.
        It should be enough to pass the input through the first EllEssTeeEmm,
        the convolution layers, the second EllEssTeeEmm, and the final LineEar
        layer in that order--except that the "input_length" and "input_dim" dimensions
        should be transposed before input to the convolution layers, and this transposition
        should be undone before input to the second EllEssTeeEmm.


importlib.reload(submitted)
help(submitted.Postnet.__init__)
help(submitted.Postnet.forward)

Help on function __init__ in module submitted:

__init__(self) -> None
    Sets up the following:
        self.convolutions - a Sequential object with five Conv1d layers, each with length-5 kernels,
        a stride of 1, a padding of 2, and a dilation of 1:
            The first should take an 80-channel input and yield a 512-channel output.
            The next three should take 512-channel inputs and yield 512-channel outputs.
            The last should take a 512-channel input and yield an 80-channel output.
            Each layer's output should be passed into a BatchNorm1d,
            and (except for the last layer) from there through a Tanh,
            before being sent to the next layer.

Help on function forward in module submitted:

forward(self, x: typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'input_channels', 'n_mels',), 'cls_name': 'TensorType'}]) -> typing.Annotated[torch.Tensor, {'__torchtyping__': True, 'details': ('batch', 'input_channels', 'n_mels',), 'cls_name': 'TensorType'}]
    Performs the forward propagation of the AutoVC decoder.
    If you initialized this module properly, passing the input through self.convolutions here should suffice.

MP6: AutoVC¶

AutoVC Purpose: Autoencoder-based Voice Conversion¶

"style transfer"¶

"autoencoder"¶

"zero-shot"¶

Trying it yourself¶

AutoVC Components: Encoders, Bottleneck, and Decoder¶

How `submitted.py` is structured¶

1. AutoVC's Content encoder¶

2. AutoVC's Style Encoder¶

3. AutoVC's decoder¶

4. AutoVC's decoder post-network¶

Testing your code¶

Extra Credit: Transferring your voice¶

MP6: AutoVC¶

AutoVC Purpose: Autoencoder-based Voice Conversion¶

"style transfer"¶

"autoencoder"¶

"zero-shot"¶

Trying it yourself¶

AutoVC Components: Encoders, Bottleneck, and Decoder¶

How submitted.py is structured¶

1. AutoVC's Content encoder¶

2. AutoVC's Style Encoder¶

3. AutoVC's decoder¶

4. AutoVC's decoder post-network¶

Testing your code¶

Extra Credit: Transferring your voice¶

How `submitted.py` is structured¶