import importlib, reader
help(reader.load_vocabulary)

Help on function load_vocabulary in module reader:

load_vocabulary(vocabularyfile)
    @param:
    vocabularyfile (str) - name of a file that contains a list of unique words
    @return:
    vocabulary (list) - list of those words, in the same order they had in the file

vocabulary = reader.load_vocabulary('data/task1_vocabulary.txt')
print(vocabulary)

['cat', 'swims', 'happily', 'noun', 'verb', 'adverb', 'is', 'in', 'the', 'string', 'of', 'words']

help(reader.load_data)

Help on function load_data in module reader:

load_data(datafile, vocabulary)
    Load data from a datafile.

    @param:
    datafile (str) - the input filename, one sentence per line
    vocabulary (list) - a list of words in the vocabulary, length V

    @return:
    sentences - a list of N sentences, each of which is a list of T words.
    embeddings - a list of N numpy arrays, each of size T[n]-by-V
      embeddings[n][t,:] is one-hot embedding of the t'th word in the n'th sentence.

importlib.reload(reader)
sentences, embeddings = reader.load_data('data/task1_dev_data.txt', vocabulary)
print('\n'.join([' '.join(sentence) for sentence in sentences]))

the noun in the happily swims cat is cat
the adverb in the string of happily cat swims is happily
in the string happily swims cat the verb is swims
the noun in happily cat swims is cat
the noun in the string of swims happily cat is cat
in the string of words swims cat happily the adverb is happily
the verb in the string of words swims happily cat is swims
the adverb in the string of swims cat happily is happily
the noun in the string of words cat happily swims is cat
in the string of swims happily cat the noun is cat
the noun in the string happily cat swims is cat
the adverb in the string of happily swims cat is happily
the adverb in the string swims cat happily is happily
in swims cat happily the verb is swims
the verb in the string cat happily swims is swims
the adverb in the string of cat swims happily is happily
the verb in the string of happily cat swims is swims
the noun in the string swims cat happily is cat
the adverb in the string of happily swims cat is happily
the adverb in the string of cat happily swims is happily

print(embeddings[0])

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

importlib.reload(reader)
help(reader.define_task)

Help on function define_task in module reader:

define_task(embeddings)
    Split the lexical embeddings into XK, XQ, and Y.

    @param:
    embeddings - a T-by-V array, where T is length of the sentence, and V is size of vocabulary

    @return:
    XK - a (T-2)-by-V array, with embeddings that are used to generate key and value vectors
    XQ - a 2-by-V array, with embeddings that are used to generate query vectors
    Y - a 2-by-V array, with embeddings that are the output targets for the transformer

XK, XQ, Y = reader.define_task(embeddings[0])
print('XK is:\n',XK)
print('\nXQ is:\n','\n'.join([' '.join(['%.3f'%(x) for x in row]) for row in XQ]))
print('\nY is:\n',Y)

XK is:
 [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

XQ is:
 0.143 0.143 0.143 0.143 0.000 0.000 0.000 0.143 0.286 0.000 0.000 0.000
0.125 0.125 0.125 0.125 0.000 0.000 0.125 0.125 0.250 0.000 0.000 0.000

Y is:
 [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

importlib.reload(reader)
help(reader.load_model)

Help on function load_model in module reader:

load_model(modelfile)
    Load a model from a text file.

    @param:
    modelfile (str) - name of the file to which model should be saved
    @return:
    WK - numpy array, size V-by-d where V is vocabulary size, d is embedding dimension
    WO - numpy array, size d-by-V where V is vocabulary size, d is embedding dimension
    WQ - numpy array, size V-by-d where V is vocabulary size, d is embedding dimension
    WV - numpy array, size V-by-d where V is vocabulary size, d is embedding dimension

importlib.reload(reader)
WK, WO, WQ, WV = reader.load_model('data/task1_model.txt')
print('WK is\n',WK)
print('WO is\n',WO)
print('WV is\n',WV)

WK is
 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
WO is
 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
WV is
 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

print('WQ is\n',WQ)

WQ is
 [[-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [  2.  -99.  -99.  -99.  -99.  -99.  -99.    2.5 -99.  -99.  -99.  -99. ]
 [-99.    2.  -99.  -99.  -99.  -99.  -99.    2.5 -99.  -99.  -99.  -99. ]
 [-99.  -99.    2.  -99.  -99.  -99.  -99.    2.5 -99.  -99.  -99.  -99. ]
 [  1.    1.    1.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]
 [-99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99.  -99. ]]

import numpy as np
import transformer
v = np.array([0,1,0,0])
print(np.exp(v)/sum(np.exp(v)))

[0.1748777  0.47536689 0.1748777  0.1748777 ]

importlib.reload(transformer)
transformer.softmax([0,1,0,0])

array([0.1748777 , 0.47536689, 0.1748777 , 0.1748777 ])

v = [990,991,990,990]
print(np.exp(v)/np.sum(np.exp(v)))

[nan nan nan nan]

/var/folders/y5/kdcw0kfn2xl71bwrvl50bj1m0000gn/T/ipykernel_76533/3156646497.py:2: RuntimeWarning: overflow encountered in exp
  print(np.exp(v)/np.sum(np.exp(v)))
/var/folders/y5/kdcw0kfn2xl71bwrvl50bj1m0000gn/T/ipykernel_76533/3156646497.py:2: RuntimeWarning: invalid value encountered in divide
  print(np.exp(v)/np.sum(np.exp(v)))

numerator = np.exp(v - np.amax(v))
print('The numerator of the softmax is\n',numerator)
prob = numerator / np.sum(numerator)
print('\nand the softmax output is\n',prob)

The numerator of the softmax is
 [0.36787944 1.         0.36787944 0.36787944]

and the softmax output is
 [0.1748777  0.47536689 0.1748777  0.1748777 ]

importlib.reload(transformer)
print(transformer.softmax([990,991,990,990]))

[0.1748777  0.47536689 0.1748777  0.1748777 ]

importlib.reload(transformer)
print(transformer.softmax([[990,991,990,990],[0,0,1,0]]))

[[0.1748777  0.47536689 0.1748777  0.1748777 ]
 [0.1748777  0.1748777  0.47536689 0.1748777 ]]

help(transformer.forward)

Help on function forward in module transformer:

forward(XK, XQ, WK, WO, WQ, WV)
    Perform one layer of transformer inference, using trained model, on given data.

    @param:
    XK - (T-2)-by-V array containing embeddings of words to be used for keys and values
    XQ - 2-by-V array containing embeddings of words to be used for queries
    WK - V-by-d array mapping X to K
    WO - d-by-V array mapping C to O
    WQ - V-by-d array mapping X to Q
    WV - V-by-d array mapping X to V

    @return:
    A - 2-by-(T-2) array, A[i,j] is attention the i'th query pays to the j'th key
    C - 2-by-d array, context vectors from which P is computed
    K - (T-2)-by-d array, key vectors computed from XK
    O - 2-by-V array, O[i,j] is probability that i'th output word should be j
    Q - 2-by-d array, query vectors computed from XQ
    V - (T-2)-by-d array, value vectors computed from XK

importlib.reload(transformer)
A, C, K, O, Q, V = transformer.forward(XK, XQ, WK, WO, WQ, WV)
print('K is\n',K)
print('V is\n',V)

K is
 [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
V is
 [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

print('Q is\n','\n'.join([' '.join(['%.2f'%(q) for q in row]) for row in Q]))

Q is
 -84.57 -99.00 -99.00 -99.00 -99.00 -99.00 -99.00 -84.50 -99.00 -99.00 -99.00 -99.00
-73.88 -86.50 -86.50 -99.00 -99.00 -99.00 -99.00 -86.31 -99.00 -99.00 -99.00 -99.00

print('A is\n','\n'.join([' '.join(['%.2f'%(q) for q in row]) for row in A]))

A is
 0.00 0.00 0.52 0.00 0.00 0.00 0.48
0.00 0.00 0.00 0.00 0.00 0.00 1.00

print('C is\n','\n'.join([' '.join(['%.2f'%(q) for q in row]) for row in C]))

C is
 0.48 0.00 0.00 0.00 0.00 0.00 0.00 0.52 0.00 0.00 0.00 0.00
1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

print('O is\n','\n'.join([' '.join(['%.2f'%(q) for q in row]) for row in O]))

O is
 0.12 0.08 0.08 0.08 0.08 0.08 0.13 0.08 0.08 0.08 0.08 0.08
0.20 0.07 0.07 0.07 0.07 0.07 0.07 0.07 0.07 0.07 0.07 0.07

importlib.reload(transformer)
help(transformer.generate)

Help on function generate in module transformer:

generate(embeddings, vocabulary, WK, WO, WQ, WV)
    Perform inference on the provided embeddings, and report the generated sentences.

    @param:
    embeddings - a list of one-hot embedding matrices, one per sentence
    vocabulary - a list of words in the vocabulary
    WK - V-by-d array mapping X to K
    WO - d-by-V array mapping C to O
    WQ - V-by-d array mapping X to Q
    WV - V-by-d array mapping X to V

    @return:
    generated - a list of generated sentences, each as a list of space-separated words.
      The first T-2 words of each sentence should be vocabulary items indexed by the
      argmax of the first T-2 embeddings.  The last 2 words of each sentence should be
      vocabulary items indexed by the argmax of the two outputs computed by running
      the transformer with the provided WK, WO, WQ, and WV.

importlib.reload(transformer)
generated = transformer.generate(embeddings,vocabulary,WK,WO,WQ,WV)
for ref,hyp in zip(sentences,generated):
    print('Hypothesis:',' '.join(hyp))
    print('\tReference:',' '.join(ref),' --- ',sum([x!=y for x,y in zip(ref,hyp)]),'errors')

Hypothesis: the noun in the happily swims cat is cat
	Reference: the noun in the happily swims cat is cat  ---  0 errors
Hypothesis: the adverb in the string of happily cat swims is happily
	Reference: the adverb in the string of happily cat swims is happily  ---  0 errors
Hypothesis: in the string happily swims cat the verb is swims
	Reference: in the string happily swims cat the verb is swims  ---  0 errors
Hypothesis: the noun in happily cat swims is cat
	Reference: the noun in happily cat swims is cat  ---  0 errors
Hypothesis: the noun in the string of swims happily cat is cat
	Reference: the noun in the string of swims happily cat is cat  ---  0 errors
Hypothesis: in the string of words swims cat happily the adverb is happily
	Reference: in the string of words swims cat happily the adverb is happily  ---  0 errors
Hypothesis: the verb in the string of words swims happily cat is swims
	Reference: the verb in the string of words swims happily cat is swims  ---  0 errors
Hypothesis: the adverb in the string of swims cat happily is happily
	Reference: the adverb in the string of swims cat happily is happily  ---  0 errors
Hypothesis: the noun in the string of words cat happily swims is cat
	Reference: the noun in the string of words cat happily swims is cat  ---  0 errors
Hypothesis: in the string of swims happily cat the noun is cat
	Reference: in the string of swims happily cat the noun is cat  ---  0 errors
Hypothesis: the noun in the string happily cat swims is cat
	Reference: the noun in the string happily cat swims is cat  ---  0 errors
Hypothesis: the adverb in the string of happily swims cat is happily
	Reference: the adverb in the string of happily swims cat is happily  ---  0 errors
Hypothesis: the adverb in the string swims cat happily is happily
	Reference: the adverb in the string swims cat happily is happily  ---  0 errors
Hypothesis: in swims cat happily the verb is swims
	Reference: in swims cat happily the verb is swims  ---  0 errors
Hypothesis: the verb in the string cat happily swims is swims
	Reference: the verb in the string cat happily swims is swims  ---  0 errors
Hypothesis: the adverb in the string of cat swims happily is happily
	Reference: the adverb in the string of cat swims happily is happily  ---  0 errors
Hypothesis: the verb in the string of happily cat swims is swims
	Reference: the verb in the string of happily cat swims is swims  ---  0 errors
Hypothesis: the noun in the string swims cat happily is cat
	Reference: the noun in the string swims cat happily is cat  ---  0 errors
Hypothesis: the adverb in the string of happily swims cat is happily
	Reference: the adverb in the string of happily swims cat is happily  ---  0 errors
Hypothesis: the adverb in the string of cat happily swims is happily
	Reference: the adverb in the string of cat happily swims is happily  ---  0 errors

importlib.reload(transformer)
V = len(vocabulary)
generated = transformer.generate(
    embeddings,
    vocabulary,
    np.random.uniform(0,1,(V,V)),
    np.random.uniform(0,1,(V,V)),
    np.random.uniform(0,1,(V,V)),
    np.random.uniform(0,1,(V,V))
)
for ref,hyp in zip(sentences,generated):
    print('Hypothesis:',' '.join(hyp))
    print('\tReference:',' '.join(ref),' --- ',sum([x!=y for x,y in zip(ref,hyp)]),'errors')

Hypothesis: the noun in the happily swims cat swims swims
	Reference: the noun in the happily swims cat is cat  ---  2 errors
Hypothesis: the adverb in the string of happily cat swims swims swims
	Reference: the adverb in the string of happily cat swims is happily  ---  2 errors
Hypothesis: in the string happily swims cat the verb swims swims
	Reference: in the string happily swims cat the verb is swims  ---  1 errors
Hypothesis: the noun in happily cat swims swims swims
	Reference: the noun in happily cat swims is cat  ---  2 errors
Hypothesis: the noun in the string of swims happily cat swims swims
	Reference: the noun in the string of swims happily cat is cat  ---  2 errors
Hypothesis: in the string of words swims cat happily the adverb swims swims
	Reference: in the string of words swims cat happily the adverb is happily  ---  2 errors
Hypothesis: the verb in the string of words swims happily cat swims swims
	Reference: the verb in the string of words swims happily cat is swims  ---  1 errors
Hypothesis: the adverb in the string of swims cat happily swims swims
	Reference: the adverb in the string of swims cat happily is happily  ---  2 errors
Hypothesis: the noun in the string of words cat happily swims swims swims
	Reference: the noun in the string of words cat happily swims is cat  ---  2 errors
Hypothesis: in the string of swims happily cat the noun swims swims
	Reference: in the string of swims happily cat the noun is cat  ---  2 errors
Hypothesis: the noun in the string happily cat swims swims swims
	Reference: the noun in the string happily cat swims is cat  ---  2 errors
Hypothesis: the adverb in the string of happily swims cat swims swims
	Reference: the adverb in the string of happily swims cat is happily  ---  2 errors
Hypothesis: the adverb in the string swims cat happily swims swims
	Reference: the adverb in the string swims cat happily is happily  ---  2 errors
Hypothesis: in swims cat happily the verb swims swims
	Reference: in swims cat happily the verb is swims  ---  1 errors
Hypothesis: the verb in the string cat happily swims swims swims
	Reference: the verb in the string cat happily swims is swims  ---  1 errors
Hypothesis: the adverb in the string of cat swims happily swims swims
	Reference: the adverb in the string of cat swims happily is happily  ---  2 errors
Hypothesis: the verb in the string of happily cat swims swims swims
	Reference: the verb in the string of happily cat swims is swims  ---  1 errors
Hypothesis: the noun in the string swims cat happily swims swims
	Reference: the noun in the string swims cat happily is cat  ---  2 errors
Hypothesis: the adverb in the string of happily swims cat swims swims
	Reference: the adverb in the string of happily swims cat is happily  ---  2 errors
Hypothesis: the adverb in the string of cat happily swims swims swims
	Reference: the adverb in the string of cat happily swims is happily  ---  2 errors

importlib.reload(reader)
vocabulary = reader.load_vocabulary('data/task2_vocabulary.txt')
sentences, embeddings = reader.load_data('data/task2_train_data.txt', vocabulary)
print('\n'.join([' '.join(s) for s in sentences]))

the noun in the string of happily cat programs is cat
the adverb in the string happily dog swims is happily
the verb in happily cat programs is programs
in the string of words programs capybara strongly the adverb is strongly
the noun in quickly programs dog is dog
in the capybara paints strongly the verb is paints
in the string strongly capybara programs the verb is programs
in the strongly swims cat the noun is cat
the noun in the string of words quickly capybara swims is capybara
the verb in the swims quickly capybara is swims
the verb in happily swims cat is swims
the noun in the string of words strongly cat programs is cat
the adverb in swims dog quickly is quickly
the noun in the string of words programs happily dog is dog
the adverb in the string cat swims strongly is strongly
in capybara strongly programs the adverb is strongly
in the string of quickly capybara swims the adverb is quickly
the adverb in the dog paints happily is happily
the noun in programs capybara quickly is capybara
the verb in dog strongly programs is programs
the noun in the swims quickly cat is cat
the verb in the happily swims capybara is swims
in the programs quickly capybara the adverb is quickly
in the string happily programs capybara the adverb is happily
the adverb in the string dog paints happily is happily
in the string of dog happily programs the verb is programs
the noun in the string of words cat happily swims is cat
the adverb in the string of swims dog happily is happily
the adverb in the string of words cat swims quickly is quickly
in the string quickly paints cat the noun is cat
in the string of words programs happily cat the adverb is happily
the verb in the string of programs happily capybara is programs
the noun in dog strongly programs is dog
in the string quickly cat swims the verb is swims
the verb in the string of strongly dog programs is programs
the verb in the string paints quickly capybara is paints
the adverb in the string of words programs cat strongly is strongly
the noun in happily swims cat is cat
in the string of words quickly cat programs the verb is programs
the adverb in swims cat strongly is strongly
in the string of words swims capybara quickly the verb is swims
the verb in the string cat paints happily is paints
in the string of quickly swims capybara the noun is capybara
the verb in the string happily programs dog is programs
in the string of capybara strongly paints the noun is capybara
the adverb in the string of words swims cat happily is happily
the adverb in the string of capybara happily swims is happily
in the string quickly swims cat the verb is swims
the adverb in the dog programs happily is happily
the noun in the string of words swims capybara happily is capybara
in the string of words cat programs happily the noun is cat
the adverb in the string happily dog swims is happily
in the string capybara strongly swims the noun is capybara
the adverb in the programs capybara quickly is quickly
the verb in the string strongly paints capybara is paints
in the string of words swims cat happily the noun is cat
the adverb in the string of strongly swims capybara is strongly
the verb in the cat programs quickly is programs
the verb in the string of swims quickly capybara is swims
the adverb in the swims strongly capybara is strongly
the verb in the string of words capybara programs happily is programs
the verb in the string of swims strongly capybara is swims
the adverb in the string of words dog strongly programs is strongly
the verb in paints quickly capybara is paints
the adverb in dog programs strongly is strongly
in cat paints happily the adverb is happily
the verb in the string cat paints happily is paints
in the string of words paints dog quickly the verb is paints
the noun in the string of words cat programs quickly is cat
the noun in the string dog quickly swims is dog
the noun in the string dog happily swims is dog
the adverb in the paints happily dog is happily
the verb in the programs happily dog is programs
in the string of words dog programs strongly the verb is programs
the noun in the string swims strongly dog is dog
in the string of words paints capybara quickly the verb is paints
the adverb in the string of programs capybara strongly is strongly
in programs capybara quickly the adverb is quickly
the noun in dog strongly programs is dog
in the paints capybara happily the noun is capybara
the adverb in the string of programs happily cat is happily
the noun in the string of words dog programs quickly is dog
the noun in the string of cat happily programs is cat
in the string of words paints cat quickly the verb is paints
in the string cat programs happily the adverb is happily
the adverb in paints happily dog is happily
the verb in the happily swims dog is swims
the noun in the string happily swims cat is cat
the noun in the string programs cat happily is cat
in the quickly swims cat the noun is cat
the verb in quickly dog paints is paints
in the string of cat paints strongly the noun is cat
in dog quickly paints the adverb is quickly
in the string of words strongly capybara paints the verb is paints
the noun in the strongly capybara programs is capybara
the adverb in happily capybara swims is happily
in the string of words cat paints quickly the verb is paints
in the dog paints strongly the noun is dog
the adverb in the string of paints strongly cat is strongly
in the strongly programs cat the verb is programs

importlib.reload(transformer)
help(transformer.cross_entropy_loss)

Help on function cross_entropy_loss in module transformer:

cross_entropy_loss(O, Y)
    Calculate losses from network outputs O if target one-hot vectors are Y.

    @param:
    O - NQ-by-V array.  O[n,v]=probability that n'th output is v.
    Y - NQ-by-V array. Y[n,v]=1 if n'th target is v, else Y[n,v]=0.

    @return:
    L - cross-entropy loss, summed over all rows
    dO - NQ-by-V array.  Derivatives of the loss with respect to the elements of O.

XK, XQ, Y = reader.define_task(embeddings[0])
print('Y is\n',Y)

O = np.random.uniform(0, 100, size=Y.shape)
O /= np.expand_dims(np.sum(O,axis=1), axis=1)
print('O is')
print('\n'.join([' '.join(['%.2g'%(o) for o in row]) for row in O]))

Y is
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
O is
0.094 0.098 0.0091 0.069 0.032 0.015 0.081 0.074 0.06 0.074 0.018 0.11 0.043 0.04 0.043 0.027 0.051 0.059
0.039 0.099 0.098 0.046 0.11 0.033 0.02 0.005 0.12 0.0085 0.024 0.087 0.1 0.011 0.016 0.059 0.067 0.058

importlib.reload(transformer)
L, dO = transformer.cross_entropy_loss(O,Y)
print('Total loss is',L, 'which should equal', -np.log(O[0,12])-np.log(O[1,0]))
print('Its derivative dL/dO is')
print('\n'.join([' '.join(['%.2g'%(o) for o in row]) for row in dO]))

Total loss is 6.411922807596774 which should equal 6.411922807596774
Its derivative dL/dO is
-0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -23 -0 -0 -0 -0 -0
-26 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0

importlib.reload(transformer)
help(transformer.gradient)

Help on function gradient in module transformer:

gradient(XK, XQ, Y, WK, WO, WQ, WV, A, C, K, O, Q, V)
    Compute gradient of cross-entropy loss with respect to WK, WO, WQ, and WV
    given the input data in K, Q, and V, and the target outputs in Y.

    @param:
    XK - one embedding per row, first n-2 words in the sentence
    XQ - one embedding per row, 3rd-from-last and 2nd-from-last words in the sentence
    Y - one embedding per row, last two words in the sentence
    O - 2-by-V array, O[i,j] is probability that i'th output word should be j
    C - 2-by-d array, context vectors from which O is computed
    V - (T-2)-by-d array, value vectors of which each row of C is a weighted average
    A - 2-by-(T-2) array, A[i,j] is attention the i'th query pays to the j'th key
    K - (T-2)-by-d array, key vectors computed from XK
    Q - 2-by-d array, query vectors computed from XQ

    @return:
    dWK - gradient of cross-entropy with respect to WK
    dWO - gradient of cross-entropy with respect to WO
    dWQ - gradient of cross-entropy with respect to WQ
    dWV - gradient of cross-entropy with respect to WV

import unittest
unittest.TextTestRunner().run(unittest.defaultTestLoader.discover('tests'))

................
----------------------------------------------------------------------
Ran 16 tests in 0.016s

OK

<unittest.runner.TextTestResult run=16 errors=0 failures=0>

importlib.reload(transformer)
help(transformer.train)

Help on function train in module transformer:

train(embeddings, WK, WO, WQ, WV, learningrate, num_iters)
    Train a transformer using stochastic gradient descent (SGD).
    Each iteration of SGD should choose one training sentence, uniformly at random,
    compute the loss and loss gradient for that one sentence,
    then adjust the parameters WK, WO, WQ and WV in the direction of the negative
    gradient scaled by the learningrate.

    @param:
    embeddings - embeddings[i][j,:] is one-hot vector of the j'th word in the i'th training sentence
    WK - the matrix that multiplies each embedding to produce a key
    WO - the matrix that multiplies the context vector to produce an output logit vector
    WQ - the matrix that multiplies each embedding to produce a query
    WV - the matrix that multiplies each embedding to produce a value
    learningrate - scalar learning rate
    num_iters - number of iterations of SGD to perform

    @return:
    losses - losses[t]=cross-entropy loss of t'th iteration
    WK - what WK has become after num_iters of training
    WO - what WO has become after num_iters of training
    WQ - what WQ has become after num_iters of training
    WV - what WV has become after num_iters of training

V = len(vocabulary)
WK0 = np.random.normal(loc=0,scale=0.001,size=(V,V))
WO0 = np.random.normal(loc=0,scale=0.001,size=(V,V))
WQ0 = np.random.normal(loc=0,scale=0.001,size=(V,V))
WV0 = np.random.normal(loc=0,scale=0.001,size=(V,V))

importlib.reload(transformer)
losses, WK, WO, WQ, WV = transformer.train(embeddings,WK0,WO0,WQ0,WV0,0.01,50000)

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(14,4))
ax = fig.subplots(1)
ax.plot(losses)
ax.set_title('Training losses, transformer MP task 2')

Text(0.5, 1.0, 'Training losses, transformer MP task 2')

dev, devX = reader.load_data('data/task2_dev_data.txt', vocabulary)
generated = transformer.generate(devX,vocabulary,WK,WO,WQ,WV)
for ref,hyp in zip(dev,generated):
    print('Hypothesis:',' '.join(hyp))
    print('\tReference:',' '.join(ref),' --- ',sum([x!=y for x,y in zip(ref,hyp)]),'errors')

Hypothesis: the noun in the happily swims cat is cat
	Reference: the noun in the happily swims cat is cat  ---  0 errors
Hypothesis: the verb in the string paints cat strongly is paints
	Reference: the verb in the string paints cat strongly is paints  ---  0 errors
Hypothesis: the verb in the string quickly programs capybara is swims
	Reference: the verb in the string quickly programs capybara is programs  ---  1 errors
Hypothesis: the adverb in the string of words capybara programs strongly is strongly
	Reference: the adverb in the string of words capybara programs strongly is strongly  ---  0 errors
Hypothesis: the adverb in the string swims quickly capybara is quickly
	Reference: the adverb in the string swims quickly capybara is quickly  ---  0 errors
Hypothesis: the adverb in the swims happily capybara is happily
	Reference: the adverb in the swims happily capybara is happily  ---  0 errors
Hypothesis: the noun in the string capybara strongly programs is capybara
	Reference: the noun in the string capybara strongly programs is capybara  ---  0 errors
Hypothesis: in the string of paints happily capybara the noun is capybara
	Reference: in the string of paints happily capybara the noun is capybara  ---  0 errors
Hypothesis: in the string strongly programs capybara the verb is programs
	Reference: in the string strongly programs capybara the verb is programs  ---  0 errors
Hypothesis: the verb in the capybara quickly programs is swims
	Reference: the verb in the capybara quickly programs is programs  ---  1 errors
Hypothesis: in the dog happily swims the noun is dog
	Reference: in the dog happily swims the noun is dog  ---  0 errors
Hypothesis: in the string cat happily swims the noun is cat
	Reference: in the string cat happily swims the noun is cat  ---  0 errors
Hypothesis: the noun in the swims dog quickly is dog
	Reference: the noun in the swims dog quickly is dog  ---  0 errors
Hypothesis: in the string happily cat swims the noun is cat
	Reference: in the string happily cat swims the noun is cat  ---  0 errors
Hypothesis: the adverb in paints cat strongly is strongly
	Reference: the adverb in paints cat strongly is strongly  ---  0 errors
Hypothesis: the noun in the string happily paints capybara is capybara
	Reference: the noun in the string happily paints capybara is capybara  ---  0 errors
Hypothesis: in the programs dog quickly the noun is dog
	Reference: in the programs dog quickly the noun is dog  ---  0 errors
Hypothesis: in the string paints happily cat the noun is cat
	Reference: in the string paints happily cat the noun is cat  ---  0 errors
Hypothesis: in the string of words capybara strongly swims the noun is capybara
	Reference: in the string of words capybara strongly swims the noun is capybara  ---  0 errors
Hypothesis: in the string of quickly swims dog the noun is dog
	Reference: in the string of quickly swims dog the noun is dog  ---  0 errors

MP4: Transformer¶

Data¶

Vocabulary¶

Dev Data¶

Task Definition¶

Model¶

Task 1: Test an existing transformer¶

softmax¶

forward¶

generate¶

Task 2: Train a Transformer¶

Cross Entropy Loss¶

Gradient¶

Matrix Multiplication¶

Softmax¶

Train the transformer¶

Grading your work¶