import reader, importlib
importlib.reload(reader)
help(reader.loadTrain)

Help on function loadTrain in module reader:

loadTrain(dirname, stemming, lower_case, use_tqdm=True)
    Loads a training dataset.
    
    Parameters:
    dirname (str): the directory containing the data
        - dirname/y should contain training examples from class y
    
    stemming (bool): if True, use NLTK's stemmer to remove suffixes
    lower_case (bool): if True, convert letters to lowercase
    use_tqdm (bool, default:True): if True, use tqdm to show status bar
    
    Output:
    train (dict of list of lists): 
        - train[y][i][k] = k'th token of i'th text of class y


importlib.reload(reader)

train = reader.loadTrain('data/train', False, True)

100%|█████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 8261.67it/s]
100%|█████████████████████████████████████████████████████████| 6000/6000 [00:00<00:00, 8222.23it/s]


for y in train.keys():
    print("There were",len(train[y]),"texts loaded for class",y)

There were 2000 texts loaded for class neg
There were 6000 texts loaded for class pos


print("The first positive review is:",train['pos'][0])

The first positive review is: ['i', 'went', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'being', 'coaxed', 'to', 'by', 'a', 'few', 'friends', 'of', 'mine', 'i', 'll', 'admit', 'that', 'i', 'was', 'reluctant', 'to', 'see', 'it', 'because', 'from', 'what', 'i', 'knew', 'of', 'ashton', 'kutcher', 'he', 'was', 'only', 'able', 'to', 'do', 'comedy', 'i', 'was', 'wrong', 'kutcher', 'played', 'the', 'character', 'of', 'jake', 'fischer', 'very', 'well', 'and', 'kevin', 'costner', 'played', 'ben', 'randall', 'with', 'such', 'professionalism', 'the', 'sign', 'of', 'a', 'good', 'movie', 'is', 'that', 'it', 'can', 'toy', 'with', 'our', 'emotions', 'this', 'one', 'did', 'exactly', 'that', 'the', 'entire', 'theater', 'which', 'was', 'sold', 'out', 'was', 'overcome', 'by', 'laughter', 'during', 'the', 'first', 'half', 'of', 'the', 'movie', 'and', 'were', 'moved', 'to', 'tears', 'during', 'the', 'second', 'half', 'while', 'exiting', 'the', 'theater', 'i', 'not', 'only', 'saw', 'many', 'women', 'in', 'tears', 'but', 'many', 'full', 'grown', 'men', 'as', 'well', 'trying', 'desperately', 'not', 'to', 'let', 'anyone', 'see', 'them', 'crying', 'this', 'movie', 'was', 'great', 'and', 'i', 'suggest', 'that', 'you', 'go', 'see', 'it', 'before', 'you', 'judge']


import submitted, importlib
importlib.reload(submitted)
help(submitted.create_frequency_table)

Help on function create_frequency_table in module submitted:

create_frequency_table(train)
    Parameters:
    train (dict of list of lists) 
        - train[y][i][k] = k'th token of i'th text of class y
    
    Output:
    frequency (dict of Counters) 
        - frequency[y][x] = number of tokens of word x in texts of class y


importlib.reload(submitted)
frequency = submitted.create_frequency_table(train)

print("frequency['pos']['excellent']=",frequency['pos']['excellent'])
print("frequency['neg']['excellent']=",frequency['neg']['excellent'])
print("\n")

print("Total # tokens in pos texts is",sum(frequency['pos'].values()))
print("Total # tokens in neg texts is",sum(frequency['neg'].values()))
print("\n")

print("Total # types in pos texts is",len(frequency['pos'].keys()))
print("Total # types in neg texts is",len(frequency['neg'].keys()))

frequency['pos']['excellent']= 810
frequency['neg']['excellent']= 61


Total # tokens in pos texts is 1427513
Total # tokens in neg texts is 470194


Total # types in pos texts is 40829
Total # types in neg texts is 23901


importlib.reload(submitted)
print(sorted(submitted.stopwords))

["'d", "'ll", "'m", "'re", "'s", "'t", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'it', 'its', 'itself', 'let', 'll', 'me', 'more', 'most', 'mustn', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', 'won', 'would', 'wouldn', 'you', 'your', 'yours', 'yourself', 'yourselves']


importlib.reload(submitted)
help(submitted.remove_stopwords)

Help on function remove_stopwords in module submitted:

remove_stopwords(frequency)
    Parameters:
    frequency (dict of Counters) 
        - frequency[y][x] = number of tokens of word x in texts of class y
    
    Output:
    nonstop (dict of Counters) 
        - nonstop[y][x] = frequency of word x in texts of class y,
          but only if x is not a stopword.


importlib.reload(submitted)
nonstop = submitted.remove_stopwords(frequency)

print("frequency['pos']['excellent']=",frequency['pos']['excellent'])
print("nonstop['pos']['excellent']=",nonstop['pos']['excellent'])
print("\n")

print("frequency['pos']['you']=",frequency['pos']['you'])
print("nonstop['pos']['you']=",nonstop['pos']['you'])
print("\n")

print("Total pos frequency:",sum(frequency['pos'].values()))
print("Total pos non-stopwords",sum(nonstop['pos'].values()))
print("\n")

print("Total # types in pos texts is",len(frequency['pos'].keys()))
print("Total # non-stopwords in pos is",len(nonstop['pos'].keys()))

print("Length of the stopwords set is:",len(submitted.stopwords))

frequency['pos']['excellent']= 810
nonstop['pos']['excellent']= 810


frequency['pos']['you']= 7917
nonstop['pos']['you']= 0


Total pos frequency: 1427513
Total pos non-stopwords 769662


Total # types in pos texts is 40829
Total # non-stopwords in pos is 40687
Length of the stopwords set is: 150


importlib.reload(submitted)
help(submitted.laplace_smoothing)

Help on function laplace_smoothing in module submitted:

laplace_smoothing(nonstop, smoothness)
    Parameters:
    nonstop (dict of Counters) 
        - nonstop[y][x] = frequency of x in y, if x not a stopword
    smoothness (float)
        - smoothness = Laplace smoothing hyperparameter
    
    Output:
    likelihood (dict of dicts) 
        - likelihood[y][x] = Laplace-smoothed likelihood of x given y
        - likelihood[y]['OOV'] = likelihood of an out-of-vocabulary word given y
    
    Be careful that your vocabulary only counts words that occurred at least once
    in the training data for class y.


importlib.reload(submitted)
likelihood = submitted.laplace_smoothing(frequency, 1)

print("likelihood['pos']['excellent']=",likelihood['pos']['excellent'])
print("likelihood['neg']['excellent']=",likelihood['neg']['excellent'])
print("\n")

print("likelihood['pos']['OOV']=",likelihood['pos']['OOV'])
print("likelihood['neg']['OOV']=",likelihood['neg']['OOV'])
print("\n")

print("likelihood['pos'] sums to",sum(likelihood['pos'].values()))
print("Likelihood['neg'] sums to",sum(likelihood['neg'].values()))

likelihood['pos']['excellent']= 0.0005523232650681755
likelihood['neg']['excellent']= 0.00012548168776917846


likelihood['pos']['OOV']= 6.810397843010795e-07
likelihood['neg']['OOV']= 2.023898189825459e-06


likelihood['pos'] sums to 0.9999999999996005
Likelihood['neg'] sums to 0.9999999999996396


importlib.reload(submitted)
help(submitted.naive_bayes)

Help on function naive_bayes in module submitted:

naive_bayes(texts, likelihood, prior)
    Parameters:
    texts (list of lists) -
        - texts[i][k] = k'th token of i'th text
    likelihood (dict of dicts) 
        - likelihood[y][x] = Laplace-smoothed likelihood of x given y
    prior (float)
        - prior = the prior probability of the class called "pos"
    
    Output:
    hypotheses (list)
        - hypotheses[i] = class label for the i'th text


importlib.reload(reader)
texts, labels = reader.loadDev('data/dev', False, True, True)

for y in ['neg','pos']:
    print("There are",labels.count(y),'examples of class',y)

100%|█████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 8077.74it/s]
100%|█████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 8203.04it/s]

There are 1000 examples of class neg
There are 4000 examples of class pos


importlib.reload(submitted)
hypotheses = submitted.naive_bayes(texts, likelihood, 0.5)

for y in ['neg','pos']:
    print("There are",hypotheses.count(y),'examples that were labeled with class',y)

There are 2016 examples that were labeled with class neg
There are 2984 examples that were labeled with class pos


print("The accuracy of the classifier on the dev set is:")

count_correct = 0
for (y,yhat) in zip(labels, hypotheses):
    if y==yhat:
        count_correct += 1
        
print(count_correct / len(labels))

The accuracy of the classifier on the dev set is:
0.7744


importlib.reload(submitted)
help(submitted.optimize_hyperparameters)

Help on function optimize_hyperparameters in module submitted:

optimize_hyperparameters(texts, labels, nonstop, priors, smoothnesses)
    Parameters:
    texts (list of lists) - dev set texts
        - texts[i][k] = k'th token of i'th text
    labels (list) - dev set labels
        - labels[i] = class label of i'th text
    nonstop (dict of Counters) 
        - nonstop[y][x] = frequency of word x in class y, x not stopword
    priors (list)
        - a list of different possible values of the prior
    smoothnesses (list)
        - a list of different possible values of the smoothness
    
    Output:
    accuracies (numpy array, shape = len(priors) x len(smoothnesses))
        - accuracies[m,n] = dev set accuracy achieved using the
          m'th candidate prior and the n'th candidate smoothness


importlib.reload(submitted)
import numpy as np

priors = [0.65,0.75,0.85]
smoothnesses = [0.001,0.01,0.1]
accuracies = submitted.optimize_hyperparameters(texts,labels,nonstop,priors,smoothnesses)

(m,n) = np.unravel_index(np.argmax(accuracies), accuracies.shape)
print("The best accuracy achieved was",accuracies[m,n])
print("It was achieved for a prior of",priors[m])
print("  and a smoothness of",smoothnesses[n])

The best accuracy achieved was 0.8732
It was achieved for a prior of 0.85
  and a smoothness of 0.01


import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
plt.xscale('log')
contours = ax.contour(smoothnesses, priors, accuracies)
ax.clabel(contours, inline=True, fontsize=10)
ax.set_title('Devset accuracy versus smoothness and prior')

Text(0.5, 1.0, 'Devset accuracy versus smoothness and prior')


!python grade.py

..........
----------------------------------------------------------------------
Ran 10 tests in 51.523s

OK

CS440/ECE448 Spring 2023¶

MP02: Naive Bayes¶

Table of Contents¶

Reading the data¶

Learning a Naive Bayes Model: Maximum Likelihood¶

Learning a Naive Bayes model: Stop words¶

Learning a Naive Bayes model: Laplace Smoothing¶

Decisions using a Naive Bayes model¶

Implementation Details¶

Implementation¶

Optimizing Hyperparameters¶

Grade your homework¶