# First, install requirements
# You can reuse your environment from MP01
!python -m pip install pandas -q
!python -m pip install fastparquet -q
!python -m pip install tqdm -q
!python -m pip install nltk -q

import reader, importlib
importlib.reload(reader)
help(reader.loadDataset)

Help on function loadDataset in module reader:

loadDataset(filename, subsets, stemming=False)
    Load a file, and returns a list of texts.

    Parameters:
    filename (str): the .parquet file containing the data
    subsets (list of str): List of subsets
                           (in 'acad', 'blog', 'news', 'fic', 'spok', 'tvm') to load
    stemming (bool, optional): if True, use NLTK's stemmer to remove suffixes

    Output:
    texts (list of lists): texts[m][n] is the n'th word in the m'th text in the dataset
    count (int): number of files loaded

importlib.reload(reader)

train = {
    'acad': reader.loadDataset('data/train/human.parquet', ['acad']),
    'fic': reader.loadDataset('data/train/human.parquet', ['fic']),
    'news': reader.loadDataset('data/train/human.parquet', ['news']),
}

for y in train.keys():
    print("There were",len(train[y]),"texts loaded for class",y)

There were 1104 texts loaded for class acad
There were 1255 texts loaded for class fic
There were 1189 texts loaded for class news

print("The first fiction text is:",train['fic'][0])

The first fiction text is: ['he', 'was', 'a', 'wiry', 'little', 'chap', 'with', 'bright', 'eyes', 'for', 'ever', 'on', 'the', 'twinkle', 'and', 'black', 'hair', 'pasted', 'down', 'upon', 'his', 'head', 'so', 'as', 'not', 'to', 'show', 'the', 'slightest', 'vestige', 'of', 'curl', 'while', 'the', 'sharp', 'mischievous', 'look', 'on', 'his', 'face', 'and', 'the', 'quick', 'comical', 'movements', 'of', 'his', 'body', 'suggested', 'something', 'between', 'a', 'terrier', 'and', 'a', 'monkey', 'there', 'was', 'never', 'very', 'much', 'going', 'on', 'in', 'the', 'way', 'of', 'regular', 'sports', 'or', 'pastimes', 'at', 'the', 'birches', 'the', 'smallness', 'of', 'numbers', 'made', 'it', 'difficult', 'to', 'attempt', 'proper', 'games', 'of', 'cricket', 'or', 'football', 'and', 'the', 'boys', 'were', 'forced', 'to', 'content', 'themselves', 'with', 'such', 'substitutes', 'as', 'prisoner', 's', 'base', 'cross', 'tag', 'etc', 'or', 'in', 'carrying', 'out', 'the', 'projects', 'of', 'fred', 'acton', 'who', 'was', 'constantly', 'making', 'suggestions', 'for', 'the', 'employment', 'of', 'their', 'time', 'and', 'compelling', 'everybody', 'to', 'conform', 'to', 'his', 'wishes', 'mr', 'welsby', 'had', 'been', 'a', 'widower', 'for', 'many', 'years', 'he', 'was', 'a', 'grave', 'scholarly', 'man', 'who', 'spent', 'most', 'of', 'his', 'spare', 'time', 'in', 'his', 'own', 'library', 'mr', 'blake', 'was', 'supposed', 'to', 'take', 'charge', 'out', 'of', 'school', 'hours', 'he', 'was', 'as', 'every', 'one', 'said', 'a', 'jolly', 'fellow', 'and', 'the', 'fact', 'that', 'his', 'popularity', 'extended', 'far', 'and', 'wide', 'among', 'a', 'large', 'circle', 'of', 'friends', 'and', 'acquaintances', 'caused', 'him', 'to', 'have', 'a', 'good', 'many', 'irons', 'in', 'the', 'fire', 'of', 'one', 'sort', 'and', 'another', 'during', 'their', 'hours', 'of', 'leisure', 'therefore', 'the', 'birchites', 'were', 'left', 'pretty', 'much', 'to', 'their', 'own', 'devices', 'or', 'more', 'often', 'to', 'those', 'of', 'master', 'fred', 'acton', 'who', 'liked', 'as', 'has', 'already', 'been', 'stated', 'to', 'assume', 'the', 'office', 'of', 'bellwether', 'to', 'the', 'little', 'flock', 'at', 'the', 'time', 'when', 'our', 'story', 'commences', 'the', 'ground', 'was', 'covered', 'with', 'snow', 'but', 'acton', 'was', 'equal', 'to', 'the', 'occasion', 'and', 'as', 'soon', 'as', 'dinner', 'was', 'over', 'ordered', 'all', 'hands', 'to', 'come', 'outside', 'and', 'make', 'a', 'slide', 'the', 'garden', 'was', 'on', 'a', 'steep', 'slope', 'along', 'the', 'bottom', 'of', 'which', 'ran', 'the', 'brick', 'wall', 'bounding', 'one', 'side', 'of', 'the', 'playground', 'a', 'straight', 'steep', 'path', 'lay', 'between', 'this', 'and', 'the', 'house', 'and', 'the', 'youthful', 'dux', 'with', 'his', 'usual', 'disregard', 'of', 'life', 'and', 'limb', 'insisted', 'on', 'choosing', 'this', 'as', 'the', 'scene', 'of', 'operations', 'what', 'he', 'cried', 'in', 'answer', 'to', 'a', 'feeble', 'protest', 'on', 'the', 'part', 'of', 'mugford', 'make', 'it', 'on', 'level', 'ground', 'of', 'course', 'not', 'when', 'we', 've', 'got', 'this', 'jolly', 'hill', 'to', 'go', 'down', 'not', 'if', 'i', 'know', 'it', 'we', 'll', 'open', 'the', 'door', 'at', 'the', 'bottom', 'and', 'go', 'right', 'on', 'into', 'the', 'playground', 'but', 'how', 'if', 'any', 'one', 'goes', 'a', 'bit', 'crooked', 'and', 'runs', 'up', 'against', 'the', 'bricks', 'well', 'they', 'll', 'get', 'pretty', 'well', 'smashed', 'or', 'he', 'will', 'you', 'must', 'go', 'straight', 'that', 's', 'half', 'the', 'fun', 'of', 'the', 'thing', 'it', 'll', 'make', 'it', 'all', 'the', 'more', 'exciting', 'come', 'on', 'and', 'begin', 'to', 'tread', 'down', 'the', 'snow', 'without', 'daring', 'to', 'show', 'any', 'outward', 'signs', 'of', 'reluctance', 'but', 'with', 'feelings', 'very', 'much', 'akin', 'to', 'those', 'of', 'men', 'digging', 'their', 'own', 'graves', 'before', 'being', 'shot', 'the', 'company', 'set', 'about', 'putting', 'this', 'fearful', 'project', 'into', 'execution']

import submitted, importlib
importlib.reload(submitted)
help(submitted.create_frequency_table)

Help on function create_frequency_table in module submitted:

create_frequency_table(train)
    Parameters:
    train (dict of list of lists)
        - train[y][i][k] = k'th token of i'th text of class y

    Output:
    frequency (dict of Counters):
        - frequency[y][x] = number of occurrences of bigram x in texts of class y,
          where x is in the format 'word1*-*-*-*word2'

importlib.reload(submitted)
frequency = submitted.create_frequency_table(train)

print("Frequency of 'machine learning' in academic text: ",frequency['acad']['machine*-*-*-*learning'])
print("Frequency of 'machine learning' in fiction text:  ",frequency['fic']['machine*-*-*-*learning'])
print("Frequency of 'machine learning' in news text:     ",frequency['news']['machine*-*-*-*learning'])
print("\n")

print("Frequency of 'years ago' in academic text: ",frequency['acad']['years*-*-*-*ago'])
print("Frequency of 'years ago' in fiction text:  ",frequency['fic']['years*-*-*-*ago'])
print("Frequency of 'years ago' in news text:     ",frequency['news']['years*-*-*-*ago'])
print("\n")


print("Frequency of 'of the' in academic text: ",frequency['acad']['of*-*-*-*the'])
print("Frequency of 'of the' in fiction text:  ",frequency['fic']['of*-*-*-*the'])
print("Frequency of 'of the' in news text:     ",frequency['news']['of*-*-*-*the'])
print("\n")

print("Frequency of 'to be' in academic text: ",frequency['acad']['to*-*-*-*be'])
print("Frequency of 'to be' in fiction text:  ",frequency['fic']['to*-*-*-*be'])
print("Frequency of 'of the' in news text:     ",frequency['news']['to*-*-*-*be'])
print("\n")

print("Frequency of 'and the' in academic text: ",frequency['acad']['and*-*-*-*the'])
print("Frequency of 'and the' in fiction text:  ",frequency['fic']['and*-*-*-*the'])
print("Frequency of 'of the' in news text:     ",frequency['news']['to*-*-*-*be'])
print("\n")

print("--------------------------------------\n")

print("Total # tokens in acad texts is",sum(frequency['acad'].values()))
print("Total # tokens in fic texts is",sum(frequency['fic'].values()))
print("\n")

print("Total # types in acad texts is",len(frequency['acad'].keys()))
print("Total # types in fic texts is",len(frequency['fic'].keys()))

Frequency of 'machine learning' in academic text:  29
Frequency of 'machine learning' in fiction text:   0
Frequency of 'machine learning' in news text:      0


Frequency of 'years ago' in academic text:  7
Frequency of 'years ago' in fiction text:   50
Frequency of 'years ago' in news text:      101


Frequency of 'of the' in academic text:  3944
Frequency of 'of the' in fiction text:   4522
Frequency of 'of the' in news text:      3165


Frequency of 'to be' in academic text:  714
Frequency of 'to be' in fiction text:   989
Frequency of 'of the' in news text:      955


Frequency of 'and the' in academic text:  995
Frequency of 'and the' in fiction text:   1363
Frequency of 'of the' in news text:      955


--------------------------------------

Total # tokens in acad texts is 530053
Total # tokens in fic texts is 606614


Total # types in acad texts is 262370
Total # types in fic texts is 260166

importlib.reload(submitted)
print(sorted(submitted.stopwords))

["'d", "'ll", "'m", "'re", "'s", "'t", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'it', 'its', 'itself', 'let', 'll', 'me', 'more', 'most', 'mustn', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', 'won', 'would', 'wouldn', 'you', 'your', 'yours', 'yourself', 'yourselves']

importlib.reload(submitted)
help(submitted.remove_stopwords)

Help on function remove_stopwords in module submitted:

remove_stopwords(frequency)
    Parameters:
    frequency (dict of Counters):
        - frequency[y][x] = number of occurrences of bigram x in texts of class y,
          where x is in the format 'word1*-*-*-*word2'
    stopwords (set of str):
        - Set of stopwords to be excluded

    Output:
    nonstop (dict of Counters):
        - nonstop[y][x] = frequency of bigram x in texts of class y,
          but only if either token in x is not a stopword. x is in the format 'word1*-*-*-*word2'

importlib.reload(submitted)
nonstop = submitted.remove_stopwords(frequency)

print("Frequency of 'machine learning' in academic text before stopword removal: ",frequency['acad']['machine*-*-*-*learning'])
print("Frequency of 'machine learning' in academic text after stopword removal:  ",nonstop['acad']['machine*-*-*-*learning'])
print("\n")


print("Frequency of 'of the' in academic text before stopword removal: ",frequency['acad']['of*-*-*-*the'])
print("Frequency of 'of the' in academic text after stopword removal:  ",nonstop['acad']['of*-*-*-*the'])
print("\n")

print("Frequency of 'to be' in academic text before stopword removal: ",frequency['acad']['to*-*-*-*be'])
print("Frequency of 'to be' in academic text after stopword removal:  ",nonstop['acad']['to*-*-*-*be'])
print("\n")

print("Frequency of 'and the' in academic text before stopword removal: ",frequency['acad']['and*-*-*-*the'])
print("Frequency of 'and the' in academic text after stopword removal:  ",nonstop['acad']['and*-*-*-*the'])
print("\n")

print("--------------------------------------\n")

print("Total acad frequency:",sum(frequency['acad'].values()))
print("Total acad non-stopwords",sum(nonstop['acad'].values()))
print("\n")

print("Total # types in acad texts is",len(frequency['acad'].keys()))
print("Total # non-stopwords in acad is",len(nonstop['acad'].keys()))

print("Length of the stopwords set is:",len(submitted.stopwords))

Frequency of 'machine learning' in academic text before stopword removal:  29
Frequency of 'machine learning' in academic text after stopword removal:   29


Frequency of 'of the' in academic text before stopword removal:  3944
Frequency of 'of the' in academic text after stopword removal:   0


Frequency of 'to be' in academic text before stopword removal:  714
Frequency of 'to be' in academic text after stopword removal:   0


Frequency of 'and the' in academic text before stopword removal:  995
Frequency of 'and the' in academic text after stopword removal:   0


--------------------------------------

Total acad frequency: 530053
Total acad non-stopwords 481950


Total # types in acad texts is 262370
Total # non-stopwords in acad is 259798
Length of the stopwords set is: 150

importlib.reload(submitted)
help(submitted.laplace_smoothing)

Help on function laplace_smoothing in module submitted:

laplace_smoothing(nonstop, smoothness)
    Parameters:
    nonstop (dict of Counters)
        - nonstop[y][x] = frequency of bigram x in y, where x is in the format 'word1*-*-*-*word2'
          and neither word1 nor word2 is a stopword
    smoothness (float)
        - smoothness = Laplace smoothing hyperparameter

    Output:
    likelihood (dict of dicts)
        - likelihood[y][x] = Laplace-smoothed likelihood of bigram x given y,
          where x is in the format 'word1*-*-*-*word2'
        - likelihood[y]['OOV'] = likelihood of an out-of-vocabulary bigram given y


    Important:
    Be careful that your vocabulary only counts bigrams that occurred at least once
    in the training data for class y.

importlib.reload(submitted)
likelihood = submitted.laplace_smoothing(nonstop, 0.001)

print("Likelihood of 'years ago' in academic text: ",likelihood['acad']['years*-*-*-*ago'])
print("Likelihood of 'years ago' in fiction text:  ",likelihood['fic']['years*-*-*-*ago'])
print("Likelihood of 'years ago' in news text:     ",likelihood['news']['years*-*-*-*ago'])
print("\n")

print("Likelihood of OOV bigram type in academic text: ",likelihood['acad']['OOV'])
print("Likelihood of OOV bigram type in fiction text:  ",likelihood['fic']['OOV'])
print("Likelihood of OOV bigram type in news text:     ",likelihood['news']['OOV'])
print("\n")

print("(should be approx. 1): Likelihood['acad'] sums to",sum(likelihood['acad'].values()))
print("(should be approx. 1): Likelihood['fic']  sums to",sum(likelihood['fic'].values()))
print("(should be approx. 1): Likelihood['news'] sums to",sum(likelihood['news'].values()))

Likelihood of 'years ago' in academic text:  1.451857679897542e-05
Likelihood of 'years ago' in fiction text:   0.0001045642072571625
Likelihood of 'years ago' in news text:      0.0002047836458885437


Likelihood of OOV bigram type in academic text:  2.073786144690104e-09
Likelihood of OOV bigram type in fiction text:   2.0912423202968444e-09
Likelihood of OOV bigram type in news text:      2.0275407757204748e-09


(should be approx. 1): Likelihood['acad'] sums to 0.9999999999999999
(should be approx. 1): Likelihood['fic']  sums to 0.9999999999999999
(should be approx. 1): Likelihood['news'] sums to 0.9999999999999999

importlib.reload(submitted)
help(submitted.naive_bayes)

Help on function naive_bayes in module submitted:

naive_bayes(texts, likelihood, prior, classes)
    Parameters:
    texts (list of lists) -
        - texts[i][k] = k'th token of i'th text
    likelihood (dict of dicts)
        - likelihood[y][x] = Laplace-smoothed likelihood of bigram x given y,
          where x is in the format 'word1*-*-*-*word2'
    prior (list of floats)
        - The prior probability of each class, in the same order as classes
    classes (list of strings)
        - the classes represented in the likelihood dict

    Output:
    hypotheses (list)
        - hypotheses[i] = class label for the i'th text

importlib.reload(reader)
validation = {
    'acad': reader.loadDataset('data/dev/human.parquet', ['acad']),
    'fic': reader.loadDataset('data/dev/human.parquet', ['fic']),
    'news': reader.loadDataset('data/dev/human.parquet', ['news']),
}
labels, texts = reader.val_set_helper(validation)

importlib.reload(submitted)
hypotheses = submitted.naive_bayes(texts, likelihood, [1./3, 1./3, 1./3], ['acad', 'fic', 'news'])

for y in ['acad','fic','news','undecided']:
    print("There are",hypotheses.count(y),'examples that were labeled with class',y)

There are 123 examples that were labeled with class acad
There are 141 examples that were labeled with class fic
There are 132 examples that were labeled with class news
There are 0 examples that were labeled with class undecided

print(len(hypotheses)) 
print(len(labels))

396
396

print("The accuracy of the classifier on the validation set is:")

count_correct = 0
for (y,yhat) in zip(labels, hypotheses):
    if y==yhat:
        count_correct += 1
        
print(count_correct / len(labels))

The accuracy of the classifier on the validation set is:
0.9974747474747475

importlib.reload(reader)
train = {
    'human': reader.loadDataset('data/train/human.parquet', ['acad', 'blog', 'news', 'fic', 'spok', 'tvm']),
    'llm': reader.loadDataset('data/train/llm.parquet', ['acad', 'blog', 'news', 'fic', 'spok', 'tvm']),
}
validation = {
    'human': reader.loadDataset('data/dev/human.parquet', ['acad', 'blog', 'news', 'fic', 'spok', 'tvm']),
    'llm': reader.loadDataset('data/dev/llm.parquet', ['acad', 'blog', 'news', 'fic', 'spok', 'tvm']),
}
labels, texts = reader.val_set_helper(validation)
print ("Loading complete")

Loading complete

print("Making frequency table")
frequency = submitted.create_frequency_table(train)
print("Removing stopwords")
nonstop = submitted.remove_stopwords(frequency)
print("Calculating likelihood")
likelihood = submitted.laplace_smoothing(nonstop, 0.001)
print("Generating hypotheses")
hypotheses = submitted.naive_bayes(texts, likelihood, [1./2, 1./2], ['human', 'llm'])

for y in ['human','llm','undecided']:
    print("There are",hypotheses.count(y),'examples that were labeled with class',y)

Making frequency table
Removing stopwords
Calculating likelihood
Generating hypotheses
There are 1497 examples that were labeled with class human
There are 167 examples that were labeled with class llm
There are 0 examples that were labeled with class undecided

print("The accuracy of the classifier on the validation set is:")

count_correct = 0
for (y,yhat) in zip(labels, hypotheses):
    if y==yhat:
        count_correct += 1
        
print(count_correct / len(labels))

The accuracy of the classifier on the validation set is:
0.5751201923076923

importlib.reload(submitted)
help(submitted.optimize_hyperparameters)

Help on function optimize_hyperparameters in module submitted:

optimize_hyperparameters(texts, labels, classes, nonstop, priors, smoothnesses)
    Parameters:
    texts (list of lists) - dev set texts
        - texts[i][k] = k'th token of i'th text
    labels (list) - dev set labels
        - labels[i] = class label of i'th text
    classes (list of strings)
        - the classes represented in the likelihood dict
    nonstop (dict of Counters)
        - nonstop[y][x] = frequency of word x in class y, x not stopword
    priors (list of lists)
        - a list of different possible sets of values of the prior
    smoothnesses (list)
        - a list of different possible values of the smoothness

    Output:
    accuracies (numpy array, shape = len(priors) x len(smoothnesses))
        - accuracies[m,n] = dev set accuracy achieved using the
          m'th candidate prior and the n'th candidate smoothness

importlib.reload(submitted)
import numpy as np

priors = [(0.5,0.5), (0.65,0.35), (0.75,0.25)]
smoothnesses = [0.0001,0.001,0.01,0.1]
accuracies = submitted.optimize_hyperparameters(texts,labels,['human', 'llm'],nonstop,priors,smoothnesses)

(m,n) = np.unravel_index(np.argmax(accuracies), accuracies.shape)
print("The best accuracy achieved was",accuracies[m,n])
print("It was achieved for a prior of", priors[m], "and a smoothness of", smoothnesses[n])

The best accuracy achieved was 0.6814903846153846
It was achieved for a prior of (0.75, 0.25) and a smoothness of 0.01

!python grade.py

.....
----------------------------------------------------------------------
Ran 5 tests in 40.366s

OK

CS440/ECE448 Spring 2026¶

MP02: Naive Bayes¶

Table of Contents¶

Reading the data¶

Learning a Naive Bayes Model: Maximum Likelihood¶

Learning a Naive Bayes model: Stop words¶

Learning a Naive Bayes model: Laplace Smoothing¶

Decisions using a Naive Bayes model¶

Implementation Details¶

Implementation¶

Optimizing Hyperparameters¶

Grade your homework¶