import reader
help(reader)

Help on module reader:

NAME
    reader - This file is responsible for providing functions for reading the files

FUNCTIONS
    loadDir(dirname, stemming, lower_case, use_tqdm=True)
        Loads the files in the folder and returns a 
        list of lists of words from the text in each file.
        
        Parameters:
        name (str): the directory containing the data
        stemming (bool): if True, use NLTK's stemmer to remove suffixes
        lower_case (bool): if True, convert letters to lowercase
        use_tqdm (bool, default:True): if True, use tqdm to show status bar
        
        Output:
        texts (list of lists): texts[m][n] is the n'th word in the m'th email
        count (int): number of files loaded
    
    loadFile(filename, stemming, lower_case)
        Load a file, and returns a list of words.
        
        Parameters:
        filename (str): the directory containing the data
        stemming (bool): if True, use NLTK's stemmer to remove suffixes
        lower_case (bool): if True, convert letters to lowercase
        
        Output:
        x (list): x[n] is the n'th word in the file

DATA
    bad_words = {'aed', 'eed', 'oed'}
    porter_stemmer = <PorterStemmer>
    tokenizer = RegexpTokenizer(pattern='\\w+', gaps=False, disc...ty=True...

FILE
    /Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring23/mp01/src/reader.py


import importlib
importlib.reload(reader)
texts, count = reader.loadDir('data',False,False)

100%|██████████████████████████████████████████████████| 500/500 [00:00<00:00, 6554.26it/s]


print("There were",count,"files loaded")

There were 500 files loaded


print("The first file contained the following words:",texts[0])

The first file contained the following words: ['Subject', 'done', 'new', 'sitara', 'desk', 'request', 'ref', 'cc', '20000813', 'carey', 'per', 'scott', 's', 'request', 'below', 'the', 'following', 'business', 'unit', 'aka', 'desk', 'id', 'portfolio', 'was', 'added', 'to', 'global', 'production', 'and', 'unify', 'development', 'test', 'production', 'and', 'stage', 'please', 'copy', 'to', 'the', 'other', 'global', 'environments', 'thanks', 'dick', 'x', '3', '1489', 'updated', 'in', 'global', 'production', 'environment', 'gcc', 'code', 'desc', 'p', 'ent', 'subenti', 'data', '_', 'cd', 'ap', 'data', '_', 'desc', 'code', '_', 'id', 'a', 'sit', 'deskid', 'imcl', 'a', 'ena', 'im', 'cleburne', '9273', 'from', 'scott', 'mills', '08', '30', '2000', '08', '27', 'am', 'to', 'samuel', 'schott', 'hou', 'ect', 'ect', 'richard', 'elwood', 'hou', 'ect', 'ect', 'debbie', 'r', 'brackett', 'hou', 'ect', 'ect', 'judy', 'rose', 'hou', 'ect', 'ect', 'vanessa', 'schulte', 'corp', 'enron', 'enron', 'david', 'baumbach', 'hou', 'ect', 'ect', 'daren', 'j', 'farmer', 'hou', 'ect', 'ect', 'dave', 'nommensen', 'hou', 'ect', 'ect', 'donna', 'greif', 'hou', 'ect', 'ect', 'shawna', 'johnson', 'corp', 'enron', 'enron', 'russ', 'severson', 'hou', 'ect', 'ect', 'cc', 'subject', 'new', 'sitara', 'desk', 'request', 'this', 'needs', 'to', 'be', 'available', 'in', 'production', 'by', 'early', 'afternoon', 'sorry', 'for', 'the', 'short', 'notice', 'srm', 'x', '33548']


import submitted
import importlib
importlib.reload(submitted)
print(submitted.__doc__)

This is the module you'll submit to the autograder.

There are several function definitions, here, that raise RuntimeErrors.  You should replace
each "raise RuntimeError" line with a line that performs the function specified in the
function's docstring.


help(submitted.joint_distribution_of_word_counts)

Help on function joint_distribution_of_word_counts in module submitted:

joint_distribution_of_word_counts(texts, word0, word1)
    Parameters:
    texts (list of lists) - a list of texts; each text is a list of words
    word0 (str) - the first word to count
    word1 (str) - the second word to count
    
    Output:
    Pjoint (numpy array) - Pjoint[m,n] = P(X1=m,X2=n), where
      X0 is the number of times that word1 occurs in a given text,
      X1 is the number of times that word2 occurs in the same text.


importlib.reload(submitted)
Pjoint = submitted.joint_distribution_of_word_counts(texts, 'mr', 'company')
print(Pjoint)

[[0.964 0.024 0.002 0.    0.002]
 [0.006 0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.   ]
 [0.002 0.    0.    0.    0.   ]]


importlib.reload(submitted)
P0 = submitted.marginal_distribution_of_word_counts(Pjoint, 0)
print(P0)

[0.992 0.006 0.    0.    0.002]


importlib.reload(submitted)
P1 = submitted.marginal_distribution_of_word_counts(Pjoint, 1)
print(P1)

[0.972 0.024 0.002 0.    0.002]


import numpy as np
importlib.reload(submitted)
Pcond = submitted.conditional_distribution_of_word_counts(Pjoint, P0)
print("Conditional distribution table:")
print(Pcond)
print("\nSums of the rows:")
print(np.sum(Pcond, axis=1))

Conditional distribution table:
[[0.97177419 0.02419355 0.00201613 0.         0.00201613]
 [1.         0.         0.         0.         0.        ]
 [       nan        nan        nan        nan        nan]
 [       nan        nan        nan        nan        nan]
 [1.         0.         0.         0.         0.        ]]

Sums of the rows:
[ 1.  1. nan nan  1.]


importlib.reload(submitted)
Pathe = submitted.joint_distribution_of_word_counts(texts, 'a', 'the')

print("Here is the joint distribution:")
print(Pathe)
print("\n It has size", Pathe.shape)

Here is the joint distribution:
[[0.248 0.078 0.056 ... 0.    0.    0.   ]
 [0.036 0.028 0.026 ... 0.    0.    0.   ]
 [0.006 0.006 0.014 ... 0.    0.    0.   ]
 ...
 [0.    0.    0.    ... 0.    0.    0.   ]
 [0.    0.    0.    ... 0.    0.    0.   ]
 [0.    0.    0.    ... 0.    0.    0.002]]

 It has size (20, 59)


importlib.reload(submitted)
Pthe = submitted.marginal_distribution_of_word_counts(Pathe, 1)

print("Counts of the word /the/ have the following distribution:")
print(Pthe)

Counts of the word /the/ have the following distribution:
[0.296 0.122 0.106 0.09  0.076 0.056 0.026 0.04  0.032 0.026 0.016 0.01
 0.014 0.008 0.014 0.006 0.008 0.004 0.008 0.002 0.004 0.002 0.    0.002
 0.    0.008 0.01  0.002 0.    0.006 0.    0.    0.    0.    0.    0.004
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.002]


importlib.reload(submitted)
help(submitted.mean_from_distribution)

Help on function mean_from_distribution in module submitted:

mean_from_distribution(P)
    Parameters:
    P (numpy array) - P[n] = P(X=n)
    
    Outputs:
    mu (float) - the mean of X


importlib.reload(submitted)
help(submitted.variance_from_distribution)

Help on function variance_from_distribution in module submitted:

variance_from_distribution(P)
    Parameters:
    P (numpy array) - P[n] = P(X=n)
    
    Outputs:
    var (float) - the variance of X


importlib.reload(submitted)
help(submitted.covariance_from_distribution)

Help on function covariance_from_distribution in module submitted:

covariance_from_distribution(P)
    Parameters:
    P (numpy array) - P[m,n] = P(X0=m,X1=n)
    
    Outputs:
    covar (float) - the covariance of X0 and X1


importlib.reload(submitted)
mu_the = submitted.mean_from_distribution(Pthe)
print(mu_the)

4.432


importlib.reload(submitted)
var_the = submitted.variance_from_distribution(Pthe)
print(var_the)

41.601376


importlib.reload(submitted)
covar_a_the = submitted.covariance_from_distribution(Pathe)
print(covar_a_the)

9.244752


importlib.reload(submitted)
help(submitted.expectation_of_a_function)

Help on function expectation_of_a_function in module submitted:

expectation_of_a_function(P, f)
    Parameters:
    P (numpy array) - joint distribution, P[m,n] = P(X0=m,X1=n)
    f (function) - f should be a function that takes two
       real-valued inputs, x0 and x1.  The output, z=f(x0,x1),
       must be a real number for all values of (x0,x1)
       such that P(X0=x0,X1=x1) is nonzero.
    
    Output:
    expected (float) - the expected value, E[f(X0,X1)]


import numpy as np
def f(x0,x1):
    return(np.log(x0+1) + np.log(x1+1))

print("f(0,0) is",f(0,0))
print("f(0,15) is",f(0,15))
print("f(1,1) is",f(1,1))
print("f(19,58) is",f(19,58))

f(0,0) is 0.0
f(0,15) is 2.772588722239781
f(1,1) is 1.3862943611198906
f(19,58) is 7.073269717459711


importlib.reload(submitted)
expected = submitted.expectation_of_a_function(Pathe, f)
print(expected)

1.7722821489053828


!python grade.py

/Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring23/mp01/src/submitted.py:67: RuntimeWarning: invalid value encountered in true_divide
  Pcond[m,:] = Pjoint[m,:] / Pmarginal[m]
......
----------------------------------------------------------------------
Ran 6 tests in 0.076s

OK


!python grade.py -j

{
    "tests": [
        {
            "name": "test_cond (test_visible.TestStep)",
            "score": 8,
            "max_score": 8,
            "output": "\n/Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring23/mp01/src/submitted.py:67: RuntimeWarning: invalid value encountered in true_divide\n  Pcond[m,:] = Pjoint[m,:] / Pmarginal[m]\n"
        },
        {
            "name": "test_covariance (test_visible.TestStep)",
            "score": 8,
            "max_score": 8
        },
        {
            "name": "test_expected (test_visible.TestStep)",
            "score": 8,
            "max_score": 8
        },
        {
            "name": "test_joint (test_visible.TestStep)",
            "score": 9,
            "max_score": 9
        },
        {
            "name": "test_marginal (test_visible.TestStep)",
            "score": 9,
            "max_score": 9
        },
        {
            "name": "test_mean (test_visible.TestStep)",
            "score": 8,
            "max_score": 8
        }
    ],
    "leaderboard": [],
    "visibility": "visible",
    "execution_time": "0.06",
    "score": 50
}

CS440/ECE448 Spring 2023¶

MP01: Probability¶

Table of Contents¶

Reading the data¶

Joint, Conditional, and Marginal Distributions¶

Joint distribution:¶

Marginal distributions:¶

Conditional distribution:¶

Mean, Variance and Covariance¶

Expected Value of a Function of an RV¶

Grade your homework¶