import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import logit
import matplotlib.pyplot as plt
import math
psych = pd.read_table("psych.txt", sep=" ")
print (psych)
endog = psych.iloc[:,0] #y: ill
exog = sm.add_constant(psych.iloc[:,1:6]) # x1-x5 with the interception
psychfit1 = sm.GLM(endog, exog, family=sm.families.Binomial())
result1 = psychfit1.fit()
print(result1.summary())
# In another way
logistic_model = logit('ill ~ x1+x2+x3+x4+x5',psych)
result = logistic_model.fit()
print(result.summary())
# Try using the total score only.
endog = psych.iloc[:,0] #y: ill
np.sum(psych.iloc[:,1:6], axis=1) # calcuate the row sum
exog = sm.add_constant(np.sum(psych.iloc[:,1:6], axis=1))
psychfit2 = sm.GLM(endog, exog, family=sm.families.Binomial())
result2 = psychfit2.fit()
print(result2.summary())
# Plot fitted probabilities of illness versus the total score:
def p(x): # Calcuate the fitted probabilities
e = result2.params["const"]+result2.params[0]*x
return(math.exp(e)/(1+math.exp(e)))
fitprob = list(map(p, np.sum(psych.iloc[:,1:6], axis=1)))
# plotting the points
plt.scatter(np.sum(psych.iloc[:,1:6], axis=1), fitprob)
# naming the x axis
plt.xlabel('Total Score')
# naming the y axis
plt.ylabel('Fitted Probability of Illness')
# function to show the plot
plt.show()
# Plot fitted probabilities of illness versus the total score:
fitprob = result2.predict(sm.add_constant(np.sum(psych.iloc[:,1:6], axis=1)))
# plotting the points
plt.scatter(np.sum(psych.iloc[:,1:6], axis=1), fitprob)
# naming the x axis
plt.xlabel('Total Score')
# naming the y axis
plt.ylabel('Fitted Probability of Illness')
# function to show the plot
plt.show()
data = np.array([(24, 1355, 0),(35, 603, 2),(21,192, 4), (30, 224, 5)])
snoreheart = pd.DataFrame(data, columns=['Disease', 'NoDisease', 'Snoring'])
snoreheart
endog = snoreheart.iloc[:,:2] #y: Disease and NoDisease
exog = sm.add_constant(snoreheart.Snoring) # Snoring with the interception
snorefit = sm.GLM(endog, exog, family=sm.families.Binomial())
result3 = snorefit.fit()
print(result3.summary())
plt.scatter(snoreheart.Snoring, snoreheart.Disease/(snoreheart.Disease+snoreheart.NoDisease))
plt.xlabel('Snoring Level')
plt.ylabel('Prob. Heart Disease')
x = np.linspace(0, 5, 200)
plt.plot(x, result3.predict(sm.add_constant(x)))
plt.show()
horseshoe = pd.read_table("horseshoe.txt", sep = " ")
horseshoe = horseshoe.iloc[:,:6]
horseshoe
# plotting the points
plt.scatter(horseshoe.width, horseshoe.satell)
# naming the x axis
plt.xlabel('Width')
# naming the y axis
plt.ylabel('Number of Satellites')
# function to show the plot
plt.show()
endog = horseshoe.satell
exog = sm.add_constant(horseshoe.width)
snorefit = sm.GLM(endog, exog, family=sm.families.Poisson())
result4 = snorefit.fit()
print(result4.summary())
tc = pd.read_table("traincollisions.txt", sep = " ")
tc
endog = tc.TrRd
exog = sm.add_constant(tc.Year-1975)
tcfit = sm.GLM(endog, exog, offset = np.log(tc.KM), family=sm.families.Poisson()) #offset = log(KM)
result5 = tcfit.fit()
print(result5.summary())
plt.scatter(tc.Year, 1000*tc.TrRd/tc.KM)
plt.ylabel('Collisions per Billion Train-Kilometers')
plt.xlabel('Year')
newyear = np.linspace(1975, 2005, 200)
KM = [1]*200
result5.predict(sm.add_constant(newyear-1975), offset = np.log(KM)) # predict when KM=1, log(1)=0
# result5.predict(sm.add_constant(newyear-1975), offset = [0]*200)
plt.plot(newyear, 1000 * result5.predict(sm.add_constant(newyear-1975), offset = np.log(KM)))
plt.show()