import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("/Users/hongyeliu/Desktop/CS361JupyterNotebook/Lecture3/BodyFat.csv")
df
df = df.filter(['WEIGHT', 'HEIGHT'])
df.head()
df.corr()
plt.figure(figsize=(8,8))
plt.scatter(df['HEIGHT'], df['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT")
df_clean = df[df["HEIGHT"] > 40]
df_clean
df_clean = df_clean[df_clean["WEIGHT"] <350]
df_clean
df_clean.head()
plt.figure(figsize=(8,8))
plt.scatter(df_clean['HEIGHT'], df_clean['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT")
df_clean.corr()
x_vals = np.array([np.min(df_clean["HEIGHT"]), np.max(df_clean["HEIGHT"])])
x_vals_standardized = (x_vals-df_clean["HEIGHT"].mean())/df_clean["HEIGHT"].std(ddof=0)
y_predictions_standardized = df_clean.corr()["HEIGHT"]["WEIGHT"]*x_vals_standardized
y_predictions = y_predictions_standardized*df_clean["WEIGHT"].std(ddof=0)+df_clean["WEIGHT"].mean()
plt.figure(figsize=(8,8))
plt.scatter(df_clean['HEIGHT'], df_clean['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT with prediction line")
plt.plot(x_vals, y_predictions, 'r', linewidth=5)