In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("/Users/hongyeliu/Desktop/CS361JupyterNotebook/Lecture3/BodyFat.csv")
df
Out[1]:
IDNO BODYFAT DENSITY AGE WEIGHT HEIGHT ADIPOSITY NECK CHEST ABDOMEN HIP THIGH KNEE ANKLE BICEPS FOREARM WRIST
0 1 12.6 1.0708 23 154.25 67.75 23.7 36.2 93.1 85.2 94.5 59.0 37.3 21.9 32.0 27.4 17.1
1 2 6.9 1.0853 22 173.25 72.25 23.4 38.5 93.6 83.0 98.7 58.7 37.3 23.4 30.5 28.9 18.2
2 3 24.6 1.0414 22 154.00 66.25 24.7 34.0 95.8 87.9 99.2 59.6 38.9 24.0 28.8 25.2 16.6
3 4 10.9 1.0751 26 184.75 72.25 24.9 37.4 101.8 86.4 101.2 60.1 37.3 22.8 32.4 29.4 18.2
4 5 27.8 1.0340 24 184.25 71.25 25.6 34.4 97.3 100.0 101.9 63.2 42.2 24.0 32.2 27.7 17.7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
247 248 11.5 1.0736 70 134.25 67.00 21.1 34.9 89.2 83.6 88.8 49.6 34.8 21.5 25.6 25.7 18.5
248 249 32.3 1.0236 72 201.00 69.75 29.1 40.9 108.5 105.0 104.5 59.6 40.8 23.2 35.2 28.6 20.1
249 250 28.3 1.0328 72 186.75 66.00 30.2 38.9 111.1 111.5 101.7 60.3 37.3 21.5 31.3 27.2 18.0
250 251 25.3 1.0399 72 190.75 70.50 27.0 38.9 108.3 101.3 97.8 56.0 41.6 22.7 30.5 29.4 19.8
251 252 30.7 1.0271 74 207.50 70.00 29.8 40.8 112.4 108.5 107.1 59.3 42.2 24.6 33.7 30.0 20.9

252 rows × 17 columns

Correlation

In [2]:
df = df.filter(['WEIGHT', 'HEIGHT'])
df.head()
Out[2]:
WEIGHT HEIGHT
0 154.25 67.75
1 173.25 72.25
2 154.00 66.25
3 184.75 72.25
4 184.25 71.25
In [3]:
df.corr()
Out[3]:
WEIGHT HEIGHT
WEIGHT 1.000000 0.308279
HEIGHT 0.308279 1.000000

Scatter plot

In [4]:
plt.figure(figsize=(8,8))
plt.scatter(df['HEIGHT'], df['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT")
Out[4]:
Text(0.5, 1.0, 'Scatter plot of WEIGHT vs. HEIGHT')

Remove outliers

In [7]:
df_clean = df[df["HEIGHT"] > 40]
df_clean
Out[7]:
WEIGHT HEIGHT
0 154.25 67.75
1 173.25 72.25
2 154.00 66.25
3 184.75 72.25
4 184.25 71.25
... ... ...
247 134.25 67.00
248 201.00 69.75
249 186.75 66.00
250 190.75 70.50
251 207.50 70.00

251 rows × 2 columns

In [8]:
df_clean = df_clean[df_clean["WEIGHT"] <350]
df_clean
Out[8]:
WEIGHT HEIGHT
0 154.25 67.75
1 173.25 72.25
2 154.00 66.25
3 184.75 72.25
4 184.25 71.25
... ... ...
247 134.25 67.00
248 201.00 69.75
249 186.75 66.00
250 190.75 70.50
251 207.50 70.00

250 rows × 2 columns

In [9]:
df_clean.head()
Out[9]:
WEIGHT HEIGHT
0 154.25 67.75
1 173.25 72.25
2 154.00 66.25
3 184.75 72.25
4 184.25 71.25
In [10]:
plt.figure(figsize=(8,8))
plt.scatter(df_clean['HEIGHT'], df_clean['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT")
Out[10]:
Text(0.5, 1.0, 'Scatter plot of WEIGHT vs. HEIGHT')

Correlation

In [12]:
df_clean.corr()
Out[12]:
WEIGHT HEIGHT
WEIGHT 1.000000 0.512913
HEIGHT 0.512913 1.000000

Prediction

In [13]:
x_vals = np.array([np.min(df_clean["HEIGHT"]), np.max(df_clean["HEIGHT"])])
x_vals_standardized = (x_vals-df_clean["HEIGHT"].mean())/df_clean["HEIGHT"].std(ddof=0)
y_predictions_standardized = df_clean.corr()["HEIGHT"]["WEIGHT"]*x_vals_standardized
y_predictions = y_predictions_standardized*df_clean["WEIGHT"].std(ddof=0)+df_clean["WEIGHT"].mean()
plt.figure(figsize=(8,8))
plt.scatter(df_clean['HEIGHT'], df_clean['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT with prediction line")
plt.plot(x_vals, y_predictions, 'r', linewidth=5)
Out[13]:
[<matplotlib.lines.Line2D at 0x1a1be57490>]
In [ ]:
 
In [ ]: