import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("/Users/hongyeliu/Desktop/CS361JupyterNotebook/Lecture3/BodyFat.csv")
df

Correlation¶

df = df.filter(['WEIGHT', 'HEIGHT'])
df.head()

df.corr()

Scatter plot¶

plt.figure(figsize=(8,8))
plt.scatter(df['HEIGHT'], df['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT")

Text(0.5, 1.0, 'Scatter plot of WEIGHT vs. HEIGHT')

Remove outliers¶

df_clean = df[df["HEIGHT"] > 40]
df_clean

df_clean = df_clean[df_clean["WEIGHT"] <350]
df_clean

df_clean.head()

plt.figure(figsize=(8,8))
plt.scatter(df_clean['HEIGHT'], df_clean['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT")

Text(0.5, 1.0, 'Scatter plot of WEIGHT vs. HEIGHT')

Correlation¶

df_clean.corr()

Prediction¶

x_vals = np.array([np.min(df_clean["HEIGHT"]), np.max(df_clean["HEIGHT"])])
x_vals_standardized = (x_vals-df_clean["HEIGHT"].mean())/df_clean["HEIGHT"].std(ddof=0)
y_predictions_standardized = df_clean.corr()["HEIGHT"]["WEIGHT"]*x_vals_standardized
y_predictions = y_predictions_standardized*df_clean["WEIGHT"].std(ddof=0)+df_clean["WEIGHT"].mean()
plt.figure(figsize=(8,8))
plt.scatter(df_clean['HEIGHT'], df_clean['WEIGHT'])
plt.xlabel("HEIGHT")
plt.ylabel("WEIGHT")
plt.title("Scatter plot of WEIGHT vs. HEIGHT with prediction line")
plt.plot(x_vals, y_predictions, 'r', linewidth=5)

[<matplotlib.lines.Line2D at 0x1a1be57490>]

	IDNO	BODYFAT	DENSITY	AGE	WEIGHT	HEIGHT	ADIPOSITY	NECK	CHEST	ABDOMEN	HIP	THIGH	KNEE	ANKLE	BICEPS	FOREARM	WRIST
0	1	12.6	1.0708	23	154.25	67.75	23.7	36.2	93.1	85.2	94.5	59.0	37.3	21.9	32.0	27.4	17.1
1	2	6.9	1.0853	22	173.25	72.25	23.4	38.5	93.6	83.0	98.7	58.7	37.3	23.4	30.5	28.9	18.2
2	3	24.6	1.0414	22	154.00	66.25	24.7	34.0	95.8	87.9	99.2	59.6	38.9	24.0	28.8	25.2	16.6
3	4	10.9	1.0751	26	184.75	72.25	24.9	37.4	101.8	86.4	101.2	60.1	37.3	22.8	32.4	29.4	18.2
4	5	27.8	1.0340	24	184.25	71.25	25.6	34.4	97.3	100.0	101.9	63.2	42.2	24.0	32.2	27.7	17.7
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
247	248	11.5	1.0736	70	134.25	67.00	21.1	34.9	89.2	83.6	88.8	49.6	34.8	21.5	25.6	25.7	18.5
248	249	32.3	1.0236	72	201.00	69.75	29.1	40.9	108.5	105.0	104.5	59.6	40.8	23.2	35.2	28.6	20.1
249	250	28.3	1.0328	72	186.75	66.00	30.2	38.9	111.1	111.5	101.7	60.3	37.3	21.5	31.3	27.2	18.0
250	251	25.3	1.0399	72	190.75	70.50	27.0	38.9	108.3	101.3	97.8	56.0	41.6	22.7	30.5	29.4	19.8
251	252	30.7	1.0271	74	207.50	70.00	29.8	40.8	112.4	108.5	107.1	59.3	42.2	24.6	33.7	30.0	20.9

	WEIGHT	HEIGHT
0	154.25	67.75
1	173.25	72.25
2	154.00	66.25
3	184.75	72.25
4	184.25	71.25

	WEIGHT	HEIGHT
0	154.25	67.75
1	173.25	72.25
2	154.00	66.25
3	184.75	72.25
4	184.25	71.25
...	...	...
247	134.25	67.00
248	201.00	69.75
249	186.75	66.00
250	190.75	70.50
251	207.50	70.00

	WEIGHT	HEIGHT
0	154.25	67.75
1	173.25	72.25
2	154.00	66.25
3	184.75	72.25
4	184.25	71.25
...	...	...
247	134.25	67.00
248	201.00	69.75
249	186.75	66.00
250	190.75	70.50
251	207.50	70.00

	WEIGHT	HEIGHT
0	154.25	67.75
1	173.25	72.25
2	154.00	66.25
3	184.75	72.25
4	184.25	71.25

	WEIGHT	HEIGHT
WEIGHT	1.000000	0.308279
HEIGHT	0.308279	1.000000

	WEIGHT	HEIGHT
WEIGHT	1.000000	0.512913
HEIGHT	0.512913	1.000000