QUESTION
%matplotlib inline
import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
sns.set_context("talk")
Anscombe’s quartet
Anscombe’s quartet comprises of four datasets, and is rather famous. Why? You’ll find out in this exercise.
anascombe = pd.read_csv('data/anscombe.csv')
anascombe.head()
- 1
- 2
- 3
dataset | x | y |
---|---|---|
0 | 10 | 8.04 |
1 | 8 | 6.95 |
2 | 13 | 7.58 |
3 | 9 | 8.81 |
4 | 11 | 8.33 |
Part 1
For each of the four datasets…
- Compute the mean and variance of both x and y
- Compute the correlation coefficient between x and y
- Compute the linear regression line: y=β0+β1x+ϵ (hint: use statsmodels and look at the Statsmodels notebook)
Part 2
Using Seaborn, visualize all four datasets.
hint: use sns.FacetGrid combined with plt.scatter
ANSWER
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statistics as sta
import scipy.stats.stats as stats
def getValues():
anscombe = sns.load_dataset("anscombe")
dataset = ['I', 'II', 'III', 'IV']
X = {}
Y = {}
for i in range(0, 4):
xarray = anscombe.x[i * 11:i * 11 + 10].values
X[dataset[i]] = xarray
yarray = anscombe.y[i * 11:i * 11 + 10].values
Y[dataset[i]] = yarray
return X, Y
def getMandV(array):
mean = np.mean(array)
variance = sta.variance(array)
return mean, variance
def getC(X, Y):
Cof = stats.pearsonr(X, Y)[0]
return Cof
anscombe = sns.load_dataset("anscombe")
keys = ['I', 'II', 'III', 'IV']
X, Y = getValues()
#Part 1: Compute the mean and variance of both x and y
for set, array in X.items():
mean, variance = getMandV(array)
print(set + ': mean of x is ' + str(mean))
print(set + ': variance of x is ' + str(variance))
for set, array in Y.items():
mean, variance = getMandV(array)
print(set + ': mean of y is ' + str(mean))
print(set + ': variance of y is ' + str(variance))
#Part 1: Compute the correlation coefficient between x and y
for key in keys:
Cof = getC(X[key], Y[key])
print(key + ': correlation coefficient between x and y is ' + str(Cof))
#Part 1: Compute the linear regression line: y=β0+β1x+ϵ (hint: use statsmodels and look at the Statsmodels notebook)
for key in keys:
x = sm.add_constant(X[key])
model = sm.OLS(Y[key], x)
result = model.fit()
params = result.params
print(key + ": y =", params[0], "+", params[1], "* x")
#Part 2: Using Seaborn, visualize all four datasets
sns.set(style = 'whitegrid')
g = sns.FacetGrid(anscombe, col = "dataset", hue = "dataset", size = 3)
g.map(plt.scatter, 'x', 'y')
plt.show()
Result:
Part 1:
1.
1.
2.
3.
Part 2: