题目地址
import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import statsmodels.api as sm
import statsmodels.formula.api as smf
sns.set_context("talk")
anascombe = pd.read_csv('Anscombe.csv')
x = anascombe.groupby('dataset')['x']
y = anascombe.groupby('dataset')['y']
print("x mean:", x.mean())
print("x variance:", x.var())
print("y mean:", y.mean())
print("y variance:", y.var())
print()
print(anascombe[anascombe['dataset'] == 'I'].corr())
print(anascombe[anascombe['dataset'] == 'II'].corr())
print(anascombe[anascombe['dataset'] == 'III'].corr())
print(anascombe[anascombe['dataset'] == 'IV'].corr())
print()
lin_model = smf.ols('y ~ x', anascombe[anascombe['dataset'] == 'I']).fit()
print(lin_model.summary())
lin_model = smf.ols('y ~ x', anascombe[anascombe['dataset'] == 'II']).fit()
print(lin_model.summary())
lin_model = smf.ols('y ~ x', anascombe[anascombe['dataset'] == 'III']).fit()
print(lin_model.summary())
lin_model = smf.ols('y ~ x', anascombe[anascombe['dataset'] == 'IV']).fit()
print(lin_model.summary())
g = sns.FacetGrid(anascombe, col="dataset")
g.map(plt.scatter, "x", "y")
plt.show()
因为是练习题,所以是有直接的参考资料的。我觉得这种教pandans的方式很好,方法太多了。