题目:
代码:
import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics as sta
import scipy.stats as stats
anscombe = sns.load_dataset("anscombe")
ansxI = anscombe.x[:11].values
meanIx = np.mean(ansxI)
print('the mean of x of dataset I: {}'.format(meanIx))
ansxII = anscombe.x[11:22].values
meanIIx = np.mean(ansxII)
print('the mean of x of dataset II: {}'.format(meanIIx))
ansxIII = anscombe.x[22:33].values
meanIIIx = np.mean(ansxIII)
print('the mean of x of dataset III: {}'.format(meanIIIx))
ansxIV = anscombe.x[33:44].values
meanIVx = np.mean(ansxIV)
print('the mean of x of dataset IV: {}'.format(meanIVx))
print('')
varIx = sta.variance(ansxI)
print('the variance of x of dataset I: {}'.format(varIx))
varIIx = sta.variance(ansxII)
print('the variance of x of dataset II: {}'.format(varIIx))
varIIIx = sta.variance(ansxIII)
print('the variance of x of dataset III: {}'.format(varIIIx))
varIVx = sta.variance(ansxIV)
print('the variance of x of dataset IV: {}'.format(varIVx))
print('')
ansyI = anscombe.y[:11].values
meanIy = np.mean(ansyI)
print('the mean of y of dataset I: {}'.format(meanIy))
ansyII = anscombe.y[11:22].values
meanIIy = np.mean(ansyII)
print('the mean of y of dataset II: {}'.format(meanIIy))
ansyIII = anscombe.y[22:33].values
meanIIIy = np.mean(ansyIII)
print('the mean of y of dataset III: {}'.format(meanIIIy))
ansyIV = anscombe.y[33:44].values
meanIVy = np.mean(ansyIV)
print('the mean of y of dataset IV: {}'.format(meanIVy))
print('')
varIy = sta.variance(ansyI)
print('the variance of y of dataset I: {}'.format(varIy))
varIIy = sta.variance(ansyII)
print('the variance of y of dataset II: {}'.format(varIIy))
varIIIy = sta.variance(ansyIII)
print('the variance of y of dataset III: {}'.format(varIIIy))
varIVy = sta.variance(ansyIV)
print('the variance of y of dataset IV: {}'.format(varIVy))
print('')
cofI = stats.pearsonr(ansxI, ansyI)[0]
print('the correlation coefficient of dataset I: {}'.format(cofI))
cofII = stats.pearsonr(ansxII, ansyII)[0]
print('the correlation coefficient of dataset II: {}'.format(cofII))
cofIII = stats.pearsonr(ansxIII, ansyIII)[0]
print('the correlation coefficient of dataset III: {}'.format(cofIII))
cofIV = stats.pearsonr(ansxIV, ansyIV)[0]
print('the correlation coefficient of dataset IV: {}'.format(cofIV))
print('')
xI = sm.add_constant(ansxI)
modI = sm.OLS(ansyI, xI)
resI = modI.fit()
print('the linear regression line of dataset I:')
print('y = {} + {} * x'.format(resI.params[0], resI.params[1]))
xII = sm.add_constant(ansxII)
modII = sm.OLS(ansyII, xII)
resII = modII.fit()
print('the linear regression line of dataset II:')
print('y = {} + {} * x'.format(resII.params[0], resII.params[1]))
xIII = sm.add_constant(ansxIII)
modIII = sm.OLS(ansyIII, xIII)
resIII = modIII.fit()
print('the linear regression line of dataset III:')
print('y = {} + {} * x'.format(resIII.params[0], resIII.params[1]))
xIV = sm.add_constant(ansxIV)
modIV = sm.OLS(ansyIV, xIV)
resIV = modIV.fit()
print('the linear regression line of dataset IV:')
print('y = {} + {} * x'.format(resIV.params[0], resIV.params[1]))
sns.set(style = 'whitegrid')
g = sns.FacetGrid(anscombe, col = 'dataset', hue = 'dataset', size = 3)
g.map(plt.scatter, 'x', 'y')
plt.show()
结果:
参考资料:
求相关系数:https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html
求线性拟合:http://www.statsmodels.org/devel/generated/statsmodels.regression.linear_model.OLS.html