import numpy as np
import pandas as pd
df = pd.read_csv(r'C:\Users\David\Desktop\上课\健康数据挖掘\第三次作业\diabetes.csv')
df.head()
# 查看数据的信息
df.info()
# 查看数据的分布情况
tmp = df.describe()
print(tmp)
# matploblib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
plt.style.use('fivethirtyeight')
sns.pairplot(df, hue = 'Outcome', vars = df.columns[:8], diag_kind = 'kde');
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x = 'Pregnancies',y ='Glucose' ,data = df.iloc[:,:2],kind = 'scatter')
df.iloc[:,:2]
plt.figure()
plt.title("相关关系图")
plt.xticks("x")
plt.yticks("y")
plt.scatter(df.iloc[:,1], df.iloc[:,2])
plt.show()
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
np.random.seed(123)
contingency = pd.crosstab(df['SkinThickness'], df['Insulin'])
print(chi2_contingency(contingency))
#小样本数据的正态性检验
from scipy import stats
import numpy as np
np.random.seed(456)
x = stats.norm.rvs(loc=5,scale=10,size=80)
print(stats.shapiro(df['Glucose']))
from scipy import stats
import numpy as np
np.random.seed(456)
print(stats.kstest(df['Glucose'],'norm'))
from scipy import stats
import numpy as np
np.random.seed(456)
print(stats.levene(df['BMI'],df['Outcome']))
import statsmodels.api as sm
import matplotlib.pyplot as plt
data = sm.datasets.ccard.load_pandas().data
plt.scatter(data['INCOMESQ'], data['INCOME'])
import statsmodels.api as sm
import matplotlib.pyplot as plt
plt.scatter(df['Glucose'],df['DiabetesPedigreeFunction'],c='r')
from scipy import stats
import numpy as np
np.random.seed(456)
data = sm.datasets.ccard.load_pandas().data
print(stats.pearsonr(data['INCOMESQ'], data['INCOME']))
from scipy import stats
import numpy as np
np.random.seed(456)
rvs = stats.norm.rvs(loc=5,scale=10,size=(100,2))
print(stats.ttest_1samp(rvs,[1,5]))
from scipy import stats
import numpy as np
np.random.seed(456)
rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)
rvs2 = stats.norm.rvs(loc=6,scale=10,size=500)
print(stats.ttest_ind(rvs1,rvs2))
from scipy import stats
import numpy as np
np.random.seed(456)
rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)
rvs2 = (stats.norm.rvs(loc=6,scale=10,size=500)+stats.norm.rvs(scale=0.2,size=500))
print(stats.ttest_rel(rvs1,rvs2))
from scipy import stats
a = df['Glucose']
b= df['DiabetesPedigreeFunction']
c = df['Insulin']
print(stats.f_oneway(a,b,c))
# 多因素方差分析
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import pandas as pd
X1 = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
X2 = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
Y = [77, 71, 74, 70, 72, 79, 72, 79, 72, 70, 74, 75, 78, 74, 74, 72, 73, 77, 73, 73, 76, 76, 70, 70, 76, 78, 79, 76, 74, 77, 70, 71, 71, 74, 78, 73, 72, 74, 71, 72]
data = {'T':X1,'G':X2,'L':Y}
df = pd.DataFrame(data)
formula = 'L~T+G+T:G'
model = ols(formula,df).fit()
print(anova_lm(model))
# 卡方检验
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
np.random.seed(123)
data = np.random.randint(2,size=(40,3))
data = pd.DataFrame(data,columns=['A','B','C'])
contingency = pd.crosstab(data['A'], data['B'])
print(chi2_contingency(contingency))
df.info()
plt.scatter(df['Glucose'],df['DiabetesPedigreeFunction'],c='r')
df.info()
import random
L = []
for i in range(40):
L.append(random.choice(range(70,80)))
print(L)
################################################
import numpy as py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
diabetes_data = pd.read_csv(r'C:\Users\David\Desktop\上课\健康数据挖掘\第三次作业\diabetes.csv')
print(diabetes_data.head())
# 查看数据信息
print(diabetes_data.info(verbose=True))
# 设置参数verbose为True,允许冗长信息
# 数据描述
print(diabetes_data.describe())
# 通过describe可以观察到数据的数量,平均值,标准差,最小值,最大值等数据
#数据形状
print("dimension of diabetes data: {}".format(diabetes_data.shape))
#查看标签分布
print(diabetes_data.Outcome.value_counts())
#使用柱状图的方式画出标签个数统计
plt.figure()
diabetes_data.Outcome.value_counts().plot(kind="bar")
plt.figure()
sns.countplot(diabetes_data['Outcome'], label="Count")
plt.savefig("0_1_graph")
# 可视化数据分布
# 对角线上是各个属性的直方图(分布图)
# 而非对角线上是两个不同属性之间的相关图
plt.figure()
sns.pairplot(diabetes_data)
# 画热力图,数值为两个变量之间的相关系数
# annot: 默认为False,为True,在格子上显示数字
plt.figure()
sns.heatmap(diabetes_data.corr(), annot=True)
##################################
# 绘制数据kde,hist,box,离群点
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
data = pd.read_csv(r'C:\Users\David\Desktop\上课\健康数据挖掘\第三次作业\diabetes.csv')
data = data['BMI']
# 异常值分析
# (1)3σ原则:如果数据服从正态分布,异常值被定义为一组测定值中与平均值的偏差超过3倍的值 → p(|x - μ| > 3σ) ≤ 0.003
u = data.mean() # 计算均值
std = data.std() # 计算标准差
stats.kstest(data, 'norm', (u, std))
print('均值为:%.3f,标准差为:%.3f' % (u,std))
fig = plt.figure(figsize = (20,12))
# 设置中文和负号正常显示
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.title("糖尿病数据集BMI数据分析")
fig.tight_layout(h_pad=2,w_pad=2) # 该方法用于调整子图的位置
ax1 = fig.add_subplot(3,1,1)
ax2 = fig.add_subplot(3,2,3)
ax3 = fig.add_subplot(3,2,4)
#,除了名称之外,plot.density()和plot.kde()之间没有区别.这两个功能完全相同.
data.plot(kind = 'kde',ax=ax1,grid = True,style = '-k',title = 'kde')
ax1.axvline(3*std+u,color='r',linestyle="--",alpha=0.8)
ax1.axvline(-3*std+u,color='r',linestyle="--",alpha=0.8)
data.plot(kind = 'hist',ax=ax2,grid = True,style = '-k',title = 'hist')
data.plot(kind = 'box',ax=ax3,vert=False,grid = True,style = '-k',title = 'box')
# 绘制数据离群点
error = data[np.abs(data - u) > 3*std]
data_c = data[np.abs(data - u) <= 3*std]
print('异常值共%i条' % len(error)