from sklearn import linear_model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
data=pd.DataFrame(pd.read_csv('d:/tips.csv'))
print(data)
data.head()
head=['total_bil','sex','smoker','day','meal','people_num']
#print(data)
散点图
for i in range(6):
plt.scatter(data.iloc[:,i:i+1],Y,color='red',marker='*',alpha=0.8)
#X,Y坐标
plt.xlabel(head[i])
plt.ylabel('tip')
plt.show()
sns.distplot(data['tip'])
皮尔逊相关系数
sns.jointplot(x='total_bill',y='tip',data=data)
pearsonr(data.iloc[:,0:1],data.iloc[:,-1:])
sns.jointplot(x='people_num',y='tip',data=data)
pearsonr(data.iloc[:,3:4],data.iloc[:,-1:])
热力图
corrmatrix=data.corr()
sns.heatmap(corrmatrix,square=True,vmax=1,vmin=-1,center=0.0,cmap='coolwarm')
###取相关性前3的特征###
k=3
#与tip相关性最大的前3个特征名
cols=corrmatrix.nlargest(k,'tip')['tip'].index
#设置坐标轴字体大小
sns.set(font_scale=1.25)
#相关系数
cm=data[cols].corr()
#cmap显示颜色;annot是否显示每个值,默认不显示;annot_kws为annot设置格式
hm=sns.heatmap(cm,square=True,annot=True,cmap='RdPu',fmt='.2f',annot_kws={'size':10})
散点图矩阵
cols1=['tip','total_bill','people_num']
sns.pairplot(data[cols1],size=2.5)
数据处理
#删除相关性弱的特征
data1=data.drop(["sex","smoker","meal","day"],axis=1)
X=np.array(data1.iloc[:,:-1])#自变量X
Y=np.array(data1.iloc[:,-1:])#因变量Y
#划分训练集与测试集
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.8)
#X_train.shape,Y_test.shape
线性回归
lr=linear_model.LinearRegression()
lr.fit(X_train,Y_train)
#显示预测值
y_lr=lr.predict(X_test)
print(y_lr)
print(Y_test) #显示Y_test
score1=lr.score(X_test,Y_test) #查看准确率
print(score1)
岭回归
ri=linear_model.Ridge()
ri.fit(X_train,Y_train)
#显示预测值
y_ri=ri.predict(X_test)
print(y_ri)
print(Y_test) #显示Y_test
score2=ri.score(X_test,Y_test)
#print(score2)
lasso回归
ls=linear_model.Lasso()
ls.fit(X_train,Y_train)
#显示预测值
y_ls=ri.predict(X_test)
y_ls,Y_test #显示Y_test
#score3=ls.score(X_test,Y_test)
#print(score3)
真实值与预测值图象
plt.plot(Y_test,label='real',color='red')
plt.plot(y_lr,label='lr',color='blue')
plt.legend()
plt.plot(Y_test,label='real',color='red')
plt.plot(y_ri,label='ri',color='green')
plt.legend()
plt.plot(Y_test,label='real',color='red')
plt.plot(y_ls,label='ls',color='black')
plt.legend()