预测西雅图降雨情况
1. 问题描述
除了咖啡、垃圾食品和科技公司,西雅图最出名的就是经常下雨。本问题通过西雅图的历史降雨数据来预测西雅图的降雨情况。
2. 数据集介绍
数据集来源:https://github.com/BATCEO/kaggle_PredictRainUsingLogisticRegression
数据集名称:seattleWeather_1948-2017.csv
数据集包含了从1948年1月1日到2017年12月12日的每日降雨模式的完整记录。下图展示了该数据集的头10条记录和末10条记录:
数据集包含五列信息,各列代表含义如下:
DATE:日期
PRCP:降水量,单位为英寸
TMAX:最高温度
TMIN:最低温度
RAIN:是否下雨
3. 实现
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
data = pd.read_csv("seattleWeather_1948-2017.csv")
print(data)
data['RAIN'].isnull().value_counts() #计算RAIN值为空的行数
data['RAIN'] = data['RAIN'].fillna('1') #将RAIN值为空的行填充为1
index = data[(data.RAIN=='1')].index.tolist() #找出RAIN值为空的行的索引值
data = data.drop(index) #删除空值所在行
data_X = data.TMAX+data.TMIN
data_Y = data.RAIN
train_x, test_x, train_y, test_y = train_test_split(data_X, data_Y, test_size=0.1,random_state=0)
transfer=StandardScaler() #标准化处理
train_x=transfer.fit_transform(train_x.values.reshape(-1, 1))
test_x=transfer.transform(test_x.values.reshape(-1, 1))
model = LogisticRegression() #训练逻辑回归模型
train_y = train_y.astype("int")
test_y = test_y.astype("int")
fit = model.fit(train_x, train_y)
pred = model.predict(test_x) #对测试集进行预测
pred_array = model.predict_proba(test_x) #返回是一个数组,第i行第j列上的数值是模型预测第i个预测样本的标签为j的概率
test_num = pred_array.shape[0]
y_score = model.score(test_x,test_y) #返回预测的系数R^2
test_yy = np.array(test_y)
pre_y = copy.deepcopy(test_yy)
count_tt = 0
count_tf = 0
count_ft = 0
count_ff = 0
for i in range(test_num):
pre = pred_array[i][0]
acu = test_yy[i]
if(pre<0.5):
pre_y[i] = 1
if(acu==1):
count_tt +=1
else:
count_tf +=1
else:
pre_y[i] = 0
if(acu==1):
count_ft +=1
else:
count_ff +=1
print("\t actual_values")
print("pred_class 1 \t 0")
print("\t 1",end=" ")
print(count_tt,end=" ")
print(count_tf)
print("\t 0",end=" ")
print(count_ft,end=" ")
print(count_ff)
summ = test_num
accuarcy = (count_tt+count_ff)/summ
sensitivity = count_ff/(count_ff+count_ft)
specificity = count_tt/(count_tt+count_tf)
print("准确性:",accuarcy)
print("灵敏度:",sensitivity)
print("特异性:",specificity)
fpr,tpr,threshold = roc_curve(test_yy, pre_y) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
plt.figure() #绘制ROC曲线
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlabel("Specificity")
plt.ylabel("Sensitivity")
plt.title("ROC")