sklearn机器学习之svm案例（预测明天是否会下雨）

最新推荐文章于 2024-07-02 09:00:00 发布

yueyuebushihuai

最新推荐文章于 2024-07-02 09:00:00 发布

阅读量2.3k

点赞数 5

文章标签：数据挖掘机器学习 svm 支持向量机

本文链接：https://blog.csdn.net/m0_45184077/article/details/114869829

版权

1.导入相应包

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

2.读取数据集并分析

weather = pd.read_csv(r"D:\download\sklearnjqxx_jb51\【机器学习】菜菜的sklearn课堂(1-12全课)\08支持向量机下\weatherAUS5000.csv", index_col=0)
weather.head()

#定义特征和标签
X = weather.iloc[:, :-1]
Y = weather.iloc[:, -1]
#查看缺失值占比
X.isnull().mean()

3.划分训练集和测试集

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)
#这里注意修改后必须恢复索引
for i in [Xtrain, Xtest, Ytrain, Ytest]:
    i.index = range(i.shape[0])
#查看是否存在样本不均衡问题
#样本不平衡问题
Ytrain.value_counts()

4.将标签编码

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder().fit(Ytrain)
Ytrain = pd.DataFrame(encoder.transform(Ytrain))
Ytest = pd.DataFrame(encoder.transform(Ytest))

5.异常值处理这里略去

6.日期处理

这里使用该天是否降雨作为变量代替日期，且选出月份作为季度指标。

Xtrain.loc[Xtrain['Rainfall'] >= 1, 'RainToday'] = 'Yes'
Xtrain.loc[Xtrain['Rainfall'] < 1, 'RainToday'] = 'No'
Xtrain.loc[Xtrain['Rainfall'] == np.nan, 'RainToday'] = np.nan
Xtest.loc[Xtest["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtest.loc[Xtest["Rainfall"] < 1, "RainToday"] = "No"
Xtest.loc[Xtest["Rainfall"] == np.nan, "RainToday"] = np.nan

Xtrain['Date'] = Xtrain['Date'].apply(lambda x : int(x.split('-')[1]))
Xtest['Date'] = Xtest['Date'].apply(lambda x : int(x.split('-')[1]))

Xtrain.head()
#将日期变为月份
Xtrain = Xtrain.rename(columns={'Date': 'Month'})
Xtest = Xtest.rename(columns={'Date': 'Month'})

7.读取位置和气候信息（位置处理）

cityll = pd.read_csv(r"D:\download\sklearnjqxx_jb51\【机器学习】菜菜的sklearn课堂(1-12全课)\08支持向量机下\cityll.csv" , index_col=0)
cityll

city_climate = pd.read_csv(r"D:\download\sklearnjqxx_jb51\【机器学习】菜菜的sklearn课堂(1-12全课)\08支持向量机下\Cityclimate.csv")
city_climate
#去掉经纬度后的°
cityll['Latitudenum'] = cityll['Latitude'].apply(lambda x: float(x[: -1]))
cityll['Longitudenum'] = cityll['Longitude'].apply(lambda x: float(x[: -1]))

citylld = cityll.iloc[:, [0, 5, 6]].copy()
citylld.head()
citylld['climate'] = city_climate.iloc[:, -1]
#读取样本经纬度
samplecity = pd.read_csv("D://download//sklearnjqxx_jb51//【机器学习】菜菜的sklearn课堂(1-12全课)//08支持向量机下//samplecity.csv",index_col=0)
samplecity.head()

#去掉经纬度°
samplecity["Latitudenum"] = samplecity["Latitude"].apply(lambda x:float(x[:-1]))
samplecity["Longitudenum"] = samplecity["Longitude"].apply(lambda x:float(x[:-1]))
samplecityd = samplecity.iloc[:, [0,5,6]]
samplecityd.head()

#将经纬度转换为弧度并计算距离
from math import radians, sin, cos, acos
citylld.loc[:, "slat"] = citylld.iloc[:, 1].apply(lambda x: radians(x))
citylld.loc[:, "slon"] = citylld.iloc[:, 2].apply(lambda x: radians(x))
samplecityd.loc[:, "elat"] = samplecityd.iloc[:, 1].apply(lambda x: radians(x))
samplecityd.loc[:, "elon"] = samplecityd.iloc[:, 2].apply(lambda x: radians(x))

import sys
#找出最近的城市，并用该城市的气候代替气候站的气候
for i in range(samplecityd.shape[0]):
    slat = citylld.loc[:, "slat"]
    slon = citylld.loc[:, "slon"]
    elat = samplecityd.loc[i, "elat"]
    elon = samplecityd.loc[i, "elon"]
    dist = 6371.01 * np.arccos(np.sin(slat) * np.sin(elat) + np.cos(slat) * np.cos(elat) * np.cos(slon.values - elon))
    city_index = np.argsort(dist)[0]
    samplecityd.loc[i, "closest_city"] = citylld.loc[city_index, "City"]
    samplecityd.loc[i, "climate"] = citylld.loc[city_index, "climate"]
samplecityd.head()

#样本对应的
localfinal = samplecityd.iloc[:, [0, -1]]
#变换列名，设置索引
localfinal.columns = ['Location', 'Climate']
localfinal = localfinal.set_index(keys='Location')
localfinal.head()

import re
#去掉逗号，使用map函数对索引进行替换
Xtrain['Location'] = Xtrain['Location'].map(localfinal.iloc[:, 0]).apply(lambda x: re.sub(",", "", x.strip()))
Xtest['Location'] = Xtest['Location'].map(localfinal.iloc[:, 0]).apply(lambda x: re.sub(",", "", x.strip()))
#变换列名
Xtrain = Xtrain.rename(columns={"Location": "Climate"})
Xtest = Xtest.rename(columns={"Location": "Climate"})
Xtrain.head()

8.用众数替换分类型变量缺失值并编码

cate = Xtrain.columns[Xtrain.dtypes == 'object'].tolist()
cate = cate + ['Cloud9am', 'Cloud3pm']
from sklearn.impute import SimpleImputer
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
si.fit(Xtrain.loc[:, cate])
Xtrain.loc[:, cate] = si.transform(Xtrain.loc[:, cate])
Xtest.loc[:, cate] = si.transform(Xtest.loc[:, cate])
Xtrain.head()
#编码
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
oe = oe.fit(Xtrain.loc[:, cate])

Xtrain.loc[:, cate] = oe.transform(Xtrain.loc[:, cate])
Xtest.loc[:, cate] = oe.transform(Xtest.loc[:, cate])
Xtrain.loc[:, cate].head()

9.用平均值替换连续型变量缺失值

#取出列名
col = [i for i in Xtrain.columns.to_list() if i not in cate]
impmean = SimpleImputer(missing_values=np.nan, strategy="mean")
impmean = impmean.fit(Xtrain.loc[:, col])
Xtrain.loc[:, col] = impmean.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = impmean.transform(Xtest.loc[:, col])
Xtrain.head()
#删除月份
col.remove("Month")

10.去量纲化

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss = ss.fit(Xtrain.loc[:, col])
Xtrain.loc[:, col] = ss.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = ss.transform(Xtest.loc[:, col])
Xtrain

11.建模评估

#建模评估
from time import time
import datetime
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, recall_score
Ytrain = Ytrain.iloc[:, 0].ravel()
Ytest = Ytest.iloc[:, 0].ravel()
Ytrain.shape
times = time()
#分别利用不同的核函数测试
for kernel in ["linear", 'poly', 'rbf', 'sigmoid']:
    clf = SVC(kernel=kernel, gamma='auto', degree=1, cache_size=5000).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("%s 's testing accuracy %f, recall is %f, auc is %f" % (kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%S:%f"))

得到线性核较好。设置class_weight求最高召回率：

irange = np.linspace(0.01, 0.05, 10)
for i in irange:
    times = time()
    clf = SVC(kernel = "linear"
             ,gamma="auto"
             ,cache_size = 5000
             ,class_weight = {1: 1+i}
             ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("under ratio 1:%f testing accuracy %f, recall is %f', auc is %f" %
(1+i, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time()-times).strftime("%M:%S:%f"))

利用不同的C值查看模型指标：

C_range = np.linspace(0.01, 20, 20)
recallall = []
aucall = []
scoreall = []
for C in C_range:
    times = time()
    clf = SVC(kernel='linear', C=C, cache_size=5000, class_weight='balanced').fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    recallall.append(recall)
    aucall.append(auc)
    scoreall.append(score)
    print("under C %f, testing accuracy is %f, recall is %f, auc is %f" % (C, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%S:%f"))
print(max(aucall), C_range[aucall.index(max(aucall))])
plt.figure()
plt.plot(C_range, recallall, c='red', label='recall')
plt.plot(C_range, aucall, c='black', label='auc')
plt.plot(C_range, scoreall, c='orange', label='accuracy')
plt.legend()
plt.show()