sklearn机器学习之svm案例(预测明天是否会下雨)

1.导入相应包

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

2.读取数据集并分析

weather = pd.read_csv(r"D:\download\sklearnjqxx_jb51\【机器学习】菜菜的sklearn课堂(1-12全课)\08支持向量机下\weatherAUS5000.csv", index_col=0)
weather.head()

#定义特征和标签
X = weather.iloc[:, :-1]
Y = weather.iloc[:, -1]
#查看缺失值占比
X.isnull().mean()

3.划分训练集和测试集

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)
#这里注意修改后必须恢复索引
for i in [Xtrain, Xtest, Ytrain, Ytest]:
    i.index = range(i.shape[0])
#查看是否存在样本不均衡问题
#样本不平衡问题
Ytrain.value_counts()

4.将标签编码

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder().fit(Ytrain)
Ytrain = pd.DataFrame(encoder.transform(Ytrain))
Ytest = pd.DataFrame(encoder.transform(Ytest))

5.异常值处理这里略去

6.日期处理

这里使用该天是否降雨作为变量代替日期,且选出月份作为季度指标。

Xtrain.loc[Xtrain['Rainfall'] >= 1, 'RainToday'] = 'Yes'
Xtrain.loc[Xtrain['Rainfall'] < 1, 'RainToday'] = 'No'
Xtrain.loc[Xtrain['Rainfall'] == np.nan, 'RainToday'] = np.nan
Xtest.loc[Xtest["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtest.loc[Xtest["Rainfall"] < 1, "RainToday"] = "No"
Xtest.loc[Xtest["Rainfall"] == np.nan, "RainToday"] = np.nan

Xtrain['Date'] = Xtrain['Date'].apply(lambda x : int(x.split('-')[1]))
Xtest['Date'] = Xtest['Date'].apply(lambda x : int(x.split('-')[1]))

Xtrain.head()
#将日期变为月份
Xtrain = Xtrain.rename(columns={'Date': 'Month'})
Xtest = Xtest.rename(columns={'Date': 'Month'})

7.读取位置和气候信息(位置处理)

cityll = pd.read_csv(r"D:\download\sklearnjqxx_jb51\【机器学习】菜菜的sklearn课堂(1-12全课)\08支持向量机下\cityll.csv" , index_col=0)
cityll

city_climate = pd.read_csv(r"D:\download\sklearnjqxx_jb51\【机器学习】菜菜的sklearn课堂(1-12全课)\08支持向量机下\Cityclimate.csv")
city_climate
#去掉经纬度后的°
cityll['Latitudenum'] = cityll['Latitude'].apply(lambda x: float(x[: -1]))
cityll['Longitudenum'] = cityll['Longitude'].apply(lambda x: float(x[: -1]))

citylld = cityll.iloc[:, [0, 5, 6]].copy()
citylld.head()
citylld['climate'] = city_climate.iloc[:, -1]
#读取样本经纬度
samplecity = pd.read_csv("D://download//sklearnjqxx_jb51//【机器学习】菜菜的sklearn课堂(1-12全课)//08支持向量机下//samplecity.csv",index_col=0)
samplecity.head()

#去掉经纬度°
samplecity["Latitudenum"] = samplecity["Latitude"].apply(lambda x:float(x[:-1]))
samplecity["Longitudenum"] = samplecity["Longitude"].apply(lambda x:float(x[:-1]))
samplecityd = samplecity.iloc[:, [0,5,6]]
samplecityd.head()

#将经纬度转换为弧度并计算距离
from math import radians, sin, cos, acos
citylld.loc[:, "slat"] = citylld.iloc[:, 1].apply(lambda x: radians(x))
citylld.loc[:, "slon"] = citylld.iloc[:, 2].apply(lambda x: radians(x))
samplecityd.loc[:, "elat"] = samplecityd.iloc[:, 1].apply(lambda x: radians(x))
samplecityd.loc[:, "elon"] = samplecityd.iloc[:, 2].apply(lambda x: radians(x))

import sys
#找出最近的城市,并用该城市的气候代替气候站的气候
for i in range(samplecityd.shape[0]):
    slat = citylld.loc[:, "slat"]
    slon = citylld.loc[:, "slon"]
    elat = samplecityd.loc[i, "elat"]
    elon = samplecityd.loc[i, "elon"]
    dist = 6371.01 * np.arccos(np.sin(slat) * np.sin(elat) + np.cos(slat) * np.cos(elat) * np.cos(slon.values - elon))
    city_index = np.argsort(dist)[0]
    samplecityd.loc[i, "closest_city"] = citylld.loc[city_index, "City"]
    samplecityd.loc[i, "climate"] = citylld.loc[city_index, "climate"]
samplecityd.head()

#样本对应的
localfinal = samplecityd.iloc[:, [0, -1]]
#变换列名,设置索引
localfinal.columns = ['Location', 'Climate']
localfinal = localfinal.set_index(keys='Location')
localfinal.head()

import re
#去掉逗号,使用map函数对索引进行替换
Xtrain['Location'] = Xtrain['Location'].map(localfinal.iloc[:, 0]).apply(lambda x: re.sub(",", "", x.strip()))
Xtest['Location'] = Xtest['Location'].map(localfinal.iloc[:, 0]).apply(lambda x: re.sub(",", "", x.strip()))
#变换列名
Xtrain = Xtrain.rename(columns={"Location": "Climate"})
Xtest = Xtest.rename(columns={"Location": "Climate"})
Xtrain.head()

8.用众数替换分类型变量缺失值并编码

cate = Xtrain.columns[Xtrain.dtypes == 'object'].tolist()
cate = cate + ['Cloud9am', 'Cloud3pm']
from sklearn.impute import SimpleImputer
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
si.fit(Xtrain.loc[:, cate])
Xtrain.loc[:, cate] = si.transform(Xtrain.loc[:, cate])
Xtest.loc[:, cate] = si.transform(Xtest.loc[:, cate])
Xtrain.head()
#编码
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
oe = oe.fit(Xtrain.loc[:, cate])

Xtrain.loc[:, cate] = oe.transform(Xtrain.loc[:, cate])
Xtest.loc[:, cate] = oe.transform(Xtest.loc[:, cate])
Xtrain.loc[:, cate].head()

9.用平均值替换连续型变量缺失值

#取出列名
col = [i for i in Xtrain.columns.to_list() if i not in cate]
impmean = SimpleImputer(missing_values=np.nan, strategy="mean")
impmean = impmean.fit(Xtrain.loc[:, col])
Xtrain.loc[:, col] = impmean.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = impmean.transform(Xtest.loc[:, col])
Xtrain.head()
#删除月份
col.remove("Month")

10.去量纲化

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss = ss.fit(Xtrain.loc[:, col])
Xtrain.loc[:, col] = ss.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = ss.transform(Xtest.loc[:, col])
Xtrain

11.建模评估

#建模评估
from time import time
import datetime
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, recall_score
Ytrain = Ytrain.iloc[:, 0].ravel()
Ytest = Ytest.iloc[:, 0].ravel()
Ytrain.shape
times = time()
#分别利用不同的核函数测试
for kernel in ["linear", 'poly', 'rbf', 'sigmoid']:
    clf = SVC(kernel=kernel, gamma='auto', degree=1, cache_size=5000).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("%s 's testing accuracy %f, recall is %f, auc is %f" % (kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%S:%f"))

得到线性核较好。设置class_weight求最高召回率:

irange = np.linspace(0.01, 0.05, 10)
for i in irange:
    times = time()
    clf = SVC(kernel = "linear"
             ,gamma="auto"
             ,cache_size = 5000
             ,class_weight = {1: 1+i}
             ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("under ratio 1:%f testing accuracy %f, recall is %f', auc is %f" %
(1+i, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time()-times).strftime("%M:%S:%f"))

利用不同的C值查看模型指标:

C_range = np.linspace(0.01, 20, 20)
recallall = []
aucall = []
scoreall = []
for C in C_range:
    times = time()
    clf = SVC(kernel='linear', C=C, cache_size=5000, class_weight='balanced').fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    recallall.append(recall)
    aucall.append(auc)
    scoreall.append(score)
    print("under C %f, testing accuracy is %f, recall is %f, auc is %f" % (C, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%S:%f"))
print(max(aucall), C_range[aucall.index(max(aucall))])
plt.figure()
plt.plot(C_range, recallall, c='red', label='recall')
plt.plot(C_range, aucall, c='black', label='auc')
plt.plot(C_range, scoreall, c='orange', label='accuracy')
plt.legend()
plt.show()

绘制图像如下:
在这里插入图片描述
需要进一步调参可以深入实践,或者变换模型。

菜菜首先需要安装Python的机器学习库scikit-learn(sklearn)。她可以在终端窗口运行以下命令来安装该库: ``` pip install scikit-learn ``` 安装完成后,菜菜可以开始编写代码来进行下雨预测。首先,她需要导入必要的库和模块: ```python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score ``` 然后,菜菜需要下载下雨的数据文件。她可以在一些公开的数据集网站上找到相关的数据集。一旦找到了合适的数据集,菜菜可以使用pandas库中的read_csv函数加载数据: ```python data = pd.read_csv("rainfall_data.csv") ``` 接下来,菜菜需要将数据拆分为特征数据(X)和目标变量(y),并将其分为训练集和测试集: ```python X = data.drop('Rain', axis=1) y = data['Rain'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) ``` 然后,菜菜可以定义一个决策树分类器模型,并使用训练集来拟合模型: ```python model = DecisionTreeClassifier() model.fit(X_train, y_train) ``` 最后,菜菜可以使用测试集来评估模型的性能并计算准确度: ```python predictions = model.predict(X_test) accuracy = accuracy_score(y_test, predictions) print("准确度:", accuracy) ``` 这样,菜菜就可以利用scikit-learn来预测下雨文件下载了。请注意,上述代码只是一个简单的示例,实际应用中还可能需要进行数据预处理、模型调参等操作。
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值