基于澳大利亚气象数据集可视化分析及降雨预测

岳不谢

已于 2022-10-10 08:27:29 修改

阅读量1.2k

点赞数 2

文章标签： python 机器学习开发语言逻辑回归随机森林

于 2022-10-04 00:22:59 首次发布

本文链接：https://blog.csdn.net/m0_62909438/article/details/127158166

版权

本文介绍了一种基于Python的数据分析流程，包括数据预处理、特征工程、模型训练等环节，通过对气象数据的深入分析，实现了对次日是否降雨的预测。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1：探索性分析代码流程

2：数字特征处理

3：非数字类型的特征处理

4：特征拼接

5：标签处理

6：数据集划分

7：模型评价

8：数据可视化

仅供大家参考学习，有不足之处请多多包涵与批评，指导指导我，切勿完全照搬采用，需要源码请下载压缩包。

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from matplotlib import style, test
import seaborn as sns
from sklearn import preprocessing, __all__
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LogisticRegression # 逻辑回归

from sklearn.ensemble import RandomForestClassifier # 随机森林
from sklearn import tree # 决策树
from sklearn import metrics # 评价的指标

style.use('ggplot')     # 设置图片显示的主题样式
# 解决matplotlib显示中文问题
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题



# 1.探索性数据分析的流程
def inspect_data(df_data):
    #　1.1查看数据的前５行
    print("查看数据的前５行")
    print(df_data.head())
    print("*"*60)
    #　1.2查看数据的后5行
    print("查看数据的后5行")
    print(df_data.tail())
    print("*"*60)
    #  1.3显示数据的基本信息
    print("显示数据的基本信息")
    print(df_data.info())
    print("*"*60)
    # 1.4显示数据的统计信息
    print("显示数据的统计信息")
    print(df_data.describe())
    print("*" * 60)
    # 1.5查看空值NAN
    print("判断哪些“列”存在缺失值")
    print(df_data.isnull().any()) # 判断哪些“列”存在缺失值
    print("*" * 60)
    print("找出含有nan的所有行")
    print(df_data[df_data.isnull().T.any().T]) # 找出含有nan的所有行
    print("*" * 60)
    # 1.6空值处理，删除空值所在的行
    print("空值处理，删除空值所在的行")
    print(df_data.dropna())
    print("*" * 60)
    """
    "axis":0代表行操作（默认），1代表列操作
    “how”:any表示只有空值就删除（默认），all表示全部为空值才删除
    ”inplace“:False表示返回新的数据集，即生成一个副本数据集（默认），True表示在原数据集上操作
    """


# 2. 数据的分析及画图（pandas）
def analysis_data(df_data):
    use_cols = ['Date', 'Location', 'MinTemp', 'MaxTemp','Rainfall',
                'Sunshine','Evaporation','RainToday','Pressure9am','Pressure3pm',
                'WindGustDir','WindGustSpeed','RISK_MM', 'RainTomorrow']
    use_data = df_data[use_cols]
    print("数据分析总览，查看使用列数据的前10行")
    print(use_data.head(10))
    print("*" * 60)
    # 按照月份记录降雨量
    print("*" * 60)
    print("时间类型转换.....")
    # 1.处理时间特征，把日期转换为月份
    use_data['Date'] = pd.to_datetime(use_data['Date'])
    use_data['Date'] = use_data['Date'].dt.month
    print(use_data.head())
    print("*" * 60)
    # 2.可视化，月份VS降雨量
    fig1 = plt.figure()
    ax = fig1.add_subplot(1, 1, 1)
    ax.scatter(use_data['Date'],  use_data['Rainfall'])
    ax.set_ylabel('Rainfall')
    ax.set_xlabel('Month')
    plt.show()

def  analysis_data1(df_data):
    # 可视化 今天下雨样本比例
    df_data['RainToday'].value_counts().plot(kind='pie', autopct='%.2f%%')
    plt.title('RainToday样本比例')
    plt.tight_layout()
    plt.show()

def  analysis_data2(df_data):
    data = np.random.rand(4, 2)
    rows = ['MinTemp','MaxTemp','Rainfall','RISK_MM']  # rows categories
    columns = ['RainToday','RainTomorrow']  # column categories
    fig, ax = plt.subplots()
    # Advance color controls
    ax.pcolor(data, cmap=plt.cm.Reds, edgecolors='k')
    ax.set_xticks(np.arange(0, 2) + 0.5)
    ax.set_yticks(np.arange(0, 4) + 0.5)
    # Here we position the tick labels for x and y axis
    ax.xaxis.tick_bottom()
    ax.yaxis.tick_left()
    # Values against each labels
    ax.set_xticklabels(columns, minor=False, fontsize=20)
    ax.set_yticklabels(rows, minor=False, fontsize=10)
    plt.show()

def  analysis_data3(df_data):
    sns.displot(data=df_data,
                x='RainTomorrow',
                hue='RainToday',
                multiple="stack",
                height=6,
                aspect=1)
    plt.show()
def  analysis_data4(df_data):
    plt.figure(figsize=(12, 6))
    sns.scatterplot(data=df_data.sample(2022),
                    x='MinTemp',
                    y='MaxTemp',
                    hue='RainTomorrow');

    plt.show()


# 处理标签数据
def create_label(RainTomorrow_val):
    label = 1 # 标签为1代表了不下雨
    if RainTomorrow_val == 'Yes':
        label = 0
    return label

#　处理数据
def process_data(df_data):
    filter_mask = df_data['RainTomorrow'].isin(['Yes', 'No'])
    filter_data = df_data[filter_mask]

    print(filter_data['RainTomorrow'].value_counts())
    print("*" * 60)
    # 为数据添加 0, 1 标签，'Yes' -> 0, No -> 1
    proc_filter_data = filter_data.copy()
    proc_filter_data['label'] = filter_data['RainTomorrow'].apply(create_label)
    print(proc_filter_data.head())
    print("*" * 60)

    # Porj 2.2: 可视化 正负样本比例
    proc_filter_data['label'].value_counts().plot(kind='pie', autopct='%.2f%%')
    plt.title('RainTomorrow正负样本比例')
    plt.tight_layout()
    plt.show()

    numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
                    'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
                    'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                    'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM']
    category_cols = ['WindGustDir','WindDir9am','WindDir3pm','RainToday']

    label_col = ['label']

    user_cols = numeric_cols + category_cols + label_col

    final_samples = proc_filter_data[user_cols]

    # 去掉空值
    final_samples.dropna(inplace=True)

    proc_data_filepath = 'proc_data.csv'
    final_samples.to_csv(os.path.join(proc_data_filepath), index=False)

def perform_machine_learning(data_filepath, numeric_cols, category_cols, label_col):
    """
        数据集处理及模型学习
        理解，准确率，精确率，召回率三者之间的关系
        参数
        ======
        data_filepath:  数据集路径
        numeric_cols:   数值类型列
        category_cols:  类别类型列
        label_col:      标签列

        返回值
        ======
        None
    """
    data = pd.read_csv(data_filepath)
    numeric_feat = data[numeric_cols].values
    category_val = data[category_cols].values[:, 0]  # 如果有多列，每次处理一列

    # 处理类别数据
    # label encoder
    label_enc = preprocessing.LabelEncoder()
    label_val = label_enc.fit_transform(category_val)
    label_val = label_val.reshape(-1, 1)

    # one-hot encoder 独热编码
    onehot_enc = preprocessing.OneHotEncoder()
    category_feat = onehot_enc.fit_transform(label_val)
    category_feat = category_feat.toarray()
    # 生成最终特征和标签用于模型的训练
    X = np.hstack((numeric_feat, category_feat))
    y = data[label_col].values

    # 数据集信息
    n_sample = y.shape[0]
    n_pos_sample = y[y == 1].shape[0]
    n_neg_sample = y[y == 0].shape[0]
    print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
    print('特征维数：', X.shape[1])

    # 处理不平衡数据
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
    print('通过SMOTE方法平衡正负样本后')
    n_sample = y.shape[0]
    n_pos_sample = y[y == 1].shape[0]
    n_neg_sample = y[y == 0].shape[0]
    print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))

    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #C为超参数，尝试使用交叉验证选取最优的C值
    #lr_model = LogisticRegression(C=1.0)
    # rf_model = RandomForestClassifier()
    # clf = tree.DecisionTreeClassifier()  # 决策树
    clf = LogisticRegression()   # 逻辑回归
    # clf = RandomForestClassifier() # 随机森林
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = metrics.accuracy_score(y_pred, y_test)
    precision = metrics.precision_score(y_pred, y_test, pos_label=1)
    recall = metrics.recall_score(y_pred, y_test, pos_label=1)

    print('准确率为：', accuracy)
    print('精确率为：', precision)
    print('召回率：', recall)


def main():
    csvfile = "weatherAUS.csv"
    raw_data = pd.read_csv(csvfile)
    inspect_data(raw_data)
    analysis_data(raw_data)
    analysis_data1(raw_data)
    analysis_data2(raw_data)
    analysis_data3(raw_data)
    analysis_data4(raw_data)
    process_data(raw_data)

    numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
                    'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
                    'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                    'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM']  # 数字列

    category_cols = ['WindGustDir','WindDir9am','WindDir3pm','RainToday']  # 非数字列

    label_col = ['label']  # 标签 （要用来进行预测的内容）
    data_filepath = 'proc_data.csv'
    perform_machine_learning(data_filepath, numeric_cols, category_cols, label_col)

if __name__ == '__main__':
    main()





# 小谢编写，有不足请指正与包涵