1:探索性分析代码流程
2:数字特征处理
3:非数字类型的特征处理
4:特征拼接
5:标签处理
6:数据集划分
7:模型评价
8:数据可视化
仅供大家参考学习,有不足之处请多多包涵与批评,指导指导我,切勿完全照搬采用,需要源码请下载压缩包。
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from matplotlib import style, test
import seaborn as sns
from sklearn import preprocessing, __all__
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.ensemble import RandomForestClassifier # 随机森林
from sklearn import tree # 决策树
from sklearn import metrics # 评价的指标
style.use('ggplot') # 设置图片显示的主题样式
# 解决matplotlib显示中文问题
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# 1.探索性数据分析的流程
def inspect_data(df_data):
# 1.1查看数据的前5行
print("查看数据的前5行")
print(df_data.head())
print("*"*60)
# 1.2查看数据的后5行
print("查看数据的后5行")
print(df_data.tail())
print("*"*60)
# 1.3显示数据的基本信息
print("显示数据的基本信息")
print(df_data.info())
print("*"*60)
# 1.4显示数据的统计信息
print("显示数据的统计信息")
print(df_data.describe())
print("*" * 60)
# 1.5查看空值NAN
print("判断哪些“列”存在缺失值")
print(df_data.isnull().any()) # 判断哪些“列”存在缺失值
print("*" * 60)
print("找出含有nan的所有行")
print(df_data[df_data.isnull().T.any().T]) # 找出含有nan的所有行
print("*" * 60)
# 1.6空值处理,删除空值所在的行
print("空值处理,删除空值所在的行")
print(df_data.dropna())
print("*" * 60)
"""
"axis":0代表行操作(默认),1代表列操作
“how”:any表示只有空值就删除(默认),all表示全部为空值才删除
”inplace“:False表示返回新的数据集,即生成一个副本数据集(默认),True表示在原数据集上操作
"""
# 2. 数据的分析及画图(pandas)
def analysis_data(df_data):
use_cols = ['Date', 'Location', 'MinTemp', 'MaxTemp','Rainfall',
'Sunshine','Evaporation','RainToday','Pressure9am','Pressure3pm',
'WindGustDir','WindGustSpeed','RISK_MM', 'RainTomorrow']
use_data = df_data[use_cols]
print("数据分析总览,查看使用列数据的前10行")
print(use_data.head(10))
print("*" * 60)
# 按照月份记录降雨量
print("*" * 60)
print("时间类型转换.....")
# 1.处理时间特征,把日期转换为月份
use_data['Date'] = pd.to_datetime(use_data['Date'])
use_data['Date'] = use_data['Date'].dt.month
print(use_data.head())
print("*" * 60)
# 2.可视化,月份VS降雨量
fig1 = plt.figure()
ax = fig1.add_subplot(1, 1, 1)
ax.scatter(use_data['Date'], use_data['Rainfall'])
ax.set_ylabel('Rainfall')
ax.set_xlabel('Month')
plt.show()
def analysis_data1(df_data):
# 可视化 今天下雨样本比例
df_data['RainToday'].value_counts().plot(kind='pie', autopct='%.2f%%')
plt.title('RainToday样本比例')
plt.tight_layout()
plt.show()
def analysis_data2(df_data):
data = np.random.rand(4, 2)
rows = ['MinTemp','MaxTemp','Rainfall','RISK_MM'] # rows categories
columns = ['RainToday','RainTomorrow'] # column categories
fig, ax = plt.subplots()
# Advance color controls
ax.pcolor(data, cmap=plt.cm.Reds, edgecolors='k')
ax.set_xticks(np.arange(0, 2) + 0.5)
ax.set_yticks(np.arange(0, 4) + 0.5)
# Here we position the tick labels for x and y axis
ax.xaxis.tick_bottom()
ax.yaxis.tick_left()
# Values against each labels
ax.set_xticklabels(columns, minor=False, fontsize=20)
ax.set_yticklabels(rows, minor=False, fontsize=10)
plt.show()
def analysis_data3(df_data):
sns.displot(data=df_data,
x='RainTomorrow',
hue='RainToday',
multiple="stack",
height=6,
aspect=1)
plt.show()
def analysis_data4(df_data):
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df_data.sample(2022),
x='MinTemp',
y='MaxTemp',
hue='RainTomorrow');
plt.show()
# 处理标签数据
def create_label(RainTomorrow_val):
label = 1 # 标签为1代表了不下雨
if RainTomorrow_val == 'Yes':
label = 0
return label
# 处理数据
def process_data(df_data):
filter_mask = df_data['RainTomorrow'].isin(['Yes', 'No'])
filter_data = df_data[filter_mask]
print(filter_data['RainTomorrow'].value_counts())
print("*" * 60)
# 为数据添加 0, 1 标签,'Yes' -> 0, No -> 1
proc_filter_data = filter_data.copy()
proc_filter_data['label'] = filter_data['RainTomorrow'].apply(create_label)
print(proc_filter_data.head())
print("*" * 60)
# Porj 2.2: 可视化 正负样本比例
proc_filter_data['label'].value_counts().plot(kind='pie', autopct='%.2f%%')
plt.title('RainTomorrow正负样本比例')
plt.tight_layout()
plt.show()
numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM']
category_cols = ['WindGustDir','WindDir9am','WindDir3pm','RainToday']
label_col = ['label']
user_cols = numeric_cols + category_cols + label_col
final_samples = proc_filter_data[user_cols]
# 去掉空值
final_samples.dropna(inplace=True)
proc_data_filepath = 'proc_data.csv'
final_samples.to_csv(os.path.join(proc_data_filepath), index=False)
def perform_machine_learning(data_filepath, numeric_cols, category_cols, label_col):
"""
数据集处理及模型学习
理解,准确率,精确率,召回率三者之间的关系
参数
======
data_filepath: 数据集路径
numeric_cols: 数值类型列
category_cols: 类别类型列
label_col: 标签列
返回值
======
None
"""
data = pd.read_csv(data_filepath)
numeric_feat = data[numeric_cols].values
category_val = data[category_cols].values[:, 0] # 如果有多列,每次处理一列
# 处理类别数据
# label encoder
label_enc = preprocessing.LabelEncoder()
label_val = label_enc.fit_transform(category_val)
label_val = label_val.reshape(-1, 1)
# one-hot encoder 独热编码
onehot_enc = preprocessing.OneHotEncoder()
category_feat = onehot_enc.fit_transform(label_val)
category_feat = category_feat.toarray()
# 生成最终特征和标签用于模型的训练
X = np.hstack((numeric_feat, category_feat))
y = data[label_col].values
# 数据集信息
n_sample = y.shape[0]
n_pos_sample = y[y == 1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数:{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
n_pos_sample / n_sample,
n_neg_sample / n_sample))
print('特征维数:', X.shape[1])
# 处理不平衡数据
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print('通过SMOTE方法平衡正负样本后')
n_sample = y.shape[0]
n_pos_sample = y[y == 1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数:{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
n_pos_sample / n_sample,
n_neg_sample / n_sample))
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#C为超参数,尝试使用交叉验证选取最优的C值
#lr_model = LogisticRegression(C=1.0)
# rf_model = RandomForestClassifier()
# clf = tree.DecisionTreeClassifier() # 决策树
clf = LogisticRegression() # 逻辑回归
# clf = RandomForestClassifier() # 随机森林
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_pred, y_test)
precision = metrics.precision_score(y_pred, y_test, pos_label=1)
recall = metrics.recall_score(y_pred, y_test, pos_label=1)
print('准确率为:', accuracy)
print('精确率为:', precision)
print('召回率:', recall)
def main():
csvfile = "weatherAUS.csv"
raw_data = pd.read_csv(csvfile)
inspect_data(raw_data)
analysis_data(raw_data)
analysis_data1(raw_data)
analysis_data2(raw_data)
analysis_data3(raw_data)
analysis_data4(raw_data)
process_data(raw_data)
numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM'] # 数字列
category_cols = ['WindGustDir','WindDir9am','WindDir3pm','RainToday'] # 非数字列
label_col = ['label'] # 标签 (要用来进行预测的内容)
data_filepath = 'proc_data.csv'
perform_machine_learning(data_filepath, numeric_cols, category_cols, label_col)
if __name__ == '__main__':
main()
# 小谢编写,有不足请指正与包涵