from _pydecimal import Decimal import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import seaborn as sns import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier # Wine数据集的URL url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" # 列名,根据数据集文档设置 column_names = [ 'Class', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of Ash', 'Magnesium', 'Total Phenols', 'Flavanoids', 'Nonflavanoid Phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ] # 读取数据集 wine_data = pd.read_csv(url, header=None, names=column_names) # 将数据保存到CSV文件 wine_data.to_csv('wine_data.csv', index=False) print("Data saved to 'wine_data.csv'") # 读取数据集 wine_data = pd.read_csv('wine_data.csv') # 如果存在缺失值,可以考虑删除或者使用平均值、中位数进行填充 wine_data = wine_data.dropna() # 删除包含缺失值的行 # 标准化特征,使用StandardScaler scaler = StandardScaler() wine_data_scaled = scaler.fit_transform(wine_data.drop('Class', axis=1)) # 训练PCA模型,并应用于特征数据 from sklearn.decomposition import PCA pca = PCA(n_components=5) # 设置要保留的主成分个数 reduced_wine_data = pca.fit_transform(wine_data_scaled) # 获取每个主成分解释的方差比例 explained_variance_ratio = pca.explained_variance_ratio_ # 获取前五个主成分对应的特征索引 top_five_components = pca.components_[:5] # 获取保留下来的特征名 feature_names = wine_data.columns[1:] # 排除Class列名 # 打印保留下来的特征名 for component, var_ratio in zip(top_five_components, explained_variance_ratio): top_feature_indices = abs(component).argsort()[::-1][:1] # 获取每个主成分中最重要的特征索引 top_feature_name = feature_names[top_feature_indices][0] print(f"保留的特征:{top_feature_name},方差比例:{var_ratio}") # 将降维后的数据保存到CSV文件 reduced_wine_data_df = pd.DataFrame(reduced_wine_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5']) reduced_wine_data_df['Class'] = wine_data['Class'] # 添加Class列 reduced_wine_data_df.to_csv('reduced_wine_data.csv', index=False) print("降维后的数据保存到 'reduced_wine_data.csv'") # 划分训练集和测试集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(reduced_wine_data, wine_data['Class'], test_size=0.2, random_state=42) print("训练集样本数:", len(X_train)) print("测试集样本数:", len(X_test)) import pandas as pd # 假设reduced_wine_data是一个numpy数组 reduced_wine_data = pd.DataFrame(reduced_wine_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5']) # 检查数据集的统计信息 print(reduced_wine_data.describe()) # 检查数据集的统计信息 print(reduced_wine_data.isnull().sum()) print("训练集形状:", X_train.shape) print("测试集形状:", X_test.shape) import seaborn as sns import matplotlib.pyplot as plt import numpy as np # 读取降维后的数据集 reduced_wine_data_df = pd.read_csv('reduced_wine_data.csv') # 提取PC1到PC5的数据 pc_data = reduced_wine_data_df[['PC1', 'PC2', 'PC3', 'PC4', 'PC5']] # 绘制箱线图,并获取每个箱线图的数据 fig, ax = plt.subplots(1, 5, figsize=(20, 6)) for i, col in enumerate(pc_data.columns): box_data = pc_data[col].values median = np.median(box_data) q1, q3 = np.percentile(box_data, [25, 75]) iqr = q3 - q1 upper_whisker = box_data[box_data <= q3 + 1.5 * iqr].max() lower_whisker = box_data[box_data >= q1 - 1.5 * iqr].min() # 绘制箱线图 ax[i].boxplot(box_data, labels=[col]) ax[i].set_title(f'Statistics of {col}') ax[i].set_ylabel('Values') # 添加统计信息 textstr = f'Median={median:.2f}\nQ1={q1:.2f}\nQ3={q3:.2f}\nIQR={iqr:.2f}\nUpper Whisker={upper_whisker:.2f}\nLower Whisker={lower_whisker:.2f}' props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) ax[i].text(0.05, 0.95, textstr, transform=ax[i].transAxes, fontsize=10, verticalalignment='top', bbox=props) plt.suptitle('Boxplot of PC1 to PC5') plt.show() #画出散点图 import matplotlib.pyplot as plt # 绘制 PC1 与 Class 的散点图 plt.scatter(reduced_wine_data['PC1'], wine_data['Class']) plt.xlabel('PC1') plt.ylabel('Class') plt.title('PC1 vs Class') plt.show() # 绘制 PC2 与 Class 的散点图 plt.scatter(reduced_wine_data['PC2'], wine_data['Class']) plt.xlabel('PC2') plt.ylabel('Class') plt.title('PC2 vs Class') plt.show() # 绘制 PC3 与 Class 的散点图 plt.scatter(reduced_wine_data['PC3'], wine_data['Class']) plt.xlabel('PC3') plt.ylabel('Class') plt.title('PC3 vs Class') plt.show() # 绘制 PC4 与 Class 的散点图 plt.scatter(reduced_wine_data['PC4'], wine_data['Class']) plt.xlabel('PC4') plt.ylabel('Class') plt.title('PC4 vs Class') plt.show() # 绘制 PC5 与 Class 的散点图 plt.scatter(reduced_wine_data['PC5'], wine_data['Class']) plt.xlabel('PC5') plt.ylabel('Class') plt.title('PC5 vs Class') plt.show() # import seaborn as sns # import matplotlib.pyplot as plt # # 将数据集与类别合并 # reduced_wine_data['Class'] = wine_data['Class'] # # 自定义调色板 # custom_palette = ['red', 'green', 'blue'] # # 绘制多变量关系图,设置类别列和调色板 # sns.pairplot(reduced_wine_data, hue='Class', palette=custom_palette) # 显示图形 plt.show() plt.figure(figsize=(10, 8)) sns.pairplot(reduced_wine_data) plt.title('Pairplot of Wine Data') plt.show() # 各个特征之间的关系矩阵关系矩阵 import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # 将特征矩阵转换为DataFrame wine_data_scaled_df = pd.DataFrame(wine_data_scaled, columns=wine_data.columns[1:]) # 计算特征之间的相关系数矩阵 correlation_matrix = wine_data_scaled_df.corr() # 可视化相关系数矩阵 plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') plt.title("Correlation Matrix of Wine Dataset Features") plt.show() # 目的变量的关系矩阵 import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # 创建相关系数矩阵 component_corr = pd.DataFrame(top_five_components, columns=feature_names) # 只保留前五行和前五列 corr_matrix = component_corr.iloc[:5,:5].corr() # 可视化相关系数矩阵 plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', xticklabels=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'], yticklabels=['PC1', 'PC2', 'PC3', 'PC4', 'PC5']) plt.title("Correlation Matrix of Top Five Principal Components") plt.show() # 画出关系点图 import seaborn as sns import matplotlib.pyplot as plt # 将数据集与类别合并 reduced_wine_data['Class'] = wine_data['Class'] # 自定义调色板 custom_palette = ['red', 'green', 'blue'] # 绘制多变量关系图,设置类别列和调色板 sns.pairplot(reduced_wine_data, hue='Class', palette=custom_palette) # 显示图形 plt.show() # 创建随机森林分类器模型 from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score # 创建随机森林分类器模型 model = RandomForestClassifier(random_state=42) # 在训练集上训练模型 model.fit(X_train, y_train) # 在测试集上进行预测 y_pred = model.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print("模型准确率:", accuracy) import matplotlib.pyplot as plt import numpy as np # 创建随机森林分类器模型 model = RandomForestClassifier(random_state=42) # 在训练集上训练模型 model.fit(X_train, y_train) # 在测试集上进行预测 y_pred = model.predict(X_test) # 将实际值和预测值转换为数组 y_test_arr = np.array(y_test) y_pred_arr = np.array(y_pred) import joblib # 保存模型到文件 joblib.dump(model, 'random_forest_model.pkl') # 加载模型 #loaded_model = joblib.load('random_forest_model.pkl') #可视化测试数据与实际值 import matplotlib.pyplot as plt x = np.arange(1, len(y_test_arr) + 1) width = 0.5 # 增加柱状图的宽度 fig, ax = plt.subplots(figsize=(12, 6)) # 调整图形的大小 rects1 = ax.bar(x - width/2, y_test_arr, width, label='Actual') rects2 = ax.bar(x + width/2, y_pred_arr, width, label='Predicted') ax.set_ylabel('Class') ax.set_title('Actual vs Predicted(random forest)') ax.set_xticks(x) ax.set_xticklabels(x, fontsize=10) # 调整横坐标数字的字体大小 plt.yticks(fontsize=10) # 调整纵坐标刻度的字体大小 ax.legend() plt.tight_layout() # 自动调整子图参数,确保图像不被裁剪 plt.show() from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score print("随机森林") # 计算均方误差 mse = mean_squared_error(y_test, y_pred) print("均方误差 (MSE):", mse) # 计算平均绝对误差 mae = mean_absolute_error(y_test, y_pred) print("平均绝对误差 (MAE):", mae) # 计算均方根误差 rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("均方根误差 (RMSE):", rmse) # 计算决定系数 r2 = r2_score(y_test, y_pred) print("决定系数 (R2 Score):", r2) from sklearn.svm import SVR from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score print("支持向量机") # 创建支持向量机回归模型 svm_model = SVR() # 在训练集上训练模型 svm_model.fit(X_train, y_train) # 在测试集上进行预测 y_pred_svm = svm_model.predict(X_test) # from sklearn.metrics import accuracy_score # # 创建随机森林分类器模型 # #model = RandomForestClassifier(random_state=42) # # 在训练集上训练模型 # #model.fit(X_train, y_train) # # 在测试集上进行预测 # #y_pred = model.predict(X_test) # # 计算准确率 # accuracy = accuracy_score(y_test, y_pred_svm) # print("模型准确率:", accuracy) # 计算均方误差 mse_svm = mean_squared_error(y_test, y_pred_svm) print("均方误差 (MSE):", mse_svm) # 计算平均绝对误差 mae_svm = mean_absolute_error(y_test, y_pred_svm) print("平均绝对误差 (MAE):", mae_svm) # 计算均方根误差 rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm)) print("均方根误差 (RMSE):", rmse_svm) # 计算决定系数 r2_svm = r2_score(y_test, y_pred_svm) print("决定系数 (R2 Score):", r2_svm) y_pred = model.predict(X_test) # 将实际值和预测值转换为数组 y_test_arr = np.array(y_test) y_pred_arr = np.array(y_pred) # 可视化测试数据与实际值 x = np.arange(1, len(y_test_arr) + 1) width = 0.5 # 增加柱状图的宽度 fig, ax = plt.subplots(figsize=(12, 6)) # 调整图形的大小 rects1 = ax.bar(x - width/2, y_test_arr, width, label='Actual') rects2 = ax.bar(x + width/2, y_pred_arr, width, label='Predicted (SVM)') ax.set_ylabel('Class') ax.set_title('Actual vs Predicted (SVM)') ax.set_xticks(x) ax.set_xticklabels(x, fontsize=10) # 调整横坐标数字的字体大小 plt.yticks(fontsize=10) # 调整纵坐标刻度的字体大小 ax.legend() plt.tight_layout() # 自动调整子图参数,确保图像不被裁剪 plt.show() from sklearn.linear_model import BayesianRidge from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import numpy as np print("贝叶斯线性回归") # 创建贝叶斯线性回归模型 bayesian_model = BayesianRidge() # 在训练集上训练模型 bayesian_model.fit(X_train, y_train) # 在测试集上进行预测 y_pred_bayesian = bayesian_model.predict(X_test) # from sklearn.metrics import accuracy_score # # 创建随机森林分类器模型 # #model = RandomForestClassifier(random_state=42) # # 在训练集上训练模型 # #model.fit(X_train, y_train) # # 在测试集上进行预测 # #y_pred = model.predict(X_test) # # 计算准确率 # accuracy = accuracy_score(y_test, y_pred_bayesian) # print("模型准确率:", accuracy) # 计算均方误差 mse_bayesian = mean_squared_error(y_test, y_pred_bayesian) print("均方误差 (MSE):", mse_bayesian) # 计算平均绝对误差 mae_bayesian = mean_absolute_error(y_test, y_pred_bayesian) print("平均绝对误差 (MAE):", mae_bayesian) # 计算均方根误差 rmse_bayesian = np.sqrt(mean_squared_error(y_test, y_pred_bayesian)) print("均方根误差 (RMSE):", rmse_bayesian) # 计算决定系数 r2_bayesian = r2_score(y_test, y_pred_bayesian) print("决定系数 (R2 Score):", r2_bayesian) import matplotlib.pyplot as plt import numpy as np # 在测试集上进行贝叶斯线性回归模型的预测 y_pred_bayesian = bayesian_model.predict(X_test) # 将实际值和预测值转换为数组 y_test_arr = np.array(y_test) y_pred_bayesian_arr = np.array(y_pred_bayesian) # 可视化测试数据与实际值 x = np.arange(1, len(y_test_arr) + 1) width = 0.5 # 增加柱状图的宽度 fig, ax = plt.subplots(figsize=(12, 6)) # 调整图形的大小 rects1 = ax.bar(x - width/2, y_test_arr, width, label='Actual') rects2 = ax.bar(x + width/2, y_pred_bayesian_arr, width, label='Predicted (Bayesian)') ax.set_ylabel('Class') ax.set_title('Actual vs Predicted (Bayesian Linear Regression)') ax.set_xticks(x) ax.set_xticklabels(x, fontsize=10) # 调整横坐标数字的字体大小 plt.yticks(fontsize=10) # 调整纵坐标刻度的字体大小 ax.legend() plt.tight_layout() # 自动调整子图参数,确保图像不被裁剪 plt.show() # from sklearn.svm import SVC # from sklearn.metrics import accuracy_score # import matplotlib.pyplot as plt # import numpy as np # model = SVC(random_state=42) # model.fit(X_train, y_train) # y_pred = model.predict(X_test) # accuracy = accuracy_score(y_test, y_pred) # print("模型准确率:", accuracy) # y_test_arr = np.array(y_test) # y_pred_arr = np.array(y_pred) # x = np.arange(1, len(y_test_arr) + 1) # width = 0.5 # fig, ax = plt.subplots(figsize=(12, 6)) # rects1 = ax.bar(x - width/2, y_test_arr, width, label='Actual') # rects2 = ax.bar(x + width/2, y_pred_arr, width, label='Predicted') # ax.set_ylabel('Class') # ax.set_title('Actual vs Predicted (SVM)') # ax.set_xticks(x) # ax.set_xticklabels(x, fontsize=10) # plt.yticks(fontsize=10) # ax.legend() # plt.tight_layout() # plt.show() # import joblib # joblib.dump(model, 'model.pkl') # loaded_model = joblib.load('model.pkl') # import joblib # # 保存模型到文件 # joblib.dump(model, 'random_forest_model.pkl') # # 加载模型 # loaded_model = joblib.load('random_forest_model.pkl') # # 预测新数据 # new_data = [...]#(这里放入新数据的输入) # prediction = loaded_model.predict(new_data) # # 获取预测结果的概率输出 # probabilities = loaded_model.predict_proba(new_data) # # 导入相关的库 # import matplotlib.pyplot as plt # from sklearn.ensemble import RandomForestClassifier # from sklearn.model_selection import train_test_split # from sklearn import metrics # import numpy as np # import pandas as pd # # # 读取数据 # data = pd.read_csv("data.csv") # 假设数据保存在data.csv文件中 # # # 划分特征和标签 # X = data.drop('label', axis=1) # 假设标签列名为'label' # y = data['label'] # # 划分训练集和测试集 # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # 创建随机森林分类器 # model = RandomForestClassifier() # # 训练模型 # model.fit(X_train, y_train) # # 预测测试集 # y_pred = rfc.predict(X_test) # # 模型评估 # accuracy = metrics.accuracy_score(y_test, y_pred) # print("Accuracy:", accuracy) # 预测新数据 # new_data = [...]#(这里放入新数据的输入) # prediction = loaded_model.predict(new_data) # # 获取预测结果的概率输出 # probabilities = loaded_model.predict_proba(new_data)
模型应用
# 导入所需的库 from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier import joblib # 加载鸢花数据集 iris = load_iris() # X, y = iris.data, iris.target # # # # # 创建随机森林分类器模型 # model = RandomForestClassifier() # # # # # 拟合模型 # model.fit(X, y) # # # # # 保存模型到文件 # joblib.dump(model, 'random_forest_model.pkl') # # # # 加载保存的模型 loaded_model = joblib.load('random_forest_model.pkl') # 使用加载的模型进行预测 new_data = [[5.1, 3.5, 1.4, 0.2], [6.2, 2.9, 4.3, 1.3], [7.3, 2.9, 6.3, 1.8]] predictions = loaded_model.predict(new_data) # 打印预测结果 print(predictions)