#导入需要的库
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.datasets import fetch_20newsgroups,load_boston
from sklearn.metrics import ConfusionMatrixDisplay,accuracy_score,roc_curve, auc,mean_squared_error,silhouette_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,RidgeCV,LogisticRegression
from sklearn.neural_network import MLPRegressor,MLPClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tensorflow.keras import models,Model,layers,losses,metrics,callbacks
import tensorflow as tf
import jieba as jb
import pandas as pd
import numpy as np
from random import shuffle
import matplotlib.pyplot as plt
# KNN算法
def KNN_Algorithm():
"""
K近邻算法预测入住位置类别
:return:
"""
# 一、处理数据以及特征工程
# 1、读取收,缩小数据的范围
data = pd.read_csv("D:/xueixixiangmu/Machine_Learning/resources/FBlocation/train.csv")
# 数据逻辑筛选操作 df.query()
data = data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75")
# # 删除time这一列特征
data = data.drop(['time'], axis=1)
# # 删除入住次数少于三次位置
place_count = data.groupby('place_id').count()
place_count
tf = place_count[place_count.row_id > 3].reset_index()
data = data[data['place_id'].isin(tf.place_id)]
# # 3、取出特征值和目标值
y = data['place_id']
# # y = data[['place_id']]
x = data.drop(['place_id', 'row_id'], axis=1)
# # 4、数据分割与特征工程?
# # (1)、数据分割
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
# # (2)、标准化
transfer=StandardScaler()
# # 队训练集进行标准化操作
x_train = transfer.fit_transform(x_train)
# print(x_train)
# # 进行测试集的标准化操作
x_test = transfer.fit_transform(x_test)
# # 二、算法的输入训练预测
# # K值:算法传入参数不定的值 理论上:k = 根号(样本数)
# # K值:后面会使用参数调优方法,去轮流试出最好的参数[1,3,5,10,20,100,200],实列化一个评估器
knn = KNeighborsClassifier()
parm={"n_neighbors":[3,5,10]}
knn=GridSearchCV(knn,param_grid=parm,cv=2)
knn.fit(x_train, y_train)
print("选择了某个模型测试集当中预测的准确率为:", knn.score(x_test, y_test))
# 训练验证集的结果
print("在交叉验证当中验证的最好结果:", knn.best_score_)
print("gc选择了的模型K值是:", knn.best_estimator_)
print("每次交叉验证的结果为:", knn.cv_results_)
print("预测测试集类别:", y_predict)
return None
#朴树贝叶斯
def nbcls_Algorithm():
"""
朴素贝叶斯对新闻数据集进行预测
:return:
"""
# 获取新闻的数据,20个类别
news = fetch_20newsgroups(subset='all')
# 进行数据集分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3)
# 对于文本数据,进行特征抽取
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
# # 这里打印出来的列表是:训练集当中的所有不同词的组成的一个列表
# print(tf.get_feature_names())
print(x_train.toarray())
# 不能调用fit_transform
x_test = tf.transform(x_test)
# estimator估计器流程
mlb = MultinomialNB(alpha=1.0)
mlb.fit(x_train, y_train)
# 进行预测
y_predict = mlb.predict(x_test)
print("预测每篇文章的类别:", y_predict[:100])
print("真实类别为:", y_test[:100])
print("预测准确率为:", mlb.score(x_test, y_test))
return None
#决策树
def decisioncls_Algorithm():
"""
决策树进行乘客生存预测
:return:
"""
# 1、获取数据
titan = pd.read_csv("D:/xueixixiangmu/Machine_Learning/resources/titanic/titanic.csv")
# 2、数据的处理
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']
# print(x , y)
# 缺失值需要处理,将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 对于x转换成字典数据
# x.to_dict(orient="records")
# # [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
dict = DictVectorizer(sparse=False)
x = dict.fit_transform(x.to_dict(orient="records"))
# print(dict.get_feature_names())
# print(x)
# 分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 进行决策树的建立和预测
dc = DecisionTreeClassifier(max_depth=5)
dc.fit(x_train, y_train)
print("预测的准确率为:", dc.score(x_test, y_test))
return None
def RF__Algorithm():
# 随机森林去进行预测生存预测
titan = pd.read_csv("D:/xueixixiangmu/Machine_Learning/resources/titanic/titanic.csv")
# 2、数据的处理
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']
# print(x , y)
# 缺失值需要处理,将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 对于x转换成字典数据
# x.to_dict(orient="records")
# # [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
dict = DictVectorizer(sparse=False)
x = dict.fit_transform(x.to_dict(orient="records"))
# print(dict.get_feature_names())
# print(x)
# 分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 进行决策树的建立和预测
rf = RandomForestClassifier()
param = {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5, 8, 15, 25, 30]}
# 超参数调优
gc = GridSearchCV(rf, param_grid=param, cv=2)
gc.fit(x_train, y_train)
print("随机森林预测的准确率为:", gc.score(x_test, y_test))
return None
def SVM_Algorithm():
"""
支持向量机进行乘客生存预测
:return:
"""
# 1、获取数据
titan = pd.read_csv("D:/xueixixiangmu/Machine_Learning/resources/titanic/titanic.csv")
# 2、数据的处理
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']
# print(x , y)
# 缺失值需要处理,将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 对于x转换成字典数据
# x.to_dict(orient="records")
# # [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
dict = DictVectorizer(sparse=False)
x = dict.fit_transform(x.to_dict(orient="records"))
# print(dict.get_feature_names())
# print(x)
# 分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 进行支持向量机预估器建立
random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(svm.SVC(kernel="linear", probability=True, random_state=random_state))
y_score = classifier.fit(x_train, y_train).decision_function(x_test)
y_predict = classifier.fit(x_train, y_train).decision_function(x_test)
fpr = {}
tpr = {}
roc_auc ={}
fpr["1"], tpr["1"], _ = roc_curve(y_test[:], y_predict[:])
roc_auc["1"] = auc(fpr["1"], tpr["1"])
fpr["2"], tpr["2"], _ = roc_curve(y_test.ravel(), y_predict.ravel())
roc_auc["2"] = auc(fpr["2"], tpr["2"])
plt.figure()
lw = 2
plt.plot(
fpr["1"],
tpr["1"],
color="darkorange",
lw=lw,
label="ROC curve (area = %0.2f)" % roc_auc["1"],
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()
print("预测的准确率为:", classifier.score(x_test, y_test))
return None
#正规方程随机梯度下降
def mylinearregression():
lb = load_boston()
# print(lb.data)
# print(lb.target)
# 对数据集进行划分
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.3, random_state=24)
# # 需要做标准化处理对于特征值处理
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.fit_transform(x_test)
# # print(x_train)
# # 对于目标值进行标准化
# std_y = StandardScaler()
# y_train = std_x.fit_transform(y_train)
# y_test = std_x.transform(y_test)
# y_test = std_x.inverse_transform(y_test)
# # 使用线性模型进行预测
# # 使用正规方程求解
lr = LinearRegression()
# # 此时在干什么?
lr.fit(x_train, y_train)
y_lr_predict = lr.predict(x_test)
print(lr.coef_)
print("正规方程预测的结果为:", y_lr_predict)
print("正规方程的均方误差为:", mean_squared_error(y_test, y_lr_predict))
# 梯度下降进行预测
sgd = SGDRegressor()
sgd.fit(x_train, y_train)
print("SGD的权重参数为:", sgd.coef_)
y_sgd_predict = sgd.predict(x_test)
print("SGD的预测的结果为:", y_sgd_predict)
# # 怎么评判这两个方法好坏
print("SGD的均方误差为:", mean_squared_error(y_test, y_sgd_predict))
# #正则化,L2回归降低过拟合
# rd = Ridge(alpha=1.0)
# rd.fit(x_train, y_train)
# print("岭回归的权重参数为:", rd.coef_)
# y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
# print("岭回归的预测的结果为:", y_rd_predict)
# print("岭回归的均方误差为:", mean_squared_error(y_test, y_rd_predict))
def logisticregression_demo():
"""
逻辑回归进行癌症预测
:return: None
"""
# 1、读取数据,处理缺失值以及标准化
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
names=column_name)
# 删除缺失值
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna()
# 取出特征值
x = data[column_name[1:10]]
y = data[column_name[10]]
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 进行标准化
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 使用逻辑回归
lr = LogisticRegression()
lr.fit(x_train, y_train)
print("得出来的权重:", lr.coef_)
# 预测类别
print("预测的类别:", lr.predict(x_test))
# 得出准确率
print("预测的准确率:", lr.score(x_test, y_test))
#模型保存
joblib.dump(lr, "path/保存路径")
#模型加载
model = joblib.load("test.pkl/模型")
print("从文件加载进来的模型预测的结果:", std_y.inverse_transform(model.predict(x_test)))
return None
def pca_demo():
"""
对数据进行PCA降维
:return: None
"""
data = [[2,8,4,5], [6,3,0,8], [5,4,9,1]]
# 1、实例化PCA, 小数——保留多少信息
transfer = PCA(n_components=0.9)
# 2、调用fit_transform
data1 = transfer.fit_transform(data)
pd.DataFrame(data1)
# print("保留90%的信息,降维结果为:\n", data1)
# 1、实例化PCA, 整数——指定降维到的维数
transfer2 = PCA(n_components=3)
# 2、调用fit_transform
data2 = transfer2.fit_transform(data)
# pd.DataFrame(data2)
print("降维到3维的结果:\n", data2)
return None
def Kmeans_demo ():
data = [[2,8,4,5], [6,3,0,8], [5,4,9,1]]
km = KMeans(n_clusters=2)
km.fit(data)
pre = km.predict(data)
print(pre)
print(silhouette_score(data ,pre))
def ANN_demo():
#神经网络预测
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
names=column_name)
# 删除缺失值
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna()
# 取出特征值
x = data[column_name[1:10]]
y = data[column_name[10]]
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 进行归一化
std = MinMaxScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
mlp = MLPClassifier()
mlp.fit(x_train, y_train)
print("预测的类别:", mlp.predict(x_test))
# 得出准确率
print("预测的准确率:", mlp.score(x_test, y_test))
return None
def DNN_Algorithm():
"""
深度DNN进行乘客生存预测
:return:
"""
# 1、获取数据
titan = pd.read_csv("D:/xueixixiangmu/Machine_Learning/resources/titanic/titanic.csv")
# 2、数据的处理
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']
# print(x , y)
# 缺失值需要处理,将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 对于x转换成字典数据
# x.to_dict(orient="records")
# # [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
dict = DictVectorizer(sparse=False)
x = dict.fit_transform(x.to_dict(orient="records"))
# print(dict.get_feature_names())
# print(x)
# 分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
#深度神经网络建立
# model=models.Sequential()
# x_input = layers.Input(shape = (None,6))
# x=layers.LSTM(32)(x_input)
# x=layers.Dense(20,activation = 'relu')(x)
# x=layers.Dense(10,activation = 'relu' )(x)
# x=layers.Dense(1,activation = 'sigmoid' )(x)
# model = models.Model(inputs = x_input,outputs = x)
# model.summary()
model=models.Sequential()
model.add(layers.Dense(20,input_shape=(6,),activation = 'relu'))
model.add(layers.Dense(10,activation = 'relu' ))
model.add(layers.Dense(1,activation = 'sigmoid' ))
model.summary()
# # 二分类问题选择二元交叉熵损失函数
model.compile(optimizer="adam",loss="binary_crossentropy",
metrics=["Accuracy"])
model.fit(x_train,y_train,batch_size=64,epochs=30,
validation_split=0.2)
history = model.fit(x_train,y_train,
batch_size= 64,
epochs= 30,
validation_split=0.2 #分割一部分训练数据用于验证
)
# # # 进行决策树的建立和预测
# # dc = DecisionTreeClassifier(max_depth=5)
# # dc.fit(x_train, y_train)
# print("预测的准确率为:", model.evaluate(x_test, y_test))
# predict_x=model.predict(x_test)
# predict_x
# # classes_x=np.argmax(predict_x,axis=1)
# # return None
# DNN_Algorithm()
if __name__=="__main__":
# # # # # # KNN_Algorithm()
# # # # RF__Algorithm()
# # # mylinearregression()
# ANN_demo()