案例一:数据预测.
数据:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor, VotingClassifier, BaggingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
import numpy as np
train=pd.read_excel("DATABASE.xlsx")
test=pd.read_excel("testdatabase.xlsx")
x_train=train.iloc[:,:5]
y_train=train.iloc[:,5] #训练集
x_test=test.iloc[:,:5]
y_test=test.iloc[:,5] #测试集
#特征工程 归一化
train_min=x_train.min()
train_max=x_train.max()
test_min=x_test.min()
test_max=x_test.max()
new_x_train=(x_train-train_min)/(train_max-train_min)
new_x_test=(x_test-test_min)/(test_max-test_min)
new_x_train.info()
print(new_x_test.describe())
knn=KNeighborsRegressor(8)
knn.fit(new_x_train,y_train)
y_pred=knn.predict(new_x_test)
score=knn.score(new_x_test,y_test)
print(score)
#
# a=pd.DataFrame([[0.0010256,475,200,1.5,0]],columns=["ua_values","ta_values","tb_values","n1_values","n2_values"])
# a.columns=a.columns.astype(str)
# new=(a-train_min)/(train_max-test_min)
# print(a)
#
# 总结=knn.predict(new)
# print(总结)
model = LinearRegression()
# 在训练集上训练模型
model.fit(new_x_train, y_train)
# 在测试集上进行预测
y_preds = model.predict(new_x_test)
scoreb=model.score(new_x_test,y_test)
print(scoreb)
random_forest=RandomForestRegressor(n_estimators=200)
random_forest.fit(new_x_train,y_train)
scorec=random_forest.score(new_x_test,y_test)
print(scorec)
rige=Ridge()
rige.fit(new_x_train,y_train)
rige_y_pred=rige.predict(new_x_test)
rige_score=r2_score(y_test,rige_y_pred)
print(rige_score)
vote=VotingRegressor(estimators=[("knn",knn),("lr",model),("rd",random_forest)])
vote.fit(new_x_train,y_train)
y_preds=vote.predict(new_x_test)
vscore=r2_score(y_test,y_preds)
print(vscore)
bagging=BaggingRegressor(random_forest,n_estimators=10,random_state=42)
bagging.fit(new_x_train,y_train)
y_pred=bagging.predict(new_x_test)
bagscore=r2_score(y_test,y_pred)
print(bagscore)
adboost=AdaBoostRegressor(estimator=random_forest,n_estimators=10)
adboost.fit(new_x_train,y_train)
adboost_y_pred=adboost.predict(new_x_test)
adsocre=r2_score(y_test,adboost_y_pred)
print(adsocre)
结果:
案例二:文本分类
数据:
# coding=utf-8
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, GradientBoostingClassifier, \
StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
train=pd.read_csv("train.csv",encoding="utf-8")
train=train.dropna(subset=("Label"))
test=pd.read_csv("test.csv")
x_train=train["Message_body"]
y_train=train["Label"]
x_test=test["Message_body"]
y_test=test["Label"]
x_train=x_train.str.lower()#转换为小写
x_test=x_test.str.lower()
vector=TfidfVectorizer()
new_x_train=vector.fit_transform(x_train)
new_x_test=vector.transform(x_test)
print(new_x_train)
nb=MultinomialNB()
nb.fit(new_x_train,y_train)
y_pred=nb.predict(new_x_test)
accuract=accuracy_score(y_test,y_pred)
print("NB",accuract)
logist=LogisticRegression()
logist.fit(new_x_train,y_train)
y_pred=logist.predict(new_x_test)
logist_score=accuracy_score(y_test,y_pred)
print("logist",logist_score)
decision=DecisionTreeClassifier()
decision.fit(new_x_train,y_train)
y_pred=decision.predict(new_x_test)
decision_score=accuracy_score(y_test,y_pred)
print("deciosin",decision_score)
random=RandomForestClassifier()
random.fit(new_x_train,y_train)
y_pred=random.predict(new_x_test)
random_score=accuracy_score(y_test,y_pred)
print("random",random_score)
Gradient=GradientBoostingClassifier()
Gradient.fit(new_x_train,y_train)
y_pred=Gradient.predict(new_x_test)
gradient_score=accuracy_score(y_test,y_pred)
print("grandient",gradient_score)
bagging=BaggingClassifier(Gradient,n_estimators=10,random_state=42)
bagging.fit(new_x_train,y_train)
y_pred=bagging.predict(new_x_test)
baggin_score=accuracy_score(y_test,y_pred)
print("bagging",baggin_score)
vote=VotingClassifier(estimators=[("decision",decision),("random",bagging),("b",Gradient)],voting="hard")
vote.fit(new_x_train,y_train)
y_pred=vote.predict(new_x_test)
vote_score=accuracy_score(y_test,y_pred)
print("vote",vote_score)
stacking=StackingClassifier([("decision",decision),("random",bagging),("b",Gradient)],final_estimator=vote)
stacking.fit(new_x_train,y_train)
y_pred=stacking.predict(new_x_test)
stacking_score=accuracy_score(y_test,y_pred)
print("stack",stacking_score)
sentence = "what are you doing man?"
arr = pd.Series([sentence], dtype="object")
print(arr)
new_test=vector.transform(test)
y_pred=stacking.predict(new_test)
print(y_pred)
结果:
案例三:图像(手写数字识别)
数据:
# coding=utf-8
import numpy as np
from matplotlib import pyplot as plt
img = plt.imread('./手写数字识别/0/0_1.bmp')
plt.imshow(img,cmap='gray')
plt.show()
# 批量导入5000个图片数据
data = [] # 分类模型输入数据
target = [] # 分类模型输出数据
for i in range(10):
for j in range(1,501):
img = plt.imread(f'./手写数字识别/{i}/{i}_{j}.bmp')
data.append(img)
target.append(i)
# 此时data和target作为列表数据运算起来非常耗内存,所以先转为数组形式的数据方便处理,然后再改变维度
data = np.array(data).reshape(5000, -1)
target = np.array(target).reshape(5000, -1)
print('data的形状:',data.shape,'target的形状:',target.shape)
# 数据划分为训练集和测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data,target,test_size=0.2) # 20%的测试集
# 导入模型
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
# 训练数据
knn.fit(x_train,y_train)
# 查看模型得分,如果是pycharm就把下面代码放到print中
print(knn.score(x_test,y_test))
y_pred=knn.predict(x_test)
# 随机挑选10个测试值画图查看预测结果
choice = np.random.randint(1,1000,15).tolist()
# 设置画布大小
plt.figure(figsize=(5*10,3*10))
for i in range(15):
# 画子图
re = plt.subplot(3,5,i+1)
re.imshow(x_test[choice[i]].reshape(28,-1),cmap='gray')
re.set_title(f'real:{y_test[choice[i]]},\npredict:{y_pred[choice[i]]}',fontsize=40,color = 'k' if y_test[choice[i]] == y_pred[choice[i]] else 'r')
plt.show()
结果: