线性回归模型实现
import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
def mse_score(y_predict,y_test):
'''
input:y_predict(ndarray):预测值
y_test(ndarray):真实值
ouput:mse(float):mse损失函数值
'''
mse=np.mean((y_predict-y_test)**2)
return mse
class LinearRegression :
def __init__(self):
'''初始化线性回归模型'''
self.theta = None
def fit_normal(self,train_data,train_label):
'''
input:train_data(ndarray):训练样本
train_label(ndarray):训练标签
'''
ones=np.ones((train_data.shape[0],1))
X=np.hstack((ones,train_data))
XT=X.T
XTX=np.dot(XT,X)
XTX_1=np.linalg.inv(XTX)
XTX_1XT=np.dot(XTX_1,XT)
self.theta=np.dot(XTX_1XT,train_label)
return self.theta
def predict(self,test_data):
'''
input:test_data(ndarray):测试样本
'''
ones=np.ones((test_data.shape[0],1))
X=np.hstack((ones,test_data))
y_predict=np.dot(X,self.theta)
return y_predict
朴素贝叶斯模型实现
import numpy as np
class NaiveBayesClassifier(object):
def __init__(self):
'''
self.label_prob表示每种类别在数据中出现的概率
例如,{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333,类别1的概率为0.667
'''
self.label_prob = {}
'''
self.condition_prob表示每种类别确定的条件下各个特征出现的概率
例如训练数据集中的特征为 [[2, 1, 1],
[1, 2, 2],
[2, 2, 2],
[2, 1, 2],
[1, 2, 3]]
标签为[1, 0, 1, 0, 1]
那么当标签为0时第0列的值为1的概率为0.5,值为2的概率为0.5;
当标签为0时第1列的值为1的概率为0.5,值为2的概率为0.5;
当标签为0时第2列的值为1的概率为0,值为2的概率为1,值为3的概率为0;
当标签为1时第0列的值为1的概率为0.333,值为2的概率为0.666;
当标签为1时第1列的值为1的概率为0.333,值为2的概率为0.666;
当标签为1时第2列的值为1的概率为0.333,值为2的概率为0.333,值为3的概率为0.333;
因此self.label_prob的值如下:
{
0:{
0:{
1:0.5
2:0.5
}
1:{
1:0.5
2:0.5
}
2:{
1:0
2:1
3:0
}
}
1:
{
0:{
1:0.333
2:0.666
}
1:{
1:0.333
2:0.666
}
2:{
1:0.333
2:0.333
3:0.333
}
}
}
'''
self.condition_prob = {}
def fit(self, feature, label):
'''
对模型进行训练,需要将各种概率分别保存在self.label_prob和self.condition_prob中
:param feature: 训练数据集所有特征组成的ndarray
:param label:训练数据集中所有标签组成的ndarray
:return: 无返回
'''
row_num = len(feature)
col_num = len(feature[0])
unique_label_count = len(set(label))
for c in label:
if c in self.label_prob:
self.label_prob[c] += 1
else:
self.label_prob[c] = 1
for key in self.label_prob.keys():
self.label_prob[key] += 1
self.label_prob[key] /= (unique_label_count+row_num)
self.condition_prob[key] = {}
for i in range(col_num):
self.condition_prob[key][i] = {}
for k in np.unique(feature[:, i], axis=0):
self.condition_prob[key][i][k] = 1
for i in range(len(feature)):
for j in range(len(feature[i])):
if feature[i][j] in self.condition_prob[label[i]]:
self.condition_prob[label[i]][j][feature[i][j]] += 1
for label_key in self.condition_prob.keys():
for k in self.condition_prob[label_key].keys():
total = len(self.condition_prob[label_key].keys())
for v in self.condition_prob[label_key][k].values():
total += v
for kk in self.condition_prob[label_key][k].keys():
self.condition_prob[label_key][k][kk] /= total
def predict(self, feature):
'''
对数据进行预测,返回预测结果
:param feature:测试数据集所有特征组成的ndarray
:return:
'''
result = []
for i, f in enumerate(feature):
prob = np.zeros(len(self.label_prob.keys()))
ii = 0
for label, label_prob in self.label_prob.items():
prob[ii] = label_prob
for j in range(len(feature[0])):
prob[ii] *= self.condition_prob[label][j][f[j]]
ii += 1
result.append(list(self.label_prob.keys())[np.argmax(prob)])
return np.array(result)
kmeans实现
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
k = 2
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title('KMeans Clustering')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
PCA实现
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(X_pca)
'''
result
[[-1.34690654e+00 1.11022302e-16]
[ 0.00000000e+00 0.00000000e+00]
[ 1.34690654e+00 -1.11022302e-16]]
'''
决策树实现
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("训练集得分:", clf.score(X_train, y_train))
print("测试集得分:", clf.score(X_test, y_test))
'''
训练集得分: 1.0
测试集得分: 0.9555555555555556
'''