knn:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
X = np.array([[1, 1], [1, 1.5], [2, 2], [4, 3], [4, 4]])
y = np.array(['A', 'A', 'A', 'B', 'B'])
knn = KNeighborsClassifier(n_neighbors=3)
# 训练模型
knn.fit(X, y)
# 预测
pred = knn.predict([[3, 2]])
pred_proba = knn.predict_proba([[3, 2]])
LinearRegression:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
#模拟数据
x = np.linspace(0, 10, 50)
noise = np.random.uniform(-2,2,size=50)
y = 5 * x + 6 + noise
#创建模型
liner = LinearRegression()
#拟合模型
liner.fit(np.reshape(x,(-1,1)),np.reshape(y,(-1,1)))
#预测
y_pred = liner.predict(np.reshape(x,(-1,1)))
logistics regression:
# 定义多项式回归,degree的值可以调节多项式的特征
poly_reg = PolynomialFeatures(degree=5)
# 特征处理
x_poly = poly_reg.fit_transform(x_data)
# 定义逻辑回归模型
logistic = linear_model.LogisticRegression()
# 训练模型
logistic.fit(x_poly, y_data)
# 获取数据值所在的范围
x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1
y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1
# 生成网格矩阵
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
z = logistic.predict(poly_reg.fit_transform(np.c_[xx.ravel(), yy.ravel()]))# ravel与flatten类似,多维数据转一维。flatten不会改变原始数据,ravel会改变原始数据
z = z.reshape(xx.shape)
# 等高线图
cs = plt.contourf(xx, yy, z)
# 样本散点图
plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
plt.show()
print('score:',logistic.score(x_poly,y_data))#得分很高
Decision Tree:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(criterion='mse', max_depth=3)
dt_reg.fit(x, y)
x_test = np.linspace(-3, 3, 50).reshape(-1, 1)
y_hat = dt_reg.predict(x_test)
RandomForestClassifier:
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:, :2] # 花萼长度和宽度
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=15, max_leaf_nodes=16, n_jobs=1, oob_score=True)
rnd_clf.fit(X_train, y_train)
print(rnd_clf.oob_score_)
kmeans:
k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
begin_time = time.time() # 记录训练开始时间
k_means.fit(X) # 聚类模型
dbscan:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
data_path = 'data.csv'
# 读取数据文件
data_frame = pd.read_csv(data_path, encoding='gbk')
# DBSCAN聚类
def dbscan_cluster(x_label, y_label):
# 生成DBSCAN模型
clu = DBSCAN(eps=4, min_samples=5)
X_value = data_frame[[x_label, y_label]].values
# 开始进行K-Means聚类
clu.fit(X_value)
# 输出样本所属的簇
print('样本所属簇编号:', clu.labels_)
# 可视化聚类属性(散点图)
# 参数设置
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 以簇编号作为颜色区分依据
plt.scatter(data_frame[x_label], data_frame[y_label], c=clu.labels_)
plt.title('DBSCAN聚类结果')
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
if __name__ == '__main__':
dbscan_cluster('当月MOU', '当月DOU')
数据标准化:
from sklearn.preprocessing import StandardScaler
#标准归一化
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print(scaler.fit(data))
print(scaler.mean_)
print(scaler.transform(data))
from sklearn.preprocessing import MinMaxScaler
#最大最小值归一化
data = [[-1, 2], [-0.5, 6], [1, 10], [0, 18]]
scaler = MinMaxScaler()
print(scaler.fit_transform(data))
print(scaler.fit(data))
print(scaler.data_max_)
梯度下降SGDRegressor、BGDRegressor、MBGDRegressor:
from sklearn.linear_model import SGDRegressor
sgd1=SGDRegressor()
sgd1.fit(x_train_standard,y_train1)
print(sgd1.coef_)
print(sgd1.intercept_)
print(sgd1.score(x_test_standard,y_test1))