机器学习之回归算法2
模型的保存和加载
逻辑回归
公式
损失函数
代码实现
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
def logistic():
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin.names', names=column)
print(data)
data.replace(to_place='?', value=np.nan)
data = data.dropna()
x_train, x_test, y_train, y_test = train_test_split(data[column[1:3]], data[column[3]], test_size=0.25)
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
lg = LogisticRegression(c=1.0)
lg.fit(x_train, y_train)
print(lg.coef_)
y_predict = lg.predict(x_test)
print('准确率', lg.score(x_test, y_test))
print('召回率', classification_report(y_test, y_predict, labels=[2, 4], target_names=['良性', '恶性']))
return
logistic()
优缺点
逻辑回归与朴素贝叶斯对比
聚类评估准则
Kmeans优缺点
代码实现
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
prior = pd.read_csv('~.csv')
products = pd.read_csv('~.csv')
orders = pd.read_csv('~.csv')
aisles = pd.read_csv('~.csv')
_mg = pd.merge(prior, products, on=['product_id', 'product_id'])
_mg = pd.merge(_mg, orders, on=['order_id', 'order_id'])
mt = pd.merge(_mg, aisles, on=['aisle_id', 'aisle_id'])
cross = pd.crosstab(mt['user_id'], mt['aisle'])
pca = PCA(n_components=0.9)
data = pca.fit_transform(cross)
km = KMeans(n_clusters=4)
x = data[:500]
km.fit(x)
predict = km.predict(x)
plt.figure(figsize=(10, 10))
colored = ['orange', 'blue', 'purple', 'green']
colr = [colored[i] for i in predict]
plt.scatter(x[:1], x[:20], color=colr)
plt.xlabel('1')
plt.xlabel('20')
plt.show()
# 聚类效果轮廓系数
silhouette_score(x, predict)