机器学习-逻辑回归
预测乳腺癌案例
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['simHei']
mpl.rcParams['axes.unicode_minus'] = False
names = ['id',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class']
data = pd.read_csv('./data/乳腺癌分类/breast-cancer-wisconsin.data', names=names)
data = data.replace('?', np.nan).dropna(how='any')
X = data.iloc[:, 1:10]
Y = data.iloc[:, -1:]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
lr = LogisticRegression()
result = lr.fit(X_train, Y_train)
print("训练集合上的准确率", result.score(X_train, Y_train))
print("测试集合上的准确率", result.score(X_test, Y_test))
y_hat = lr.predict(X_test)
Y_ = Y_test.values.reshape(-1)
print(y_hat == Y_)
Y_arr = (y_hat == Y_)
Y_arr = np.array(Y_arr, dtype=int)
print(Y_arr)
print(np.mean(Y_arr))
x_len = range(len(X_test))
plt.figure()
plt.ylim(0, 6)
plt.plot(x_len, Y_test, 'ro', markersize=8, label='真实值', zorder=3)
plt.plot(x_len, lr.predict(X_test), 'go', markersize=14, label='预测值', zorder=2)
from sklearn import metrics
from sklearn.preprocessing import label_binarize
y_t = label_binarize(Y_test, classes=(2, 4))
print(y_t.reshape(-1))
Y_score = lr.decision_function(X_test)
fpr, tpr, th = metrics.roc_curve(y_t.reshape(-1), Y_score)
auc = metrics.auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, c='r', lw=2)
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
plt.xticks(np.arange(0, 1.1, 0.1))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.plot((0, 1), (0, 1), c='#a0a0a0', ls='--')
plt.xlabel('假正例率', fontsize=16)
plt.ylabel('真正例率', fontsize=16)
plt.grid(b=True, ls=':')
plt.show()
泰坦尼克号案例
创建泰坦尼克号GUI.py
import tkinter as tk
import tkinter as tk
from Titanic import Titanic
def on_start_button_click():
sex_input = sex_entry.get()
age_input = float(age_entry.get())
bro_input = float(bro_entry.get())
parents_input = float(parents_entry.get())
fare_input = float(fare_entry.get())
p_class_input = float(p_class_entry.get())
result_str = t.predict(sex_input, age_input, bro_input, parents_input, fare_input, p_class_input)
result_textarea.delete('1.0', 'end')
result_textarea.insert('insert', result_str)
pass
root = tk.Tk()
t =Titanic()
root.title('泰坦尼克号预测')
tk.Label(root, padx=20, pady=10, text='输入下面各个数值,来预测你在泰坦尼克号事故中是否存活下来').grid(row=0, columnspan=2)
tk.Label(root, text='性别(男/女)').grid(row=1, sticky=tk.E)
tk.Label(root, text='年龄').grid(row=2, sticky=tk.E)
tk.Label(root, text='船上的兄弟姐妹(0)').grid(row=3, sticky=tk.E)
tk.Label(root, text='父母亲(0)').grid(row=4, sticky=tk.E)
tk.Label(root, text='恐惧程度(0-1)').grid(row=5, sticky=tk.E)
tk.Label(root, text='仓位(1-3)').grid(row=6, sticky=tk.E)
tk.Label(root, text='Create By AzurLane').grid(row=7, sticky=tk.W)
sex_entry = tk.Entry(root)
sex_entry.grid(row=1, column=1, padx=10, pady=10)
age_entry = tk.Entry(root)
age_entry.grid(row=2, column=1, padx=10, pady=10)
bro_entry = tk.Entry(root)
bro_entry.grid(row=3, column=1, padx=10, pady=10)
parents_entry = tk.Entry(root)
parents_entry.grid(row=4, column=1, padx=10, pady=10)
fare_entry = tk.Entry(root)
fare_entry.grid(row=5, column=1, padx=10, pady=10)
p_class_entry = tk.Entry(root)
p_class_entry .grid(row=6, column=1, padx=10, pady=10)
photo = tk.PhotoImage(file='./data/siki.png')
tk.Label(image=photo).grid(row=1, column=2, rowspan=3, padx=20)
result_textarea = tk.Text(root, height=6, width=30)
result_textarea.grid(row=4, column=2, rowspan=2)
tk.Button(root, text='开始预测你的生死', command=on_start_button_click).grid(row=6, column=2, rowspan=2, ipadx=50)
root.mainloop()
Titanic.py
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
def split_data(data, train_size=0.8, random_state=0):
data_upset = data.sample(frac=1, random_state=random_state)
x = data_upset.iloc[:, 1:]
y = data_upset.iloc[:, :1]
num_train = int(len(data_upset) * train_size)
x_train = x.iloc[:num_train, :]
y_train = y.iloc[:num_train, :]
x_test = x.iloc[num_train:, :]
y_test = y.iloc[num_train:, :]
return x_train, y_train, x_test, y_test
class Titanic:
def __init__(self):
data = pd.read_csv('./data/titanic.csv')
names = ['Survived', 'Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
data = data[names]
sex_one_hot = pd.get_dummies(data['Sex'], prefix='Sex')
pclass_one_hot = pd.get_dummies(data['Pclass'], prefix='Pclass')
names = ['Survived', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
data = data[names]
data = pd.concat([data, sex_one_hot, pclass_one_hot], axis=1)
data['Fare'] = data['Fare'] = data['Fare'].apply(
lambda x: (x - data['Fare'].min()) / (data['Fare'].max() - data['Fare'].min()))
x_train, y_train, x_test, y_test = split_data(data)
self.lr = LogisticRegression(solver='lbfgs', max_iter=1000000)
self.result = self.lr.fit(x_train, y_train)
print('训练集上准确率', self.result.score(x_train, y_train))
print('测试集上准确率', self.result.score(x_test, y_test))
pass
def predict(self, sex_input, age_input, bro_input, parents_input, fare_input, p_class_input):
print(self.lr.coef_)
print('预测')
x = [age_input, bro_input, parents_input, fare_input]
if sex_input == '男':
x.extend([0, 1])
if p_class_input == 1:
x.extend([1, 0, 0])
elif p_class_input == 2:
x.extend([0, 1, 1])
elif p_class_input == 3:
x.extend([0, 0, 1])
else:
print("仓位输入有误")
return '仓位输入有误,请输入1到3任意数字,1为头等仓,3为经济舱'
elif sex_input == '女':
x.extend([1, 0])
if p_class_input == 1:
x.extend([1, 0, 0])
elif p_class_input == 2:
x.extend([0, 1, 1])
elif p_class_input == 3:
x.extend([0, 0, 1])
else:
print("仓位输入有误")
return '仓位输入有误,请输入1到3任意数字,1为头等仓,3为经济舱'
else:
print("性别输入有误")
return '性别输入有误,请输入男或女'
result = self.lr.predict([x])
if result == 0:
result = '死'
else:
result = '活'
return '在这次泰坦尼克号中,你'+result+'了'
pass
葡萄酒分类
葡萄酒分类GUI.py
import tkinter as tk
from Winequality import Winequality
def on_start_button_click():
fixed_acidity_input = float(fixed_acidity.get())
volatile_acidity_input = float(volatile_acidity.get())
citric_acid_input = float(citric_acid.get())
residual_sugar_input = float(residual_sugar.get())
chlorides_input = float(chlorides.get())
free_sulfur_dioxide_input = float(free_sulfur_dioxide.get())
total_sulfur_dioxide_input= float(total_sulfur_dioxide.get())
density_input = float(density.get())
pH_input = float(pH.get())
sulphates_input = float(sulphates.get())
alcohol_input = float(alcohol.get())
result = w.perdict(fixed_acidity_input, volatile_acidity_input, citric_acid_input, residual_sugar_input, chlorides_input, free_sulfur_dioxide_input, total_sulfur_dioxide_input, density_input, pH_input, sulphates_input, alcohol_input)
result_textarea.delete('1.0', 'end')
result_textarea.insert('insert', "分类的结果为:" + str(result))
pass
w = Winequality()
root = tk.Tk()
root.title('葡萄酒分类质量')
tk.Label(root, padx=20, pady=10, text='输入下面各个数值').grid(row=0, columnspan=2)
tk.Label(root, text='非挥发性酸').grid(row=1, sticky=tk.E)
fixed_acidity = tk.Entry(root)
fixed_acidity.grid(row=1, column=1, padx=10, pady=10)
tk.Label(root, text='挥发性酸').grid(row=2, sticky=tk.E)
volatile_acidity = tk.Entry(root)
volatile_acidity.grid(row=2, column=1, padx=10, pady=10)
tk.Label(root, text='柠檬酸').grid(row=3, sticky=tk.E)
citric_acid = tk.Entry(root)
citric_acid.grid(row=3, column=1, padx=10, pady=10)
tk.Label(root, text='残糖').grid(row=4, sticky=tk.E)
residual_sugar = tk.Entry(root)
residual_sugar.grid(row=4, column=1, padx=10, pady=10)
tk.Label(root, text='氯化物').grid(row=5, sticky=tk.E)
chlorides = tk.Entry(root)
chlorides.grid(row=5, column=1, padx=10, pady=10)
tk.Label(root, text='无氯二氧化硫').grid(row=6, sticky=tk.E)
free_sulfur_dioxide = tk.Entry(root)
free_sulfur_dioxide.grid(row=6, column=1, padx=10, pady=10)
tk.Label(root, text='二氧化硫总量').grid(row=7, sticky=tk.E)
total_sulfur_dioxide = tk.Entry(root)
total_sulfur_dioxide.grid(row=7, column=1, padx=10, pady=10)
tk.Label(root, text='密度').grid(row=8, sticky=tk.E)
density = tk.Entry(root)
density.grid(row=8, column=1, padx=10, pady=10)
tk.Label(root, text='酸碱度').grid(row=9, sticky=tk.E)
pH = tk.Entry(root)
pH.grid(row=9, column=1, padx=10, pady=10)
tk.Label(root, text='硫酸盐').grid(row=10, sticky=tk.E)
sulphates = tk.Entry(root)
sulphates.grid(row=10, column=1, padx=10, pady=10)
tk.Label(root, text='酒精').grid(row=11, sticky=tk.E)
alcohol = tk.Entry(root)
alcohol.grid(row=11, column=1, padx=10, pady=10)
tk.Label(root, text='Create By AzurLane').grid(row=12, sticky=tk.W)
photo = tk.PhotoImage(file='./data/siki.png')
tk.Label(root, image=photo).grid(row=1, column=2, rowspan=2, padx=10, pady=5)
result_textarea = tk.Text(root, height=7, width=40)
result_textarea.grid(row=3, column=2, rowspan=2, padx=10, pady=0)
tk.Button(root, text='开始预测', command=on_start_button_click).grid(row=11, rowspan=2, column=2, ipadx=120)
root.mainloop()
Winequality.py
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
mpl.rcParams['font.sans-serif'] = ['simHei']
mpl.rcParams['axes.unicode_minus'] = False
from sklearn.preprocessing import label_binarize
from sklearn import metrics
class Winequality:
def __init__(self):
red_data = pd.read_csv('./data/winequality-red.csv', sep=';')
white_data = pd.read_csv('./data/winequality-white.csv', sep=';')
data = pd.concat([red_data, white_data], axis=0)
print(data['quality'].value_counts())
data = data.dropna(how='any')
x_train, y_train, x_test, y_test = self.split_data(data)
self.test_num = len(x_test)
self.test_y = y_test
self.test_x = x_test
self.lr = LogisticRegression(max_iter=10000, solver='sag')
self.lr.fit(x_train, y_train)
print('训练集准确率', self.lr.score(x_train, y_train))
print('测试集', self.lr.score(x_test, y_test))
pass
def split_data(self, data, train_size=0.8, random_state=0):
data_upset = data.sample(frac=1, random_state=random_state)
x = data_upset.iloc[:, :-1]
y = data_upset.iloc[:, -1:]
num_train = int(len(data_upset) * train_size)
x_train = x.iloc[:num_train, :]
y_train = y.iloc[:num_train, :]
x_test = x.iloc[num_train:, :]
y_test = y.iloc[num_train:, :]
return x_train, y_train, x_test, y_test
def perdict(self, fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol):
result = self.lr.predict([[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol]])
return str(result[0])
def visualization(self):
x_len = range(self.test_num)
plt.figure()
plt.plot(x_len, self.test_y, 'ro', markersize=8, zorder=3)
plt.plot(x_len, self.lr.predict(self.test_x), 'go', markersize=14, zorder=2)
plt.show()
pass
def roc(self):
y_ = label_binarize(self.test_y, classes=(3, 4, 5, 6, 7, 8, 9))
y_score = self.lr.decision_function(self.test_x)
fpr, tpr, th = metrics.roc_curve(y_.reshape(-1), y_score.reshape(-1))
auc = metrics.auc(fpr, tpr)
print('auc面积', auc)
plt.figure()
plt.plot(fpr, tpr, c='r', lw=1)
plt.show()
pass
花分类案例
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['simHei']
mpl.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.neighbors import KNeighborsClassifier
def change_data(x):
if x == 'Iris-setosa':
return 0
elif x == 'Iris-versicolor':
return 1
elif x == 'Iris-virginica':
return 2
else:
return -1
pass
def split_data(data, train_size=0.8, random_state=2):
data_upset = data.sample(frac=1, random_state=random_state, replace=False)
x = data_upset.iloc[:, :-1]
y = data_upset.iloc[:, -1:]
num = int(len(data_upset) * train_size)
x_train = x.iloc[:num, :]
y_train = y.iloc[:num, :]
x_test = x.iloc[num:, :]
y_test = y.iloc[num:, :]
return x_train, x_test, y_train, y_test
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'cla']
data = pd.read_csv('./data/鸢尾花数据分类/iris.data', names=names)
data['cla'] = data['cla'].apply(change_data)
x_train, x_test, y_train, y_test = split_data(data, 0.7)
lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=3)
result = lr.fit(x_train, y_train)
result_knn = knn.fit(x_train, y_train)
print(result.score(x_train, y_train))
print('逻辑回归', result.score(x_test, y_test))
print('knn', result_knn.score(x_train, y_train))
print(result_knn.score(x_test, y_test))
plt.figure()
x_len = range(len(x_test))
plt.plot(x_len, y_test, 'ro', markersize=8, zorder=3, label='真实值')
plt.plot(x_len, lr.predict(x_test), 'go', markersize=14, zorder=2, label='预测值')
plt.ylim(-1, 3)
plt.legend(loc='upper left')
plt.show()
y_ = label_binarize(y_test, classes=(0, 1, 2))
y_score = lr.decision_function(x_test)
y_score_knn = knn.predict_proba(x_test)
fpr, tpr, th = metrics.roc_curve(y_.reshape(-1), y_score.reshape(-1))
fpr_knn, tpr_knn, th_knn = metrics.roc_curve(y_.reshape(-1), y_score_knn.reshape(-1))
auc = metrics.auc(fpr, tpr)
print('面积{}'.format(auc))
auc_knn = metrics.auc(fpr, tpr)
print('knn面积{}'.format(auc_knn))
plt.figure()
plt.plot(fpr, tpr, c='r', lw=2)
plt.plot(fpr_knn, tpr_knn, c='b', lw=1)
plt.xlim(-0.01, 1.01)
plt.ylim(-0.01, 1.01)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xticks(np.arange(0, 1.1, 0.1))
plt.plot((0, 1), (0, 1), c='#a0a0a0', ls='--')
plt.grid(ls=':')
plt.show()
总结
逻辑回归
逻辑回归的过程
处理逻辑回归
项目处理过程的API
读取数据集|查看数据集|数据的预处理
切割训练集和测试集|创建和训练模型|模型预测
模型评估与可视化
python——GUI界面