import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
from sklearn import metrics
df = pd.read_csv(filepath_or_buffer='./data/iris/iris.csv',header=None)
df.columns = ['sepal_len', 'sepal_width', 'petal_len', 'petal_width', 'class']
df.tail()
sepal_len | sepal_width | petal_len | petal_width | class | |
---|---|---|---|---|---|
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
df.loc[df['class'] == 'Iris-setosa','class'] = 0
df.loc[df['class'] == 'Iris-versicolor','class'] = 1
df.loc[df['class'] == 'Iris-virginica','class'] = 2
df.tail()
sepal_len | sepal_width | petal_len | petal_width | class | |
---|---|---|---|---|---|
145 | 6.7 | 3.0 | 5.2 | 2.3 | 2 |
146 | 6.3 | 2.5 | 5.0 | 1.9 | 2 |
147 | 6.5 | 3.0 | 5.2 | 2.0 | 2 |
148 | 6.2 | 3.4 | 5.4 | 2.3 | 2 |
149 | 5.9 | 3.0 | 5.1 | 1.8 | 2 |
df.head()
sepal_len | sepal_width | petal_len | petal_width | class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
labels = df.groupby('class').size().index
values = df.groupby('class').size()
trace = go.Pie(labels=labels, values=values)
layout = go.Layout(width=350, height=350)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
var gd = document.getElementById(‘fa8f013f-33df-43da-b3ca-0d40cc41d3a7’);
var x = new MutationObserver(function (mutations, observer) {{
var display = window.getComputedStyle(gd).display;
if (!display || display === ‘none’) {{
console.log([gd, ‘removed!’]);
Plotly.purge(gd);
observer.disconnect();
}}
}});
// Listen for the removal of the full notebook cells
var notebookContainer = gd.closest(’#notebook-container’);
if (notebookContainer) {{
x.observe(notebookContainer, {childList: true});
}}
// Listen for the clearing of the current output cell
var outputEl = gd.closest(’.output’);
if (outputEl) {{
x.observe(outputEl, {childList: true});
}}
}) }; }); </script> </div>
X = df.iloc[:, :4].values
Y = df['class'].values.astype('int')
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)
x_train.shape,y_train.shape,x_test.shape, y_test.shape
((105, 4), (105,), (45, 4), (45,))
模型搭建与分类器训练
1、导入模型,调用逻辑回归LogisticRegression()函数。
- penalty: 正则化选择参数(惩罚项的种类),默认方式为L2正则化
- C: 正则项系数的倒数
- solver: 对于多分类任务, 使用‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ 来解决多项式loss
- multi_class: 默认值‘ovr’适用于二分类问题,对于多分类问题,用‘multinomial’在全局的概率分布上最小化损失
2、训练LogisticRegression分类器
- 调用fit(x,y)的方法来训练模型,其中x为数据的属性,y为所属类型。
3、利用训练得到的模型对数据集进行预测 predict(),返回预测结果。
lr = LogisticRegression(penalty='l2',solver='newton-cg',multi_class='multinomial')
lr.fit(x_train,y_train)
LogisticRegression(multi_class='multinomial', solver='newton-cg')
print("Logistic Regression模型训练集的准确率:%.3f" %lr.score(x_train, y_train))
Logistic Regression模型训练集的准确率:0.981
y_hat = lr.predict(x_test)
accuracy = metrics.accuracy_score(y_test,y_hat)
print("Logistic Regression模型正确率:%.3f" %accuracy)
Logistic Regression模型正确率:0.978