0.前言
本文于2016年7月编写。
1.介绍
sklearn,Python的机器学习算法库,目前是Python做机器学习最常用的库。它基于NumPy、SciPy和matplotlib等实现的。本文主要用到它的逻辑回归模型和评价报表功能。
pandas,Python的数据解析工具,功能强大使用广泛。它基于NumPy实现。本文主要用到它的csv文件载入功能。
iris,一种花,有较多种类。有人提供了关于这种花的数据用于分类模型的学习,这份数据已近成为非常经典且简单的多分类实验数据。
2.使用sklearn库自带的数据进行逻辑回归分析
使用红色字体标注的为关键代码
# iris.py
# Copyright (c) 2016 WU PENG
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Iris is flower. The function load_iris() provides 150 records with feature and target about iris.
print "\nLoading data..."
iris = load_iris()
print "Data shape: ", iris.data.shape, iris.target.shape
# The function train_test_split() splits the data.
# By default 75% of the data is used to train, and 25% of the data is used to test.
print "\nSpliting data..."
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
print "Train data shape: ", X_train.shape, y_train.shape
print "Test data shape: ", X_test.shape, y_test.shape
print "\nTraining..."
clf = LogisticRegression()
clf.fit(X_train, y_train)
print "intercept: \n", clf.intercept_
print "coef: \n", clf.coef_
print "\nTesting..."
y_true = y_test
y_pred = clf.predict(X_test)
print "true: ", y_true
print "pred: ", y_pred
target_names = ['class 0', 'class 1', 'class 2']
print classification_report(y_true, y_pred, target_names=target_names)
print "\nPredict..."
X_pred = [5, 3, 5, 2.5]
y_pred = clf.predict(X_pred)
print X_pred," = ", y_pred
3.使用csv格式数据进行逻辑回归分析
3.1.文件
lr/
lr/data.csv
lr/logistic.py
3.2.数据文件
文件名为data.csv,数据与load_iris()相同。
x1、x2、x3、x4是特征,y是目标。
x1,x2,x3,x4,y
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
4.7,3.2,1.3,0.2,0
4.6,3.1,1.5,0.2,0
5.0,3.6,1.4,0.2,0
5.4,3.9,1.7,0.4,0
4.6,3.4,1.4,0.3,0
5.0,3.4,1.5,0.2,0
4.4,2.9,1.4,0.2,0
4.9,3.1,1.5,0.1,0
5.4,3.7,1.5,0.2,0
4.8,3.4,1.6,0.2,0
4.8,3.0,1.4,0.1,0
4.3,3.0,1.1,0.1,0
5.8,4.0,1.2,0.2,0
5.7,4.4,1.5,0.4,0
5.4,3.9,1.3,0.4,0
5.1,3.5,1.4,0.3,0
5.7,3.8,1.7,0.3,0
5.1,3.8,1.5,0.3,0
5.4,3.4,1.7,0.2,0
5.1,3.7,1.5,0.4,0
4.6,3.6,1.0,0.2,0
5.1,3.3,1.7,0.5,0
4.8,3.4,1.9,0.2,0
5.0,3.0,1.6,0.2,0
5.0,3.4,1.6,0.4,0
5.2,3.5,1.5,0.2,0
5.2,3.4,1.4,0.2,0
4.7,3.2,1.6,0.2,0
4.8,3.1,1.6,0.2,0
5.4,3.4,1.5,0.4,0
5.2,4.1,1.5,0.1,0
5.5,4.2,1.4,0.2,0
4.9,3.1,1.5,0.1,0
5.0,3.2,1.2,0.2,0
5.5,3.5,1.3,0.2,0
4.9,3.1,1.5,0.1,0
4.4,3.0,1.3,0.2,0
5.1,3.4,1.5,0.2,0
5.0,3.5,1.3,0.3,0
4.5,2.3,1.3,0.3,0
4.4,3.2,1.3,0.2,0
5.0,3.5,1.6,0.6,0
5.1,3.8,1.9,0.4,0
4.8,3.0,1.4,0.3,0
5.1,3.8,1.6,0.2,0
4.6,3.2,1.4,0.2,0
5.3,3.7,1.5,0.2,0
5.0,3.3,1.4,0.2,0
7.0,3.2,4.7,1.4,1
6.4,3.2,4.5,1.5,1
6.9,3.1,4.9,1.5,1
5.5,2.3,4.0,1.3,1
6.5,2.8,4.6,1.5,1
5.7,2.8,4.5,1.3,1
6.3,3.3,4.7,1.6,1
4.9,2.4,3.3,1.0,1
6.6,2.9,4.6,1.3,1
5.2,2.7,3.9,1.4,1
5.0,2.0,3.5,1.0,1
5.9,3.0,4.2,1.5,1
6.0,2.2,4.0,1.0,1
6.1,2.9,4.7,1.4,1
5.6,2.9,3.6,1.3,1
6.7,3.1,4.4,1.4,1
5.6,3.0,4.5,1.5,1
5.8,2.7,4.1,1.0,1
6.2,2.2,4.5,1.5,1
5.6,2.5,3.9,1.1,1
5.9,3.2,4.8,1.8,1
6.1,2.8,4.0,1.3,1
6.3,2.5,4.9,1.5,1
6.1,2.8,4.7,1.2,1
6.4,2.9,4.3,1.3,1
6.6,3.0,4.4,1.4,1
6.8,2.8,4.8,1.4,1
6.7,3.0,5.0,1.7,1
6.0,2.9,4.5,1.5,1
5.7,2.6,3.5,1.0,1
5.5,2.4,3.8,1.1,1
5.5,2.4,3.7,1.0,1
5.8,2.7,3.9,1.2,1
6.0,2.7,5.1,1.6,1
5.4,3.0,4.5,1.5,1
6.0,3.4,4.5,1.6,1
6.7,3.1,4.7,1.5,1
6.3,2.3,4.4,1.3,1
5.6,3.0,4.1,1.3,1
5.5,2.5,4.0,1.3,1
5.5,2.6,4.4,1.2,1
6.1,3.0,4.6,1.4,1
5.8,2.6,4.0,1.2,1
5.0,2.3,3.3,1.0,1
5.6,2.7,4.2,1.3,1
5.7,3.0,4.2,1.2,1
5.7,2.9,4.2,1.3,1
6.2,2.9,4.3,1.3,1
5.1,2.5,3.0,1.1,1
5.7,2.8,4.1,1.3,1
6.3,3.3,6.0,2.5,2
5.8,2.7,5.1,1.9,2
7.1,3.0,5.9,2.1,2
6.3,2.9,5.6,1.8,2
6.5,3.0,5.8,2.2,2
7.6,3.0,6.6,2.1,2
4.9,2.5,4.5,1.7,2
7.3,2.9,6.3,1.8,2
6.7,2.5,5.8,1.8,2
7.2,3.6,6.1,2.5,2
6.5,3.2,5.1,2.0,2
6.4,2.7,5.3,1.9,2
6.8,3.0,5.5,2.1,2
5.7,2.5,5.0,2.0,2
5.8,2.8,5.1,2.4,2
6.4,3.2,5.3,2.3,2
6.5,3.0,5.5,1.8,2
7.7,3.8,6.7,2.2,2
7.7,2.6,6.9,2.3,2
6.0,2.2,5.0,1.5,2
6.9,3.2,5.7,2.3,2
5.6,2.8,4.9,2.0,2
7.7,2.8,6.7,2.0,2
6.3,2.7,4.9,1.8,2
6.7,3.3,5.7,2.1,2
7.2,3.2,6.0,1.8,2
6.2,2.8,4.8,1.8,2
6.1,3.0,4.9,1.8,2
6.4,2.8,5.6,2.1,2
7.2,3.0,5.8,1.6,2
7.4,2.8,6.1,1.9,2
7.9,3.8,6.4,2.0,2
6.4,2.8,5.6,2.2,2
6.3,2.8,5.1,1.5,2
6.1,2.6,5.6,1.4,2
7.7,3.0,6.1,2.3,2
6.3,3.4,5.6,2.4,2
6.4,3.1,5.5,1.8,2
6.0,3.0,4.8,1.8,2
6.9,3.1,5.4,2.1,2
6.7,3.1,5.6,2.4,2
6.9,3.1,5.1,2.3,2
5.8,2.7,5.1,1.9,2
6.8,3.2,5.9,2.3,2
6.7,3.3,5.7,2.5,2
6.7,3.0,5.2,2.3,2
6.3,2.5,5.0,1.9,2
6.5,3.0,5.2,2.0,2
6.2,3.4,5.4,2.3,2
5.9,3.0,5.1,1.8,2
3.3.代码文件
使用红色字体标注的为关键代码
# logistic.py
# Copyright (c) 2016 WU PENG
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
print "\nLoading data..."
data = pd.read_csv('./data.csv', header=0)
feature = ['x1', 'x2', 'x3', 'x4']
target = ['y']
# The function train_test_split() splits the data.
print "\nSpliting data..."
X_train, X_test, y_train, y_test = train_test_split(data[feature], data[target])
# The function ravel() change [[0][0]...[1][1]] to [0,0,...,1,1]. Call it, otherwise there will be an error 'DataConversionWarning'.
y_train = y_train.ravel()
y_test = y_test.ravel()
print "Train data shape: ", X_train.shape, y_train.shape
print "Test data shape: ", X_test.shape, y_test.shape
print "\nTraining..."
clf = LogisticRegression()
clf.fit(X_train, y_train)
print "intercept: \n", clf.intercept_
print "coef: \n", clf.coef_
print "\nTesting..."
y_true = y_test
y_pred = clf.predict(X_test)
print "true: ", y_true
print "pred: ", y_pred
target_names = ['class 0', 'class 1', 'class 2']
print classification_report(y_true, y_pred, target_names=target_names)
print "\nPredict..."
X_pred = [5, 3, 5, 2.5]
y_pred = clf.predict(X_pred)
print X_pred," = ", y_pred