用Python开始机器学习（7：逻辑回归分类）

1、逻辑函数

sigmoid函数就出现了。这个函数的定义如下：

sigmoid函数具有我们需要的一切优美特性，其定义域在全体实数，值域在[0, 1]之间，并且在0点值为0.5。

3、代码与分析

# -*- coding: utf-8 -*-
from matplotlib import pyplot
import scipy as sp
import numpy as np
from matplotlib import pylab
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import time

start_time = time.time()

#绘制R/P曲线
def plot_pr(auc_score, precision, recall, label=None):
pylab.figure(num=None, figsize=(6, 5))
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.title('P/R (AUC=%0.2f) / %s' % (auc_score, label))
pylab.fill_between(recall, precision, alpha=0.5)
pylab.grid(True, linestyle='-', color='0.75')
pylab.plot(recall, precision, lw=1)
pylab.show()

#读取
x = movie_data
y = movie_target

#BOOL型特征下的向量空间模型，注意，测试样本调用的是transform接口
count_vec = TfidfVectorizer(binary = False, decode_error = 'ignore',\
stop_words = 'english')
average = 0
testNum = 10
for i in range(0, testNum):
#加载数据集，切分数据集80%训练，20%测试
x_train, x_test, y_train, y_test\
= train_test_split(movie_data, movie_target, test_size = 0.2)
x_train = count_vec.fit_transform(x_train)
x_test  = count_vec.transform(x_test)

#训练LR分类器
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
p = np.mean(y_pred == y_test)
print(p)
average += p

#准确率与召回率
precision, recall, thresholds = precision_recall_curve(y_test, answer)
print(classification_report(y_test, report, target_names = ['neg', 'pos']))
print("average precision:", average/testNum)
print("time spent:", time.time() - start_time)

plot_pr(0.5, precision, recall, "pos")


0.8
0.817857142857
0.775
0.825
0.807142857143
0.789285714286
0.839285714286
0.846428571429
0.764285714286
0.771428571429
precision    recall  f1-score   support
neg       0.74      0.80      0.77       132
pos       0.81      0.74      0.77       148
avg / total     0.77      0.77      0.77       280
average precision: 0.803571428571
time spent: 9.651551961898804

• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120