





LR分类器,即Logistic Regression Classifier。


照线性加和得到 ,这里是每个样本的个特征。

按照sigmoid函数的形式求出 ,其中sigmoid函数的定义域为,值域为,因此最基本的LR分类器适合对两类目标进行分类。













这个比值称为事件的发生比(the odds of experiencing an event),简记为odds。






     logistic回归的假设函数如下所示,线性回归假设函数只是clip_image025     。






  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. # Author:ZhengzhengLiu
  4. #乳腺癌分类案例
  5. import sklearn
  6. from sklearn.linear_model import LogisticRegressionCV,LinearRegression
  7. from sklearn.model_selection import train_test_split
  8. from sklearn.preprocessing import StandardScaler
  9. from sklearn.linear_model.coordinate_descent import ConvergenceWarning
  10. import numpy as np
  11. import pandas as pd
  12. import matplotlib as mpl
  13. import matplotlib.pyplot as plt
  14. import warnings
  15. #解决中文显示问题
  16. mpl.rcParams[ "font.sans-serif"] = [ u"SimHei"]
  17. mpl.rcParams[ "axes.unicode_minus"] = False
  18. #拦截异常
  19. warnings.filterwarnings(action= 'ignore',category=ConvergenceWarning)
  20. #导入数据并对异常数据进行清除
  21. path = "datas/breast-cancer-wisconsin.data"
  22. names = [ "id", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape"
  23. , "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin"
  24. , "Normal Nucleoli", "Mitoses", "Class"]
  25. df = pd.read_csv(path,header= None,names=names)
  26. datas = df.replace( "?",np.nan).dropna(how= "any") #只要列中有nan值,进行行删除操作
  27. #print(datas.head()) #默认显示前五行
  28. #数据提取与数据分割
  29. X = datas[names[ 1: 10]]
  30. Y = datas[names[ 10]]
  31. #划分训练集与测试集
  32. X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size= 0.1,random_state= 0)
  33. #对数据的训练集进行标准化
  34. ss = StandardScaler()
  35. X_train = ss.fit_transform(X_train) #先拟合数据在进行标准化
  36. #构建并训练模型
  37. ## multi_class:分类方式选择参数,有"ovr(默认)"和"multinomial"两个值可选择,在二元逻辑回归中无区别
  38. ## cv:几折交叉验证
  39. ## solver:优化算法选择参数,当penalty为"l1"时,参数只能是"liblinear(坐标轴下降法)"
  40. ## "lbfgs"和"cg"都是关于目标函数的二阶泰勒展开
  41. ## 当penalty为"l2"时,参数可以是"lbfgs(拟牛顿法)","newton_cg(牛顿法变种)","seg(minibactch随机平均梯度下降)"
  42. ## 维度<10000时,选择"lbfgs"法,维度>10000时,选择"cs"法比较好,显卡计算的时候,lbfgs"和"cs"都比"seg"快
  43. ## penalty:正则化选择参数,用于解决过拟合,可选"l1","l2"
  44. ## tol:当目标函数下降到该值是就停止,叫:容忍度,防止计算的过多
  45. lr = LogisticRegressionCV(multi_class= "ovr",fit_intercept= True,Cs=np.logspace( -2, 2, 20),cv= 2,penalty= "l2",solver= "lbfgs",tol= 0.01)
  46. re = lr.fit(X_train,Y_train)
  47. #模型效果获取
  48. r = re.score(X_train,Y_train)
  49. print( "R值(准确率):",r)
  50. print( "参数:",re.coef_)
  51. print( "截距:",re.intercept_)
  52. print( "稀疏化特征比率:%.2f%%" %(np.mean(lr.coef_.ravel()== 0)* 100))
  53. print( "=========sigmoid函数转化的值,即:概率p=========")
  54. print(re.predict_proba(X_test)) #sigmoid函数转化的值,即:概率p
  55. #模型的保存与持久化
  56. from sklearn.externals import joblib
  57. joblib.dump(ss, "logistic_ss.model") #将标准化模型保存
  58. joblib.dump(lr, "logistic_lr.model") #将训练后的线性模型保存
  59. joblib.load( "logistic_ss.model") #加载模型,会保存该model文件
  60. joblib.load( "logistic_lr.model")
  61. #预测
  62. X_test = ss.transform(X_test) #数据标准化
  63. Y_predict = lr.predict(X_test) #预测
  64. #画图对预测值和实际值进行比较
  65. x = range(len(X_test))
  66. plt.figure(figsize=( 14, 7),facecolor= "w")
  67. plt.ylim( 0, 6)
  68. plt.plot(x,Y_test, "ro",markersize= 8,zorder= 3,label= u"真实值")
  69. plt.plot(x,Y_predict, "go",markersize= 14,zorder= 2,label= u"预测值,$R^2$=%.3f" %lr.score(X_test,Y_test))
  70. plt.legend(loc= "upper left")
  71. plt.xlabel( u"数据编号",fontsize= 18)
  72. plt.ylabel( u"乳癌类型",fontsize= 18)
  73. plt.title( u"Logistic算法对数据进行分类",fontsize= 20)
  74. plt.savefig( "Logistic算法对数据进行分类.png")
  75. plt.show()
  76. print( "=============Y_test==============")
  77. print(Y_test.ravel())
  78. print( "============Y_predict============")
  79. print(Y_predict)
  80. #运行结果:
  81. R值(准确率): 0.970684039088
  82. 参数: [[ 1.3926311 0.17397478 0.65749877 0.8929026 0.36507062 1.36092964
  83. 0.91444624 0.63198866 0.75459326]]
  84. 截距: [ -1.02717163]
  85. 稀疏化特征比率: 0.00%
  86. =========sigmoid函数转化的值,即:概率p=========
  87. [[ 6.61838068e-06 9.99993382e-01]
  88. [ 3.78575185e-05 9.99962142e-01]
  89. [ 2.44249065e-15 1.00000000e+00]
  90. [ 0.00000000e+00 1.00000000e+00]
  91. [ 1.52850624e-03 9.98471494e-01]
  92. [ 6.67061684e-05 9.99933294e-01]
  93. [ 6.75536843e-07 9.99999324e-01]
  94. [ 0.00000000e+00 1.00000000e+00]
  95. [ 2.43117004e-05 9.99975688e-01]
  96. [ 6.13092842e-04 9.99386907e-01]
  97. [ 0.00000000e+00 1.00000000e+00]
  98. [ 2.00330728e-06 9.99997997e-01]
  99. [ 0.00000000e+00 1.00000000e+00]
  100. [ 3.78575185e-05 9.99962142e-01]
  101. [ 4.65824155e-08 9.99999953e-01]
  102. [ 5.47788703e-10 9.99999999e-01]
  103. [ 0.00000000e+00 1.00000000e+00]
  104. [ 0.00000000e+00 1.00000000e+00]
  105. [ 0.00000000e+00 1.00000000e+00]
  106. [ 6.27260778e-07 9.99999373e-01]
  107. [ 3.78575185e-05 9.99962142e-01]
  108. [ 3.85098865e-06 9.99996149e-01]
  109. [ 1.80189197e-12 1.00000000e+00]
  110. [ 9.44640398e-05 9.99905536e-01]
  111. [ 0.00000000e+00 1.00000000e+00]
  112. [ 0.00000000e+00 1.00000000e+00]
  113. [ 4.11688915e-06 9.99995883e-01]
  114. [ 1.85886872e-05 9.99981411e-01]
  115. [ 5.83016713e-06 9.99994170e-01]
  116. [ 0.00000000e+00 1.00000000e+00]
  117. [ 1.52850624e-03 9.98471494e-01]
  118. [ 0.00000000e+00 1.00000000e+00]
  119. [ 0.00000000e+00 1.00000000e+00]
  120. [ 1.51713085e-05 9.99984829e-01]
  121. [ 2.34685008e-05 9.99976531e-01]
  122. [ 1.51713085e-05 9.99984829e-01]
  123. [ 0.00000000e+00 1.00000000e+00]
  124. [ 0.00000000e+00 1.00000000e+00]
  125. [ 2.34685008e-05 9.99976531e-01]
  126. [ 0.00000000e+00 1.00000000e+00]
  127. [ 9.97563915e-07 9.99999002e-01]
  128. [ 1.70686321e-07 9.99999829e-01]
  129. [ 1.38382134e-04 9.99861618e-01]
  130. [ 1.36080718e-04 9.99863919e-01]
  131. [ 1.52850624e-03 9.98471494e-01]
  132. [ 1.68154251e-05 9.99983185e-01]
  133. [ 6.66097483e-04 9.99333903e-01]
  134. [ 0.00000000e+00 1.00000000e+00]
  135. [ 9.77502258e-07 9.99999022e-01]
  136. [ 5.83016713e-06 9.99994170e-01]
  137. [ 0.00000000e+00 1.00000000e+00]
  138. [ 4.09496721e-06 9.99995905e-01]
  139. [ 0.00000000e+00 1.00000000e+00]
  140. [ 1.37819117e-06 9.99998622e-01]
  141. [ 6.27260778e-07 9.99999373e-01]
  142. [ 4.52734741e-07 9.99999547e-01]
  143. [ 0.00000000e+00 1.00000000e+00]
  144. [ 8.88178420e-16 1.00000000e+00]
  145. [ 1.06976766e-08 9.99999989e-01]
  146. [ 0.00000000e+00 1.00000000e+00]
  147. [ 2.45780192e-04 9.99754220e-01]
  148. [ 3.92389040e-04 9.99607611e-01]
  149. [ 6.10681985e-05 9.99938932e-01]
  150. [ 9.44640398e-05 9.99905536e-01]
  151. [ 1.51713085e-05 9.99984829e-01]
  152. [ 2.45780192e-04 9.99754220e-01]
  153. [ 2.45780192e-04 9.99754220e-01]
  154. [ 1.51713085e-05 9.99984829e-01]
  155. [ 0.00000000e+00 1.00000000e+00]]
  156. =============Y_test==============
  157. [ 2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4 4 2 2 2 4 2 4 4 2 2 2 4
  158. 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 4 2 2 2 4 2 2 4 2 2 2 2 2 2 2 2 4]
  159. ============Y_predict============
  160. [ 2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4 4 2 2 2 4 2 4 4 2 2 2 4
  161. 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 4 2 2 2 4 4 2 4 2 2 2 2 2 2 2 2 4]




  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. # Author:ZhengzhengLiu
  4. #葡萄酒质量预测模型
  5. import numpy as np
  6. import matplotlib as mpl
  7. import matplotlib.pyplot as plt
  8. import pandas as pd
  9. import warnings
  10. import sklearn
  11. from sklearn.linear_model import LogisticRegressionCV
  12. from sklearn.linear_model.coordinate_descent import ConvergenceWarning
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.preprocessing import StandardScaler
  15. from sklearn.preprocessing import MinMaxScaler
  16. from sklearn.preprocessing import label_binarize
  17. from sklearn import metrics
  18. #解决中文显示问题
  19. mpl.rcParams[ 'font.sans-serif']=[ u'simHei']
  20. mpl.rcParams[ 'axes.unicode_minus']= False
  21. #拦截异常
  22. warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
  23. #导入数据
  24. path1 = "datas/winequality-red.csv"
  25. df1 = pd.read_csv(path1, sep= ";")
  26. df1[ 'type'] = 1
  27. path2 = "datas/winequality-white.csv"
  28. df2 = pd.read_csv(path2, sep= ";")
  29. df2[ 'type'] = 2
  30. df = pd.concat([df1,df2], axis= 0)
  31. names = [ "fixed acidity", "volatile acidity", "citric acid",
  32. "residual sugar", "chlorides", "free sulfur dioxide",
  33. "total sulfur dioxide", "density", "pH", "sulphates",
  34. "alcohol", "type"]
  35. quality = "quality"
  36. #print(df.head(5))
  37. #对异常数据进行清除
  38. new_df = df.replace( '?', np.nan)
  39. datas = new_df.dropna(how = 'any')
  40. print ( "原始数据条数:%d;异常数据处理后数据条数:%d;异常数据条数:%d" % (len(df), len(datas), len(df) - len(datas)))
  41. #数据提取与数据分割
  42. X = datas[names]
  43. Y = datas[quality]
  44. #划分训练集与测试集
  45. X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size= 0.25,random_state= 0)
  46. print ( "训练数据条数:%d;数据特征个数:%d;测试数据条数:%d" % (X_train.shape[ 0], X_train.shape[ 1], X_test.shape[ 0]))
  47. #对数据的训练集进行标准化
  48. mms = MinMaxScaler()
  49. X_train = mms.fit_transform(X_train)
  50. #构建并训练模型
  51. lr = LogisticRegressionCV(fit_intercept= True, Cs=np.logspace( -5, 1, 100),
  52. multi_class= 'multinomial', penalty= 'l2', solver= 'lbfgs')
  53. lr.fit(X_train, Y_train)
  54. ##模型效果获取
  55. r = lr.score(X_train, Y_train)
  56. print ( "R值:", r)
  57. print ( "特征稀疏化比率:%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100))
  58. print ( "参数:",lr.coef_)
  59. print ( "截距:",lr.intercept_)
  60. #预测
  61. X_test = mms.transform(X_test)
  62. Y_predict = lr.predict(X_test)
  63. #画图对预测值和实际值进行比较
  64. x_len = range(len(X_test))
  65. plt.figure(figsize=( 14, 7), facecolor= 'w')
  66. plt.ylim( -1, 11)
  67. plt.plot(x_len, Y_test, 'ro',markersize = 8, zorder= 3, label= u'真实值')
  68. plt.plot(x_len, Y_predict, 'go', markersize = 12, zorder= 2, label= u'预测值,$R^2$=%.3f' % lr.score(X_train, Y_train))
  69. plt.legend(loc = 'upper left')
  70. plt.xlabel( u'数据编号', fontsize= 18)
  71. plt.ylabel( u'葡萄酒质量', fontsize= 18)
  72. plt.title( u'葡萄酒质量预测统计', fontsize= 20)
  73. plt.savefig( "葡萄酒质量预测统计.png")
  74. plt.show()
  75. #运行结果:
  76. 原始数据条数: 6497;异常数据处理后数据条数: 6497;异常数据条数: 0
  77. 训练数据条数: 4872;数据特征个数: 12;测试数据条数: 1625
  78. R值: 0.549466338259
  79. 特征稀疏化比率: 0.00%
  80. 参数: [[ 0.97934119 2.16608604 -0.41710039 -0.49330657 0.90621136 1.44813439
  81. 0.75463562 0.2311527 0.01015772 -0.69598672 -0.71473577 -0.2907567 ]
  82. [ 0.62487587 5.11612885 -0.38168837 -2.16145905 1.21149753 -3.71928146
  83. -1.45623362 1.34125165 0.33725355 -0.86655787 -2.7469681 2.02850838]
  84. [ -1.73828753 1.96024965 0.48775556 -1.91223567 0.64365084 -1.67821019
  85. 2.20322661 1.49086179 -1.36192671 -2.2337436 -5.01452059 -0.75501299]
  86. [ -1.19975858 -2.60860814 -0.34557812 0.17579494 -0.04388969 0.81453743
  87. -0.28250319 0.51716692 -0.67756552 0.18480087 0.01838834 -0.71392084]
  88. [ 1.15641271 -4.6636028 -0.30902483 2.21225522 -2.00298042 1.66691445
  89. -1.02831849 -2.15017982 0.80529532 2.68270545 3.36326129 -0.73635195]
  90. [ -0.07892353 -1.82724304 0.69405191 2.07681409 -0.6247279 1.49244742
  91. -0.16115782 -1.3671237 0.72694885 1.06878382 4.68718155 0.04669067]
  92. [ 0.25633987 -0.14301056 0.27158425 0.10213705 -0.08976172 -0.02454203
  93. -0.02964911 -0.06312954 0.15983679 -0.14000195 0.40739327 0.42084343]]
  94. 截距: [ -2.34176729 -1.1649153 4.91027564 4.3206539 1.30164164 -2.25841567
  95. -4.76747291]






(4)知识点——所涉及到的几种 sklearn 的二值化编码函数:

OneHotEncoder(), LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()



  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. # Author:ZhengzhengLiu
  4. #分类综合问题——鸢尾花分类案例(ROC/AUC)
  5. import numpy as np
  6. import matplotlib as mpl
  7. import matplotlib.pyplot as plt
  8. import pandas as pd
  9. import warnings
  10. import sklearn
  11. from sklearn.linear_model import LogisticRegressionCV
  12. from sklearn.linear_model.coordinate_descent import ConvergenceWarning
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.preprocessing import StandardScaler
  15. from sklearn.neighbors import KNeighborsClassifier
  16. from sklearn.preprocessing import label_binarize
  17. from sklearn import metrics
  18. #解决中文显示问题
  19. mpl.rcParams[ 'font.sans-serif']=[ u'simHei']
  20. mpl.rcParams[ 'axes.unicode_minus']= False
  21. #拦截异常
  22. warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
  23. #导入数据
  24. path = "datas/iris.data"
  25. names = [ 'sepal length', 'sepal width', 'petal length', 'petal width', 'cla']
  26. df = pd.read_csv(path,header= None,names=names)
  27. print(df[ 'cla'].value_counts())
  28. print(df.head())
  29. #编码函数
  30. def parseRecord(record): #record是数据集
  31. result = []
  32. # zip() 函数接受一系列可迭代的对象作为参数,将对象中对应的元素按顺序组合成一个tuple,
  33. # 每个tuple中包含的是原有序列中对应序号位置的元素,然后返回由这些tuples组成的list。
  34. r = zip(names,record)
  35. for name,v in r:
  36. if name == "cla":
  37. if v == "Iris-setosa":
  38. result.append( 1)
  39. elif v == "Iris-versicolor":
  40. result.append( 2)
  41. elif v == "Iris-virginica":
  42. result.append( 3)
  43. else:
  44. result.append(np.nan)
  45. else:
  46. result.append(float(v))
  47. return result
  48. #数据转换为数字以及分割
  49. #数据转换
  50. datas = df.apply( lambda r:parseRecord(r),axis= 1)
  51. print(datas.head())
  52. #异常数据删除
  53. datas = datas.dropna(how= "any")
  54. #数据分割
  55. X = datas[names[ 0: -1]]
  56. Y = datas[names[ -1]]
  57. #划分训练集与测试集
  58. X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size= 0.4,random_state= 0)
  59. print( "原始数据条数:%d;训练数据条数:%d;特征个数:%d;测试样本条数:%d" %(len(X),len(X_train),X_train.shape[ 1],len(X_test)))
  60. #对数据集进行标准化
  61. ss = StandardScaler()
  62. X_train = ss.fit_transform(X_train)
  63. X_test = ss.transform(X_test)
  64. #构建并训练模型
  65. lr = LogisticRegressionCV(Cs=np.logspace( -4, 1, 50),cv= 3,fit_intercept= True,penalty= "l2",
  66. solver= "lbfgs",tol= 0.01,multi_class= "multinomial")
  67. lr.fit(X_train,Y_train)
  68. #模型效果获取
  69. #将测试集标签数据用二值化编码的方式转换为矩阵
  70. y_test_hot = label_binarize(Y_test,classes=( 1, 2, 3))
  71. #得到预测的损失值
  72. lr_y_score = lr.decision_function(X_test)
  73. #计算ROC的值,lr_threasholds为阈值
  74. lr_fpr,lr_tpr,lr_threasholds = metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel())
  75. #计算AUC值
  76. lr_auc = metrics.auc(lr_fpr,lr_tpr)
  77. print( "Logistic算法R值:",lr.score(X_train,Y_train))
  78. print( "Logistic算法AUC值:",lr_auc)
  79. #模型预测
  80. lr_y_predict = lr.predict(X_test)
  81. #画图对预测值和实际值进行比较
  82. plt.figure(figsize=( 8, 6),facecolor= "w")
  83. plt.plot(lr_fpr,lr_tpr,c= "r",lw= 2,label= u"Logistic算法,AUC=%.3f" %lr_auc)
  84. plt.plot(( 0, 1),( 0, 1),c= '#a0a0a0',lw= 2,ls= '--')
  85. plt.xlim( -0.01, 1.02)
  86. plt.ylim( -0.01, 1.02)
  87. plt.xticks(np.arange( 0, 1.1, 0.1))
  88. plt.yticks(np.arange( 0, 1.1, 0.1))
  89. plt.xlabel( 'False Positive Rate(FPR)', fontsize= 16)
  90. plt.ylabel( 'True Positive Rate(TPR)', fontsize= 16)
  91. plt.grid(b= True, ls= ':')
  92. plt.legend(loc= 'lower right', fancybox= True, framealpha= 0.8, fontsize= 12)
  93. plt.title( u'鸢尾花数据Logistic算法的ROC/AUC', fontsize= 18)
  94. plt.savefig( "鸢尾花数据Logistic算法的ROC和AUC.png")
  95. plt.show()
  96. len_x_test = range(len(X_test))
  97. plt.figure(figsize=( 12, 9),facecolor= "w")
  98. plt.ylim( 0.5, 3.5)
  99. plt.plot(len_x_test,Y_test, "ro",markersize= 6,zorder= 3,label= u"真实值")
  100. plt.plot(len_x_test,lr_y_predict, "go",markersize= 10,zorder= 2,label= u"Logis算法预测值,$R^2=%.3f$" %lr.score(X_test,Y_test))
  101. plt.legend(loc = 'lower right')
  102. plt.xlabel( u'数据编号', fontsize= 18)
  103. plt.ylabel( u'种类', fontsize= 18)
  104. plt.title( u'鸢尾花数据分类', fontsize= 20)
  105. plt.savefig( "鸢尾花数据分类.png")
  106. plt.show()
  107. #运行结果:
  108. Iris-versicolor 50
  109. Iris-setosa 50
  110. Iris-virginica 50
  111. Name: cla, dtype: int64
  112. sepal length sepal width petal length petal width cla
  113. 0 5.1 3.5 1.4 0.2 Iris-setosa
  114. 1 4.9 3.0 1.4 0.2 Iris-setosa
  115. 2 4.7 3.2 1.3 0.2 Iris-setosa
  116. 3 4.6 3.1 1.5 0.2 Iris-setosa
  117. 4 5.0 3.6 1.4 0.2 Iris-setosa
  118. sepal length sepal width petal length petal width cla
  119. 0 5.1 3.5 1.4 0.2 1.0
  120. 1 4.9 3.0 1.4 0.2 1.0
  121. 2 4.7 3.2 1.3 0.2 1.0
  122. 3 4.6 3.1 1.5 0.2 1.0
  123. 4 5.0 3.6 1.4 0.2 1.0
  124. 原始数据条数: 150;训练数据条数: 90;特征个数: 4;测试样本条数: 60
  125. Logistic算法R值: 0.977777777778
  126. Logistic算法AUC值: 0.926944444444

