Supervised Learning Model-Classification Learning

Supervised Learning Model-Classification Learning

1. Logisitic Regression

#导入pandas与numpy工具包
import pandas as pd
import numpy as np

#创建特征列表
column_names = ['sample code number','clump thickness','uniformity of cell size','uniformity of cell shape',
                'marginal adhesion','single epithelial cell size','bare nuclei','bland chromatin','normal nucleoli',
                'mitoses','class']

'''
#使用pandas.read_csv函数从互联网读取指定数据
'''
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
                   , names = column_names)

#打印完整数据的行列数
print(data.shape)

#将?替换为标准缺失值表示
data = data.replace(to_replace= '?', value=np.nan)
#丢弃带有缺失值的数据(只要有一个维度有缺失)
data = data.dropna(how = 'any')
#输出data的数据量和维度
print(data.shape)

'''
#准备训练数据和测试数据,通常情况下,25%的数据会做测试集,其余75%的数据用作训练集
'''
#使用sklearn.cross_validation里面的train_test_split模块用于数据分割
from sklearn.cross_validation import train_test_split
#随机采样25%的数据用于测试,剩下75%用于构建训练集合
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=33)

#查验训练样本的数量和类别分布
print(y_train.value_counts())
#查验测试样本的数量和类别分布
print(y_test.value_counts())


#从sklearn.preprocessing里导入standarscaler
from sklearn.preprocessing import StandardScaler
#从sklearn.linear_model里导入LogisticRegression与SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

'''
#标准化数据,保证每一个维度的特征数据的方差为1,均值为0,使得预测结果不会被某些维度过大的特征值而主导
'''
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

'''
初始化LogisticRegression与SGDClassifier
'''
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用LogisticRegression中的fit函数/模块用来训练参数模型参数
lr.fit(X_train,y_train)
#使用训练好的模型lr对X_test进行预测,结果存储在变量lr_y_predict中
lr_y_predict = lr.predict(X_test)
#调用SGDClassifier中的fit函数/模块来训练参数模型参数
sgdc.fit(X_train,y_train)
#使用训练好的模型sgdc对X_test进行预测,结果储存在变量sgdc_y_predict中
sgdc_y_predict = sgdc.predict(X_test)

'''
使用线性分类模型对预测任务进行性能分析
'''
#从sklearn.metrics里导入classification_report模块
from sklearn.metrics import classification_report

#使用logisticRegression模型自带的评分函数score来获得模型在测试集上准确性结果
print('Accuracy of LR Classifier:', lr.score(X_test,y_test))
#使用classification_report模块来获得LogisticRegression其他三个指标的结果
print(classification_report(y_test,lr_y_predict,target_names=['benign','malignant']))

#使用SGD模型自带的评分函数score来获得模型在测试集上准确性结果
print('Accuracy of SGD Classifier:', sgdc.score(X_test,y_test))
#使用classification_report模块来获得SGD Classifier其他三个指标的结果
print(classification_report(y_test,sgdc_y_predict,target_names=['benign','malignant']))
(699, 11)
(683, 11)
2    344
4    168
Name: class, dtype: int64
2    100
4     71
Name: class, dtype: int64
Accuracy of LR Classifier: 0.970760233918
             precision    recall  f1-score   support

     benign       0.96      0.99      0.98       100
  malignant       0.99      0.94      0.96        71

avg / total       0.97      0.97      0.97       171

Accuracy of SGD Classifier: 0.976608187135
             precision    recall  f1-score   support

     benign       0.97      0.99      0.98       100
  malignant       0.99      0.96      0.97        71

avg / total       0.98      0.98      0.98       171

2. Support Vector Classification

#从sklearn.datasets里面导入手写体数字加载器
from sklearn.datasets import load_digits
#将加载的图像数据存储在digits上
digit = load_digits()
#检查数据的规模和特征维度
print(digit.data.shape)

'''
手写体数据分割代码样例
'''
#从sklearn.cross_validation中导入train_test_split用于数据分割
from sklearn.cross_validation import train_test_split
#随机选取75%数据作为训练样本,其余的25%作为测试样本
X_train,X_test,y_train,y_test = train_test_split(digit.data,digit.target,test_size=0.25,random_state=33)
#分别检验训练和测试数据的规模
print(y_train.shape)
print(y_test.shape)

'''
使用支持向量机(分类)对手写数字图像进行识别
'''
#从sklearn.precessing里导入数据标准化模块
from sklearn.preprocessing import StandardScaler
#从sklearn.svm里面导入基于线性假设的支持向量分类器LinearSVC
from sklearn.svm import LinearSVC

'''
对训练和测试的特征数据进行标准化
'''
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

'''
支持向量机分类器LinearSVC初始化和模型训练
'''
#初始化线性假设支持向量机分类器LinearSVC
lsvc = LinearSVC()
#进行模型训练
lsvc.fit(X_train,y_train)
#利用训练好的模型对测试样本的数字类别进行预测,预测结果存储在变量y_predict中
y_predict = lsvc.predict(X_test)

'''
使用模型自带的评估函数进行准确性测评
'''
print('The Accuracy of Linear SVC is',lsvc.score(X_test,y_test))
#依然使用sklearn.metrics里面的classification_report模块对预测结果做更详细的分析
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict,target_names=digit.target_names.astype(str)))
(1797, 64)
(1347,)
(450,)
The Accuracy of Linear SVC is 0.953333333333
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        35
          1       0.96      0.98      0.97        54
          2       0.98      1.00      0.99        44
          3       0.93      0.93      0.93        46
          4       0.97      1.00      0.99        35
          5       0.94      0.94      0.94        48
          6       0.96      0.98      0.97        51
          7       0.92      1.00      0.96        35
          8       0.98      0.84      0.91        58
          9       0.95      0.91      0.93        44

avg / total       0.95      0.95      0.95       450

3. Naive Bayes

#从sklearn.datasets里面导入新闻数据抓取器fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups
#与之前预存的数据不同,fetch_20newsgroups需要即时从互联网下载数据
news = fetch_20newsgroups(subset='all')

#查验数据的规模和细节
print(len((news.data)))
print(news.data[1])

'''
20类新闻文本数据分割
'''
#从sklearn.cross_validation导入train_test_split
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)

'''
用朴素贝叶斯分类器对新闻文本数据进行类别预测
'''
#从sklearn.feature_extraction.text里导入用于文本特征向量转化模块
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

#从sklearn.naive_bayes里导入朴素贝叶斯模型
from sklearn.naive_bayes import MultinomialNB
#使用默认初始化朴素贝叶斯模型
mnb = MultinomialNB()
#利用训练数据对模型的参数进行估计
mnb.fit(X_train,y_train)
#对测试的样本进行类别预测,结果存储在变量y_predict
y_predict = mnb.predict(X_test)


'''
对朴素贝叶斯分类器在新闻文本数据上的表现性能进行评估
'''
#从sklearn.metrics里面导入classification_report用来详细的分类性能报告
from sklearn.metrics import classification_report
print('The accuracy of naive bayes classifier is',mnb.score(X_test,y_test))
print(classification_report(y_test,y_predict,target_names=news.target_names))
18846
From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)
Subject: Which high-performance VLB video card?
Summary: Seek recommendations for VLB video card
Nntp-Posting-Host: midway.ecn.uoknor.edu
Organization: Engineering Computer Network, University of Oklahoma, Norman, OK, USA
Keywords: orchid, stealth, vlb
Lines: 21

  My brother is in the market for a high-performance video card that supports
VESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:

  - Diamond Stealth Pro Local Bus

  - Orchid Farenheit 1280

  - ATI Graphics Ultra Pro

  - Any other high-performance VLB card


Please post or email.  Thank you!

  - Matt

-- 
    |  Matthew B. Lawson <------------> (mblawson@essex.ecn.uoknor.edu)  |   
  --+-- "Now I, Nebuchadnezzar, praise and exalt and glorify the King  --+-- 
    |   of heaven, because everything he does is right and all his ways  |   
    |   are just." - Nebuchadnezzar, king of Babylon, 562 B.C.           |   

The accuracy of naive bayes classifier is 0.839770797963
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.92      0.94      0.93       245
               sci.space       0.89      0.96      0.92       221
  soc.religion.christian       0.78      0.96      0.86       232
      talk.politics.guns       0.88      0.96      0.92       251
   talk.politics.mideast       0.90      0.98      0.94       231
      talk.politics.misc       0.79      0.89      0.84       188
      talk.religion.misc       0.93      0.44      0.60       158

             avg / total       0.86      0.84      0.82      4712

4. KNN

#读取Iris数据集细节资料
#从sklearn.datasets导入iris数据加载器
from sklearn.datasets import load_iris

#s使用加载器读取数据并且存入变量iris
iris = load_iris()
#查看数据规模
print(iris.data.shape)

#查看数据说明,对于一个机器学习实践者来讲,这是一个好习惯
print(iris.DESCR)

'''
对iris数据集进行分割
'''
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.25,random_state=33)

'''
用KNN对iris数据进行类别预测
'''
#从sklearn.precessing里导入数据标准化模块
from sklearn.preprocessing import StandardScaler
#从sklearn.neighbors里选择导入KNeighborsClassifier,K近领分类器
from sklearn.neighbors import KNeighborsClassifier

#对训练和测试集进行标准化
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#使用K近邻分类器对测试的数据进行类别预测,预测结果存储在y_predict中
knc = KNeighborsClassifier()
knc.fit(X_train,y_train)
y_predict = knc.predict(X_test)

'''
对KNN在iris数据上的预测性能进行评估
'''
#使用模型自带的评估函数进行准确性测评
print('the accuracy of K-Nearest Neighbor Classifier is',knc.score(X_train,y_train))
#依然sklearn.metrics里面的classification_report模块对预测结果进行详细的分析
#依然使用sklearn.metrics里面的classification_report模块对预测的结果做出更加详细分析
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict,target_names=iris.target_names))
(150, 4)
Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris datasets.
http://archive.ics.uci.edu/ml/datasets/Iris

The famous Iris database, first used by Sir R.A Fisher

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

References
----------
   - Fisher,R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...

the accuracy of K-Nearest Neighbor Classifier is 0.973214285714
             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         8
 versicolor       0.73      1.00      0.85        11
  virginica       1.00      0.79      0.88        19

avg / total       0.92      0.89      0.90        38

5. Decision Tree

#泰坦尼克号乘客数据查验
import pandas as pd
#利用pandas的read_csv模块直接从互联网收集泰坦尼克号乘客数据
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#观察前几行数据,可以发现数据种类各异,数值型,类别型,甚至还有缺失数据
print(titanic.head())
#使用pandas,数据都转入pandas独有的dataframe格式(二维数据表格),直接使用info,查看数据的统计特性
print(titanic.info())

#相当重要的环节- 特征选择,基于一些背景知识,根据对这场故事的了解,sex,age,pclass这些特征都很有可能决定幸免的关键因素
X = titanic[['pclass','age','sex']]
y = titanic['survived']
#对当前选择的特征进行探查
print(X.info())

'''
借助上面的输出,我们设计如下几个数据处理的任务
(1)age这个数据列,只有633个,需要补充完整-用均值替代
(2)sex与pclass两个数据列的值都是类别型的,需要转化为数值特征,用0/1替代
'''
#首先我们补充age里的数据,使用平均数或者中位数都是对模型偏离造成最小影响的策略
print(X['age'].fillna(X['age'].mean(),inplace=True))
#对补完的数据重新探查
print(X.info())
print(X['age'])
print(X['sex'])
print(X['pclass'])

'''
进行数据分割和特征抽取
'''
#数据分割
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)

#s使用scikit-learn.feature_extraction中特征转化器
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
#转换特征后,我们发现凡是类别型的特征都单独剥离出来,独成一列特征,数值型的则保持不变
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)
#同样需要对测试数据的特征进行转换
X_test = vec.transform(X_test.to_dict(orient='record'))
print(X_train)
print('training samples is',X_train.shape)
print('training samples is',X_test.shape)
'''
导入决策树分类器,使用分割到的训练数据进行模型学习
用训练好的决策树模型对测试特征数据进行预测
'''
#从sklearn.tree中导入决策树分类器
from sklearn.tree import DecisionTreeClassifier
#使用默认配置初始化决策树分类器
dtc = DecisionTreeClassifier()
#使用分割的训练数据进行模型学习
dtc.fit(X_train,y_train)
#用训练好的决策树模型对测试特征数据进行预测
y_predict = dtc.predict(X_test)
print(y_predict)
print(y_test)

'''
决策树模型对泰坦尼克号乘客是否生还的预测功能
'''
#从sklearn.metrics导入classification_report
from sklearn.metrics import classification_report
#输出预测准确性
print('the accuracy of Desion Tree is',dtc.score(X_test,y_test))
#输出更加详细的分类性能
print(classification_report(y_predict,y_test,target_names=['died','survived']))
   row.names pclass  survived  \
0          1    1st         1   
1          2    1st         0   
2          3    1st         0   
3          4    1st         0   
4          5    1st         1   

                                              name      age     embarked  \
0                     Allen, Miss Elisabeth Walton  29.0000  Southampton   
1                      Allison, Miss Helen Loraine   2.0000  Southampton   
2              Allison, Mr Hudson Joshua Creighton  30.0000  Southampton   
3  Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)  25.0000  Southampton   
4                    Allison, Master Hudson Trevor   0.9167  Southampton   

                         home.dest room      ticket   boat     sex  
0                     St Louis, MO  B-5  24160 L221      2  female  
1  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  
2  Montreal, PQ / Chesterville, ON  C26         NaN  (135)    male  
3  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  
4  Montreal, PQ / Chesterville, ON  C22         NaN     11    male  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       633 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None


C:\Users\xxz\Anaconda3\lib\site-packages\pandas\core\generic.py:3191: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       1313 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None
0       29.000000
1        2.000000
2       30.000000
3       25.000000
4        0.916700
5       47.000000
6       63.000000
7       39.000000
8       58.000000
9       71.000000
10      47.000000
11      19.000000
12      31.194181
13      31.194181
14      31.194181
15      50.000000
16      24.000000
17      36.000000
18      37.000000
19      47.000000
20      26.000000
21      25.000000
22      25.000000
23      19.000000
24      28.000000
25      45.000000
26      39.000000
27      30.000000
28      58.000000
29      31.194181
          ...    
1283    31.194181
1284    31.194181
1285    31.194181
1286    31.194181
1287    31.194181
1288    31.194181
1289    31.194181
1290    31.194181
1291    31.194181
1292    31.194181
1293    31.194181
1294    31.194181
1295    31.194181
1296    31.194181
1297    31.194181
1298    31.194181
1299    31.194181
1300    31.194181
1301    31.194181
1302    31.194181
1303    31.194181
1304    31.194181
1305    31.194181
1306    31.194181
1307    31.194181
1308    31.194181
1309    31.194181
1310    31.194181
1311    31.194181
1312    31.194181
Name: age, dtype: float64
0       female
1       female
2         male
3       female
4         male
5         male
6       female
7         male
8       female
9         male
10        male
11      female
12      female
13        male
14        male
15      female
16        male
17        male
18        male
19      female
20        male
21        male
22        male
23      female
24        male
25        male
26        male
27      female
28      female
29        male
         ...  
1283    female
1284      male
1285      male
1286      male
1287      male
1288      male
1289      male
1290      male
1291      male
1292      male
1293    female
1294      male
1295      male
1296      male
1297      male
1298      male
1299      male
1300      male
1301      male
1302      male
1303      male
1304    female
1305      male
1306    female
1307    female
1308      male
1309      male
1310      male
1311    female
1312      male
Name: sex, dtype: object
0       1st
1       1st
2       1st
3       1st
4       1st
5       1st
6       1st
7       1st
8       1st
9       1st
10      1st
11      1st
12      1st
13      1st
14      1st
15      1st
16      1st
17      1st
18      1st
19      1st
20      1st
21      1st
22      1st
23      1st
24      1st
25      1st
26      1st
27      1st
28      1st
29      1st
       ... 
1283    3rd
1284    3rd
1285    3rd
1286    3rd
1287    3rd
1288    3rd
1289    3rd
1290    3rd
1291    3rd
1292    3rd
1293    3rd
1294    3rd
1295    3rd
1296    3rd
1297    3rd
1298    3rd
1299    3rd
1300    3rd
1301    3rd
1302    3rd
1303    3rd
1304    3rd
1305    3rd
1306    3rd
1307    3rd
1308    3rd
1309    3rd
1310    3rd
1311    3rd
1312    3rd
Name: pclass, dtype: object
['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
[[ 31.19418104   0.           0.           1.           0.           1.        ]
 [ 31.19418104   1.           0.           0.           1.           0.        ]
 [ 31.19418104   0.           0.           1.           0.           1.        ]
 ..., 
 [ 12.           0.           1.           0.           1.           0.        ]
 [ 18.           0.           1.           0.           0.           1.        ]
 [ 31.19418104   0.           0.           1.           1.           0.        ]]
training samples is (984, 6)
training samples is (329, 6)
[0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0
 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0
 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0]
386     0
89      1
183     1
746     0
1211    0
384     1
729     1
1076    0
1290    0
385     1
722     1
169     1
275     0
485     0
790     0
159     1
302     0
296     1
1208    0
428     1
745     0
109     1
1300    0
906     1
1291    0
32      1
584     1
585     1
1250    0
44      0
       ..
414     0
241     1
31      1
764     0
932     0
1226    1
657     1
256     0
803     0
677     0
1279    1
396     1
462     1
693     0
927     0
628     1
506     0
371     0
1198    0
232     1
709     0
508     0
282     1
698     0
1049    0
1048    0
106     1
618     0
175     0
937     0
Name: survived, dtype: int64
the accuracy of Desion Tree is 0.781155015198
             precision    recall  f1-score   support

       died       0.91      0.78      0.84       236
   survived       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329

6. Ensemble Classification

import pandas as pd

'''
通过互联网读取泰坦尼克号乘客文档,并存储在变量titanic中
'''
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

'''
人工选取pclass,age以及sex作为乘客生还的特征
'''
X = titanic[['pclass','age','sex']]
y = titanic['survived']

'''
对于缺失的年龄信息,我们使用全体乘客的平均年龄替代,这样可以在保证顺序训练模型的同时,尽可能不影响预测任务
'''
X['age'].fillna(X['age'].mean(),inplace=True)

'''
对原始数据进行分割,25%的乘客数据用于测试
'''
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)

'''
对类别型特征进行转化,成为特征向量
'''
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))

'''
使用单一决策树进行模型训练以及预测分析
'''
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_y_pred = dtc.predict(X_test)

'''
使用随机森林分类器进行集成模型的训练以及预测分析
'''
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_y_pred = rfc.predict(X_test)

'''
使用梯度提升决策树进行集成模型训练和预测
'''
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_pred = gbc.predict(X_test)

'''
集成模型对泰坦尼克号乘客是否生还的预测性能
'''
#从sklearn.metrics导入classification_report
from sklearn.metrics import classification_report

#输出单一决策在测试集上的准确率,以及详细的精确率,召回率,F1指标
print('the accuracy of decision tree is',dtc.score(X_test,y_test))
print(classification_report(dtc_y_pred,y_test,target_names=['died','survived']))

#输出随机森林分类器在测试集上的准确率,以及详细的精确率,召回率,F1指标
print('the accuracy of random forest classifier is',rfc.score(X_test,y_test))
print(classification_report(rfc_y_pred,y_test,target_names=['died','survived']))

#输出梯度提升决策树在测试集上的准确率,以及详细的精确率,召回率,F1指标
print('the accuracy of decision tree is',gbc.score(X_test,y_test))
print(classification_report(gbc_y_pred,y_test,target_names=['died','survived']))

C:\Users\xxz\Anaconda3\lib\site-packages\pandas\core\generic.py:3191: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


the accuracy of decision tree is 0.781155015198
             precision    recall  f1-score   support

       died       0.91      0.78      0.84       236
   survived       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329

the accuracy of random forest classifier is 0.775075987842
             precision    recall  f1-score   support

       died       0.90      0.77      0.83       236
   survived       0.57      0.78      0.66        93

avg / total       0.81      0.78      0.78       329

the accuracy of decision tree is 0.790273556231
             precision    recall  f1-score   support

       died       0.92      0.78      0.84       239
   survived       0.58      0.82      0.68        90

avg / total       0.83      0.79      0.80       329
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值