python pandas 基本使用,sklearn的10种分类器实践

由于最近代码任务不多,就想把pandas的日常使用和sklearn的建模使用更加规范和熟练,因此就以泰坦尼克的相关数据集为测试数据进行分析。

数据集在 https://codeload.github.com/fayduan/Kaggle_Titanic/zip/master


#!/usr/bin/python
# -*- coding: utf-8 -*-
__author__ = 'Wind'
"""Wish No Bug"""
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.cross_validation import cross_val_score

os.chdir('C:/Users/CC/PycharmProjects/work/titanc')  # 设置工作路径
data = pd.read_csv('train.csv')  # 读取训练数据
data.keys()
# 查看数据的基本情况
print data.info()
# 显示头部的数据
print data.head()
# 显示尾部的数据
print data.tail()
# 显示数据的行号
print data.index
# 显示数据的列号
print data.columns
# 进行大概的描述性统计
print data.describe()

# 按照船舱等级进行排序,并且将na放在前排,倒序
temp_data = data.sort_values(by=['Pclass', 'Age'], na_position='first', ascending=False)
print temp_data.head()

# 按照行坐标进行排序
temp_data = data.sort_index(axis=0, ascending=False)
print temp_data.head()

# 进行数据的转置
temp_data = data.T
print temp_data.head()

# 进行列的选择
print data['Age'].head()
# 进行行的选择
print data[1:5]

# loc只能根据标签进行索引
# 利用loc根据行的标签决心选择
print data.loc[1, ['Age', 'Sex']]

# iloc可以根据位置行号和列号进行索引相当于数组索引
# 利用iloc根据行号进行选择
print data.iloc[1, 1:3]
# 利用iloc 选择某列
print data.iloc[:, 1]
# 根据位置进行选择
print data.iloc[[1, 3], [2, 3]]

# 获取特定的值
print data.iat[1, 1]
print data.iloc[1, 1]

# 进行boll索引
print data[data['Pclass'] == 1].iloc[1]

# 根据isin函数进行boll选择
print data[data['Pclass'].isin([1, 3])].head()

# 对列的值进行替换
temp_data = data.copy()
temp_data.loc[:, 'Age'] = np.array([1] * len(data))
print temp_data.loc[1]

# 判断缺失值,返回boll型
print temp_data.isnull()
# 处理缺失值,如果该行出现缺失值,则剔除
print temp_data.dropna(axis=0, how='any').info()
# 进行缺失值的填充,inplace = false 表示是否在原始数据上进行操作
# thresh 表示门阀值,缺失值大于thresh的数据就进行剔除
print temp_data.fillna(value=5)

# 进行描述性统计
print data.mean(axis=0)

# apply 按列进行处理
print data.apply(np.max)

# 绘制直方图
print data[['Age']].plot.hist()
plt.show()

# 绘制柱形图
# 计算分组交叉表
temp_data = pd.crosstab(data['Pclass'], data['Survived'])
temp_data.plot.bar()
# print data[['Pclass','Survived']].groupby(by = 'Survived').count()
# print data[['Survived', 'Pclass']].groupby(by = 'Pclass').sum().plot.bar(y = 'Survived', x ='Pclass',stacked=True)
plt.show()

# 绘制箱图
data['Age'].plot.box()
plt.show()

# 进行数据合并
pieces = [data[:3], data[5:10], data[11:20]]
temp_data = pd.concat(pieces)
print temp_data

# 在原有数据上面增加一行
temp_row = temp_data.iloc[1]
print temp_data.append(temp_row, ignore_index=True)

# 进行group操作
print data.groupby(['Pclass']).sum()
print data.groupby(['Pclass', 'Survived']).sum()

# 生成数据透视表
print pd.pivot_table(data=data, values='Age', index='Survived', columns='Pclass')

# 输出数据
# data.to_csv(file)

# 运用sklearn的10个分类算法进行建模操作
# 去除空白值
print data.info()
data_1 = data.drop(['Age', 'Cabin'], axis=1)
data_1 = data_1.dropna(axis=0, how='any')
print data_1.info()

# 对二维以上的分类变量就行哑变量处理
data_1.head()
Pclass = pd.get_dummies(data_1['Pclass'], prefix='Pclass')
print Pclass.head()

# 将哑变量增加到原有数据框中
data_2 = pd.concat([data_1, Pclass], axis=1)
print data_2.head()

# metric_all 储存了10个监督模型的f1结果
metric_all = pd.DataFrame()
# 以data_2为基础数据进行预测
X = data_2.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Embarked'], axis=1)
X['Sex'][X['Sex'] == 'male'] = 1
X['Sex'][X['Sex'] == 'female'] = 0
Y = data_2['Survived']
# 首先进行交叉验证的分组
# 构造glm模型
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1000, random_state=0)
metric = cross_val_score(lr, X, Y, cv=10, scoring='f1')
metric.sort()
metric_all['glm'] = metric[::-1]

# 拟合决策树模型
from sklearn import tree

tree = tree.DecisionTreeClassifier(criterion='gini')
metric = cross_val_score(tree, X, Y, cv=10, scoring='f1')
metric.sort()
metric_all['tree'] = metric[::-1]

# 拟合svm模型
from sklearn import svm

svc = svm.SVC(C=1.0, kernel='rbf', gamma='auto')
metric = cross_val_score(svc, X, Y, cv=10, scoring='f1')
metric.sort()
metric_all['svm'] = metric[::-1]

# 拟合随机森林算法
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=30, criterion='gini', random_state=10)
metric = cross_val_score(RF, X, Y, cv=10, scoring='f1')
metric.sort()
metric_all['RandomForest'] = metric[::-1]

# 构造knn最近邻模型
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors=9, algorithm='kd_tree')
metric = cross_val_score(estimator=knn, X=X, y=Y, cv=10, scoring='f1')
metric.sort()
metric_all['knn'] = metric[::-1]

# 构造lda模型
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None, priors=None)
metric = cross_val_score(estimator=lda, X=X, y=Y, cv=10, scoring='f1')
metric.sort()
metric_all['lda'] = metric[::-1]

# 构造qda模型
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
metric = cross_val_score(estimator=qda, X=X, y=Y, cv=10, scoring='f1')
metric.sort()
metric_all['qda'] = metric[::-1]

#  高斯朴素贝叶斯,多项式贝叶斯适用于文本分类,伯努利贝叶斯需要全部变量威二值变量
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
metric = cross_val_score(estimator=gnb, X=X, y=Y, cv=10, scoring='f1')
metric.sort()
metric_all['gnb'] = metric[::-1]

# sklearn提供了bp多层神经网络,隐含层设为 (4, 3, 2)
from sklearn.neural_network import MLPClassifier

mbp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(4, 3, 2), random_state=1)
metric = cross_val_score(estimator=mbp, X=X, y=Y, cv=10, scoring='f1')
metric.sort()
metric_all['mbp'] = metric[::-1]

# 采用adaboost算法
from sklearn.ensemble import AdaBoostClassifier

# 弱分类器的参数 base_estimator 默认为决策树
ada = AdaBoostClassifier(n_estimators=100)
metric = cross_val_score(estimator=ada, X=X, y=Y, cv=10, scoring='f1')
metric.sort()
metric_all['ada'] = metric[::-1]

# 将10种模型结果进行比较
print metric_all
print "按照F1-score进行排序"
metric_mean = metric_all.mean()
print metric_mean.sort_values(ascending=False)

最终的结果如下:

        glm      tree       svm  RandomForest       knn       lda       qda  
0  0.788732  0.811594  0.756757      0.826667  0.760563  0.800000  0.733333   
1  0.761905  0.800000  0.753623      0.800000  0.757576  0.761905  0.693333   
2  0.732394  0.769231  0.705882      0.789474  0.750000  0.742857  0.685714   
3  0.730159  0.760563  0.695652      0.782609  0.727273  0.735294  0.677419   
4  0.716418  0.730159  0.677419      0.781250  0.724638  0.718750  0.676056   
5  0.698413  0.730159  0.676056      0.741935  0.716418  0.687500  0.675000   
6  0.677419  0.711864  0.675325      0.666667  0.714286  0.666667  0.649351   
7  0.677419  0.707692  0.656250      0.634921  0.634921  0.666667  0.563636   
8  0.666667  0.646154  0.622951      0.634921  0.579710  0.655738  0.530612   
9  0.666667  0.634921  0.593750      0.606061  0.562500  0.645161  0.449438   


        gnb       mbp       ada  
0  0.707692  0.786885  0.760563  
1  0.704225  0.776119  0.760563  
2  0.696970  0.742857  0.741935  
3  0.688525  0.711864  0.738462  
4  0.687500  0.701754  0.718750  
5  0.677966  0.692308  0.709677  
6  0.657534  0.690909  0.707692  
7  0.586207  0.675325  0.676471  
8  0.500000  0.654545  0.666667  
9  0.466667  0.620690  0.666667  

按照F1-score进行排序
tree            0.730234
RandomForest    0.726450
ada             0.714745
glm             0.711619
lda             0.708054
mbp             0.705326
knn             0.692788
svm             0.681367
gnb             0.637329
qda             0.633389


  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值