四、Sklearn机器学习之kaggle实例---Titanic、IMDB、MNIST

书籍《Python机器学习及实践》阅读笔记

一、Titanic罹难乘客预测

在kaggle上下载train.csv和test.csv文件,然后搭建模型,提交预测结果

import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV

train = pd.read_csv('../Datasets/Titanic/train.csv')
test = pd.read_csv('../Datasets/Titanic/test.csv')

#print(train.info())
#print(test.info())

selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare']
X_train,  X_test= train[selected_features], test[selected_features]

y_train = train['Survived']

#print(X_train['Embarked'].value_counts())
#对于Embarked这种类别型的特征,我们使用出现频率最高的特征值来填充,这也是相对可以减少引入误差的一种填充方法
X_train['Embarked'].fillna('S', inplace=True)
X_test['Embarked'].fillna('S', inplace=True)
#对于数值特征,使用平均数或者中位数给填充
X_train['Age'].fillna(X_train['Age'].mean(), inplace = True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace = True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)
#使用DictVectororize对特征向量化
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
#print(dict_vec.feature_names_)
X_test=dict_vec.transform(X_test.to_dict(orient='record'))

rfc = RandomForestClassifier()
xgbc = XGBClassifier()
#使用5折交叉验证的方法在训练集上分别对默认配置的两种分类器进行性能评估,并获得平均分类准确性的得分
score_5_rfc = cross_val_score(rfc,X_train, y_train,cv=5).mean()
score_5_xgbc = cross_val_score(xgbc,X_train, y_train,cv=5).mean()
print('rfc={}, xgbc={}'.format(score_5_rfc,score_5_xgbc))
#使用默认配置的RandomForestClassifier进行预测
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':rfc_y_pred})
rfc_submission.to_csv('../Datasets/Titanic/rfc_submission.csv', index=False)
#使用默认配置的XGBClassifier进行预测
xgbc.fit(X_train, y_train)
xgbc_y_pred = xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_y_pred})
xgbc_submission.to_csv('../Datasets/Titanic/xgbc_submission.csv', index=False)
#使用并行网格搜索的方式寻找更好的超参数组合,以期待进一步提高XGBClassifier的预测性能
params = {'max_depth':range(2,7), 'n_estimators':range(100,1100,200), 'learning_rate':[0.05,0.1,0.25,0.5,1.0]}
xgbc_best = XGBClassifier()
gs = GridSearchCV(xgbc_best,params,n_jobs=-1,cv=5,verbose=1)
gs.fit(X_train, y_train)
print(gs.best_score_, gs.best_params_)
#将经过优化超参数配置的XGBClassifier进行测试并保存提交结果
xgbc_best_y_pred = gs.predict(X_test)
xgbc_best_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_best_y_pred})
xgbc_best_submission.to_csv('../Datasets/Titanic/xgbc_best_submission.csv', index=False)

二、IMDB影评得分估计

import pandas as pd
from bs4 import BeautifulSoup
import re
#导入停用词列表
from nltk.corpus import stopwords
#导入文本特征抽取器
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
#导入pipline方便搭建系统流程
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

train = pd.read_csv('../Datasets/IMDB/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('../Datasets/IMDB/testData.tsv', delimiter='\t')
#预处理
def review_to_text(review, remove_stopwords):
    #去掉html标记
    raw_text = BeautifulSoup(review,'html.parser').get_text()
    #去掉非字母字符
    letters = re.sub('[^a-zA-Z]',' ',raw_text)
    words = letters.lower().split()
    #去掉停用词
    if remove_stopwords:
        all_stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in all_stop_words]

    return words

X_train = [' '.join(review_to_text(review,True)) for review in train['review']]
X_test = [' '.join(review_to_text(review,True)) for review in test['review']]

y_train = train['sentiment']
#使用Pipelie搭建两组朴素贝叶斯分类器,区别在于分别使用CountVectorizer和TfidfVectorizer进行特征抽取
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

#分别配置用于模型超参数搜索的组合
params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}

#4折交叉验证
gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)

gs_count.fit(X_train, y_train)
print(gs_count.best_score_)
print(gs_count.best_params_)
#预测
count_y_predict = gs_count.predict(X_test)

gs_tfidf.fit(X_train, y_train)

print(gs_tfidf.best_score_)
print(gs_tfidf.best_params_)

tfidf_y_predict = gs_tfidf.predict(X_test)

#使用pandas对需要提交的数据格式化
submission_count = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})
submission_tfidf= pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})
submission_count.to_csv('../Datasets/IMDB/submission_count.csv', index=False)
submission_tfidf.to_csv('../Datasets/IMDB/submission_tfidf.csv', index=False)
-------------------------------------------------------
0.88216
{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}
0.88712
{'mnb__alpha': 0.1, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)}

三、MNIST

from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf
#读取数据
mnist = input_data.read_data_sets('./MNIST_data', one_hot=True)
sess=tf.InteractiveSession()
#构建cnn网络结构
#自定义卷积函数(后面卷积时就不用写太多)
def conv2d(x,w):
    return tf.nn.conv2d(x,w,strides=[1,1,1,1],padding='SAME')
#自定义池化函数
def max_pool_2x2(x):
    return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
#设置占位符,尺寸为样本输入和输出的尺寸
x=tf.placeholder(tf.float32,[None,784])
y_=tf.placeholder(tf.float32,[None,10])
x_img=tf.reshape(x,[-1,28,28,1])

#设置第一个卷积层和池化层
w_conv1=tf.Variable(tf.truncated_normal([3,3,1,32],stddev=0.1))
b_conv1=tf.Variable(tf.constant(0.1,shape=[32]))
h_conv1=tf.nn.relu(conv2d(x_img,w_conv1)+b_conv1)
h_pool1=max_pool_2x2(h_conv1)

#设置第二个卷积层和池化层
w_conv2=tf.Variable(tf.truncated_normal([3,3,32,50],stddev=0.1))
b_conv2=tf.Variable(tf.constant(0.1,shape=[50]))
h_conv2=tf.nn.relu(conv2d(h_pool1,w_conv2)+b_conv2)
h_pool2=max_pool_2x2(h_conv2)

#设置第一个全连接层
w_fc1=tf.Variable(tf.truncated_normal([7*7*50,1024],stddev=0.1))
b_fc1=tf.Variable(tf.constant(0.1,shape=[1024]))
h_pool2_flat=tf.reshape(h_pool2,[-1,7*7*50])
h_fc1=tf.nn.relu(tf.matmul(h_pool2_flat,w_fc1)+b_fc1)

#dropout(随机权重失活)
keep_prob=tf.placeholder(tf.float32)
h_fc1_drop=tf.nn.dropout(h_fc1,keep_prob)

#设置第二个全连接层
w_fc2=tf.Variable(tf.truncated_normal([1024,10],stddev=0.1))
b_fc2=tf.Variable(tf.constant(0.1,shape=[10]))
y_out=tf.nn.softmax(tf.matmul(h_fc1_drop,w_fc2)+b_fc2)

#建立loss function,为交叉熵
loss=tf.reduce_mean(-tf.reduce_sum(y_*tf.log(y_out),reduction_indices=[1]))
#配置Adam优化器,学习速率为1e-4
train_step=tf.train.AdamOptimizer(1e-4).minimize(loss)

#建立正确率计算表达式
correct_prediction=tf.equal(tf.argmax(y_out,1),tf.argmax(y_,1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

#开始喂数据,训练
tf.global_variables_initializer().run()
for i in range(20000):
    batch=mnist.train.next_batch(50)
    if i%100==0:
        train_accuracy=accuracy.eval(feed_dict={x:batch[0],y_:batch[1],keep_prob:1})
        print("step %d,train_accuracy= %g"%(i,train_accuracy))
    train_step.run(feed_dict={x:batch[0],y_:batch[1],keep_prob:0.5})

#训练之后,使用测试集进行测试,输出最终结果
print("test_accuracy= %g"%accuracy.eval(feed_dict={x:mnist.test.images,y_:mnist.test.labels,keep_prob:1}))
------------------------------------------------------------------
step 0,train_accuracy= 0.14
step 100,train_accuracy= 0.76
step 200,train_accuracy= 0.9
step 300,train_accuracy= 0.96
step 400,train_accuracy= 0.88
step 500,train_accuracy= 0.92
step 600,train_accuracy= 0.98
step 700,train_accuracy= 0.92
step 800,train_accuracy= 0.9
......
step 19400,train_accuracy= 1
step 19500,train_accuracy= 0.984375
step 19600,train_accuracy= 1
step 19700,train_accuracy= 1
step 19800,train_accuracy= 1
step 19900,train_accuracy= 1
test_accuracy= 0.9925

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值