初体验
path = untar_data(URLs.PETS) # 自动下载并解压数据
fnames = get_image_files(path_img) # 得到路径下所有的图片
data.show_batch(rows=3, figsize=(7,6)) # 查看数据3行,显示尺寸7,6
print(data.classes) # 数据种类
手写数据集识别:
import torch
from fastai import *
from fastai.vision import *
path = untar_data(URLs.MNIST_SAMPLE)
data = ImageDataBunch.from_folder(path)
learn = cnn_learner(data, models.resnet18, metrics=accuracy)
learn.fit_one_cycle(1)
learn.save('stage-1') # 保存训练的模型
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_top_losses(9,figsize = (7,7)) # 打印损失最大的九个
interp.plot_confusion_matrix(figsize = (7,7)) # 绘制混淆矩阵
interp.most_confused(min_val=2) # [('3', '7', 50), ('7', '3', 37)]
'''
most_confused
will simply grab out of the confusion matrix the particular
combinations of predicted and actual that got wrong the most often.
简单地从混乱矩阵中找出预测和实际的特定组合,这些组合是最容易出错的。
上面的3和7最容易出错 (实际的,预测的,这种情况发生的次数)上面第一个表示实际上是7,
预测是3,错了22次
'''
# 微调
'''
我们有时候需要微调,重新加了一些层进去后,如果你不解冻,它只会训练新加的一些层,
#而不会训练整个层,这是为了让训练更快,而你要想训练整个层的话,就要先解冻再训练
'''
learn.unfreeze()
learn.fit_one_cycle(1)
learn.lr_find() # 寻找最适合的学习率
learn.recorder.plot() # 并且画图表示出来
learn.unfreeze()
learn.fit_one_cycle(2, max_lr=slice(1e-6,1e-4)) # 这个学习率区间是看图得出来的
下面我们用Titanic号这个例子来说:
from fastai import *
from fastai.tabular import *
import pandas as pd
import numpy as np
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')
for df in [train_df, test_df]:
# 这个是提取名字中的性别特征,这个是csv文件中的一个例子 Braund, Mr. Owen Harris
# 我们最终提取到 Mr
df['Title'] = df['Name'].str.split(',').str[1].str.split(' ').str[1]
# 船窗号 eg:C103 取到C
df['Deck'] = df['Cabin'].str[0]
all_df = pd.concat([train_df, test_df], sort=False)
# 这里按照Title进行分组,对每组进行求年龄平均值
mean_age_by_title = all_df.groupby('Title').mean()['Age']
# 填充年龄缺失值,如果乘客是男性,那就用男性年龄的平均值填充,反之,女性也是
for df in [train_df, test_df]:
for title, age in mean_age_by_title.iteritems():
df.loc[df['Age'].isnull() & (df['Title'] == title), 'Age'] = age
test_df.Fare.fillna(0,inplace=True) # 用0填充空白的Fare
dep_var = 'Survived'
cat_names = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck']
cont_names = ['Age', 'Fare', 'SibSp', 'Parch']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names, procs=procs)
data = (TabularList.from_df(train_df, path='.', cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(0,200)))
#.split_by_idx(valid_idx=range(200,400))
.label_from_df(cols=dep_var)
.add_test(test, label=0)
.databunch())
#learn = tabular_learner(data, layers=[60, 20], metrics=accuracy)
#learn = tabular_learner(data, layers=[ 300, 150], metrics=accuracy, emb_drop=0.1)
#learn.fit(10)
learn = tabular_learner(data, layers=[1000,500], metrics=accuracy)
learn.fit_one_cycle(5, 2.5e-2)
learn.lr_find()
learn.recorder.plot()
learn.unfreeze()
learn.fit_one_cycle(20, slice(1e-3))
preds, _ = learn.get_preds(ds_type=DatasetType.Test)
pred_prob, pred_class = preds.max(1)
submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':pred_class})
submission.to_csv('submission-fastai.csv', index=False)