处理表格数据
导入包,数据集
from fastai.tabular import *
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
- 指定哪些是类别变量,哪些是连续变量
- 数据预处理procs
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(800,1000)))
.label_from_df(cols=dep_var)
.add_test(test)
.databunch())
data.show_batch(rows=10)
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | target |
---|
Local-gov | Bachelors | Never-married | Prof-specialty | Not-in-family | White | False | -0.1896 | -0.7476 | 1.1422 | <50k |
Self-emp-not-inc | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | -0.0430 | 0.0063 | -0.4224 | <50k |
Private | HS-grad | Married-civ-spouse | Sales | Husband | White | False | -1.1425 | -1.4272 | -0.4224 | <50k |
Private | HS-grad | Divorced | Sales | Own-child | White | False | -0.2629 | 2.4893 | -0.4224 | <50k |
Private | Some-college | Never-married | Tech-support | Own-child | White | False | -1.4357 | -0.2975 | -0.0312 | <50k |
Self-emp-not-inc | Bachelors | Never-married | Sales | Own-child | Asian-Pac-Islander | False | 0.0303 | 1.1938 | 1.1422 | <50k |
Private | HS-grad | Never-married | Adm-clerical | Unmarried | White | False | -0.9959 | -0.1439 | -0.4224 | <50k |
Private | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 0.0303 | -0.2085 | 1.1422 | >=50k |
? | 7th-8th | Divorced | ? | Unmarried | White | False | 1.2030 | 0.0396 | -2.3781 | <50k |
? | Some-college | Never-married | ? | Own-child | White | False | -1.3624 | 0.4335 | -0.0312 | <50k |
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)
epoch | train_loss | valid_loss | accuracy | time |
---|
0 | 0.359038 | 0.390357 | 0.805000 | 00:52 |
row = df.iloc[0]
learn.predict(row)
(Category <50k, tensor(0), tensor([0.5156, 0.4844]))