Keras(十八)预定义estimator使用、交叉特征实战

38 篇文章 2 订阅
35 篇文章 11 订阅

本文将介绍:

  • 预定义estimator.LinearClassifier模型
  • 预定义estimator.DNNClassifier模型
  • 交叉特征实战

一,加载Titanic数据集

1,下载Titanic数据集,使用pandas读取并解析数据集

下载数据的网址如下:

https://storage.googleapis.com/tf-datasets/titanic/train.csv
https://storage.googleapis.com/tf-datasets/titanic/eval.csv

将下载的数据读取到pandas中,代码如下:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())
2,分离出特征值和目标值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())
3,使用panda对数值型数据的字段进行统计
train_df.describe()

二,使用feature_column做数据处理,并转化为tf.data.dataset类型数据

1,将"离散特征"和"连续特征"整合为one-hot编码
# 将特征分为"离散特征"和"连续特征"两个列表
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
# 使用tf.feature_column对"离散特征"做处理
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

# 使用tf.feature_column对"连续特征"做处理
for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))
2,将ndarray数据转化为tf.data.dataset中的BatchDataset类型数据
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

三,LinearClassifier

# 创建-LinearClassifier预定义estimator模型
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
# 训练-LinearClassifier预定义estimator模型
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

# 评估-LinearClassifier预定义estimator模型结果
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

四,DNNClassifier

# 创建-DNNClassifier预定义estimator模型
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    optimizer = 'Adam')
# 训练-DNNClassifier预定义estimator模型
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

# 评估-DNNClassifier预定义estimator模型结果
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

五,交叉特征实战

1,交叉特征的代码实现
# 使用tf.feature_column.indicator_column对多个特征做<交叉特征>处理
# cross feature: age: [1,2,3,4,5], gender:[male, female]
# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
# 100000: 100 -> hash(100000 values) % 100 
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))
2,总结代码如下
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 14:03:56 2020

@author: nijiahui
"""
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

### 一,加载Titanic数据集 ##########################################
# 1,下载Titanic数据集,使用pandas读取并解析数据集
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

# 2,分离出特征值和目标值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

# 3,使用panda对数值型数据的字段进行统计
train_df.describe()

### 二,使用feature_column做数据处理,并转化为tf.data.dataset类型数据 ##############
# 1,将"离散特征"和"连续特征"整合为one-hot编码
# 将特征分为"离散特征"和"连续特征"两个列表
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class','deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
# 使用tf.feature_column对"离散特征"做处理
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

# 使用tf.feature_column对"连续特征"做处理
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            numeric_column, dtype=tf.float32))

# 使用tf.feature_column.indicator_column对多个特征做<交叉特征>处理
# cross feature: age: [1,2,3,4,5], gender:[male, female]
# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
# 100000: 100 -> hash(100000 values) % 100 
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))

# 2,将ndarray数据转化为tf.data.dataset中的BatchDataset类型数据
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

### 三,LinearClassifier
# 创建-LinearClassifier预定义estimator模型
linear_output_dir = 'linear_model_new_features'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
# 训练-LinearClassifier预定义estimator模型
linear_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))

# 评估-LinearClassifier预定义estimator模型结果
linear_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))


### 四,DNNClassifier
# 创建-DNNClassifier预定义estimator模型
dnn_output_dir = './dnn_model_new_features'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    # optimizer = 'Adam'
    optimizer = 'SGD'
    )
# 训练-DNNClassifier预定义estimator模型
dnn_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))

# 评估-DNNClassifier预定义estimator模型结果
dnn_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值