1、泰坦尼克问题引入分析
数据集下载地址:
https://storage.googleapis.com/tf-datasets/titanic/train.csv
https://storage.googleapis.com/tf-datasets/titanic/eval.csv
数据集解析并展示前5条数据:
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.head())#5条数据
print(eval_df.head())
survived sex age n_siblings_spouses parch fare class deck \ 0 0 male 22.0 1 0 7.2500 Third unknown 1 1 female 38.0 1 0 71.2833 First C 2 1 female 26.0 0 0 7.9250 Third unknown 3 1 female 35.0 1 0 53.1000 First C 4 0 male 28.0 0 0 8.4583 Third unknown embark_town alone 0 Southampton n 1 Cherbourg n 2 Southampton y 3 Southampton n 4 Queenstown y survived sex age n_siblings_spouses parch fare class \ 0 0 male 35.0 0 0 8.0500 Third 1 0 male 54.0 0 0 51.8625 First 2 1 female 58.0 0 0 26.5500 First 3 1 female 55.0 0 0 16.0000 Second 4 1 male 34.0 0 0 13.0000 Second deck embark_town alone 0 unknown Southampton y 1 E Southampton y 2 C Southampton y 3 unknown Southampton y 4 D Southampton y
将标签(survived字段)从数据集里去除掉:
#pop函数可以把相应字段从dataframe里去除掉,同时返回结果
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')
print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())
sex age n_siblings_spouses parch fare class deck \ 0 male 22.0 1 0 7.2500 Third unknown 1 female 38.0 1 0 71.2833 First C 2 female 26.0 0 0 7.9250 Third unknown 3 female 35.0 1 0 53.1000 First C 4 male 28.0 0 0 8.4583 Third unknown embark_town alone 0 Southampton n 1 Cherbourg n 2 Southampton y 3 Southampton n 4 Queenstown y sex age n_siblings_spouses parch fare class deck \ 0 male 35.0 0 0 8.0500 Third unknown 1 male 54.0 0 0 51.8625 First E 2 female 58.0 0 0 26.5500 First C 3 female 55.0 0 0 16.0000 Second unknown 4 male 34.0 0 0 13.0000 Second D embark_town alone 0 Southampton y 1 Southampton y 2 Southampton y 3 Southampton y 4 Southampton y 0 0 1 1 2 1 3 1 4 0 Name: survived, dtype: int64 0 0 1 0 2 1 3 1 4 1 Name: survived, dtype: int64
查看数据集中的统计量:
train_df.describe() #数据集中的统计量,只有这四个字段是有数值的,其他都是离散型的
age n_siblings_spouses parch fare count 627.000000 627.000000 627.000000 627.000000 mean 29.631308 0.545455 0.379585 34.385399 std 12.511818 1.151090 0.792999 54.597730 min 0.750000 0.000000 0.000000 0.000000 25% 23.000000 0.000000 0.000000 7.895800 50% 28.000000 0.000000 0.000000 15.045800 75% 35.000000 1.000000 0.000000 31.387500 max 80.000000 8.000000 5.000000 512.329200
print(train_df.shape, eval_df.shape)
(627, 9) (264, 9)
泰坦尼克号上乘客的年龄符合的分布:
train_df.age.hist(bins = 20)
#.age把对应字段的值取出来,.hist表示画直方图,bins表示将所有的值分多少份
不同性别的人各占多少:
train_df.sex.value_counts().plot(kind = 'barh')
#.value_counts()统计各个值的个数,.plot(kind = 'barh')画一个横向的柱状图(barv)
不同舱位的乘客都各有多少:
train_df['class'].value_counts().plot(kind = 'barh')#.class与函数冲突?
统计:男性中有百分之多少获救,女性中有百分之多少获救:
pd.concat([train_df, y_train], axis = 1).groupby('sex').survived.mean().plot(kind='barh')
2、feature_column使用
离散特征->One_hot编码—>向量
连续特征,直接输入
#离散特征
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
'deck', 'embark_town', 'alone']
#连续特征
numeric_columns = ['age', 'fare']
feature_columns = []
#离散特征
for categorical_column in categorical_columns:
vocab = train_df[categorical_column].unique() #该离散特征所在列的所有可能值
print(categorical_column, vocab)
feature_columns.append(
tf.feature_column.indicator_column( #one-hot编码
tf.feature_column.categorical_column_with_vocabulary_list(
categorical_column, vocab)))
#离散特征,定义feature_column,再添加到feature_columns
#连续特征,直接输入
for numeric_column in numeric_columns:
feature_columns.append(
tf.feature_column.numeric_column(
numeric_column , dtype=tf.float32))
sex ['male' 'female'] n_siblings_spouses [1 0 3 4 2 5 8] parch [0 1 2 5 3 4] class ['Third' 'First' 'Second'] deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E'] embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown'] alone ['n' 'y']
构建dataset:
note:data_df是有多列的一个pandas的dataframe结构,需要变成字典(Key是列名,value是数据值)。
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
batch_size = 32):
dataset = tf.data.Dataset.from_tensor_slices(
(dict(data_df), label_df))
if shuffle:
dataset = dataset.shuffle(10000) #buffer_size=10000
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
train_dataset = make_dataset(train_df, y_train, batch_size = 5)
#取出来的是一个batch,在取出的数据中,每个field的数据是聚在一起的,比如sex,它存在字典key为'sex'的value中。
for x, y in train_dataset.take(1):
print(x, y)
{'sex': <tf.Tensor: id=82, shape=(5,), dtype=string, numpy=array([b'female', b'male', b'male', b'male', b'female'], dtype=object)>, 'age': <tf.Tensor: id=74, shape=(5,), dtype=float64, numpy=array([41., 30., 28., 71., 28.])>, 'n_siblings_spouses': <tf.Tensor: id=80, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 1])>, 'parch': <tf.Tensor: id=81, shape=(5,), dtype=int32, numpy=array([2, 0, 0, 0, 0])>, 'fare': <tf.Tensor: id=79, shape=(5,), dtype=float64, numpy=array([20.2125, 7.8958, 8.05 , 34.6542, 15.5 ])>, 'class': <tf.Tensor: id=76, shape=(5,), dtype=string, numpy=array([b'Third', b'Third', b'Third', b'First', b'Third'], dtype=object)>, 'deck': <tf.Tensor: id=77, shape=(5,), dtype=string, numpy=array([b'unknown', b'unknown', b'unknown', b'A', b'unknown'], dtype=object)>, 'embark_town': <tf.Tensor: id=78, shape=(5,), dtype=string, numpy= array([b'Southampton', b'Southampton', b'Southampton', b'Cherbourg', b'Queenstown'], dtype=object)>, 'alone': <tf.Tensor: id=75, shape=(5,), dtype=string, numpy=array([b'n', b'y', b'y', b'y', b'n'], dtype=object)>} tf.Tensor([0 0 0 0 1], shape=(5,), dtype=int32)
dataset与feature_columns结合:
keras.layers.DenseFeature:DenseFeature可以把刚才定义的feature_columns给应用到dataset中去,
feature_columns本质上是一组对feature进行变换的规则,
DenseFeature可以把这一组规则给应用到dataset中的每一个数据上去。
1)
for x, y in train_dataset.take(1):
age_column = feature_columns[7]
gender_column = feature_columns[0]
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
[[ 2. ] [33. ] [28. ] [50. ] [70.5]]
[[0. 1.] [1. 0.] [1. 0.] [1. 0.] [1. 0.]]
2)
# keras.layers.DenseFeature
for x, y in train_dataset.take(1):
print(keras.layers.DenseFeatures(feature_columns)(x).numpy())
[[50. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 55.9 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ] [45. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 26.25 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. ] [28. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 9.5 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ] [38. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 7.05 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ] [30. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 12.475 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. ]]
3、keras_to_estimator
构造keras模型:
model = keras.models.Sequential([
keras.layers.DenseFeatures(feature_columns),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(2, activation='softmax'),
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer = keras.optimizers.SGD(lr=0.01),
metrics = ['accuracy'])
1) model.fit
train_dataset = make_dataset(train_df, y_train, epochs = 100)
eval_dataset = make_dataset(eval_df, y_eval, epochs = 1, shuffle = False)
history = model.fit(train_dataset,
validation_data = eval_dataset,
steps_per_epoch = 19, #训练集中的样本数/batch_size
validation_steps = 8,
epochs = 100)
2) model -> estimator -> train
estimator = keras.estimator.model_to_estimator(model)
# input_fn:
# 1. 是一个function
# 2. return a. (features, labels) b. dataset -> (feature, label)
estimator.train(input_fn = lambda : make_dataset(
train_df, y_train, epochs=100))
4、预定义estimator使用
1)baseline_estimator:
output_dir = 'baseline_model'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
baseline_estimator = tf.estimator.BaselineClassifier(
model_dir = output_dir,
n_classes = 2)
baseline_estimator.train(input_fn = lambda : make_dataset(
train_df, y_train, epochs = 100))
#随机猜测,现在还有bug
baseline_estimator.evaluate(input_fn = lambda : make_dataset(
eval_df, y_eval, epochs = 1, shuffle = False, batch_size = 20))
note:FailedPreconditionError: GetNext() failed。官方bug,仍未解决。
2)linear_estimator:
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
model_dir = linear_output_dir,
n_classes = 2,
feature_columns = feature_columns)
linear_estimator.train(input_fn = lambda : make_dataset(
train_df, y_train, epochs = 100))
linear_estimator.evaluate(input_fn = lambda : make_dataset(
eval_df, y_eval, epochs = 1, shuffle = False))
默认在"./linear_model"下会保存tensorboard,可以在tensorboard中查看训练的loss等信息。
{'accuracy': 0.7878788, 'accuracy_baseline': 0.625, 'auc': 0.8367616, 'auc_precision_recall': 0.7849544, 'average_loss': 0.46802205, 'label/mean': 0.375, 'loss': 0.4526842, 'precision': 0.7171717, 'prediction/mean': 0.3789331, 'recall': 0.7171717, 'global_step': 1960}
3)dnn_estimator:
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
model_dir = dnn_output_dir,
n_classes = 2,
feature_columns=feature_columns,
hidden_units = [128, 128],
activation_fn = tf.nn.relu,
optimizer = 'Adam')
dnn_estimator.train(input_fn = lambda : make_dataset(
train_df, y_train, epochs = 100))
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
eval_df, y_eval, epochs = 1, shuffle = False))
{'accuracy': 0.8068182, 'accuracy_baseline': 0.625, 'auc': 0.85723907, 'auc_precision_recall': 0.8250245, 'average_loss': 0.46943292, 'label/mean': 0.375, 'loss': 0.44829544, 'precision': 0.7222222, 'prediction/mean': 0.42193753, 'recall': 0.7878788, 'global_step': 1960}
5、交叉特征实战
cross feature:对两个离散特征做笛卡尔积: age: [1,2,3,4,5], gender:[male, female]
age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
hash_bucket_size: 100000: 100 -> hash(100000 values) % 100
feature_columns.append(
tf.feature_column.indicator_column(
tf.feature_column.crossed_column(
['age', 'sex'], hash_bucket_size = 100)))
dnn_estimator:
dnn_output_dir = './dnn_model_new_features'
if not os.path.exists(dnn_output_dir):
os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
model_dir = dnn_output_dir,
n_classes = 2,
feature_columns=feature_columns,
hidden_units = [128, 128],
activation_fn = tf.nn.relu,
optimizer = 'Adam')
dnn_estimator.train(input_fn = lambda : make_dataset(
train_df, y_train, epochs = 100))
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
eval_df, y_eval, epochs = 1, shuffle = False))