假设工具从头开始 - 从CSV到经过培训的二进制分类模型到假设工具使用
-
这个笔记本显示了从CSV加载数据集的过程,训练一个非常简单的分类器,预测其中一列,然后使用假设工具(WIT)分析训练数据集和训练数据model。
-
这个笔记本使用了UCI Census数据集和学习问题,详情请参阅
https://archive.ics.uci.edu/ml/datasets/census+income,预测一个人是否收入超过5万美元
给出他们的人口普查信息。 -
要自定义此笔记本以处理您自己的数据集,您只需编辑标有“USER:”的部分
Setup(在virtualenv中安装Jupyter,Tensorflow和Tensorflow服务)。
-
注意:使用virtualenv,pip安装tensorflow和docker用于TF服务不是唯一的方法。设置所有这些。我发现它最简单,最安全。
-
步骤1:使用pip / virtualenv安装Tensorflow - 有关说明,请参阅https://www.tensorflow.org/install/pip
-
步骤2:使用docker安装Tensorflow服务 - 有关说明,请参阅https://www.tensorflow.org/serving/docker
-
-
接下来的步骤必须从激活了步骤1中创建的virtualenv的终端完成
-
步骤3:安装Jupyter以查看和运行此笔记本
#> pip install jupyter -
步骤4:运行此笔记本
#> jupyter notebook
#从浏览器中打开的文件选择器中,选择此笔记本文件。
#运行单元格。
-
建模
## Load helper functions
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow import data
# Writes a pandas dataframe to disk as a tfrecord file of tf.Example protos,
# using only the dataframe columns specified. Non-numeric columns are treated
# as strings.
def write_df_as_tfrecord(df, filename, columns=None):
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
writer = tf.python_io.TFRecordWriter(filename)
if columns == None:
columns = df.columns.values.tolist()
for index, row in df.iterrows():
example = tf.train.Example()
for col in columns:
if df[col].dtype is np.dtype(np.int64):
example.features.feature[col].int64_list.value.append(row[col])
elif df[col].dtype is np.dtype(np.float64):
example.features.feature[col].float_list.value.append(row[col])
elif row[col] == row[col]:
example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
writer.write(example.SerializeToString())
writer.close()
# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns):
feature_spec = {}
for f in columns:
if df[f].dtype is np.dtype(np.int64):
feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.int64)
elif df[f].dtype is np.dtype(np.float64):
feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.float32)
else:
feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.string)
return feature_spec
# Parses a serialized tf.Example into input features and target feature from
# the provided label feature name and feature spec.
def parse_tf_example(example_proto, label, feature_spec):
parsed_features = tf.parse_example(serialized=example_proto, features=feature_spec)
target = parsed_features.pop(label)
return parsed_features, target
# An input function for providing input to a model from tf.Examples from tf record files.
def tfrecords_input_fn(files_name_pattern, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
num_epochs=None,
batch_size=64):
shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
file_names = tf.matching_files(files_name_pattern)
dataset = data.TFRecordDataset(filenames=file_names)
if shuffle:
dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
dataset = dataset.batch(batch_size)
dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
dataset = dataset.repeat(num_epochs)
iterator = dataset.make_one_shot_iterator()
features, target = iterator.get_next()
return features, target
# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(columns, feature_spec):
ret = []
for col in columns:
if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
ret.append(tf.feature_column.numeric_column(col))
else:
ret.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))
return ret
## Read the dataset from a CSV into dataframe and display a list of all columns and a preview of the data
# USER: Set the path to the CSV containing the dataset to train on (can be a web address or local path).
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
# USER: Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
"Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
"Hours-per-week", "Country", "Target"]
# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)
print (df.columns.tolist())
df
# USER: Set the name you want to give the directory the model will be saved to
model_name = 'trained_model'
# USER: Set the name you want to give the tfrecord dataset file
tfrecord_name = 'data.tfrecord'
# USER: Set the column in the dataset you wish for the model to predict
label_column = 'Target'
model_path = os.path.join(os.getcwd(), model_name)
tfrecord_path = os.path.join(os.getcwd(), tfrecord_name)
# print(model_path, tfrecord_path)
# USER: Make the label column numeric (0 and 1), for use in our model.
# In this case, examples with a target value of '<=50K' are considered to be in the '0' (negative) class
# and all other examples are considered to be in the '1' (positive) class.
df[label_column] = np.where(df[label_column] == '<=50K', 0, 1)
# USER: If the CSV needs any clean-up (such as removing problematic rows or creating new columns), do it here.
# USER: Set list of all columns from the dataset we will use for model input.
input_features = ['Age', 'Workclass', 'Education', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex',
'Capital-Gain', 'Capital-Loss', 'Hours-per-week', 'Country']
# Ensure the label column is not accidentally set as an input feature.
if label_column in input_features:
input_features.remove(label_column)
# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]
# Write the records to disk as tf.Example protos in tf record file, for use in model training
# and later for use by WIT.
write_df_as_tfrecord(df, tfrecord_path, features_and_labels)
## Create and train the classifier
import functools
# Create a feature spec for the classifier
feature_spec = create_feature_spec(df, features_and_labels)
# Define and train the classifier
train_inpf = functools.partial(tfrecords_input_fn, tfrecord_path, feature_spec, label_column)
classifier = tf.estimator.LinearClassifier(
feature_columns=create_feature_columns(input_features, feature_spec))
classifier.train(train_inpf, steps=10000)
## Save the classifier to disk for serving
# Uses a parsing serving input receiver function so that it can classify from serialized tf.Examples
# using the TensorFlow Serving Classify API.
serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
classifier.export_savedmodel(model_path, serving_input_fn)
## Print out the What-If Tool usage instructions (serve model, launch TensorBoard, configure What-If Tool)
import urllib.parse
docker_command = 'sudo docker run -p 8500:8500 --mount type=bind,source=%s,target=/models/my_model/ -e MODEL_NAME=my_model -t tensorflow/serving' % model_path
what_if_tool_path = ('http://localhost:6006/#whatif&inferenceAddress1=%s&modelName1=my_model&examplesPath=%s' %
(urllib.parse.quote('localhost:8500'), urllib.parse.quote(tfrecord_path)))
print ('Command to serve model:')
print (docker_command)
print ('\n')
print ('Command to launch tensorboard:')
print ('tensorboard --logdir .')
print ('\n')
print ('URL to view What-If Tool for your model and dataset:')
print (what_if_tool_path)
# To kill the served model, find the docker container ID through 'sudo docker container ls',
# then run 'sudo docker kill [containerId]'