TFT(tensorflow_transform)中使用tensorflow_transform.beam的预处理操作

import math
import os
import pprint
import tempfile
import pathlib

import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import apache_beam as beam
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam

from tfx_bsl.public import tfxio
from tfx_bsl.coders.example_coder import RecordBatchToExamplesEncoder
2023-06-26 23:55:49.730378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 23:55:50.561378: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:55:50.561477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:55:50.561486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.

下载数据集

import urllib
train_url_path = 'https://storage.googleapis.com/artifacts.tfx-oss-public.appspot.com/datasets/census/adult.data'
test_url_path = 'https://storage.googleapis.com/artifacts.tfx-oss-public.appspot.com/datasets/census/adult.test'
train_path = os.path.join('./test3','adult.data')
test_path  = os.path.join('./test3/','adult.test')
urllib.request.urlretrieve(train_url_path,train_path)
urllib.request.urlretrieve(test_url_path,test_path)
('./test3/adult.test', <http.client.HTTPMessage at 0x7efe540eeb50>)

定义数据集常量

CATEGORICAL_FEATURE_KEYS = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
]

NUMERIC_FEATURE_KEYS = [
    'age',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'education-num'
]

ORDERED_CSV_COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label'
]

LABEL_KEY = 'label'
pandas_train = pd.read_csv(train_path,header=None,names=ORDERED_CSV_COLUMNS)
pandas_train.head(5)
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countrylabel
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
COLUMN_DEFAULTS = [
  '' if isinstance(v, str) else 0.0
  for v in  dict(pandas_train.loc[1]).values()]
COLUMN_DEFAULTS
[0.0, '', 0.0, '', 0.0, '', '', '', '', '', 0.0, 0.0, 0.0, '', '']
pandas_test = pd.read_csv(test_path, header=1, names=ORDERED_CSV_COLUMNS)

pandas_test.head(5)
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countrylabel
038Private89814HS-grad9Married-civ-spouseFarming-fishingHusbandWhiteMale0050United-States<=50K.
128Local-gov336951Assoc-acdm12Married-civ-spouseProtective-servHusbandWhiteMale0040United-States>50K.
244Private160323Some-college10Married-civ-spouseMachine-op-inspctHusbandBlackMale7688040United-States>50K.
318?103497Some-college10Never-married?Own-childWhiteFemale0030United-States<=50K.
434Private19869310th6Never-marriedOther-serviceNot-in-familyWhiteMale0030United-States<=50K.
RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.io.FixedLenFeature([], tf.string))
     for name in CATEGORICAL_FEATURE_KEYS] +
    [(name, tf.io.FixedLenFeature([], tf.float32))
     for name in NUMERIC_FEATURE_KEYS] + 
    [(LABEL_KEY, tf.io.FixedLenFeature([], tf.string))]
)
#如下,使用手动feature_spec获得Schema,或者也可通过tfdv.infer_schema自动推断
SCHEMA = tft.DatasetMetadata.from_feature_spec(RAW_DATA_FEATURE_SPEC).schema

转换example函数

from typing import List,Union,Optional
from tensorflow_metadata.proto.v0 import schema_pb2
import pandas as pd
import tensorflow_data_validation as tfdv
import numpy as np
def create_example_by_schema_from_dataframe(row:pd.Series,column_names:List[str],schema_or_schemapath:Union[str,schema_pb2.Schema]):
    """
    根据数据原来的Schema信息将输入的一行数据转换为序列化后的example
    input:
        row:类型为pd.Series的一行数据
        column_names:类型为列表,包含需要转换的列名
        schema_or_schemapath:数据的Schema实例或者Schema的路径(需要具体到schema.pbtxt)
    output:
        example:example数据
    """
    features = {}
    if isinstance(schema_or_schemapath,str):
        schema_or_schemapath=tfdv.load_schema_text(schema_or_schemapath)
    for columnName in column_names:
        typeCode = tfdv.get_feature(schema_or_schemapath,columnName).type
        tempvalue = None
        if typeCode == 1: #string
            if pd.isna(row[columnName]):
                tempvalue = b''
            else:
                tempvalue = row[columnName].encode()
            features[columnName] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[tempvalue]))
        elif typeCode == 2: #int
            if pd.isna(row[columnName]):
                tempvalue = 0
            else:
                tempvalue = int(row[columnName])
            features[columnName] = tf.train.Feature(int64_list=tf.train.Int64List(value=[tempvalue]))
        elif typeCode == 3: #float
            if pd.isna(row[columnName]):
                tempvalue = 0.0
            else:
                tempvalue = float(row[columnName])
            features[columnName] = tf.train.Feature(float_list=tf.train.FloatList(value=[tempvalue]))
    example_proto = tf.train.Example(features=tf.train.Features(feature=features))
    return example_proto

def create_feature_spec_by_schema_from_columnnames(column_names:List[str],schema_or_schemapath:Union[str,schema_pb2.Schema],all_num_to_float:bool=False):
    """
    根据Schema从需要的列中获得它的Feature_spec
    input:
        column_names:类型为列表,包含需要的列名
        schema_or_schemapath:数据的Schema实例或者Schema的路径(需要具体到schema.pbtxt)
        all_num_to_float:是否将所有的数值类型设为tf.float64
    output:
        output_dict:获得的feature_spec
    """
    output_dict={}
    if isinstance(schema_or_schemapath,str):
        schema_or_schemapath=tfdv.load_schema_text(schema_or_schemapath)
    for columnName in column_names:
        typeCode = tfdv.get_feature(schema_or_schemapath,columnName).type
        if typeCode == 1: #string
            output_dict[columnName] = tf.io.FixedLenFeature([],tf.string)
        elif typeCode == 2: #int
            if all_num_to_float:
                output_dict[columnName] = tf.io.FixedLenFeature([], tf.float32)
            else:
                output_dict[columnName] = tf.io.FixedLenFeature([], tf.int64)
        elif typeCode == 3: #float
            output_dict[columnName] = tf.io.FixedLenFeature([], tf.float32)
    return output_dict
tf_example = create_example_by_schema_from_dataframe(pandas_train.loc[0],NUMERIC_FEATURE_KEYS+CATEGORICAL_FEATURE_KEYS,SCHEMA)
tf_example.features.feature['age']
float_list {
  value: 39.0
}
created_feature_spec=create_feature_spec_by_schema_from_columnnames(NUMERIC_FEATURE_KEYS+CATEGORICAL_FEATURE_KEYS,SCHEMA)
created_feature_spec
{'age': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-gain': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-loss': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'hours-per-week': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'education-num': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'workclass': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'education': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'marital-status': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'occupation': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'relationship': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'race': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'sex': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'native-country': FixedLenFeature(shape=[], dtype=tf.string, default_value=None)}
decoded_tensor = tf.io.parse_single_example(
    tf_example.SerializeToString(),
    features=created_feature_spec
)
decoded_tensor
2023-06-26 23:55:59.068527: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 23:55:59.099139: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:55:59.099171: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-26 23:55:59.099783: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.





{'age': <tf.Tensor: shape=(), dtype=float32, numpy=39.0>,
 'capital-gain': <tf.Tensor: shape=(), dtype=float32, numpy=2174.0>,
 'capital-loss': <tf.Tensor: shape=(), dtype=float32, numpy=0.0>,
 'education': <tf.Tensor: shape=(), dtype=string, numpy=b' Bachelors'>,
 'education-num': <tf.Tensor: shape=(), dtype=float32, numpy=13.0>,
 'hours-per-week': <tf.Tensor: shape=(), dtype=float32, numpy=40.0>,
 'marital-status': <tf.Tensor: shape=(), dtype=string, numpy=b' Never-married'>,
 'native-country': <tf.Tensor: shape=(), dtype=string, numpy=b' United-States'>,
 'occupation': <tf.Tensor: shape=(), dtype=string, numpy=b' Adm-clerical'>,
 'race': <tf.Tensor: shape=(), dtype=string, numpy=b' White'>,
 'relationship': <tf.Tensor: shape=(), dtype=string, numpy=b' Not-in-family'>,
 'sex': <tf.Tensor: shape=(), dtype=string, numpy=b' Male'>,
 'workclass': <tf.Tensor: shape=(), dtype=string, numpy=b' State-gov'>}
'marital-status' in tf_example.features.feature.keys()
True

预处理常量

NUM_OOV_BUCKETS = 1

EPOCH_SPLITS = 10
TRAIN_NUM_EPOCHS = 2*EPOCH_SPLITS
NUM_TRAIN_INSTANCES = len(pandas_train)
NUM_TEST_INSTANCES = len(pandas_test)

BATCH_SIZE = 128

STEPS_PER_TRAIN_EPOCH = tf.math.ceil(NUM_TRAIN_INSTANCES/BATCH_SIZE/EPOCH_SPLITS)
EVALUATION_STEPS = tf.math.ceil(NUM_TEST_INSTANCES/BATCH_SIZE)

# Names of temp files
TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed'
TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
EXPORTED_MODEL_DIR = 'exported_model_dir'

预处理

def preprocessing_fn(inputs):
    outputs = inputs.copy()
    
    for key in NUMERIC_FEATURE_KEYS:
        outputs[key] = tft.scale_to_0_1(inputs[key])
    
    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tft.compute_and_apply_vocabulary(
            tf.strings.strip(inputs[key]),
            num_oov_buckets=NUM_OOV_BUCKETS,
            vocab_filename=key
        )
        
    table_keys = ['>50K','<=50K']
    with tf.init_scope():
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)),tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64
        )
        table = tf.lookup.StaticHashTable(initializer,default_value=-1)
    label_str = inputs[LABEL_KEY]
    label_str = tf.strings.regex_replace(label_str,r'\.$','')
    label_str = tf.strings.strip(label_str)
    data_labels = table.lookup(label_str)
    transformed_label = tf.one_hot(
        indices=data_labels,depth=len(table_keys),on_value=1.0,off_value=0.0
    )
    outputs[LABEL_KEY] = tf.reshape(transformed_label, [-1, len(table_keys)])
    
    return outputs
    
def transform_data(train_data_file,test_data_file,working_dir):
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            
            #读入数据
            train_csv_tfxio = tfxio.CsvTFXIO(
                file_pattern=train_data_file,
                telemetry_descriptors=[],
                column_names=ORDERED_CSV_COLUMNS,
                schema=SCHEMA
            )
            raw_data = (
                pipeline |
                'ReadTrainCsv' >> train_csv_tfxio.BeamSource()
            )
            
            #设置元数据
            cfg = train_csv_tfxio.TensorAdapterConfig()
            raw_dataset = (raw_data,cfg)
            
            #转换
            transformed_dataset, transform_fn = (
                raw_dataset | tft_beam.AnalyzeAndTransformDataset(
                    preprocessing_fn,output_record_batches=True)
            )
            
            #写入成examples
            transformed_data, _ = transformed_dataset
            coder = RecordBatchToExamplesEncoder()
            _ = (
                transformed_data
                | 'EncodeTrainData' >>
                beam.FlatMapTuple(lambda batch, _: coder.encode(batch))  #转换后数据编码为examples
                | 'WriteTrainData' >> beam.io.WriteToTFRecord(           #写入examples数据
                    os.path.join(working_dir,TRANSFORMED_TRAIN_DATA_FILEBASE))
            )
            
            #下面一样
            test_csv_tfxio = tfxio.CsvTFXIO(
                file_pattern=test_data_file,
                skip_header_lines=1,
                telemetry_descriptors=[],
                column_names=ORDERED_CSV_COLUMNS,
                schema=SCHEMA
            )
            raw_test_data = (
                pipeline | 'ReadTestCsv' >> test_csv_tfxio.BeamSource()
            )
            raw_test_dataset = (raw_test_data,test_csv_tfxio.TensorAdapterConfig())
            
            #应用上面得到的转换
            transformed_test_dataset = (
                (raw_test_dataset, transform_fn)
                | tft_beam.TransformDataset(output_record_batches=True)
            )
            transformed_test_data, _ = transformed_test_dataset
            
            _ = (
                transformed_test_data
                | 'EncodeTestData' >>
                beam.FlatMapTuple(lambda batch, _ :coder.encode(batch))
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir,TRANSFORMED_TEST_DATA_FILEBASE))
            )
            
            _ = (
                transform_fn
                | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)
            )

查看转换后数据

output_dir = os.path.join(tempfile.mkdtemp(), 'keras')
transform_data(train_path, test_path, output_dir)
WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.




WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/tf_utils.py:324: Tensor.experimental_ref (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use ref() instead.


WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/tf_utils.py:324: Tensor.experimental_ref (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use ref() instead.


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/631aea06548b4fb38aaa5e855fd97e70/assets


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/631aea06548b4fb38aaa5e855fd97e70/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/2c585badca494ed8939abc241f42206a/assets


INFO:tensorflow:Assets written to: /tmp/tmpg4n8yfpg/tftransform_tmp/2c585badca494ed8939abc241f42206a/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
WARNING:apache_beam.io.tfrecordio:Couldn't find python-snappy so the implementation of _TFRecordUtil._masked_crc32c is not as fast as it could be.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
!ls {output_dir}
test_transformed-00000-of-00001   transform_fn
train_transformed-00000-of-00001  transformed_metadata

a

#这里label,shape变为[2]
tf_transform_output = tft.TFTransformOutput(output_dir)
tf_transform_output.transformed_feature_spec()
{'age': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-gain': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'capital-loss': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'education': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'education-num': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'hours-per-week': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'label': FixedLenFeature(shape=[2], dtype=tf.float32, default_value=None),
 'marital-status': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'native-country': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'occupation': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'race': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'relationship': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'sex': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'workclass': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}
def _make_training_input_fn(tf_transform_output, train_file_pattern,
                            batch_size):
  """An input function reading from transformed data, converting to model input.

  Args:
    tf_transform_output: Wrapper around output of tf.Transform.
    transformed_examples: Base filename of examples.
    batch_size: Batch size.

  Returns:
    The input data for training or eval, in the form of k.
  """
  def input_fn():
    return tf.data.experimental.make_batched_features_dataset( #生成字典数据,且label和inputs分开
        file_pattern=train_file_pattern,
        batch_size=batch_size,
        features=tf_transform_output.transformed_feature_spec(),
        reader=tf.data.TFRecordDataset,
        label_key=LABEL_KEY,
        shuffle=True)

  return input_fn
train_file_pattern = pathlib.Path(output_dir)/f'{TRANSFORMED_TRAIN_DATA_FILEBASE}*'

input_fn = _make_training_input_fn(
    tf_transform_output=tf_transform_output,
    train_file_pattern = str(train_file_pattern),
    batch_size = 10
)
for example, label in input_fn().take(5):
  break

pd.DataFrame(example)
agecapital-gaincapital-losseducationeducation-numhours-per-weekmarital-statusnative-countryoccupationracerelationshipsexworkclass
00.1506850.0000000.00000020.8000000.5000000040000
10.0000000.0105500.000000100.2666670.2346941050200
20.3972600.0000000.00000000.5333330.3775512060110
30.0684930.0000000.00000010.6000000.3979591080100
40.2876710.0000000.45385790.9333330.5000000000006
50.0273970.0000000.00000000.5333330.3979591061210
60.1917810.0000000.00000020.8000000.3979590000410
70.6575340.2005120.00000030.8666670.3979590040000
80.5342470.0000000.00000020.8000000.3979594050110
90.3150680.0000000.00000020.8000000.3673471050100
label
<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]], dtype=float32)>

模型建立、训练、评估

def build_keras_inputs(working_dir):
    tf_transform_output = tft.TFTransformOutput(working_dir)
    feature_spec = tf_transform_output.transformed_feature_spec().copy()
    feature_spec.pop(LABEL_KEY)
    
    inputs={}
    for key, spec in feature_spec.items():
        if isinstance(spec,tf.io.VarLenFeature):
            inputs[key] = tf.keras.layers.Input(
                shape=[None],name=key,dtype=spec.dtype,sparse=True)
        elif isinstance(spec,tf.io.FixedLenFeature):
            inputs[key] = tf.keras.layers.Input(
                shape=spec.shape,name=key,dtype=spec.dtype)
        else:
            raise ValueError('Spec type is not supported:',key,spec)
    return inputs

def encode_inputs(inputs):
    encoded_inputs = {}
    for key in inputs:
        feature = tf.expand_dims(inputs[key],-1)
        if key in CATEGORICAL_FEATURE_KEYS:
            num_buckets = tf_transform_output.num_buckets_for_transformed_feature(key)
            encoding_layer = (
                tf.keras.layers.CategoryEncoding(
                    num_tokens=num_buckets,output_mode='binary',sparse=False))
            encoded_inputs[key] = encoding_layer(feature)
        else:
            encoded_inputs[key] = feature
    return encoded_inputs

def build_keras_model(working_dir):
    inputs = build_keras_inputs(working_dir)
    encoded_inputs = encode_inputs(inputs)
    
    stacked_inputs = tf.concat(tf.nest.flatten(encoded_inputs),axis=1)
    output = tf.keras.layers.Dense(100,activation='relu')(stacked_inputs)
    output = tf.keras.layers.Dense(50,activation='relu')(output)
    output = tf.keras.layers.Dense(2)(output)
    model = tf.keras.Model(inputs=inputs,outputs=output)
    return model
model = build_keras_model(output_dir)

tf.keras.utils.plot_model(model,rankdir='LR', show_shapes=True)

在这里插入图片描述

def get_dataset(working_dir, filebase):
  tf_transform_output = tft.TFTransformOutput(working_dir)

  data_path_pattern = os.path.join(
      working_dir,
      filebase + '*')

  input_fn = _make_training_input_fn(
      tf_transform_output,
      data_path_pattern,
      batch_size=BATCH_SIZE)

  dataset = input_fn()

  return dataset
def train_model(model, train_dataset, validation_dataset):
  model.compile(optimizer='adam',
                loss=tf.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

  history = model.fit(train_dataset, validation_data=validation_dataset,
      epochs=TRAIN_NUM_EPOCHS,
      steps_per_epoch=STEPS_PER_TRAIN_EPOCH,
      validation_steps=EVALUATION_STEPS)
  return history

def train_and_evaluate(
    model,
    working_dir):
  """Train the model on training data and evaluate on test data.

  Args:
    working_dir: The location of the Transform output.
    num_train_instances: Number of instances in train set
    num_test_instances: Number of instances in test set

  Returns:
    The results from the estimator's 'evaluate' method
  """
  train_dataset = get_dataset(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)
  validation_dataset = get_dataset(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)

  model = build_keras_model(working_dir)

  history = train_model(model, train_dataset, validation_dataset)

  metric_values = model.evaluate(validation_dataset,
                                 steps=EVALUATION_STEPS,
                                 return_dict=True)
  return model, history, metric_values
model, history, metric_values = train_and_evaluate(model, output_dir)
Epoch 1/20
26/26 [==============================] - 2s 30ms/step - loss: 0.5085 - accuracy: 0.7623 - val_loss: 0.4191 - val_accuracy: 0.7864
Epoch 2/20
26/26 [==============================] - 0s 18ms/step - loss: 0.4029 - accuracy: 0.8092 - val_loss: 0.3704 - val_accuracy: 0.8270
Epoch 3/20
26/26 [==============================] - 1s 22ms/step - loss: 0.3721 - accuracy: 0.8302 - val_loss: 0.3596 - val_accuracy: 0.8329
Epoch 4/20
26/26 [==============================] - 1s 21ms/step - loss: 0.3517 - accuracy: 0.8332 - val_loss: 0.3527 - val_accuracy: 0.8349
Epoch 5/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3494 - accuracy: 0.8389 - val_loss: 0.3467 - val_accuracy: 0.8388
Epoch 6/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3401 - accuracy: 0.8425 - val_loss: 0.3477 - val_accuracy: 0.8396
Epoch 7/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3461 - accuracy: 0.8419 - val_loss: 0.3444 - val_accuracy: 0.8395
Epoch 8/20
26/26 [==============================] - 0s 20ms/step - loss: 0.3570 - accuracy: 0.8305 - val_loss: 0.3427 - val_accuracy: 0.8417
Epoch 9/20
26/26 [==============================] - 1s 24ms/step - loss: 0.3387 - accuracy: 0.8447 - val_loss: 0.3406 - val_accuracy: 0.8420
Epoch 10/20
26/26 [==============================] - 1s 20ms/step - loss: 0.3419 - accuracy: 0.8431 - val_loss: 0.3386 - val_accuracy: 0.8433
Epoch 11/20
26/26 [==============================] - 1s 23ms/step - loss: 0.3423 - accuracy: 0.8401 - val_loss: 0.3391 - val_accuracy: 0.8422
Epoch 12/20
26/26 [==============================] - 1s 25ms/step - loss: 0.3364 - accuracy: 0.8459 - val_loss: 0.3360 - val_accuracy: 0.8442
Epoch 13/20
26/26 [==============================] - 1s 20ms/step - loss: 0.3325 - accuracy: 0.8543 - val_loss: 0.3350 - val_accuracy: 0.8436
Epoch 14/20
26/26 [==============================] - 1s 23ms/step - loss: 0.3287 - accuracy: 0.8474 - val_loss: 0.3334 - val_accuracy: 0.8441
Epoch 15/20
26/26 [==============================] - 0s 20ms/step - loss: 0.3395 - accuracy: 0.8404 - val_loss: 0.3348 - val_accuracy: 0.8434
Epoch 16/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3307 - accuracy: 0.8537 - val_loss: 0.3352 - val_accuracy: 0.8435
Epoch 17/20
26/26 [==============================] - 0s 19ms/step - loss: 0.3197 - accuracy: 0.8483 - val_loss: 0.3336 - val_accuracy: 0.8443
Epoch 18/20
26/26 [==============================] - 1s 22ms/step - loss: 0.3479 - accuracy: 0.8332 - val_loss: 0.3303 - val_accuracy: 0.8455
Epoch 19/20
26/26 [==============================] - 1s 22ms/step - loss: 0.3300 - accuracy: 0.8459 - val_loss: 0.3343 - val_accuracy: 0.8411
Epoch 20/20
26/26 [==============================] - 1s 20ms/step - loss: 0.3393 - accuracy: 0.8374 - val_loss: 0.3311 - val_accuracy: 0.8438
128/128 [==============================] - 1s 2ms/step - loss: 0.3304 - accuracy: 0.8441
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Eval')
plt.ylim(0,max(plt.ylim()))
plt.legend()
plt.title('Loss');

在这里插入图片描述

测试预处理函数

def read_csv(file_name, batch_size):
  return tf.data.experimental.make_csv_dataset(  #生成字典数据
        file_pattern=file_name,
        batch_size=batch_size,
        column_names=ORDERED_CSV_COLUMNS,
        column_defaults=COLUMN_DEFAULTS,
        prefetch_buffer_size=0,
        ignore_errors=True)
for ex in read_csv(test_path, batch_size=5):
  break

ex2 = ex.copy()
ex2.pop('fnlwgt')

tft_layer = tf_transform_output.transform_features_layer()
t_ex = tft_layer(ex2)

label = t_ex.pop(LABEL_KEY)
pd.DataFrame(t_ex)
WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/data/experimental/ops/readers.py:572: ignore_errors (from tensorflow.python.data.experimental.ops.error_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/data/experimental/ops/readers.py:572: ignore_errors (from tensorflow.python.data.experimental.ops.error_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
sexmarital-statusoccupationnative-countryrelationshipeducationworkclasscapital-gaineducation-numhours-per-weekcapital-lossraceage
010304200.00.8000000.3979590.000.109589
111503400.00.6666670.0918370.000.219178
2017111330.00.1333330.2448980.000.041096
311001300.00.8666670.3979590.000.520548
401701230.00.8000000.3469390.000.082192
pdex2=pd.DataFrame(ex2)
pdex2
ageworkclasseducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countrylabel
025.0b' Private'b' Bachelors'13.0b' Married-civ-spouse'b' Adm-clerical'b' Wife'b' White'b' Female'0.00.040.0b' United-States'b' <=50K.'
133.0b' Private'b' Assoc-voc'11.0b' Never-married'b' Other-service'b' Unmarried'b' White'b' Female'0.00.010.0b' United-States'b' <=50K.'
220.0b' ?'b' 5th-6th'3.0b' Never-married'b' ?'b' Not-in-family'b' White'b' Male'0.00.025.0b' Mexico'b' <=50K.'
355.0b' Private'b' Masters'14.0b' Never-married'b' Prof-specialty'b' Not-in-family'b' White'b' Female'0.00.040.0b' United-States'b' <=50K.'
423.0b' ?'b' Bachelors'13.0b' Never-married'b' ?'b' Not-in-family'b' White'b' Male'0.00.035.0b' United-States'b' <=50K.'
pdex2['age']
0    25.0
1    33.0
2    20.0
3    55.0
4    23.0
Name: age, dtype: float32

测试tft函数(可直接使用的)

tft.apply_buckets(pdex2['age'],[[10,20,30,40,50,60]])
<tf.Tensor: shape=(5,), dtype=int64, numpy=array([2, 3, 2, 5, 2])>
tft.apply_buckets_with_interpolation(tf.constant(pdex2['age'],dtype=tf.float32),[[10,20,30,40,50,60]])
<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([0.3       , 0.45999998, 0.2       , 0.9       , 0.26      ],
      dtype=float32)>
sparse = tf.SparseTensor(indices=[[0, 0], [0, 1], [2, 2]],
                         values=['a', 'b', 'c'], dense_shape=(4, 4))
sparse
SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [2 2]], shape=(3, 2), dtype=int64), values=tf.Tensor([b'a' b'b' b'c'], shape=(3,), dtype=string), dense_shape=tf.Tensor([4 4], shape=(2,), dtype=int64))
tft.bag_of_words(sparse,(1,2),' ')
WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/mappers.py:1396: calling while_loop_v2 (from tensorflow.python.ops.control_flow_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/mappers.py:1396: calling while_loop_v2 (from tensorflow.python.ops.control_flow_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))





SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [0 2]
 [2 0]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a' b'a b' b'b' b'c'], shape=(4,), dtype=string), dense_shape=tf.Tensor([4 3], shape=(2,), dtype=int64))
tft.ngrams(sparse,(1,2),' ')
SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [0 2]
 [2 0]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a' b'a b' b'b' b'c'], shape=(4,), dtype=string), dense_shape=tf.Tensor([4 3], shape=(2,), dtype=int64))
tft.deduplicate_tensor_per_row(tf.constant([[1,1,2],[2,3,2]])) #tf.sparse.to_dense(tft.deduplicate_tensor_per_row(tf.constant([[1,1,2],[2,3,2]])))
SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([1 2 2 3], shape=(4,), dtype=int32), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64))
tft.hash_strings(tf.constant(pdex2['sex'],dtype=tf.string),3)
<tf.Tensor: shape=(5,), dtype=int64, numpy=array([2, 2, 0, 2, 0])>
tft.word_count(sparse)
<tf.Tensor: shape=(4,), dtype=int64, numpy=array([2, 0, 1, 0])>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

起名大废废

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值