TFX完整流水线示例以及补充（上）

起名大废废

已于 2023-06-27 13:00:11 修改

阅读量760

点赞数

分类专栏： tfx 文章标签： tensorflow 人工智能 python

于 2023-06-27 12:32:59 首次发布

本文链接：https://blog.csdn.net/T_eddy/article/details/131414543

版权

tfx 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

import os
import pprint
import tempfile
import shutil 
import urllib

import absl
import tensorflow as tf
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip

2023-06-27 11:00:21.593290: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-27 11:00:22.444429: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
WARNING:absl:Failed to import tensorflow serving protos. It can fail if the TF version doesn't match with the TF Serving version. We will try importing again with a workaround:module 'tensorflow.core.protobuf.error_codes_pb2' has no attribute '_CODE'

print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(tfx.__version__))

TensorFlow version: 2.12.0
TFX version: 1.13.0

# This is the root directory for your TFX pip package installation.
_tfx_root = tfx.__path__[0]

# This is the directory containing the TFX Chicago Taxi Pipeline example.
_taxi_root = os.path.join(_tfx_root, 'examples/chicago_taxi_pipeline')

# This is the path where your model will be pushed for serving.
_serving_model_dir = os.path.join(
    '.', 'serving_model/taxi_simple')

PIPELINE_NAME = "chicago_pipeline"
# Output directory to store artifacts generated from the pipeline.
PIPELINE_ROOT = os.path.join('./pipelines', PIPELINE_NAME)
# Path to a SQLite DB file to use as an MLMD storage.
METADATA_PATH = os.path.join('./metadata', PIPELINE_NAME, 'metadata.db')

# Set up logging.
absl.logging.set_verbosity(absl.logging.INFO)

Download example data

_data_root = './tfx-data'
DATA_PATH = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/data/simple/data.csv'
_data_filepath = os.path.join(_data_root, "data.csv")
#urllib.request.urlretrieve(DATA_PATH, _data_filepath)

pipeline_output_root = './pipeline_output_root'
if len(os.listdir(pipeline_output_root)) > 0:
    shutil.rmtree(pipeline_output_root)
    os.mkdir(pipeline_output_root)
context = InteractiveContext(pipeline_root=pipeline_output_root)

WARNING:absl:InteractiveContext metadata_connection_config not provided: using SQLite ML Metadata database at ./pipeline_output_root/metadata.sqlite.

ExampleGen

example_gen = tfx.components.CsvExampleGen(input_base=_data_root)
context.run(example_gen)

INFO:absl:Running driver for CsvExampleGen
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:select span and version = (0, None)
INFO:absl:latest span and version = (0, None)
INFO:absl:Running executor for CsvExampleGen
INFO:absl:Generating examples.
WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.




INFO:absl:Processing input csv data ./tfx-data/* to TFExample.
WARNING:apache_beam.io.tfrecordio:Couldn't find python-snappy so the implementation of _TFRecordUtil._masked_crc32c is not as fast as it could be.
INFO:absl:Examples generated.
INFO:absl:Running publisher for CsvExampleGen
INFO:absl:MetadataStore with DB connection initialized

ExecutionResult at 0x7f030868e310

.execution_id

.component

Artifact of type 'Examples' (uri: ./pipeline_output_root/CsvExampleGen/examples/1) at 0x7f0237933610

.type	<class 'tfx.types.standard_artifacts.Examples'>
.uri	./pipeline_output_root/CsvExampleGen/examples/1
.span	0
.split_names	["train", "eval"]
.version	0

We can also take a look at the first three training examples:

# Get the URI of the output artifact representing the training examples, which is a directory
train_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  pp.pprint(example)

features {
  feature {
    key: "company"
    value {
      bytes_list {
        value: "Chicago Elite Cab Corp. (Chicago Carriag"
      }
    }
  }
  feature {
    key: "dropoff_census_tract"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "dropoff_community_area"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "dropoff_latitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "dropoff_longitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "fare"
    value {
      float_list {
        value: 12.449999809265137
      }
    }
  }
  feature {
    key: "payment_type"
    value {
      bytes_list {
        value: "Credit Card"
      }
    }
  }
  feature {
    key: "pickup_census_tract"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "pickup_community_area"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "pickup_latitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "pickup_longitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "tips"
    value {
      float_list {
        value: 0.0
      }
    }
  }
  feature {
    key: "trip_miles"
    value {
      float_list {
        value: 0.0
      }
    }
  }
  feature {
    key: "trip_seconds"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 6
      }
    }
  }
  feature {
    key: "trip_start_hour"
    value {
      int64_list {
        value: 19
      }
    }
  }
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 5
      }
    }
  }
  feature {
    key: "trip_start_timestamp"
    value {
      int64_list {
        value: 1400269500
      }
    }
  }
}

features {
  feature {
    key: "company"
    value {
      bytes_list {
        value: "Taxi Affiliation Services"
      }
    }
  }
  feature {
    key: "dropoff_census_tract"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "dropoff_community_area"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "dropoff_latitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "dropoff_longitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "fare"
    value {
      float_list {
        value: 27.049999237060547
      }
    }
  }
  feature {
    key: "payment_type"
    value {
      bytes_list {
        value: "Cash"
      }
    }
  }
  feature {
    key: "pickup_census_tract"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "pickup_community_area"
    value {
      int64_list {
        value: 60
      }
    }
  }
  feature {
    key: "pickup_latitude"
    value {
      float_list {
        value: 41.836151123046875
      }
    }
  }
  feature {
    key: "pickup_longitude"
    value {
      float_list {
        value: -87.64878845214844
      }
    }
  }
  feature {
    key: "tips"
    value {
      float_list {
        value: 0.0
      }
    }
  }
  feature {
    key: "trip_miles"
    value {
      float_list {
        value: 12.600000381469727
      }
    }
  }
  feature {
    key: "trip_seconds"
    value {
      int64_list {
        value: 1380
      }
    }
  }
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 3
      }
    }
  }
  feature {
    key: "trip_start_hour"
    value {
      int64_list {
        value: 2
      }
    }
  }
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 10
      }
    }
  }
  feature {
    key: "trip_start_timestamp"
    value {
      int64_list {
        value: 1380593700
      }
    }
  }
}

features {
  feature {
    key: "company"
    value {
      bytes_list {
      }
    }
  }
  feature {
    key: "dropoff_census_tract"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "dropoff_community_area"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "dropoff_latitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "dropoff_longitude"
    value {
      float_list {
      }
    }
  }
  feature {
    key: "fare"
    value {
      float_list {
        value: 16.450000762939453
      }
    }
  }
  feature {
    key: "payment_type"
    value {
      bytes_list {
        value: "Cash"
      }
    }
  }
  feature {
    key: "pickup_census_tract"
    value {
      int64_list {
      }
    }
  }
  feature {
    key: "pickup_community_area"
    value {
      int64_list {
        value: 13
      }
    }
  }
  feature {
    key: "pickup_latitude"
    value {
      float_list {
        value: 41.98363494873047
      }
    }
  }
  feature {
    key: "pickup_longitude"
    value {
      float_list {
        value: -87.72357940673828
      }
    }
  }
  feature {
    key: "tips"
    value {
      float_list {
        value: 0.0
      }
    }
  }
  feature {
    key: "trip_miles"
    value {
      float_list {
        value: 6.900000095367432
      }
    }
  }
  feature {
    key: "trip_seconds"
    value {
      int64_list {
        value: 780
      }
    }
  }
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 3
      }
    }
  }
  feature {
    key: "trip_start_hour"
    value {
      int64_list {
        value: 12
      }
    }
  }
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 11
      }
    }
  }
  feature {
    key: "trip_start_timestamp"
    value {
      int64_list {
        value: 1446554700
      }
    }
  }
}



2023-06-27 11:00:33.035908: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-27 11:00:33.068969: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-27 11:00:33.108659: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]

StatisticsGen

statistics_gen = tfx.components.StatisticsGen(
    examples=example_gen.outputs['examples'])
context.run(statistics_gen)

INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Running driver for StatisticsGen
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:Running executor for StatisticsGen
INFO:absl:Generating statistics for split train.
INFO:absl:Statistics for split train written to ./pipeline_output_root/StatisticsGen/statistics/2/Split-train.
INFO:absl:Generating statistics for split eval.
INFO:absl:Statistics for split eval written to ./pipeline_output_root/StatisticsGen/statistics/2/Split-eval.
INFO:absl:Running publisher for StatisticsGen
INFO:absl:MetadataStore with DB connection initialized

ExecutionResult at 0x7f0224424fa0

.execution_id

.component

Artifact of type 'ExampleStatistics' (uri: ./pipeline_output_root/StatisticsGen/statistics/2) at 0x7f02534bf610

.type	<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri	./pipeline_output_root/StatisticsGen/statistics/2
.span	0
.split_names	["train", "eval"]

context.show(statistics_gen.outputs['statistics'])

Artifact at ./pipeline_output_root/StatisticsGen/statistics/2

在这里插入图片描述

SchemaGen

schema_gen = tfx.components.SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    infer_feature_shape=False)
context.run(schema_gen)

INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Running driver for SchemaGen
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:Running executor for SchemaGen
INFO:absl:Processing schema from statistics for split train.
INFO:absl:Processing schema from statistics for split eval.
INFO:absl:Schema written to ./pipeline_output_root/SchemaGen/schema/3/schema.pbtxt.
INFO:absl:Running publisher for SchemaGen
INFO:absl:MetadataStore with DB connection initialized

ExecutionResult at 0x7f025352d0d0

.execution_id

.component

Artifact of type 'Schema' (uri: ./pipeline_output_root/SchemaGen/schema/3) at 0x7f0224420490

.type	<class 'tfx.types.standard_artifacts.Schema'>
.uri	./pipeline_output_root/SchemaGen/schema/3

context.show(schema_gen.outputs['schema'])

Artifact at ./pipeline_output_root/SchemaGen/schema/3

	Type	Presence	Valency	Domain
Feature name
'company'	STRING	required		'company'
'dropoff_census_tract'	INT	required		-
'dropoff_community_area'	INT	required		-
'dropoff_latitude'	FLOAT	required		-
'dropoff_longitude'	FLOAT	required		-
'fare'	FLOAT	required	single	-
'payment_type'	STRING	required	single	'payment_type'
'pickup_census_tract'	INT	required		-
'pickup_community_area'	INT	required		-
'pickup_latitude'	FLOAT	required		-
'pickup_longitude'	FLOAT	required		-
'tips'	FLOAT	required	single	-
'trip_miles'	FLOAT	required	single	-
'trip_seconds'	INT	required		-
'trip_start_day'	INT	required	single	-
'trip_start_hour'	INT	required	single	-
'trip_start_month'	INT	required	single	-
'trip_start_timestamp'	INT	required	single	-

	Values
Domain
'company'	'0118 - 42111 Godfrey S.Awir', '1085 - 72312 N and W Cab Co', '2192 - 73487 Zeymane Corp', '2733 - 74600 Benny Jona', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3319 - CD Cab Co', '3385 - 23210 Eman Cab', '3385 - Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - 57856 Ilie Malec', '4053 - 40193 Adwar H. Nikola', '4197 - 41842 Royal Star', '4197 - Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '585 - 88805 Valley Cab Co', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5874 - Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '6488 - 83287 Zuha Taxi', '6574 - Babylon Express Inc.', '6742 - 83735 Tasha ride inc', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation', '0694 - 59280 Chinesco Trans Inc', '2092 - 61288 Sbeih company', '2192 - Zeymane Corp', '2809 - 95474 C & D Cab Co Inc.', '2823 - 73307 Seung Lee', '3094 - 24059 G.L.B. Cab Co', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '5006 - Salifu Bawa', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5997 - AW Services Inc.', '6057 - 24657 Richard Addo', '6743 - Luhak Corp'
'payment_type'	'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown', 'Prcard'

ExampleValidator

example_validator = tfx.components.ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],
    schema=schema_gen.outputs['schema'])
context.run(example_validator)

INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Running driver for ExampleValidator
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:Running executor for ExampleValidator
INFO:absl:Validating schema against the computed statistics for split train.
INFO:absl:Validation complete for split train. Anomalies written to ./pipeline_output_root/ExampleValidator/anomalies/4/Split-train.
INFO:absl:Validating schema against the computed statistics for split eval.
INFO:absl:Validation complete for split eval. Anomalies written to ./pipeline_output_root/ExampleValidator/anomalies/4/Split-eval.
INFO:absl:Running publisher for ExampleValidator
INFO:absl:MetadataStore with DB connection initialized

ExecutionResult at 0x7f02534bf2b0

.execution_id

.component

Artifact of type 'ExampleAnomalies' (uri: ./pipeline_output_root/ExampleValidator/anomalies/4) at 0x7f0308688b50

.type	<class 'tfx.types.standard_artifacts.ExampleAnomalies'>
.uri	./pipeline_output_root/ExampleValidator/anomalies/4
.span	0
.split_names	["train", "eval"]

context.show(example_validator.outputs['anomalies'])

Artifact at ./pipeline_output_root/ExampleValidator/anomalies/4

'train' split:

No anomalies found.

'eval' split:

No anomalies found.

Transform

_taxi_constants_module_file = 'taxi_constants.py'

%%writefile {_taxi_constants_module_file}

NUMERICAL_FEATURES = ['trip_miles', 'fare', 'trip_seconds']

BUCKET_FEATURES = [
    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
    'dropoff_longitude'
]
# Number of buckets used by tf.transform for encoding each feature.
FEATURE_BUCKET_COUNT = 10

CATEGORICAL_NUMERICAL_FEATURES = [
    'trip_start_hour', 'trip_start_day', 'trip_start_month',
    'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',
    'dropoff_community_area'
]

CATEGORICAL_STRING_FEATURES = [
    'payment_type',
    'company',
]

# Number of vocabulary terms used for encoding categorical features.
VOCAB_SIZE = 1000

# Count of out-of-vocab buckets in which unrecognized categorical are hashed.
OOV_SIZE = 10

# Keys
LABEL_KEY = 'tips'
FARE_KEY = 'fare'

def t_name(key):
  """
  Rename the feature keys so that they don't clash with the raw keys when
  running the Evaluator component.
  Args:
    key: The original feature key
  Returns:
    key with '_xf' appended
  """
  return key + '_xf'

Overwriting taxi_constants.py

_taxi_transform_module_file = 'taxi_transform.py'

%%writefile {_taxi_transform_module_file}

import tensorflow as tf
import tensorflow_transform as tft

# Imported files such as taxi_constants are normally cached, so changes are
# not honored after the first import.  Normally this is good for efficiency, but
# during development when we may be iterating code it can be a problem. To
# avoid this problem during development, reload the file.
import taxi_constants
import sys
if 'google.colab' in sys.modules:  # Testing to see if we're doing development
  import importlib
  importlib.reload(taxi_constants)

_NUMERICAL_FEATURES = taxi_constants.NUMERICAL_FEATURES
_BUCKET_FEATURES = taxi_constants.BUCKET_FEATURES
_FEATURE_BUCKET_COUNT = taxi_constants.FEATURE_BUCKET_COUNT
_CATEGORICAL_NUMERICAL_FEATURES = taxi_constants.CATEGORICAL_NUMERICAL_FEATURES
_CATEGORICAL_STRING_FEATURES = taxi_constants.CATEGORICAL_STRING_FEATURES
_VOCAB_SIZE = taxi_constants.VOCAB_SIZE
_OOV_SIZE = taxi_constants.OOV_SIZE
_FARE_KEY = taxi_constants.FARE_KEY
_LABEL_KEY = taxi_constants.LABEL_KEY


def _make_one_hot(x, key):
  """Make a one-hot tensor to encode categorical features.
  Args:
    X: A dense tensor
    key: A string key for the feature in the input
  Returns:
    A dense one-hot tensor as a float list
  """
  integerized = tft.compute_and_apply_vocabulary(x,
          top_k=_VOCAB_SIZE,
          num_oov_buckets=_OOV_SIZE,
          vocab_filename=key, name=key)
  depth = (
      tft.experimental.get_vocabulary_size_by_name(key) + _OOV_SIZE)
  one_hot_encoded = tf.one_hot(
      integerized,
      depth=tf.cast(depth, tf.int32),
      on_value=1.0,
      off_value=0.0)
  return tf.reshape(one_hot_encoded, [-1, depth])


def _fill_in_missing(x):
  """Replace missing values in a SparseTensor.
  Fills in missing values of `x` with '' or 0, and converts to a dense tensor.
  Args:
    x: A `SparseTensor` of rank 2.  Its dense shape should have size at most 1
      in the second dimension.
  Returns:
    A rank 1 tensor where missing values of `x` have been filled in.
  """
  if not isinstance(x, tf.sparse.SparseTensor):
    return x

  default_value = '' if x.dtype == tf.string else 0
  return tf.squeeze(
      tf.sparse.to_dense(
          tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]),
          default_value),
      axis=1)


def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.
  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _NUMERICAL_FEATURES:
    # If sparse make it dense, setting nan's to 0 or '', and apply zscore.
    outputs[taxi_constants.t_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]), name=key)

  for key in _BUCKET_FEATURES:
    outputs[taxi_constants.t_name(key)] = tf.cast(tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT, name=key),
            dtype=tf.float32)

  for key in _CATEGORICAL_STRING_FEATURES:
    outputs[taxi_constants.t_name(key)] = _make_one_hot(_fill_in_missing(inputs[key]), key)

  for key in _CATEGORICAL_NUMERICAL_FEATURES:
    outputs[taxi_constants.t_name(key)] = _make_one_hot(tf.strings.strip(
        tf.strings.as_string(_fill_in_missing(inputs[key]))), key)

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_LABEL_KEY] = tf.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Overwriting taxi_transform.py

transform = tfx.components.Transform(
    examples=example_gen.outputs['examples'],
    schema=schema_gen.outputs['schema'],
    module_file=os.path.abspath(_taxi_transform_module_file))
context.run(transform)

INFO:absl:Generating ephemeral wheel package for '/mnt/c/Users/DELL/jupyter_notebook_code/chicago_taxi_pipeline/taxi_transform.py' (including modules: ['taxi_constants', 'taxi_trainer', 'taxi_transform']).
INFO:absl:User module package has hash fingerprint version 79f9cdb8dcb0633411b76b3906a3770b749e6c7c16484cb1f26a1a8c7cbf516a.
INFO:absl:Executing: ['/home/xzy/anaconda3/envs/tfx/bin/python', '/tmp/tmpnf6nwm4u/_tfx_generated_setup.py', 'bdist_wheel', '--bdist-dir', '/tmp/tmpsb53mzqc', '--dist-dir', '/tmp/tmpoct7sroz']
/home/xzy/anaconda3/envs/tfx/lib/python3.9/site-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
INFO:absl:Successfully built user code wheel distribution at './pipeline_output_root/_wheels/tfx_user_code_Transform-0.0+79f9cdb8dcb0633411b76b3906a3770b749e6c7c16484cb1f26a1a8c7cbf516a-py3-none-any.whl'; target user module is 'taxi_transform'.
INFO:absl:Full user module path is 'taxi_transform@./pipeline_output_root/_wheels/tfx_user_code_Transform-0.0+79f9cdb8dcb0633411b76b3906a3770b749e6c7c16484cb1f26a1a8c7cbf516a-py3-none-any.whl'
INFO:absl:Running driver for Transform
INFO:absl:MetadataStore with DB connection initialized


running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying taxi_constants.py -> build/lib
copying taxi_trainer.py -> build/lib
copying taxi_transform.py -> build/lib
installing to /tmp/tmpsb53mzqc
running install
running install_lib
copying build/lib/taxi_transform.py -> /tmp/tmpsb53mzqc
copying build/lib/taxi_trainer.py -> /tmp/tmpsb53mzqc
copying build/lib/taxi_constants.py -> /tmp/tmpsb53mzqc
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
reading manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
......省略

ExecutionResult at 0x7f0234132430

.execution_id

.component

Artifact of type 'ExampleAnomalies' (uri: ./pipeline_output_root/Transform/post_transform_anomalies/5) at 0x7f02243cddf0

.type	<class 'tfx.types.standard_artifacts.ExampleAnomalies'>
.uri	./pipeline_output_root/Transform/post_transform_anomalies/5
.span	0
.split_names

We can also take a look at the first three transformed examples

# Get the URI of the output artifact representing the transformed examples, which is a directory
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
# 输出过长
for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  pp.pprint(example)

features {
  feature {
    key: "company_xf"
    value {
      float_list {
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        ......省略



2023-06-27 11:01:39.933543: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]