import pathlib
import pprint
import tempfile
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
2023-06-26 23:15:53.877588: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 23:15:55.005099: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:15:55.005222: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:15:55.005232: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
准备数据和元数据
raw_data =[{'x':1.0,'y':1.0,'s':'hello'},{'x':2.0,'y':2.0,'s':'world'},{'x':3.0,'y':3.0,'s':'hello'}]#方法一,使用tfdv推断import tensorflow_data_validation as tfdv
import pandas as pd
data = pd.DataFrame(raw_data)
stat = tfdv.generate_statistics_from_dataframe(data)defclearDim(schema,stat):for field in data.columns:
tfdv.get_feature(schema,field).shape.ClearField('dim')return schema
#推断的Schema需要清除Shape中的Dim字段才能用于tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)#否则会报错
schema = tfdv.infer_schema(stat,max_string_domain_size=0,schema_transformations=[clearDim])
raw_data_metadata = dataset_metadata.DatasetMetadata(
schema
)#方法二,手动创建# raw_data_metadata = dataset_metadata.DatasetMetadata(# schema_utils.schema_from_feature_spec({# 'y': tf.io.FixedLenFeature([], tf.float32),# 'x': tf.io.FixedLenFeature([], tf.float32),# 's': tf.io.FixedLenFeature([], tf.string),# }))
准备与预处理函数
defpreprocessing_fn(inputs):
x = inputs['x']
y = inputs['y']
s = inputs['s']
x_centered = x - tft.mean(x)
y_normalized = tft.scale_to_0_1(y)
s_integerized = tft.compute_and_apply_vocabulary(s)
x_centered_times_y_normalized =(x_centered * y_normalized)return{'x_centered':x_centered,'y_normalized':y_normalized,'s_integerized':s_integerized,'x_centered_times_y_normalized':x_centered_times_y_normalized
}
执行转换和将预处理操作写入文件
defmain(output_dir):# Ignore the warningswith tft_beam.Context(temp_dir=tempfile.mkdtemp()):
transformed_dataset, transform_fn =(# pylint: disable=unused-variable(raw_data, raw_data_metadata)| tft_beam.AnalyzeAndTransformDataset(
preprocessing_fn))
transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable# Save the transform_fn to the output_dir
_ =(
transform_fn
|'WriteTransformFn'>> tft_beam.WriteTransformFn(output_dir))return transformed_data, transformed_metadata
WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.
WARNING:tensorflow:You are passing instance dicts and DatasetMetadata to TFT which will not provide optimal performance. Consider following the TFT guide to upgrade to the TFXIO format (Apache Arrow RecordBatch).
WARNING:tensorflow:You are passing instance dicts and DatasetMetadata to TFT which will not provide optimal performance. Consider following the TFT guide to upgrade to the TFXIO format (Apache Arrow RecordBatch).
WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/tf_utils.py:324: Tensor.experimental_ref (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use ref() instead.
2023-06-26 23:15:57.191927: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 23:15:57.288381: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 23:15:57.288444: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-26 23:15:57.289481: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
WARNING:tensorflow:From /home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow_transform/tf_utils.py:324: Tensor.experimental_ref (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use ref() instead.
WARNING:tensorflow:You are passing instance dicts and DatasetMetadata to TFT which will not provide optimal performance. Consider following the TFT guide to upgrade to the TFXIO format (Apache Arrow RecordBatch).
WARNING:tensorflow:You are passing instance dicts and DatasetMetadata to TFT which will not provide optimal performance. Consider following the TFT guide to upgrade to the TFXIO format (Apache Arrow RecordBatch).
WARNING:apache_beam.options.pipeline_options:Discarding unparseable args: ['/home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/home/xzy/.local/share/jupyter/runtime/kernel-f0c3eac2-332f-48d3-be51-f348ad18cc09.json']
INFO:tensorflow:Assets written to: /tmp/tmpt45kl1nk/tftransform_tmp/2480ce1b114f48579e5a9a3f65cc7f25/assets
INFO:tensorflow:Assets written to: /tmp/tmpt45kl1nk/tftransform_tmp/2480ce1b114f48579e5a9a3f65cc7f25/assets
INFO:tensorflow:struct2tensor is not available.
INFO:tensorflow:struct2tensor is not available.
INFO:tensorflow:tensorflow_decision_forests is not available.
INFO:tensorflow:tensorflow_decision_forests is not available.
INFO:tensorflow:tensorflow_text is not available.
INFO:tensorflow:tensorflow_text is not available.
INFO:tensorflow:Assets written to: /tmp/tmpt45kl1nk/tftransform_tmp/ec61c498ec57446d94add565b8b2c0ce/assets
INFO:tensorflow:Assets written to: /tmp/tmpt45kl1nk/tftransform_tmp/ec61c498ec57446d94add565b8b2c0ce/assets
INFO:tensorflow:struct2tensor is not available.
INFO:tensorflow:struct2tensor is not available.
INFO:tensorflow:tensorflow_decision_forests is not available.
INFO:tensorflow:tensorflow_decision_forests is not available.
INFO:tensorflow:tensorflow_text is not available.
INFO:tensorflow:tensorflow_text is not available.
WARNING:apache_beam.options.pipeline_options:Discarding unparseable args: ['/home/xzy/anaconda3/envs/tf/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/home/xzy/.local/share/jupyter/runtime/kernel-f0c3eac2-332f-48d3-be51-f348ad18cc09.json']
INFO:tensorflow:struct2tensor is not available.
INFO:tensorflow:struct2tensor is not available.
INFO:tensorflow:tensorflow_decision_forests is not available.
INFO:tensorflow:tensorflow_decision_forests is not available.
INFO:tensorflow:tensorflow_text is not available.
INFO:tensorflow:tensorflow_text is not available.
<tensorflow_transform.output_wrapper.TransformFeaturesLayer at 0x7f0525a251c0>
raw_data_batch ={'s': tf.constant([ex['s']for ex in raw_data]),'x': tf.constant([ex['x']for ex in raw_data], dtype=tf.float32),'y': tf.constant([ex['y']for ex in raw_data], dtype=tf.float32),}