涉及的概念:
- Example
- Tensor
- SequenceExample
- Feature
涉及的写入方式
- python
- spark scala
- spark dataframe
写入的数据类型
写入的特征类型
- VarlenFeature
- SparseFeature
- FixedLenFeature
feature_schema = {
# featureA: 一维字符串特征
"featureA": tf.io.FixedLenFeature(shape=(1,), dtype=tf.string, default_value="null"),
# featureB: 一维数值特征
"featureB": tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=0.0),
# featureC: 三维字符串特征
"featureC": tf.io.FixedLenFeature(shape=(3,), dtype=tf.string, default_value=["null", "null", "null"]),
# featureD: 二维数值特征
"featureD": tf.io.FixedLenFeature(shape=(2,), dtype=tf.int64, default_value=[0, 0]),
# featureE: 不固定维度字符串特征
"featureE": tf.io.VarLenFeature(dtype=tf.string),
# featureF: 不固定维度数值特征
"featureF": tf.io.VarLenFeature(dtype=tf.float32),
"featureEwhight":tf.io.VarLenFeature(dtype=tf.float32),
# featureG: 二维字符串序列特征
"featureG": tf.io.FixedLenSequenceFeature(shape=(2,), dtype=tf.string, allow_missing=True, default_value=None),
# featureH: 三维数值序列特征
"featureH": tf.io.FixedLenSequenceFeature(shape=(3,), dtype=tf.int64, allow_missing=True, default_value=None),
# featureI: 21 * 4 * 10 维字符串稀疏特征
"featureI": tf.io.SparseFeature(index_key=["featureI_Index0", "featureI_Index1", "featureI_Index2"],
value_key="featureI_value", dtype=tf.string, size=[21, 4, 10], already_sorted=False)
}
一、python方式写tfrecord
# TensorFlow2.x
writer = tf.io.TFRecordWriter("./tfrecord")
example_1 = tf.train.Example(features=tf.train.Features(feature={
# 数据维度必须为 1
"featureA": tf.train.Feature(bytes_list=tf.train.BytesList(v