import pandas as pd
from google. protobuf import text_format
import tensorflow_model_analysis as tfma
2023-06-26 21:43:20.786840: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 21:43:23.042179: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 21:43:23.042551: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 21:43:23.042563: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
使用dataframe进行评估
这里假设预测是由多列值组成。如果是单列,那么更加简单,与label一样
df= pd. DataFrame( { 'label' : [ 0 , 1 , 2 ] , 'prediction' : [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 7 , 8 , 9 ] ] , 'iden' : [ 'male' , 'female' , 'male' ] } )
df
label prediction iden 0 0 [1, 2, 3] male 1 1 [4, 5, 6] female 2 2 [7, 8, 9] male
eval_config = text_format. Parse( """
model_specs{
label_key: 'label',
prediction_key: 'prediction'
}
metrics_specs{
#aggregate{micro_average: True} #这里的bool可以大写开头或小写
metrics{class_name: "ExampleCount"}
metrics{
class_name: "FairnessIndicators"
config: '{"thresholds": [0.5]}'
}
metrics{class_name: "MultiClassConfusionMatrixPlot"}
metrics{
class_name: "SparseCategoricalCrossentropy"
config:'{"axis":-1 ,"from_logits":true}' #这里的bool需要小写,否则报错
}
metrics{class_name: "SparseCategoricalAccuracy"}
}
slicing_specs{}
slicing_specs{
feature_keys: 'iden'
}
""" , tfma. EvalConfig( ) )
eval_config
model_specs {
label_key: "label"
prediction_key: "prediction"
}
slicing_specs {
}
slicing_specs {
feature_keys: "iden"
}
metrics_specs {
metrics {
class_name: "ExampleCount"
}
metrics {
class_name: "FairnessIndicators"
config: "{\"thresholds\": [0.5]}"
}
metrics {
class_name: "MultiClassConfusionMatrixPlot"
}
metrics {
class_name: "SparseCategoricalCrossentropy"
config: "{\"axis\":-1 ,\"from_logits\":true}"
}
metrics {
class_name: "SparseCategoricalAccuracy"
}
}
output_path = './test4'
eval_result = tfma. analyze_raw_data(
data= df,
eval_config= eval_config,
output_path= output_path)
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching:
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching: -*-of-%(num_shards)05d
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching: -*-of-%(num_shards)05d
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching: -*-of-%(num_shards)05d
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching:
可视化切片——指标
tfma. view. render_slicing_metrics( eval_result, slicing_column= 'iden' )
可视化Plot
tfma. view. render_plot( eval_result, slicing_spec= tfma. SlicingSpec( feature_values= { 'iden' : 'male' } ) )
可视化Fairness
tfma. addons. fairness. view. widget_view. render_fairness_indicator( eval_result)
同时可视化多个评估结果——比较
评估结果获取
eval_result. get_slice_names( )
[(('iden', 'male'),), (), (('iden', 'female'),)]
eval_result. get_metric_names( )
['fairness_indicators_metrics/true_negative_rate@0.5',
'fairness_indicators_metrics/false_positive_rate@0.5',
'fairness_indicators_metrics/recall@0.5',
'fairness_indicators_metrics/false_omission_rate@0.5',
'fairness_indicators_metrics/false_negative_rate@0.5',
'fairness_indicators_metrics/true_positive_rate@0.5',
'example_count',
'fairness_indicators_metrics/precision@0.5',
'fairness_indicators_metrics/false_discovery_rate@0.5',
'sparse_categorical_accuracy',
'sparse_categorical_crossentropy',
'fairness_indicators_metrics/positive_rate@0.5',
'fairness_indicators_metrics/negative_rate@0.5']
eval_result. get_metrics_for_all_slices( )
{(('iden',
'male'),): {'sparse_categorical_crossentropy': {'doubleValue': 1.4076058864593506},
'sparse_categorical_accuracy': {'doubleValue': 0.5},
'example_count': {'doubleValue': 2.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}},
(): {'sparse_categorical_crossentropy': {'doubleValue': 1.4076060056686401},
'sparse_categorical_accuracy': {'doubleValue': 0.3333333432674408},
'example_count': {'doubleValue': 3.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}},
(('iden',
'female'),): {'sparse_categorical_crossentropy': {'doubleValue': 1.4076058864593506},
'sparse_categorical_accuracy': {'doubleValue': 0.0},
'example_count': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}}}
male_slice = ( ( 'iden' , 'male' ) , )
eval_result. get_metrics_for_slice( male_slice)
{'sparse_categorical_crossentropy': {'doubleValue': 1.4076058864593506},
'sparse_categorical_accuracy': {'doubleValue': 0.5},
'example_count': {'doubleValue': 2.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}}
评估结果——dataframe
dfs= tfma. experimental. dataframe. metrics_as_dataframes( tfma. load_metrics( output_path) )
dfs. double_value
slices metric_keys metric_values iden Overall name model_name output_name example_weighted is_diff double_value 0 b'male' NaN sparse_categorical_crossentropy False False 1.407606 1 b'male' NaN sparse_categorical_accuracy False False 0.500000 2 b'male' NaN example_count False False 2.000000 3 b'male' NaN fairness_indicators_metrics/false_positive_rat... False False 1.000000 4 b'male' NaN fairness_indicators_metrics/false_negative_rat... False False 0.000000 5 b'male' NaN fairness_indicators_metrics/true_positive_rate... False False 1.000000 6 b'male' NaN fairness_indicators_metrics/true_negative_rate... False False 0.000000 7 b'male' NaN fairness_indicators_metrics/positive_rate@0.5 False False 1.000000 8 b'male' NaN fairness_indicators_metrics/negative_rate@0.5 False False 0.000000 9 b'male' NaN fairness_indicators_metrics/false_discovery_ra... False False 0.666667 10 b'male' NaN fairness_indicators_metrics/false_omission_rat... False False NaN 11 b'male' NaN fairness_indicators_metrics/precision@0.5 False False 0.333333 12 b'male' NaN fairness_indicators_metrics/recall@0.5 False False 1.000000 13 NaN sparse_categorical_crossentropy False False 1.407606 14 NaN sparse_categorical_accuracy False False 0.333333 15 NaN example_count False False 3.000000 16 NaN fairness_indicators_metrics/false_positive_rat... False False 1.000000 17 NaN fairness_indicators_metrics/false_negative_rat... False False 0.000000 18 NaN fairness_indicators_metrics/true_positive_rate... False False 1.000000 19 NaN fairness_indicators_metrics/true_negative_rate... False False 0.000000 20 NaN fairness_indicators_metrics/positive_rate@0.5 False False 1.000000 21 NaN fairness_indicators_metrics/negative_rate@0.5 False False 0.000000 22 NaN fairness_indicators_metrics/false_discovery_ra... False False 0.666667 23 NaN fairness_indicators_metrics/false_omission_rat... False False NaN 24 NaN fairness_indicators_metrics/precision@0.5 False False 0.333333 25 NaN fairness_indicators_metrics/recall@0.5 False False 1.000000 26 b'female' NaN sparse_categorical_crossentropy False False 1.407606 27 b'female' NaN sparse_categorical_accuracy False False 0.000000 28 b'female' NaN example_count False False 1.000000 29 b'female' NaN fairness_indicators_metrics/false_positive_rat... False False 1.000000 30 b'female' NaN fairness_indicators_metrics/false_negative_rat... False False 0.000000 31 b'female' NaN fairness_indicators_metrics/true_positive_rate... False False 1.000000 32 b'female' NaN fairness_indicators_metrics/true_negative_rate... False False 0.000000 33 b'female' NaN fairness_indicators_metrics/positive_rate@0.5 False False 1.000000 34 b'female' NaN fairness_indicators_metrics/negative_rate@0.5 False False 0.000000 35 b'female' NaN fairness_indicators_metrics/false_discovery_ra... False False 0.666667 36 b'female' NaN fairness_indicators_metrics/false_omission_rat... False False NaN 37 b'female' NaN fairness_indicators_metrics/precision@0.5 False False 0.333333 38 b'female' NaN fairness_indicators_metrics/recall@0.5 False False 1.000000
tfma. experimental. dataframe. auto_pivot( dfs. double_value)
(metric_keys, name) example_count fairness_indicators_metrics/false_discovery_rate@0.5 fairness_indicators_metrics/false_negative_rate@0.5 fairness_indicators_metrics/false_omission_rate@0.5 fairness_indicators_metrics/false_positive_rate@0.5 fairness_indicators_metrics/negative_rate@0.5 fairness_indicators_metrics/positive_rate@0.5 fairness_indicators_metrics/precision@0.5 fairness_indicators_metrics/recall@0.5 fairness_indicators_metrics/true_negative_rate@0.5 fairness_indicators_metrics/true_positive_rate@0.5 sparse_categorical_accuracy sparse_categorical_crossentropy slices Overall: 3.0 0.666667 0.0 NaN 1.0 0.0 1.0 0.333333 1.0 0.0 1.0 0.333333 1.407606 iden:b'female' 1.0 0.666667 0.0 NaN 1.0 0.0 1.0 0.333333 1.0 0.0 1.0 0.000000 1.407606 iden:b'male' 2.0 0.666667 0.0 NaN 1.0 0.0 1.0 0.333333 1.0 0.0 1.0 0.500000 1.407606
df_double = dfs. double_value
df_filtered = df_double. loc[ df_double. slices. iden== b'male' ]
tfma. experimental. dataframe. auto_pivot( df_filtered)
(metric_keys, name) example_count fairness_indicators_metrics/false_discovery_rate@0.5 fairness_indicators_metrics/false_negative_rate@0.5 fairness_indicators_metrics/false_omission_rate@0.5 fairness_indicators_metrics/false_positive_rate@0.5 fairness_indicators_metrics/negative_rate@0.5 fairness_indicators_metrics/positive_rate@0.5 fairness_indicators_metrics/precision@0.5 fairness_indicators_metrics/recall@0.5 fairness_indicators_metrics/true_negative_rate@0.5 fairness_indicators_metrics/true_positive_rate@0.5 sparse_categorical_accuracy sparse_categorical_crossentropy slices iden:b'male' 2.0 0.666667 0.0 NaN 1.0 0.0 1.0 0.333333 1.0 0.0 1.0 0.5 1.407606
tfma. load_validation_result( output_path)
missing_thresholds: true
Model Validation(多模型)
由于没有数据、模型,这里不能运行,下面摘录自https://tensorflow.google.cn/tfx/tutorials/model_analysis/tfma_basic 。未测试
eval_config_with_thresholds = text_format. Parse( """
## Model information
model_specs {
name: "candidate"
# For keras we need to add a `label_key`.
label_key: "big_tipper"
}
model_specs {
name: "baseline"
# For keras we need to add a `label_key`.
label_key: "big_tipper"
is_baseline: true
}
## Post training metric information
metrics_specs {
metrics { class_name: "ExampleCount" }
metrics { class_name: "BinaryAccuracy" }
metrics { class_name: "BinaryCrossentropy" }
metrics {
class_name: "AUC"
threshold {
# Ensure that AUC is always > 0.9
value_threshold {
lower_bound { value: 0.9 }
}
# Ensure that AUC does not drop by more than a small epsilon
# e.g. (candidate - baseline) > -1e-10 or candidate > baseline - 1e-10
change_threshold {
direction: HIGHER_IS_BETTER
absolute { value: -1e-10 }
}
}
}
metrics { class_name: "AUCPrecisionRecall" }
metrics { class_name: "Precision" }
metrics { class_name: "Recall" }
metrics { class_name: "MeanLabel" }
metrics { class_name: "MeanPrediction" }
metrics { class_name: "Calibration" }
metrics { class_name: "CalibrationPlot" }
metrics { class_name: "ConfusionMatrixPlot" }
# ... add additional metrics and plots ...
}
## Slicing information
slicing_specs {} # overall slice
slicing_specs {
feature_keys: ["trip_start_hour"]
}
slicing_specs {
feature_keys: ["trip_start_day"]
}
slicing_specs {
feature_keys: ["trip_start_month"]
}
slicing_specs {
feature_keys: ["trip_start_hour", "trip_start_day"]
}
""" , tfma. EvalConfig( ) )
candidate_model_path = os. path. join( MODELS_DIR, 'keras' , '2' )
baseline_model_path = os. path. join( MODELS_DIR, 'keras' , '1' )
eval_shared_models = [
tfma. default_eval_shared_model(
model_name= tfma. CANDIDATE_KEY,
eval_saved_model_path= candidate_model_path,
eval_config= eval_config_with_thresholds) ,
tfma. default_eval_shared_model(
model_name= tfma. BASELINE_KEY,
eval_saved_model_path= baseline_model_path,
eval_config= eval_config_with_thresholds) ,
]
validation_output_path = os. path. join( OUTPUT_DIR, 'validation' )
eval_result_with_validation = tfma. run_model_analysis(
eval_shared_models,
eval_config= eval_config_with_thresholds,
data_location= tfrecord_file,
output_path= validation_output_path)
其他评估模型的方法(使用Beam或非Tensorflow模型)
链接