在做深度机器学习实验的时候,经常需要先做超参数扫描,得到一批实验结果,然后将验证的结果拿出来进一步处理,或是与人工标定比较,手动计算一些关键指标如分类Precision和Recall,或是制作一些曲线。下面是从Azure上实验中取出验证(机器标定)结果的关键代码:
import tempfile
form pathlib import Path
from azureml.core import Dataset
from azureml.core.experiment import Experiment
from azureml.core.script_run_config importScriptrunConfig
form azureml.core.workspace import Workspace
from azureml.train.automl.run import AutoMLRun
automl_run_id="automl_run_id"
dataset_name="dataset_validation"
experiment_name="automl_experiment_name"
csv_out="data/valid.csv"
computer_target="gpu_cluster"
validate_score=True
batch_size=16
resize_size=640
crop_size=480
workspace=Workspace.from_config()
experiment=Experiment(workspace,experiment_name)
computes=workspace.compute_targets
if computer_target in computes and computes[computer_target].type=="AmlCompute":
print(f"Found existing compute target {computer_target}.")
compute_target=computes[computer_target]
else:
raise ValueError(
f"AML cluster:{computer_target} doesn't exist. Create cluster first!"
)
parent_run=AutoMLRun(experiment,automl_run_id)
target_run=parent_run.get_best_child()
print(f"AutoML Run {target_run.id} will be used for inference")
dataset=Dataset.get_by_name(workspace,name=dataset_name)
arguments=[
target_run.id,
experiment,name,
dataset.id,
validate_score,
batch_size,
resize_size,
crop_size,
]
output_pred_file="outputs/pred.jsonl"
scoring_args=arguments+["--output_file",output_pred_file]
with tempfile.TemporaryDirectory() as tmpdir:
entry_script_name="score_script.py"
target_run.download_file(
"train_artifacts/"+entry_script_name,
Path(tmpdir,entry_script_name),
)
script_run_config=ScriptRunConfig(
source_directory=tmpdir,
script=entry_script_name,
compute_target=compute_target,
environment=target_run.get_environment(),
arguments=scoring_args,
)
scoring_run=experement.submit(script_run_config)
print("Job is submitted. This script will wait until the remote job is finished.")
scoring_run.wait_for_completion(show_output=True,wait_post_processing=True)
output_dataset_prefix="output_"
output_dataset_name=output_dataset_prefix+scoring_run.id
output_dataset=Dataset.get_by_name(workspace,name=output_dataset_name)
output_dataframe=output_dataset.to_pandas_dataframe()