AutoKeras代码

网路末端遗传因子

已于 2022-10-04 11:51:37 修改

阅读量692

点赞数

文章标签：机器学习 python 人工智能

于 2022-10-04 09:37:09 首次发布

本文链接：https://blog.csdn.net/qq_54394719/article/details/127158876

版权

原教程：Overview - AutoKeras

能够完成以下任务

图像分类

图像回归

文本分类

文本回归

结构化数据分类

结构化数据回归

文本分类

import os

import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files

import autokeras as ak

入门小demo：IMBD情感分类

#载入数据集
dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz",
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    extract=True,
)

# set path to dataset
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), "aclImdb")

classes = ["pos", "neg"]
train_data = load_files(
    os.path.join(IMDB_DATADIR, "train"), shuffle=True, categories=classes
)
test_data = load_files(
    os.path.join(IMDB_DATADIR, "test"), shuffle=False, categories=classes
)

x_train = np.array(train_data.data)
y_train = np.array(train_data.target)
x_test = np.array(test_data.data)
y_test = np.array(test_data.target)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting

#训练
# Initialize the text classifier.
clf = ak.TextClassifier(
    overwrite=True, max_trials=1
)  # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)

#预测
# Predict with the best model.
predicted_y = clf.predict(x_test)

#评估
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))

#调节验证集大小
clf.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
)

#自定义验证集
split = 5000
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
clf.fit(
    x_train,
    y_train,
    epochs=2,
    # Use your own validation set.
    validation_data=(x_val, y_val),
)

#句子编码方式【block_type：'sequence', 'ngram'】
input_node = ak.TextInput()
output_node = ak.TextBlock(block_type="ngram")(input_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
clf.fit(x_train, y_train, epochs=2)

#定制化
input_node = ak.TextInput()
output_node = ak.TextToIntSequence()(input_node)
output_node = ak.Embedding()(output_node)
# Use separable Conv layers in Keras.
output_node = ak.ConvBlock(separable=True)(output_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
clf.fit(x_train, y_train, epochs=2)

表格分类

#泰坦尼克实例
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

#建模
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    overwrite=True, max_trials=3
)  # It tries 3 different models.

#训练
# Feed the structured data classifier with training data.
clf.fit(
    # The path to the train.csv file.
    train_file_path,
    # The name of the label column.
    "survived",
    epochs=10,
)

#预测
# Predict with the best model.
predicted_y = clf.predict(test_file_path)

#评估
# Evaluate the best model with testing data.
print(clf.evaluate(test_file_path, "survived"))

#numpy.ndarray变成tf.data.Dataset
train_set = tf.data.Dataset.from_tensor_slices((x_train.astype(np.unicode), y_train))
test_set = tf.data.Dataset.from_tensor_slices(
    (x_test.to_numpy().astype(np.unicode), y_test)
)

#特征指定
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    column_names=[
        "sex",
        "age",
        "n_siblings_spouses",
        "parch",
        "fare",
        "class",
        "deck",
        "embark_town",
        "alone",
    ],
    column_types={"sex": "categorical", "fare": "numerical"},
    max_trials=10,  # It tries 10 different models.
    overwrite=True,
)

#验证集大小
clf.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
    epochs=10,
)

#验证集自定义
split = 500
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
clf.fit(
    x_train,
    y_train,
    # Use your own validation set.
    validation_data=(x_val, y_val),
    epochs=10,
)

#定制化
input_node = ak.StructuredDataInput()
output_node = ak.CategoricalToNumerical()(input_node)
output_node = ak.DenseBlock()(output_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
clf.fit(x_train, y_train, epochs=1)
clf.predict(x_train)

#模型探索
model = clf.export_model()
model.summary()
print(x_train.dtype)
# numpy array in object (mixed type) is not supported.
# convert it to unicode.
model.predict(x_train.astype(np.unicode))

表格回归

#加州房屋数据
#https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset

#载入数据集
house_dataset = fetch_california_housing()
df = pd.DataFrame(
    np.concatenate(
        (house_dataset.data, house_dataset.target.reshape(-1, 1)), axis=1
    ),
    columns=house_dataset.feature_names + ["Price"],
)
train_size = int(df.shape[0] * 0.9)
df[:train_size].to_csv("train.csv", index=False)
df[train_size:].to_csv("eval.csv", index=False)
train_file_path = "train.csv"
test_file_path = "eval.csv"

#建模
# Initialize the structured data regressor.
reg = ak.StructuredDataRegressor(
    overwrite=True, max_trials=3
)  # It tries 3 different models.

#训练
# Feed the structured data regressor with training data.
reg.fit(
    # The path to the train.csv file.
    train_file_path,
    # The name of the label column.
    "Price",
    epochs=10,
)

#预测
# Predict with the best model.
predicted_y = reg.predict(test_file_path)

#评估
# Evaluate the best model with testing data.
print(reg.evaluate(test_file_path, "Price"))

#其余类似功能不再赘述，同上一节表格分类

时序预测

#UCI空气质量数据集
dataset = tf.keras.utils.get_file(
    fname="AirQualityUCI.csv",
    origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00360/"
    "AirQualityUCI.zip",
    extract=True,
)


#预处理
dataset = pd.read_csv(dataset, sep=";")
dataset = dataset[dataset.columns[:-2]]
dataset = dataset.dropna()
dataset = dataset.replace(",", ".", regex=True)
val_split = int(len(dataset) * 0.7)
data_train = dataset[:val_split]
validation_data = dataset[val_split:]
data_x = data_train[
    [
        "CO(GT)",
        "PT08.S1(CO)",
        "NMHC(GT)",
        "C6H6(GT)",
        "PT08.S2(NMHC)",
        "NOx(GT)",
        "PT08.S3(NOx)",
        "NO2(GT)",
        "PT08.S4(NO2)",
        "PT08.S5(O3)",
        "T",
        "RH",
    ]
].astype("float64")
data_x_val = validation_data[
    [
        "CO(GT)",
        "PT08.S1(CO)",
        "NMHC(GT)",
        "C6H6(GT)",
        "PT08.S2(NMHC)",
        "NOx(GT)",
        "PT08.S3(NOx)",
        "NO2(GT)",
        "PT08.S4(NO2)",
        "PT08.S5(O3)",
        "T",
        "RH",
    ]
].astype("float64")
# Data with train data and the unseen data from subsequent time steps.
data_x_test = dataset[
    [
        "CO(GT)",
        "PT08.S1(CO)",
        "NMHC(GT)",
        "C6H6(GT)",
        "PT08.S2(NMHC)",
        "NOx(GT)",
        "PT08.S3(NOx)",
        "NO2(GT)",
        "PT08.S4(NO2)",
        "PT08.S5(O3)",
        "T",
        "RH",
    ]
].astype("float64")
data_y = data_train["AH"].astype("float64")
data_y_val = validation_data["AH"].astype("float64")
print(data_x.shape)  # (6549, 12)
print(data_y.shape)  # (6549,)

#参数
predict_from = 1
predict_until = 10
lookback = 3

#建模
clf = ak.TimeseriesForecaster(
    lookback=lookback,
    predict_from=predict_from,
    predict_until=predict_until,
    max_trials=1,
    objective="val_loss",
)

#训练
# Train the TimeSeriesForecaster with train data
clf.fit(
    x=data_x,
    y=data_y,
    validation_data=(data_x_val, data_y_val),
    batch_size=32,
    epochs=10,
)

#预测
# Predict with the best model(includes original training data).
predictions = clf.predict(data_x_test)
print(predictions.shape)

#评估
# Evaluate the best model with testing data.
print(clf.evaluate(data_x_val, data_y_val))

可视化

Trains 会自动记录有关 AutoKera 任务的全面信息: 代码源代码控制、执行环境、超参数等等。

from tensorflow import keras

tensorboard_callback_train = keras.callbacks.TensorBoard(log_dir='log')
tensorboard_callback_test = keras.callbacks.TensorBoard(log_dir='log')
clf.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback_train])
clf.fit(x_test, y_test, epochs=2, callbacks=[tensorboard_callback_test])

项目追踪

配置：

1 TRAINS - AutoKeras

2 First Steps | ClearML

from trains import Task
from tensorflow import keras
import os
import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files
import autokeras as ak


task = Task.init(project_name="autokeras", task_name="autokeras titanic test")

# 泰坦尼克实例
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

# 建模
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    overwrite=True, max_trials=3
)  # It tries 3 different models.
tensorboard_callback_train = keras.callbacks.TensorBoard(log_dir='log')
# 训练
# Feed the structured data classifier with training data.
clf.fit(
    # The path to the train.csv file.
    train_file_path,
    # The name of the label column.
    "survived",
    epochs=10,callbacks=[tensorboard_callback_train]

)

# 预测
# Predict with the best model.
predicted_y = clf.predict(test_file_path)

# 评估
# Evaluate the best model with testing data.
print(clf.evaluate(test_file_path, "survived"))