keras 简单demo

1 印第安人糖尿病


from keras.models import Sequential
from keras.layers import Dense
import numpy

numpy.random.seed(7)

dataset = numpy.loadtxt("data/diabetes.csv", delimiter=",", skiprows=1)

print(dataset.shape)
print(dataset.take(5))
X = dataset[:,0:8]
Y = dataset[:,8]
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X, Y, epochs=150, batch_size=10)
# evaluate the model
scores = model.evaluate(X, Y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

2 keras.utils.to_categorical

keras.utils.to_categorical(y, num_classes=None, dtype='float32')

将类向量(整数)转换为二进制类矩阵,也即独热编码。
参数

  • y:要转换成矩阵的类向量(从0到num_classes的整体)。
  • num_classes:总类别数。
  • dtype:字符串,输入所期望的数据类型(float32,float64,int32…)
# 考虑一组 3 个类 {0,1,2} 中的 5 个标签数组:
> labels
array([0, 2, 1, 2, 0])
# `to_categorical` 将其转换为具有尽可能多表示类别数的列的矩阵。
# 行数保持不变。
> to_categorical(labels)
array([[ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]], dtype=float32)

自己完成的类似功能函数convert_to_one_hot

def convert_one_hot(labels,num_classes):
    # 计算向量有多少行
    labels_row = len(labels)
    # 生成值全为0的独热编码矩阵
    labels_one_hot = np.zeros((labels_row,num_classes))
    # 计算向量每个类别值在生成矩阵中的位置
    index_offset = np.arange(labels_row)*num_classes
    # 遍历矩阵,为每个类别的位置填充1
    labels_one_hot.flat[index_offset+labels]=1
    return labels_one_hot

to_categorical源码如下:

def to_categorical(y, num_classes=None, dtype='float32'):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
        dtype: The data type expected by the input, as a string
            (`float32`, `float64`, `int32`...)
    # Returns
        A binary matrix representation of the input. The classes axis
        is placed last.
    # Example
    ```python
    # Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}:
    > labels
    array([0, 2, 1, 2, 0])
    # `to_categorical` converts this into a matrix with as many
    # columns as there are classes. The number of rows
    # stays the same.
    > to_categorical(labels)
    array([[ 1.,  0.,  0.],
           [ 0.,  0.,  1.],
           [ 0.,  1.,  0.],
           [ 0.,  0.,  1.],
           [ 1.,  0.,  0.]], dtype=float32)
    ```
    """
    # 将y向量转换为数组
    y = np.array(y, dtype='int')
    # 获取数组的大小
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    # y变为1维数组
    y = y.ravel()
    # 如果用户没有输入分类个数,则自行计算分类个数
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    # 生成全为0的n行num_classes列的值全为0的矩阵
    categorical = np.zeros((n, num_classes), dtype=dtype)
    # np.arange(n)得到每个行的位置值,y里边则是每个列的位置值
    categorical[np.arange(n), y] = 1
    # 进行reshape矫正
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

3 乳腺癌分类

数据总共32列,最后一列未命名且全为空值 ,diagnosis是标签,id对分类无影响。

  • dataset.columns.values:在这里插入图片描述
  • dataset.describe():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
concave points_se          569 non-null float64
symmetry_se                569 non-null float64
fractal_dimension_se       569 non-null float64
radius_worst               569 non-null float64
texture_worst              569 non-null float64
perimeter_worst            569 non-null float64
area_worst                 569 non-null float64
smoothness_worst           569 non-null float64
compactness_worst          569 non-null float64
concavity_worst            569 non-null float64
concave points_worst       569 non-null float64
symmetry_worst             569 non-null float64
fractal_dimension_worst    569 non-null float64
Unnamed: 32                0 non-null float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB

python代码

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential, load_model
from keras.layers import Dense
from sklearn.metrics import confusion_matrix

dataset = pd.read_csv('./data.csv')

# get dataset details
print(dataset.head(5))
print(dataset.columns.values)
print(dataset.info())
print(dataset.describe())

# 拆分特征和标签
X = dataset.iloc[:, 2:32]
print(X.info())
print(type(X))
# diagnosis M:恶性 B:良性
y = dataset.iloc[:, 1]
print(y)

# 将diagnosis标签转换为0,1
print(y[100:110])
encoder = LabelEncoder()
y = encoder.fit_transform(y)
print([y[100:110]])

# 拆分数据
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=0)

# 特征缩放
scalar = StandardScaler()
XTrain = scalar.fit_transform(XTrain)
XTest = scalar.transform(XTest)

# 搜索最优参数值
def classifier(optimizer):
    model = Sequential()
    # units 输出维度 kernel_initializer:初始化w权重
    model.add(Dense(units=16, kernel_initializer='uniform', activation='relu', input_dim=30))
    model.add(Dense(units=8, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=classifier)
params = {'batch_size': [1, 5], 'epochs': [100, 120], 'optimizer': ['adam', 'rmsprop']}
# 网格搜索 estimator:分类器  param_grid:需要最优化的参数的取值 scoring:模型评分标准 cv:交叉验证参数
gridSearch = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=10)
gridSearch = gridSearch.fit(XTrain, yTrain)
score = gridSearch.best_score_
bestParams = gridSearch.best_params_
print(score)
print(bestParams)

# modeling
model = Sequential()
model.add(Dense(units=16, kernel_initializer='uniform', activation='relu', input_dim=30))
model.add(Dense(units=8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(XTrain, yTrain, batch_size=1, epochs=120)
# 保存模型
model.save('./cancer_model.h5')
yPred = model.predict(XTest)
yPred = [1 if y > 0.5 else 0 for y in yPred]
# 混淆矩阵
matrix = confusion_matrix(yTest, yPred)
print(matrix)
# 准确率
accuracy = (matrix[0][0] + matrix[1][1]) / (matrix[0][0] + matrix[0][1] + matrix[1][0] + matrix[1][1])
print("Accuracy: " + str(accuracy * 100) + "%")

4 垃圾邮件分类

from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.metrics import confusion_matrix
import pandas as pd

# get dataset
data = pd.read_csv('./spam_dataset.csv')
texts = []
classes = []
for i, label in enumerate(data['Class']):
    texts.append(data['Text'][i])
    if label == 'ham':
        classes.append(0)
    else:
        classes.append(1)

texts = np.asarray(texts)
classes = np.asarray(classes)

print("number of texts :", len(texts))
print("number of labels: ", len(classes))

# number of words used as features
maxFeatures = 10000
# max document length
maxLen = 500

# we will use 80% of data as training and 20% as validation data
trainingData = int(len(texts) * .8)
validationData = int(len(texts) - trainingData)

# tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("Found {0} unique words: ".format(len(word_index)))
data = pad_sequences(sequences, maxlen=maxLen)
print("data shape: ", data.shape)

np.random.seed(42)
# shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = classes[indices]

X_train = data[:trainingData]
y_train = labels[:trainingData]
X_test = data[trainingData:]
y_test = labels[trainingData:]


# modeling
model = Sequential()
model.add(Embedding(maxFeatures, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
rnn = model.fit(X_train, y_train, epochs=10, batch_size=60, validation_split=0.2)

# predictions
pred = model.predict_classes(X_test)
acc = model.evaluate(X_test, y_test)
proba_rnn = model.predict_proba(X_test)
print("Test loss is {0:.2f} accuracy is {1:.2f}  ".format(acc[0],acc[1]))
print(confusion_matrix(pred, y_test))

参考资料:
KerasDense层的参数设置

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值