1 印第安人糖尿病
from keras.models import Sequential
from keras.layers import Dense
import numpy
numpy.random.seed(7)
dataset = numpy.loadtxt("data/diabetes.csv", delimiter=",", skiprows=1)
print(dataset.shape)
print(dataset.take(5))
X = dataset[:,0:8]
Y = dataset[:,8]
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X, Y, epochs=150, batch_size=10)
# evaluate the model
scores = model.evaluate(X, Y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
2 keras.utils.to_categorical
keras.utils.to_categorical(y, num_classes=None, dtype='float32')
将类向量(整数)转换为二进制类矩阵,也即独热编码。
参数
- y:要转换成矩阵的类向量(从0到num_classes的整体)。
- num_classes:总类别数。
- dtype:字符串,输入所期望的数据类型(float32,float64,int32…)
# 考虑一组 3 个类 {0,1,2} 中的 5 个标签数组:
> labels
array([0, 2, 1, 2, 0])
# `to_categorical` 将其转换为具有尽可能多表示类别数的列的矩阵。
# 行数保持不变。
> to_categorical(labels)
array([[ 1., 0., 0.],
[ 0., 0., 1.],
[ 0., 1., 0.],
[ 0., 0., 1.],
[ 1., 0., 0.]], dtype=float32)
自己完成的类似功能函数convert_to_one_hot
def convert_one_hot(labels,num_classes):
# 计算向量有多少行
labels_row = len(labels)
# 生成值全为0的独热编码矩阵
labels_one_hot = np.zeros((labels_row,num_classes))
# 计算向量每个类别值在生成矩阵中的位置
index_offset = np.arange(labels_row)*num_classes
# 遍历矩阵,为每个类别的位置填充1
labels_one_hot.flat[index_offset+labels]=1
return labels_one_hot
to_categorical源码如下:
def to_categorical(y, num_classes=None, dtype='float32'):
"""Converts a class vector (integers) to binary class matrix.
E.g. for use with categorical_crossentropy.
# Arguments
y: class vector to be converted into a matrix
(integers from 0 to num_classes).
num_classes: total number of classes.
dtype: The data type expected by the input, as a string
(`float32`, `float64`, `int32`...)
# Returns
A binary matrix representation of the input. The classes axis
is placed last.
# Example
```python
# Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}:
> labels
array([0, 2, 1, 2, 0])
# `to_categorical` converts this into a matrix with as many
# columns as there are classes. The number of rows
# stays the same.
> to_categorical(labels)
array([[ 1., 0., 0.],
[ 0., 0., 1.],
[ 0., 1., 0.],
[ 0., 0., 1.],
[ 1., 0., 0.]], dtype=float32)
```
"""
# 将y向量转换为数组
y = np.array(y, dtype='int')
# 获取数组的大小
input_shape = y.shape
if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
input_shape = tuple(input_shape[:-1])
# y变为1维数组
y = y.ravel()
# 如果用户没有输入分类个数,则自行计算分类个数
if not num_classes:
num_classes = np.max(y) + 1
n = y.shape[0]
# 生成全为0的n行num_classes列的值全为0的矩阵
categorical = np.zeros((n, num_classes), dtype=dtype)
# np.arange(n)得到每个行的位置值,y里边则是每个列的位置值
categorical[np.arange(n), y] = 1
# 进行reshape矫正
output_shape = input_shape + (num_classes,)
categorical = np.reshape(categorical, output_shape)
return categorical
3 乳腺癌分类
数据总共32列,最后一列未命名且全为空值 ,diagnosis是标签,id对分类无影响。
- dataset.columns.values:
- dataset.describe():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id 569 non-null int64
diagnosis 569 non-null object
radius_mean 569 non-null float64
texture_mean 569 non-null float64
perimeter_mean 569 non-null float64
area_mean 569 non-null float64
smoothness_mean 569 non-null float64
compactness_mean 569 non-null float64
concavity_mean 569 non-null float64
concave points_mean 569 non-null float64
symmetry_mean 569 non-null float64
fractal_dimension_mean 569 non-null float64
radius_se 569 non-null float64
texture_se 569 non-null float64
perimeter_se 569 non-null float64
area_se 569 non-null float64
smoothness_se 569 non-null float64
compactness_se 569 non-null float64
concavity_se 569 non-null float64
concave points_se 569 non-null float64
symmetry_se 569 non-null float64
fractal_dimension_se 569 non-null float64
radius_worst 569 non-null float64
texture_worst 569 non-null float64
perimeter_worst 569 non-null float64
area_worst 569 non-null float64
smoothness_worst 569 non-null float64
compactness_worst 569 non-null float64
concavity_worst 569 non-null float64
concave points_worst 569 non-null float64
symmetry_worst 569 non-null float64
fractal_dimension_worst 569 non-null float64
Unnamed: 32 0 non-null float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
python代码
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential, load_model
from keras.layers import Dense
from sklearn.metrics import confusion_matrix
dataset = pd.read_csv('./data.csv')
# get dataset details
print(dataset.head(5))
print(dataset.columns.values)
print(dataset.info())
print(dataset.describe())
# 拆分特征和标签
X = dataset.iloc[:, 2:32]
print(X.info())
print(type(X))
# diagnosis M:恶性 B:良性
y = dataset.iloc[:, 1]
print(y)
# 将diagnosis标签转换为0,1
print(y[100:110])
encoder = LabelEncoder()
y = encoder.fit_transform(y)
print([y[100:110]])
# 拆分数据
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=0)
# 特征缩放
scalar = StandardScaler()
XTrain = scalar.fit_transform(XTrain)
XTest = scalar.transform(XTest)
# 搜索最优参数值
def classifier(optimizer):
model = Sequential()
# units 输出维度 kernel_initializer:初始化w权重
model.add(Dense(units=16, kernel_initializer='uniform', activation='relu', input_dim=30))
model.add(Dense(units=8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
return model
model = KerasClassifier(build_fn=classifier)
params = {'batch_size': [1, 5], 'epochs': [100, 120], 'optimizer': ['adam', 'rmsprop']}
# 网格搜索 estimator:分类器 param_grid:需要最优化的参数的取值 scoring:模型评分标准 cv:交叉验证参数
gridSearch = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=10)
gridSearch = gridSearch.fit(XTrain, yTrain)
score = gridSearch.best_score_
bestParams = gridSearch.best_params_
print(score)
print(bestParams)
# modeling
model = Sequential()
model.add(Dense(units=16, kernel_initializer='uniform', activation='relu', input_dim=30))
model.add(Dense(units=8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(XTrain, yTrain, batch_size=1, epochs=120)
# 保存模型
model.save('./cancer_model.h5')
yPred = model.predict(XTest)
yPred = [1 if y > 0.5 else 0 for y in yPred]
# 混淆矩阵
matrix = confusion_matrix(yTest, yPred)
print(matrix)
# 准确率
accuracy = (matrix[0][0] + matrix[1][1]) / (matrix[0][0] + matrix[0][1] + matrix[1][0] + matrix[1][1])
print("Accuracy: " + str(accuracy * 100) + "%")
4 垃圾邮件分类
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.metrics import confusion_matrix
import pandas as pd
# get dataset
data = pd.read_csv('./spam_dataset.csv')
texts = []
classes = []
for i, label in enumerate(data['Class']):
texts.append(data['Text'][i])
if label == 'ham':
classes.append(0)
else:
classes.append(1)
texts = np.asarray(texts)
classes = np.asarray(classes)
print("number of texts :", len(texts))
print("number of labels: ", len(classes))
# number of words used as features
maxFeatures = 10000
# max document length
maxLen = 500
# we will use 80% of data as training and 20% as validation data
trainingData = int(len(texts) * .8)
validationData = int(len(texts) - trainingData)
# tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("Found {0} unique words: ".format(len(word_index)))
data = pad_sequences(sequences, maxlen=maxLen)
print("data shape: ", data.shape)
np.random.seed(42)
# shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = classes[indices]
X_train = data[:trainingData]
y_train = labels[:trainingData]
X_test = data[trainingData:]
y_test = labels[trainingData:]
# modeling
model = Sequential()
model.add(Embedding(maxFeatures, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
rnn = model.fit(X_train, y_train, epochs=10, batch_size=60, validation_split=0.2)
# predictions
pred = model.predict_classes(X_test)
acc = model.evaluate(X_test, y_test)
proba_rnn = model.predict_proba(X_test)
print("Test loss is {0:.2f} accuracy is {1:.2f} ".format(acc[0],acc[1]))
print(confusion_matrix(pred, y_test))
参考资料:
KerasDense层的参数设置