在开源基础上增加l1增则化和中止条件,测试效果并不好,暂时供大家参考指正。
训练数据去这里取,http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/
#!/usr/bin/python
# Logistic Regression on Diabetes Dataset
from random import seed
from random import randrange
from csv import reader
from math import exp
import math
import sys
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Find the min and max values for each column
def dataset_minmax(dataset):
minmax = list()
for i in range(len(dataset[0])):
col_values = [row[i] for row in dataset]
value_min = min(col_values)
value_max = max(col_values)
minmax.append([value_min, value_max])
return minmax
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = len(dataset) / n_folds
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
if len(folds) != 1:
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# Make a prediction with coefficients
def predict(row, coefficients):
yhat = coefficients[0]
for i in range(len(row)-1):
yhat += coefficients[i + 1] * row[i]
return 1.0 / (1.0 + exp(-yhat))
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch, eta, lbd):
coef = [0.0 for i in range(len(train[0]))]
'''
for epoch in range(n_epoch):
for row in train:
yhat = predict(row, coef)
error = row[-1] - yhat
#coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
coef[0] = coef[0] + l_rate * error
for i in range(len(row)-1):
#coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
coef[i + 1] = coef[i + 1] + l_rate * error * row[i]
if math.fabs(error) < eta:
print "end"
return coef
'''
epoch = 0
while True:
if epoch > n_epoch:
return coef
epoch += 1
for row in train:
yhat = predict(row, coef)
error = row[-1] - yhat
#coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
tlda = lda
if coef[0] < 0:
tlda = -lda
elif coef[0] == 0:
tlda = 0
coef[0] = coef[0] - l_rate * error - tlda
for i in range(len(row)-1):
#coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
tlda = lda
if coef[i + 1] < 0:
tlda = -lda
elif coef[i + 1] == 0:
tlda = 0
coef[i + 1] = coef[i + 1] - l_rate * error * row[i] - tlda
if math.fabs(error) < eta:
print "end"
return coef
return coef
# Linear Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch, eta, lda):
predictions = list()
coef = coefficients_sgd(train, l_rate, n_epoch, eta, lda)
print coef
for row in test:
yhat = predict(row, coef)
yhat = round(yhat)
predictions.append(yhat)
return(predictions)
# Test the logistic regression algorithm on the diabetes dataset
seed(1)
# load and prepare data
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 3
l_rate = 0.1
n_epoch = 10000
eta = 0.0000001
lda = 0.1
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch, eta, lda)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))