损失函数:http://www.csie.ntu.edu.tw/~cjlin/liblinear/ http://www.ics.uci.edu/~dramanan/teaching/ics273a_winter08/lectures/lecture14.pdf
线性model 是 平方损失,绝对值损失
分类问题有hinge loss (svm),logic loss(logistic regression)
logistic regression :
1 / (1 + exp(-wTx))
log(1 + exp(-ywTx)) y 属于1 或-1
svm分类: max(0,1-y((w*x)+b)) +a||w||**2, 间隔最大
贝叶斯:概率来的,分为feature是离散值还是连续值(转化为高斯分布来计算概率)
要点是首先,计算每种分类(yj)下的每个feature的mean及stdev值,然后P(yj|x)=连乘p(xi|yj)*p(yj),p(yj)是根据该类分类数目占总数目的比例。如下时python程序,
# Example of Naive Bayes implemented from Scratch in Python
import
csv
import
random
import
math
def
loadCsv
(
filename
)
:
lines
=
csv
.
reader
(
open
(
filename
,
"rb"
)
)
dataset
=
list
(
lines
)
for
i
in
range
(
len
(
dataset
)
)
:
dataset
[
i
]
=
[
float
(
x
)
for
x
in
dataset
[
i
]
]
return
dataset
def
splitDataset
(
dataset
,
splitRatio
)
:
trainSize
=
int
(
len
(
dataset
)
*
splitRatio
)
trainSet
=
[
]
copy
=
list
(
dataset
)
while
len
(
trainSet
)
<
trainSize
:
index
=
random
.
randrange
(
len
(
copy
)
)
trainSet
.
append
(
copy
.
pop
(
index
)
)
return
[
trainSet
,
copy
]
def
separateByClass
(
dataset
)
:
separated
=
{
}
for
i
in
range
(
len
(
dataset
)
)
:
vector
=
dataset
[
i
]
if
(
vector
[
-
1
]
not
in
separated
)
:
separated
[
vector
[
-
1
]
]
=
[
]
separated
[
vector
[
-
1
]
]
.
append
(
vector
)
return
separated
def
mean
(
numbers
)
:
return
sum
(
numbers
)
/
float
(
len
(
numbers
)
)
def
stdev
(
numbers
)
:
avg
=
mean
(
numbers
)
variance
=
sum
(
[
pow
(
x
-
avg
,
2
)
for
x
in
numbers
]
)
/
float
(
len
(
numbers
)
-
1
)
return
math
.
sqrt
(
variance
)
def
summarize
(
dataset
)
:
summaries
=
[
(
mean
(
attribute
)
,
stdev
(
attribute
)
)
for
attribute
in
zip
(
*
dataset
)
]
del
summaries
[
-
1
]
return
summaries
def
summarizeByClass
(
dataset
)
:
separated
=
separateByClass
(
dataset
)
summaries
=
{
}
for
classValue
,
instances
in
separated
.
iteritems
(
)
:
summaries
[
classValue
]
=
summarize
(
instances
)
return
summaries
def
calculateProbability
(
x
,
mean
,
stdev
)
:
exponent
=
math
.
exp
(
-
(
math
.
pow
(
x
-
mean
,
2
)
/
(
2
*
math
.
pow
(
stdev
,
2
)
)
)
)
return
(
1
/
(
math
.
sqrt
(
2
*
math
.
pi
)
*
stdev
)
)
*
exponent
def
calculateClassProbabilities
(
summaries
,
inputVector
)
:
probabilities
=
{
}
for
classValue
,
classSummaries
in
summaries
.
iteritems
(
)
:
probabilities
[
classValue
]
=
1
for
i
in
range
(
len
(
classSummaries
)
)
:
mean
,
stdev
=
classSummaries
[
i
]
x
=
inputVector
[
i
]
probabilities
[
classValue
]
*=
calculateProbability
(
x
,
mean
,
stdev
)
return
probabilities
def
predict
(
summaries
,
inputVector
)
:
probabilities
=
calculateClassProbabilities
(
summaries
,
inputVector
)
bestLabel
,
bestProb
=
None
,
-
1
for
classValue
,
probability
in
probabilities
.
iteritems
(
)
:
if
bestLabel
is
None
or
probability
>
bestProb
:
bestProb
=
probability
bestLabel
=
classValue
return
bestLabel
def
getPredictions
(
summaries
,
testSet
)
:
predictions
=
[
]
for
i
in
range
(
len
(
testSet
)
)
:
result
=
predict
(
summaries
,
testSet
[
i
]
)
predictions
.
append
(
result
)
return
predictions
def
getAccuracy
(
testSet
,
predictions
)
:
correct
=
0
for
i
in
range
(
len
(
testSet
)
)
:
if
testSet
[
i
]
[
-
1
]
==
predictions
[
i
]
:
correct
+=
1
return
(
correct
/
float
(
len
(
testSet
)
)
)
*
100.0
def
main
(
)
:
filename
=
'pima-indians-diabetes.data.csv'
splitRatio
=
0.67
dataset
=
loadCsv
(
filename
)
trainingSet
,
testSet
=
splitDataset
(
dataset
,
splitRatio
)
print
(
'Split {0} rows into train={1} and test={2} rows'
)
.
format
(
len
(
dataset
)
,
len
(
trainingSet
)
,
len
(
testSet
)
)
# prepare model
summaries
=
summarizeByClass
(
trainingSet
)
# test model
predictions
=
getPredictions
(
summaries
,
testSet
)
accuracy
=
getAccuracy
(
testSet
,
predictions
)
print
(
'Accuracy: {0}%'
)
.
format
(
accuracy
)
main
(
)
这个程序缺失了P(yi)
即p(yi|x)=p(x1|yi)*p(x2|yi)...p(xn|yi)*p(yi)(省略了分母的p(x)
|
决策树 :1)熵增益形:H(D)=summarize(pklogpk)k 为类别, H(D|A)= summarize(p(Di)summarize(p(Dik)logp(Dik))),其中p(Di)=|Di|/|D| , p(Dik)= |Dik|/|Di|,|Di|表示featureA中Di情况的数目, 增益gain = H(D)-H(D|A) ,每次比较剩余feature 的gain,gain大的优先选为节点(NOde), 这个树可能是多叉树,看Di(i的数目)
2)熵增益比,与第一种的差别是有gainR = gain/HA(D)决定哪个feature优先选为节点, 其中HA(D)= summarize(p(Di)logp(Di))
3)cart(classification and regression tree), 由基尼系数最小的优先选为节点, gini(D) = summarize(pk(1-pk)),
gini(D|A)= summarize(p(Di)summarize(p(Dik)(1-p(Dik)))(二叉树)
比较每个featureA 的在不同情况下的gini, 选择有最小gini 的feature中的种情况作为一个二叉树节点
# CART on the Bank Note dataset
from random import seed
from random import randrange
from csv import reader
# Load a CSV file
def load_csv(filename):
file = open(filename, "rb")
lines = reader(file)
dataset = list(lines)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
left, right = list(), list()
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
# Calculate the Gini index for a split dataset
def gini_index(groups, class_values):
gini = 0.0
for class_value in class_values:
for group in groups:
size = len(group)
if size == 0:
continue
proportion = [row[-1] for row in group].count(class_value) / float(size)
gini += (proportion * (1.0 - proportion))
return gini
# Select the best split point for a dataset
def get_split(dataset):
class_values = list(set(row[-1] for row in dataset))
b_index, b_value, b_score, b_groups = 999, 999, 999, None
for index in range(len(dataset[0])-1):
for row in dataset:
groups = test_split(index, row[index], dataset)
gini = gini_index(groups, class_values)
if gini < b_score:
b_index, b_value, b_score, b_groups = index, row[index], gini, groups
return {'index':b_index, 'value':b_value, 'groups':b_groups}
# Create a terminal node value
def to_terminal(group):
outcomes = [row[-1] for row in group]
return max(set(outcomes), key=outcomes.count)
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
left, right = node['groups']
del(node['groups'])
# check for a no split
if not left or not right:
node['left'] = node['right'] = to_terminal(left + right)
return
# check for max depth
if depth >= max_depth:
node['left'], node['right'] = to_terminal(left), to_terminal(right)
return
# process left child
if len(left) <= min_size:
node['left'] = to_terminal(left)
else:
node['left'] = get_split(left)
split(node['left'], max_depth, min_size, depth+1)
# process right child
if len(right) <= min_size:
node['right'] = to_terminal(right)
else:
node['right'] = get_split(right)
split(node['right'], max_depth, min_size, depth+1)
# Build a decision tree
def build_tree(train, max_depth, min_size):
root = get_split(dataset)
split(root, max_depth, min_size, 1)
return root
# Make a prediction with a decision tree
def predict(node, row):
if row[node['index']] < node['value']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
tree = build_tree(train, max_depth, min_size)
predictions = list()
for row in test:
prediction = predict(tree, row)
predictions.append(prediction)
return(predictions)
# Test CART on Bank Note dataset
seed(1)
# load and prepare data
filename = 'data_banknote_authentication.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
这个程序的gini计算少了p(Di),导致了accuracy下降的不少
https://docs.python.org/2/library/re.html#regular-expression-objects //python 库查网站
spark with ml //pyspark
1)initialize sc = SparkContext('local[4]', 'app name')
2)form RDD RDD= sc.textFile(filename) or RDD=sc.parallelize(list/array)
3) RDD 's map() ,groupByKey(),reduceByKey(),sortByKey(),sortBy(function wt by ourself), filter, combineByKey
4) merge two RDD , join(), leftOuterJoin(), rightOutJoin, fullOuterJoin, notice , the join's result only allow two column , also before join, first union the column to one (key, (, , ,)) ,or it will miss data after merge , for union(), just like push_back in c++
5) in spark , to let the result be the same for each , we can't just sort the data by sortByKey , but we can first merge key+value to string ,than use sortBy, so each run ,the result will be the same
6) action function, take()//every time the result maybe not the same, takeOrdered(number, key=sortfunction)//if not set key ,then return the smallest ones
7) collaborative Filtering
1)split the input movie data to trainingRDD, validationRDD,testRDD = ratingsRDD.randomSplit([6,2,2], seed =0)
2)from pyspark.mllib.recommendation import ALS
1)choose the model with the least errors
seed =5, iterations =0.1,rank from 4 ,8,12, computeError is the RMSE =sqrt(summarize((h(xi)-y(i )**2))/n)
for rank in ranks: model = ALS.train(trainingRDD, rank, seed=seed, iterations=iterations, lambda_=regularizationParameter) predictedRatingsRDD = model.predictAll(validationForPredictRDD) error = computeError(predictedRatingsRDD, validationRDD) errors[err] = error err += 1 print 'For rank %s the RMSE is %s' % (rank, error) if error < minError: minError = error bestRank = rank
2)get that with the best result , then use testRDD, to get the predictResults