Sure, I can help you with that. Let's start by importing the necessary libraries and loading the dataset.
```python
import csv
import random
import math
# Load dataset
def load_dataset(filename):
dataset = []
with open(filename) as file:
reader = csv.reader(file)
for row in reader:
if '?' not in row:
dataset.append(row[:-1] + [1 if row[-1].strip() == '>50K' else 0])
return dataset
# Split dataset into training and evaluation sets
def split_dataset(dataset, split_ratio):
train_size = int(len(dataset) * split_ratio)
train_set = []
eval_set = list(dataset)
while len(train_set) < train_size:
index = random.randrange(len(eval_set))
train_set.append(eval_set.pop(index))
return train_set, eval_set
dataset = load_dataset('data1/adult.data')
train_set, eval_set = split_dataset(dataset, 0.8)
```
Now, let's implement the decision tree algorithm. We will use the ID3 algorithm with entropy as the splitting criterion.
```python
# Calculate entropy of a dataset
def entropy(dataset):
num_records = len(dataset)
label_counts = {}
for record in dataset:
label = record[-1]
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
entropy = 0.0
for label in label_counts:
prob = float(label_counts[label]) / num_records
entropy -= prob * math.log2(prob)
return entropy
# Split dataset based on a given attribute
def split_dataset_by_attribute(dataset, attribute_index):
splits = {}
for record in dataset:
attribute_value = record[attribute_index]
if attribute_value not in splits:
splits[attribute_value] = []
splits[attribute_value].append(record)
return splits
# Calculate information gain of a dataset after splitting on a given attribute
def information_gain(dataset, attribute_index):
attribute_values = set([record[attribute_index] for record in dataset])
split_entropies = 0.0
for attribute_value in attribute_values:
split = [record for record in dataset if record[attribute_index] == attribute_value]
prob = float(len(split)) / len(dataset)
split_entropies += prob * entropy(split)
return entropy(dataset) - split_entropies
# Find the attribute with the highest information gain
def find_best_split_attribute(dataset, attribute_indices):
best_attribute_index = -1
best_information_gain = -1.0
for attribute_index in attribute_indices:
information_gain_val = information_gain(dataset, attribute_index)
if information_gain_val > best_information_gain:
best_information_gain = information_gain_val
best_attribute_index = attribute_index
return best_attribute_index
# Create a decision tree recursively
def create_decision_tree(dataset, attribute_indices):
labels = [record[-1] for record in dataset]
# If all records have the same label, return a leaf node with that label
if len(set(labels)) == 1:
return labels[0]
# If no attributes are left, return a leaf node with the majority label
if len(attribute_indices) == 0:
majority_label = max(set(labels), key=labels.count)
return majority_label
# Otherwise, find the best attribute to split on and create a subtree for each possible value
best_attribute_index = find_best_split_attribute(dataset, attribute_indices)
best_attribute = attribute_indices[best_attribute_index]
tree = {best_attribute: {}}
attribute_indices.remove(best_attribute)
splits = split_dataset_by_attribute(dataset, best_attribute)
for attribute_value in splits:
tree[best_attribute][attribute_value] = create_decision_tree(splits[attribute_value], attribute_indices[:])
attribute_indices.insert(best_attribute_index, best_attribute)
return tree
# Predict the label of a record using a decision tree
def predict(tree, record):
if isinstance(tree, str):
return tree
attribute_index = next(iter(tree))
attribute_value = record[attribute_index]
if attribute_value not in tree[attribute_index]:
return None
subtree = tree[attribute_index][attribute_value]
return predict(subtree, record)
# Calculate accuracy of a decision tree on a dataset
def calculate_accuracy(tree, dataset):
num_correct = 0
for record in dataset:
predicted_label = predict(tree, record)
if predicted_label is not None and predicted_label == record[-1]:
num_correct += 1
return float(num_correct) / len(dataset)
# Test decision tree algorithm
attribute_indices = list(range(len(dataset[0]) - 1))
decision_tree = create_decision_tree(train_set, attribute_indices)
accuracy = calculate_accuracy(decision_tree, eval_set)
print('Accuracy:', accuracy)
```
Finally, let's implement the support vector machine algorithm using the sequential minimal optimization (SMO) algorithm.
```python
# Calculate dot product of two vectors
def dot_product(x, y):
return sum([x[i] * y[i] for i in range(len(x))])
# Calculate magnitude of a vector
def magnitude(x):
return math.sqrt(dot_product(x, x))
# Calculate distance between two vectors
def distance(x, y):
return magnitude([x[i] - y[i] for i in range(len(x))])
# Calculate kernel function for two vectors
def kernel(x, y, kernel_type='linear', gamma=0.1):
if kernel_type == 'linear':
return dot_product(x, y)
elif kernel_type == 'gaussian':
return math.exp(-gamma * distance(x, y))
# Train a support vector machine using the SMO algorithm
def train_svm(dataset, kernel_type='linear', C=1.0, max_iterations=100, tolerance=0.01, gamma=0.1):
# Initialize alpha vector and bias term
num_records = len(dataset)
alpha = [0.0] * num_records
bias = 0.0
# Initialize kernel matrix
kernel_matrix = [[kernel(record1[:-1], record2[:-1], kernel_type, gamma) for record2 in dataset] for record1 in dataset]
# Loop until convergence or max_iterations is reached
num_iterations = 0
while num_iterations < max_iterations:
num_changed_alphas = 0
for i in range(num_records):
# Calculate error for record i
error_i = sum([alpha[j] * dataset[j][-1] * kernel_matrix[j][i] for j in range(num_records)]) + bias - dataset[i][-1]
# Check if alpha i violates KKT conditions
if (dataset[i][-1] * error_i < -tolerance and alpha[i] < C) or (dataset[i][-1] * error_i > tolerance and alpha[i] > 0):
# Select a second alpha j randomly
j = i
while j == i:
j = random.randrange(num_records)
# Calculate error for record j
error_j = sum([alpha[k] * dataset[k][-1] * kernel_matrix[k][j] for k in range(num_records)]) + bias - dataset[j][-1]
# Save old alpha values
alpha_i_old = alpha[i]
alpha_j_old = alpha[j]
# Calculate L and H bounds for alpha j
if dataset[i][-1] != dataset[j][-1]:
L = max(0, alpha[j] - alpha[i])
H = min(C, C + alpha[j] - alpha[i])
else:
L = max(0, alpha[i] + alpha[j] - C)
H = min(C, alpha[i] + alpha[j])
# If L == H, skip this pair of alphas
if L == H:
continue
# Calculate eta (i.e., the second derivative of the objective function)
eta = 2 * kernel_matrix[i][j] - kernel_matrix[i][i] - kernel_matrix[j][j]
# If eta >= 0, skip this pair of alphas
if eta >= 0:
continue
# Calculate new value for alpha j
alpha[j] -= (dataset[j][-1] * (error_i - error_j)) / eta
# Clip new value for alpha j to be between L and H
alpha[j] = max(L, min(H, alpha[j]))
# If alpha j has not changed much, skip this pair of alphas
if abs(alpha[j] - alpha_j_old) < tolerance:
continue
# Calculate new value for alpha i
alpha[i] += dataset[i][-1] * dataset[j][-1] * (alpha_j_old - alpha[j])
# Calculate new bias term
b1 = bias - error_i - dataset[i][-1] * (alpha[i] - alpha_i_old) * kernel_matrix[i][i] - dataset[j][-1] * (alpha[j] - alpha_j_old) * kernel_matrix[i][j]
b2 = bias - error_j - dataset[i][-1] * (alpha[i] - alpha_i_old) * kernel_matrix[i][j] - dataset[j][-1] * (alpha[j] - alpha_j_old) * kernel_matrix[j][j]
if 0 < alpha[i] < C:
bias = b1
elif 0 < alpha[j] < C:
bias = b2
else:
bias = (b1 + b2) / 2
num_changed_alphas += 1
# If no alphas were changed in this iteration, increment counter
if num_changed_alphas == 0:
num_iterations += 1
else:
num_iterations = 0
# Select support vectors (i.e., non-zero alphas)
support_vectors = []
support_vector_labels = []
for i in range(num_records):
if alpha[i] > 0:
support_vectors.append(dataset[i][:-1])
support_vector_labels.append(dataset[i][-1])
# Return support vectors, support vector labels, and bias term
return support_vectors, support_vector_labels, bias
# Predict the label of a record using a support vector machine
def predict_svm(support_vectors, support_vector_labels, bias, record, kernel_type='linear', gamma=0.1):
predicted_label = None
for i in range(len(support_vectors)):
kernel_val = kernel(support_vectors[i], record, kernel_type, gamma)
predicted_label += support_vector_labels[i] * kernel_val
predicted_label += bias
return 1 if predicted_label > 0 else 0
# Calculate accuracy of a support vector machine on a dataset
def calculate_accuracy_svm(support_vectors, support_vector_labels, bias, dataset, kernel_type='linear', gamma=0.1):
num_correct = 0
for record in dataset:
predicted_label = predict_svm(support_vectors, support_vector_labels, bias, record[:-1], kernel_type, gamma)
if predicted_label is not None and predicted_label == record[-1]:
num_correct += 1
return float(num_correct) / len(dataset)
# Test support vector machine algorithm
support_vectors, support_vector_labels, bias = train_svm(train_set, 'linear', 1.0, 100, 0.01, 0.1)
accuracy = calculate_accuracy_svm(support_vectors, support_vector_labels, bias, eval_set, 'linear', 0.1)
print('Accuracy:', accuracy)
```
You can now compare the performances of the decision tree and support vector machine algorithms on the Adult dataset.