import math
def cos_distance(x, y):
# 计算向量的内积
dot = sum([i * j for i, j in zip(x,y)])
# 计算向量的模长
norm_x = math.sqrt(sum([i ** 2 for i in x]))
norm_y = math.sqrt(sum([i ** 2 for i in y]))
cos_distance = dot / (norm_x * norm_y)
print(f"余弦距离是:{cos_distance}")
return cos_distance
def correlation_distance(x, y):
# 计算均值
agv_x = sum(x) / len(y)
agv_y = sum(x) / len(y)
# 计算分子
numerator = sum([(i - agv_x) * (j - agv_y) for i, j in zip(x, y)])
# 计算分母
denominator = math.sqrt(sum([(i - agv_x) ** 2 for i in x])) * math.sqrt(sum([(j - agv_y) ** 2 for j in y]))
# 计算相关距离度量
correlation_distance = 1 - numerator / denominator
print(f"相关度距离是:{correlation_distance}")
return correlation_distance
def euclid_distance(x, y):
# 计算向量差的平方和
squared_distance = sum([(i - j) ** 2 for i, j in zip(x, y)])
# 计算平方和的平方根
euclid_distance = math.sqrt(squared_distance)
print(f"欧几里得距离:{euclid_distance}")
return euclid_distance
def jaccard_similarity(x, y):
intersection = len(set(x) & set(y))
union = len(set(x) | set(y))
jaccard_similarity = intersection / union
print(f"Jaccard系数是:{jaccard_similarity}")
return jaccard_similarity
if __name__ == '__main__':
x1 = [0, -1, 0, 1]
y1 = [1, 0, -1, 0]
x2 = [0, 1, 0, 1]
y2 = [1, 0, 1, 0]
list_x = [x1, x2]
list_y = [y1, y2]
for x, y in zip(list_x, list_y):
print(f"x:{x}\ny:{y}")
cos_distance(x, y)
correlation_distance(x, y)
euclid_distance(x, y)
if any(i < 0 for i in x) and any(j < 0 for j in y) :
pass
else:
jaccard_similarity(x, y)
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据集并设置列名
iris_df = pd.read_csv('./2/iris.csv', header=None, \
names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])
# 分别绘制每个特征的盒状图
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
axs = axs.flatten()
for i, col in enumerate\
(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']):
#此部分绘制单独的图像,绘制合并的图像时注释这部分代码
# fig_child, ax_child = plt.subplots(figsize=(10, 6))
# iris_df.boxplot(column=[col], by='species', ax=ax_child)
# ax_child.set_title(f'Boxplot of {col} by Species')
# ax_child.set_xlabel('Species')
# ax_child.set_ylabel(f'{col} (cm)')
# plt.savefig(f"./2/{col}.jpg")
# plt.show()
iris_df.boxplot(column=[col], by='species', ax=axs[i])
axs[i].set_title(col)
axs[i].set_xlabel('')
plt.suptitle('Iris Box Plots', y=1.05, fontsize=16)
plt.tight_layout()
plt.savefig("./2/all.jpg")
plt.show()
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
def data_trans(dataframe):
list_name = []
for i in range(dataframe.shape[0]):
list_name.append(dataframe.loc[i, "Name"])
dataframe.loc[i, "Name"] = i
return dataframe, list_name
def display(pred):
for i in range(len(pred)):
label = y_test.iloc[i]
if pred[i] == label:
print("{:}-{: <15} label:{: <15} pre:\
{: <15} Correct prediction".format(i, test_name[i], label, pred[i]))
else:
print(
"{:}-{: <15} label:{: <15} pre:\
{: <15} Incorrect predictions".format(i, test_name[i], label, pred[i]))
# 决策树分类器
def Tree(X_train, y_train, X_test, y_test):
print("Decision Tree Classifier")
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
tree_pred = tree_clf.predict(X_test)
display(tree_pred)
tree_acc = accuracy_score(y_test, tree_pred)
print('Decision Tree Classifier Accuracy: {:.2f}%'.format(tree_acc * 100))
print("\n")
# KNN分类器
def KNN(X_train, y_train, X_test, y_test):
print("KNN Classifier")
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_test)
display(knn_pred)
knn_acc = accuracy_score(y_test, knn_pred)
print('KNN Classifier Accuracy: {:.2f}%'.format(knn_acc * 100))
print("\n")
# 支持向量机分类器
def SVM(X_train, y_train, X_test, y_test):
print("SVM Classifier")
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_pred = svm_clf.predict(X_test)
display(svm_pred)
svm_acc = accuracy_score(y_test, svm_pred)
print('SVM Accuracy: {:.2f}%'.format(svm_acc * 100))
print("\n")
# 基于决策树的AdaBoost集成学习算法
def Ada(X_train, y_train, X_test, y_test):
ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_clf.fit(X_train, y_train)
ada_pred = ada_clf.predict(X_test)
display(ada_pred)
ada_acc = accuracy_score(y_test, ada_pred)
print('Ada Accuracy: {:.2f}%'.format(ada_acc * 100))
print("\n")
if __name__ == '__main__':
# 读取训练集和测试集
train_df = pd.read_csv('./3/vertebrate1.csv')
test_df = pd.read_csv('./3/vertebrate1_test.csv')
# 分离特征和标签
X_train = train_df.iloc[:, :-1]
X_train, train_name = data_trans(X_train)
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
X_test, test_name = data_trans(X_test)
y_test = test_df.iloc[:, -1]
Tree(X_train, y_train, X_test, y_test)
KNN(X_train, y_train, X_test, y_test)
SVM(X_train, y_train, X_test, y_test)
Ada(X_train, y_train, X_test, y_test)
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
data = pd.read_csv('./4/Vertebrate2.csv')
list_name = data.iloc[:, 0].values
list_name = [f"{i}-{list_name[i]}" for i in range(len(list_name))]
data = data.iloc[:, 1:-1]
for type in ["single", "complete", "average"]:
print("\n")
print(f"AGNES_{type}:")
dist = linkage(data, method=type)
fig, ax = plt.subplots(figsize=(15, 8))
dendrogram(dist, ax =ax, leaf_rotation=0, labels=list_name ,orientation="right")
plt.title(f'{type} Linkage',fontsize = 12)
plt.ylabel('Vertebrate Species',fontsize = 12)
plt.xlabel('Distance',fontsize = 12)
fig.savefig(f"./4/{type}.png",dpi = 1200)
plt.show()
dist =dist.tolist()
print("{: ^10}{: ^15}{: ^15}{: ^15}{: ^15}".\
format("index", "groups", "groups", "distance", "group_numbers"))
for i in range(len(dist)):
print("{: ^10}{: ^15}{: ^15}{: ^15}{: ^15}".\
format(i, int(dist[i][0]), int(dist[i][1]), round(dist[i][2],1), int(dist[i][3])))
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
def Ccm():
# 设置聚类数量的范围
cluster_range = range(2, 10)
# 用于存储不同聚类数量下的轮廓系数
silhouette_scores = []
score_max = 0
score_max_n = 0
for idx, n_clusters in enumerate(cluster_range):
# 构建KMeans聚类模型
kmeans = KMeans(n_clusters=n_clusters)
# 模型训练
kmeans.fit(X)
# 获取每个数据点所属的簇
labels = kmeans.labels_
# 计算轮廓系数
score = silhouette_score(X, labels)
silhouette_scores.append(score)
if score > score_max:
score_max = score
score_max_n = n_clusters
# 绘制轮廓系数折线图
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.annotate(text=f'n_cluster:{score_max_n}', xy=(score_max_n, score_max), xytext=(score_max_n + 1, score_max),\
arrowprops={'arrowstyle':'->'},c = "r")
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
# plt.savefig(f"./5/Ccm_{score_max_n}.jpg")
plt.show()
return score_max_n
def k_distance(X, k):
neigh = NearestNeighbors(n_neighbors=k + 1)
neigh.fit(X)
distances, indices = neigh.kneighbors(X)
k_distances = distances[:, -1]
k_distances.sort()
return k_distances
def plot_k_distance(k_distances):
plt.plot(list(range(len(k_distances))), k_distances)
plt.xlabel('Points sorted by distance')
plt.ylabel('distance')
# plt.savefig(f"./5/k_distance.jpg")
plt.show()
# 定义KMeans模型并进行训练
def Kmeasn(X):
n_clu = Ccm()
kmeans = KMeans(n_clusters=n_clu)
kmeans.fit(X)
# 获取聚类中心的坐标
centers = kmeans.cluster_centers_
# 取出聚类结果并进行绘图
y_pred = kmeans.labels_
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.scatter(centers[:, 0], centers[:, 1], marker='*', s=200, linewidths=3, color='r')
plt.title(f'KMeans Clustering-{n_clu}')
# plt.savefig(f"./5/KMeans Clustering-{n_clu}.jpg")
plt.show()
#定义DBSCAN模型并进行训练
def Dbscan(X,eps):
k = 3
k_distances = k_distance(X, k)
plot_k_distance(k_distances)
dbscan = DBSCAN(eps=eps, min_samples=5)
y_pred = dbscan.fit_predict(X)
# 取出聚类结果并进行绘图
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title(f'DBSCAN Clustering, eps={eps}, min_samples=5')
# plt.savefig(f'./5/DBSCAN Clustering-{eps}-5.jpg')
plt.show()
if __name__ == '__main__':
# 读取数据
data = pd.read_csv('./5/chameleon.csv', header=None)
X = data.values[:, 0:] # 取出数据的特征部分
Kmeasn(X)
Dbscan(X,24)