train\predict\cross_validation demo include almost all Classifier from scikit-learning

12 篇文章 0 订阅
2 篇文章 0 订阅
import numpy as np
from sklearn.model_selection import train_test_split
#from sklearn import cross_validation
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn import svm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier,RandomTreesEmbedding
import datetime
from sklearn import datasets as ds

estimators = {}
estimators['GaussianNB'] = GaussianNB()
#estimators['MultinomialNB'] = MultinomialNB()
estimators['BernoulliNB'] = BernoulliNB()
estimators['DecisionTreeClassifier'] = tree.DecisionTreeClassifier()
estimators['ExtraTreeClassifier'] = tree.ExtraTreeClassifier()
estimators['forest_100'] = RandomForestClassifier(n_estimators = 100)
estimators['forest_50'] = RandomForestClassifier(n_estimators = 50)
estimators['forest_10'] = RandomForestClassifier(n_estimators = 10)
estimators['svm_c_rbf'] = svm.SVC()
estimators['svm_c_linear'] = svm.SVC(kernel='linear')
estimators['svm_linear'] = svm.LinearSVC()
estimators['QuadraticDiscriminantAnalysis'] = QuadraticDiscriminantAnalysis()
estimators['LinearDiscriminantAnalysis'] = LinearDiscriminantAnalysis(solver='svd')
estimators['gaussian_process'] = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
estimators['neural_network'] = MLPClassifier(hidden_layer_sizes=(256,128,64))
estimators['AdaBoost'] = AdaBoostClassifier()
estimators['BaggingClassifier'] = BaggingClassifier()
#stimators['VotingClassifier'] = VotingClassifier()
estimators['ExtraTreesClassifier'] = ExtraTreesClassifier()
estimators['KNeighborsClassifier'] = KNeighborsClassifier(7)
estimators['LogisticRegression'] = linear_model.LogisticRegression(dual=True)
estimators['PassiveAggressiveClassifier'] = linear_model.PassiveAggressiveClassifier()
estimators['RidgeClassifier'] = linear_model.RidgeClassifier()
estimators['SGDClassifier'] = linear_model.SGDClassifier()
estimators['GradientBoostingClassifier'] = GradientBoostingClassifier(n_estimators=100, max_depth=8)

x = np.loadtxt('trainf.csv',delimiter=",")
y = np.loadtxt('trainl.csv',delimiter=",")

# x, y = ds.load_svmlight_file("color.txt")
# x = x.toarray()

#x = RandomTreesEmbedding().fit_transform(x)
#z = StandardScaler().fit_transform(x)
#z = [y ,x]#
#ds.dump_svmlight_file(z,y,"UVN.txt",zero_based=False)
#x = preprocessing.Normalizer(x)
#x = preprocessing.minmax_scale(x)
#x = preprocessing.normalize(x)
x = StandardScaler().fit_transform(x)
data_train, data_test, target_train, target_test = train_test_split(x, y, test_size=0.2, random_state=3)


for k in estimators.keys():
    start_time = datetime.datetime.now()
    print '----%s----' % k
    estimators[k] = estimators[k].fit(data_train, target_train)
    pred = estimators[k].predict(data_test)
    print("%s Score: %0.4f" % (k, estimators[k].score(data_test, target_test)))
    scores = model_selection.cross_val_score(estimators[k], x, y, cv=5)
    print("%s Cross Avg. Score: %0.4f (+/- %0.4f)" % (k, scores.mean(), scores.std() * 2))
    end_time = datetime.datetime.now()
    time_spend = end_time - start_time
    print("%s Time: %0.4f" % (k, time_spend.total_seconds()))


data download:

http://download.csdn.net/detail/u014333051/9709819


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
K-means是常用的聚类算法之一,它的主要思想是将数据点分为K个簇,使得同一簇内的点相似度较高,不同簇之间的点相似度较低。在scikit-learn中,KMeans聚类算法已经实现,可以方便地进行聚类操作。 本文将介绍使用scikit-learn中的KMeans聚类算法进行聚类的步骤和实现方法,并介绍MiniBatchKMeans的使用。 ## 1. 数据准备 我们先生成一个随机数据集,用于演示KMeans聚类: ```python import numpy as np # 生成随机数据 np.random.seed(0) X = np.random.randn(1000, 2) # 生成1000个二维数据点 ``` ## 2. 模型训练 接下来,我们使用KMeans模型对数据进行聚类: ```python from sklearn.cluster import KMeans # 构建模型 kmeans = KMeans(n_clusters=3, random_state=0) # 训练模型 kmeans.fit(X) ``` 这里选择将数据分为3个簇,可以根据实际情况进行调整。训练完成后,我们可以查看簇中心点的位置: ```python print(kmeans.cluster_centers_) ``` 输出: ``` [[ 0.05161133 -0.96525049] [ 1.06359705 -0.02646225] [-0.9680658 0.04252211]] ``` ## 3. 预测和评估 训练完成后,我们可以使用训练好的模型对新数据进行预测: ```python # 预测新数据 y_pred = kmeans.predict(X) ``` 对于聚类算法,我们可以使用轮廓系数(Silhouette Coefficient)评估聚类效果。轮廓系数是一种衡量聚类质量的指标,取值范围在[-1, 1]之间,越接近1表示聚类效果越好。在scikit-learn中,可以使用metrics.silhouette_score来计算轮廓系数: ```python from sklearn import metrics # 计算轮廓系数 score = metrics.silhouette_score(X, y_pred) print(score) ``` 输出: ``` 0.6011942331016043 ``` ## 4. MiniBatchKMeans KMeans聚类算法的一个问题是它对于大规模数据的聚类会比较慢。因此,scikit-learn中还提供了MiniBatchKMeans算法,它可以加快聚类速度。 MiniBatchKMeans的使用方法与KMeans类似: ```python from sklearn.cluster import MiniBatchKMeans # 构建模型 mbkmeans = MiniBatchKMeans(n_clusters=3, random_state=0) # 训练模型 mbkmeans.fit(X) # 预测新数据 y_pred = mbkmeans.predict(X) # 计算轮廓系数 score = metrics.silhouette_score(X, y_pred) print(score) ``` 需要注意的是,MiniBatchKMeans算法在聚类效果上可能会稍微劣于KMeans算法,但是速度更加快捷。在处理大规模数据时,可以优先考虑使用MiniBatchKMeans算法。 本文介绍了使用scikit-learn中的KMeans聚类算法进行聚类的步骤和实现方法,并介绍了MiniBatchKMeans的使用。在实际应用中,可以根据实际情况选择不同的聚类算法和参数。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值