sklearn库--ML算法全部代码整理

此代码包括了机器学习所有基本算法,非常适合初学者,如果有任何问题,欢迎提问,此代码会不断更新…

# 加载包
import numpy as np
import pandas as pd
import os

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing 


#读取数据
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
os.chdir("/home/ftt/DL/transfer_learning/classification_2/data_9_feature")

#数据预处理
def data_preprocess(filename):
    data = pd.read_csv(filename)
   
    '''
    data[data<0]=0
    data=data.fillna(0)
    print(data.max().max())
    column=data.columns
    for i in column[:-1]:
        data[i] = data[i].map(lambda x: np.log(x+2)*10000)
    data=data.astype('int')
    '''
    '''
    min_max_scaler = preprocessing.MaxAbsScaler()
    data=min_max_scaler.fit_transform(data)*10000

    data=data.drop_duplicates(keep='first')
    '''
    return data

# 加载数据(方式一)
def load_data_np():
    Adware = np.loadtxt('adware.csv',skiprows=1,delimiter=',')
    Begin = np.loadtxt('begin.csv',skiprows=1,delimiter=',')
    GM = np.loadtxt('GM.csv',skiprows=1,delimiter=',')
    np.random.shuffle(Adware)
    np.random.shuffle(Begin)
    np.random.shuffle(GM)
    train=np.row_stack((Adware[:125000],Begin[:380000]))
    # train=np.row_stack((Adware[:125000],train))
    test=np.row_stack((Adware[125000:],Begin[380000:]))
    np.random.shuffle(train)
    np.random.shuffle(test)
    x_train,y_train=train[:,:9],train[:,9].ravel()
    x_test,y_test=test[:,:9], test[:,9]
    print(train.shape,test.shape)
    return x_train,y_train,x_test,y_test

# 加载数据(方式二)
def load_data_pd():
    Adware = data_preprocess('adware.csv')
    GM = data_preprocess('GM.csv')
    Begin = data_preprocess('begin.csv')

    Adware = Adware.sample(frac=1.0)
    GM = GM.sample(frac=1.0)
    Begin = Begin.sample(frac=1.0)
    
    print(len(Adware),len(GM),len(Begin))

    train = pd.merge(GM[:int(0.8*len(GM))],Adware[:int(0.8*len(Adware))], how='outer')
    test = pd.merge(GM[int(0.8*len(GM)):],Adware[int(0.8*len(Adware)):], how='outer')
    train =pd.merge(train,Begin[:int(0.8*len(Begin))], how='outer')
    test =pd.merge(test,Begin[int(0.8*len(Begin)):], how='outer')

    train=train.values
    test=test.values

    x_train,y_train=train[:,:9],train[:,9].ravel()
    x_test,y_test=test[:,:9], test[:,9]

    # min_max_scaler = preprocessing.MaxAbsScaler()
    # x_train=min_max_scaler.fit_transform(x_train)*10000
    # x_test=min_max_scaler.transform(x_test)*10000

    return x_train,y_train,x_test,y_test

# 加载数据(方式三)
def load_data_split():
    Adware = pd.read_csv('adware.csv')
    GM =pd.read_csv('GM.csv')
    Begin = pd.read_csv('begin.csv')
    train_data = pd.merge(Adware,GM, how='outer')
    train_data=train_data.drop_duplicates(keep='first')
    train_data = train_data.sample(frac=1.0)
    train_data = train_data.values
    print(sum(train_data[:,9].ravel()),len(train_data)-sum(train_data[:,9].ravel()))
    num_features = train_data.shape[0]
    print("Number of all features: \t", num_features)
    split = int(num_features * 0.8)
    train = train_data[:split]
    test = train_data[split:]
    x_train,y_train=train[:,:9],train[:,9].ravel()
    x_test,y_test=test[:,:9], test[:,9]
    print(train.shape,test.shape)
    return x_train,y_train,x_test,y_test

# 模型调用
def model():
    # clf = RandomForestClassifier(n_estimators=200)
    # clf = DecisionTreeClassifier()
    # clf = KNeighborsClassifier()
    # clf = GaussianNB()
    # clf = SVC()
    # clf = LogisticRegression()
    # clf = GradientBoostingClassifier(init=None,n_estimators=1000,learning_rate=0.1, subsample=0.8,loss='deviance',max_features='sqrt',criterion='friedman_mse',min_samples_split =1200, min_impurity_split=None,min_impurity_decrease=0.0,max_depth=7,max_leaf_nodes=None,min_samples_leaf =60, warm_start=False,random_state=10)
    clf = GradientBoostingRegressor(init=None,n_estimators=1000,learning_rate=0.1, subsample=0.8,loss='ls',max_features='sqrt',criterion='friedman_mse',min_samples_split =1200, min_impurity_split=None,min_impurity_decrease=0.0,max_depth=7,max_leaf_nodes=None,min_samples_leaf =60, warm_start=False,random_state=10)  
    # clf = ExtraTreesClassifier(n_estimators=200, max_depth=None,min_samples_split=2, random_state=0)
    # clf = AdaBoostClassifier(n_estimators=1000)
    return clf


# 训练与评估
def train():
    print('RandomForestClassifier')
    x_train,y_train,x_test,y_test=load_data_pd()
    clf=model()
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    y_predprob = clf.predict_proba(x_test)[:,1]
    print("Accuracy : %.4g" % clf.score(x_test,y_test))
    print("precision_recall_f1-score_accuracy:\n",metrics.classification_report(y_test,y_pred))
    print("confusion_matrix:\n",metrics.confusion_matrix(y_test,y_pred))
    print("Feature importances:",clf.feature_importances_)
    print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
    # 上面的评估指标适用于分类,下面的评估指标适用于回归
    # print("AUC Score (Train): %f" % metrics.roc_auc_score(test[:,9].ravel(), y_predprob))
    # print(metrics.mean_squared_error(test[:,9].ravel(), y_pred))
    
    # 自定义阈值
    '''
    m,n=0.0,0.0
    for i in range(len(test)):
    	if predict_proba[i][1]>0.5:
    		m+=1.0
    	if predict_proba[i][1]>0.5 and test[:,9].ravel()[i]==1:
    		n+=1.0
    k=sum(test[:,9].ravel())
    recall=n/k
    precision=n/m
    print("recall:",recall,"precision:",precision)
    print(k,m,n)
    '''

# 参数的网格搜索
def Gridsearch():
    x_train,y_train,x_test,y_test=load_data_pd()
    clf=model()
    gsearch = GridSearchCV(estimator = clf, param_grid = {'n_estimators':[100,200,300,500,800,1000]}, scoring='accuracy',iid=False,cv=5)
    gsearch.fit(x_train,y_train)
    print(gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_)

# 参数的交叉验证
def CV():
    train,test=load_data_pd()
    clf=model()
    scores = cross_val_score(clf, train[:,:9],train[:,9].ravel())
    print(scores)
    print(scores.mean()) 

# 训练主函数
for i in range(10):
    print('train()-----gr')
    train()

本代码为自己总结的,并经过验证,如有任何问题,欢迎提问,并且欢迎大家一起完善这个模板,方便之后的使用.

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值