机器学习算法框架流程

K Nearest Neighbor

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier  # classifier
from sklearn.preprocessing import StandardScaler # standard 
from sklearn.model_selection import train_test_split  #split data

#read data
data=pd.read_csv('data/---.csv')
data.head()

# feature,score
x=data[['--','---','---']]  #array_x should be two-dimensional
y=data[['target--']]   #score

# split data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=24)  #test_size-the test data ratio。random_state---set casually,migrate easily

# standard
sc=StandardScaler()
new_x_train=sc.fit_transform(x_train)
new_x_test=sc.transform(x_test)

# machine learning
knn=KNeighborsClassifier()
knn.fit(new_x_train,y_train)  #model training
knn.predict(new_x_test)   #model prediction

knn.score(new_x_test,y_test)  #model assessment

GridSearchCV

gridsearch and cross validation

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier  # classifier
from sklearn.preprocessing import StandardScaler # standard 
from sklearn.model_selection import train_test_split,GridSearchCV  #split data and gridsearchcv
# read data
data=pd.read_csv('data/---.csv')
data.head()
# feature,score
x=data[['--','---','---']]  #array_x should be two-dimensional
y=data[['target--']]   #score
# split data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=24)  #test_size-the test data ratio。random_state---set casually,migrate easily

# standard
sc=StandardScaler()
new_x_train=sc.fit_transform(x_train)
new_x_test=sc.transform(x_test)
# mahine learning------
knn=KNeighborsClassifier()
# GridSearchCv-----------------------!
param_dict={'n_neighbors':[1,3,5]}  # param dict for adjustment
es=GridSearchCV(knn,param_grid=param_dict,cv=3)  # gridsearch and cross validation
es.fit(new_x_train,y_train)   #data training

# model assessment
y_predict=es.predict(x_test)  # predict  y
score=es.score(x_test,y_test)  # calculate the precision
es.best_score_  # best score
es.best_params_  # best params

Linear regression

LinearRegression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
.........

#model training
lr=LinearRegression()
lr.fit(new_x_train,y_train)
#prediction
y_pred=lr.predict(new_x_test)
print("y_prediction:",y_pred)

lr.coef_  #w 
# coef_array of shape (n_features, ) or (n_targets, n_features) --->can be regarded as '系数'
lr.intercept_  #b
# intercept_float or array of shape (n_targets,)----->can be regarded as '截距'

#model assessment
err=mean_squared_error(y_test,y_pred)
err

SGDRegressor

  • Stochastic gradient descent
    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
.........

# model training
lr2=SGDRegressor(max_iter=5000,learning_rate='constant',eta0=0.001)
lr2.fit(new_x_train,y_train)

#prediction
y_pred=lr.predict(new_x_test)
print("y_prediction:",y_pred)


# model assessment
score=lr.score(new_x_test,y_test)#[0,1]  be better to close "1"
print(score)
error=predictions-y_test#calculate error
rmse=(error**2).mean()**.5#calculate rmse
mae=abs(error).mean()#calculatemae
print(rmse)
print(mae)

Regularized linear model

Ridge Regression

sklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True,solver=“auto”, normalize=False)
在这里插入图片描述

# ridge=Ridge(alpha=0.1)
ridge=RidgeCV(alphas=(0.001,0.005,0.1,0.5,1,10,20,50,100))
ridge.fit(new_x_train,y_train)
#predict
y_pred=ridge.predict(new_x_test)

Losso Regression

在这里插入图片描述

Logistic Regression

sklearn.linear_model.LogisticRegression(solver=‘liblinear’, penalty=‘l2’, C = 1.0)

#model training
lr=LogisticRegression()
lr.fit(new_x_train,y_train)
#model assessment
pred=lr.predict(new_x_test)
pred
#ouput score
lr.score(new_x_test,y_test)
# the distribution of test data
y_test.value_counts()

# assessment
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)  
tn,fp,fn,tp=confusion_matrix(y_test,pred).ravel()
tn,fp,fn,tp
pr=tp/(tp+fp) # precision
recall=tp/(tp+fn) # recall
#api for calculating precision and recall
from sklearn.metrics import precision_score,recall_score
precision_score(y_true,y_pred)  # y_true and y_pred must be 1 or 0
recall_score(y_true,y_pred) 
#calculate f1
f1=2*tp/(2*tp+fn+fp)
f1
f12=2*pr*recall/(pr+recall)
f12
# f1_score
from sklearn.metrics import f1_score
f1_score(y_true,y_pred)

# classification_report!!!!!!!!!!!
from sklearn.metrics import classification_report
print(classification_report(y_test,pred,labels=(1,0),target_names=("good","bad")))
precision_score(y_true,y_pred)

DecisionTree

import pandas as pd
import numpy as np
from sklearn.feature_extration import DictVectorizer # transform non-numeric data into numerical value----onehot
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier,export_graphviz

data=pd.read_csv('data/----.csv')
x=data[['--','--','--','--']]
y=data['--']

# solve the null
x['--'].fillna(x['--'].mean(),inplace=True)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=12) # split data

# feature extraction
# DictVectorizer can only deal with the "dict",must use 'to_dict' to make that
transfer=DictVectorizer(sparse=False)
x_train=transfer.fit_transform(x_train.to_dict(orient="records"))
x_test=transfer.transform(x_test.to_dict(orient='records'))

# machine learning
es=DecisionTreeClassfier(criterion="entropy", max_depth=5)
es.fit(x_train,y_train)

es.score(x_test,y_test)
es.predict(x_test)

tree.export_graphviz(estimator,out_file='tree.dot’,feature_names=[‘’,’’])
# the visualization of decision tree 

Boosting & Bagging

RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# init the object
rf=RandomForestClassifier()
# setting the 
param= {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5, 8, 15, 25, 30]}
# define gridsearch
gs=GridSearchCV(rf,param_grid=param,cv=3)
gs.fit(new_x_train,y_train)

gs.score(new_x_test,y_test)
gs.best_params_

adaboost

from sklearn.ensemble  import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=1000,learning_rate=0.01)
ada.fit(new_x_train,y_train)

ada.score(new_x_test,y_test)

Kmeans

在这里插入图片描述
在这里插入图片描述

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# model training
km=KMeans(n_clusters=8,random_state=24)
km.fit(data)

y_predict=km.predict(data)

km.cluster_centers_

silhouette_score(data, y_predict)

Others

feature extraction

CountVectorizer

# 
from sklearn.feature_extraction.text import CountVectorizer
# get data
data=["人生苦短,我喜欢Python","生活太长久,我不喜欢Python"]
stop_words={'。',','}
# init the object
vector=CountVectorizer(tokenizer=jieba.lcut,stop_words=stop_words)
new_data=vector.fit_transform(data)# handle the data 
print(new_data.toarray())#
# into dataFrame
pd.DataFrame(new_data.toarray(),columns=vector.get_feature_names()).T

Feature dimensionality reduction

Feature selector that removes all low-variance features.
在这里插入图片描述

from sklearn.feature_selection import VarianceThreshold  # removes all low-variance features
vt=VarianceThreshold(threshold=1)

new_x=vt.fit_transform(x)
new_x.shape

PCA

Principal component analysis (PCA).
在这里插入图片描述

from sklearn.decomposition import PCA
pca=PCA(n_components=2)
pca_x=pca.fit_transform(new_x)
pca_x.shape

Correlated Features

# pearsonr
from scipy.stats import pearsonr

x1 = [12.5, 15.3, 23.2, 26.4, 33.5, 34.4, 39.4, 45.2, 55.4, 60.9]
x2 = [21.2, 23.9, 32.9, 34.1, 42.5, 43.2, 49.0, 52.8, 59.4, 63.5]
pearsonr(x1,x2)

# spearmanr
from scipy.stats import spearmanr
spearmanr(x1,x2)

the guidance of choosing algorithm

在这里插入图片描述

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值