python开源代码算法_常用机器学习算法的python源码实现

常用机器学习算法的python源码实现

大致sklearn

一、思维导图

二、Python源码

## 二分类问题

* 使用skleran自带的逻辑回归、支持向量机、决策树API进行二分类的任务

* 使用sklearn的iris数据集,将iris数据集变成一个二分类的数据集,删除类别为2的数据

* 使用准确率对模型进行评价

### 准备数据

import pandas as pd

import numpy as np

from sklearn import datasets

from sklearn import linear_model

from sklearn import tree

from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt

from sklearn import cross_validation

from sklearn import metrics

iris = datasets.load_iris()

feature_columns = iris.feature_names

target_column = ['res']

np.where(iris.target==2) #说明需要删除100-149行

features = pd.DataFrame(iris.data,columns= feature_columns)

labels = pd.DataFrame(iris.target,columns = target_column)

features = features[0:100]

labels = labels[0:100]

train_test_data = pd.concat([features,labels],axis=1)

#将数据集分为测试集和训练集

train, test = cross_validation.train_test_split(train_test_data, test_size=0.1)

train_X = train[feature_columns].values

train_y = train[target_column].values.reshape(train_y.size)

test_X = test[feature_columns].values

test_y = test[target_column].values.reshape(test_y.size)

### 训练模型

# 训练一个逻辑回归模型

linear = linear_model.LogisticRegression(penalty='l2',C=1.0)

linear.fit(train_X,train_y)

preL = linear.predict(test_X)

metrics.accuracy_score(test_y,preL)

# 训练一个决策树模型

DT = tree.DecisionTreeClassifier(max_depth = 3)

DT = DT.fit(train_X,train_y)

preT = DT.predict(test_X)

metrics.accuracy_score(test_y,preT)

# 训练一个支持向量机

SVM = LinearSVC()

SVM = SVM.fit(train_X,train_y)

preS = SVM.predict(test_X)

metrics.accuracy_score(test_y,preS)

## 多分类问题

* 使用SVM进行一对一,一对多多分类

* 使用决策树进行多分类

* 使用随机森林进行多分类

import pandas as pd

import numpy as np

from sklearn import datasets

from sklearn import linear_model

from sklearn import tree

from sklearn.svm import LinearSVC,SVC

from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

from sklearn import cross_validation

from sklearn import metrics

### 准备数据

iris = datasets.load_iris()

feature_columns = iris.feature_names

target_column = ['res']

features = pd.DataFrame(iris.data,columns= feature_columns)

labels = pd.DataFrame(iris.target,columns = target_column)

train_test_data = pd.concat([features,labels],axis=1)

train, test = cross_validation.train_test_split(train_test_data, test_size=0.1)

train_X = train[feature_columns].values

train_y = train[target_column].values

train_y = train_y.reshape(train_y.size)

test_X = test[feature_columns].values

test_y = test[target_column].values

test_y =test_y.reshape(test_y.size)

### 训练模型

# linearSVC 采用的是 one vs the rest 多分类的支持向量机

Linear_SVM = LinearSVC()

Linear_SVM = Linear_SVM.fit(train_X,train_y)

preLS = Linear_SVM.predict(test_X)

# SVC 采用的是 one vs one 多分类支持向量机(C-Support Vector Classification)

C_SVM = SVC()

C_SVM = C_SVM.fit(train_X,train_y)

preCS = Linear_SVM.predict(test_X)

metrics.accuracy_score(test_y,preCS)

# 使用决策树进行多分类

DT = tree.DecisionTreeClassifier(max_depth = 3)

DT = DT.fit(train_X,train_y)

preT = DT.predict(test_X)

metrics.accuracy_score(test_y,preT)

# 使用随机森林进行多分类

RF = RandomForestClassifier()

RF = RF.fit(train_X,train_y)

preRF = RF.predict(test_X)

metrics.accuracy_score(test_y,preRF)

metrics.accuracy_score(test_y,preLS)

### 回归

* 使用boston数据集

* 使用线性回归

* 使用树回归

* 使用支持向量机进行回归

import pandas as pd

import numpy as np

from sklearn import datasets

from sklearn import tree

from sklearn import linear_model

from sklearn import svm

from sklearn import model_selection

from sklearn import metrics

boston = datasets.load_boston()

feature_columns = boston.feature_names

target_column = ['target']

features = pd.DataFrame(boston.data,columns= feature_columns)

labels = pd.DataFrame(boston.target,columns = target_column)

train_test_data = pd.concat([features,labels],axis=1)

train, test = model_selection.train_test_split(train_test_data, test_size=0.1)

train_X = train[feature_columns].values

train_y = train[target_column].values

train_y = train_y.reshape(train_y.size)

test_X = test[feature_columns].values

test_y = test[target_column].values

test_y =test_y.reshape(test_y.size)

### 训练模型

#训练线性回归模型

linear = linear_model.LinearRegression()

linear.fit(train_X,train_y)

preL = linear.predict(test_X)

metrics.mean_squared_error(test_y,preL)**0.5

#训练树回归模型

DT = tree.DecisionTreeRegressor()

DT = DT.fit(train_X,train_y)

preT = DT.predict(test_X)

metrics.mean_squared_error(test_y,preT)**0.5

#训练支持向量机回归模型

SVM = svm.LinearSVR()

SVM = SVM.fit(train_X,train_y)

preS = SVM.predict(test_X)

metrics.mean_squared_error(test_y,preS)**0.5

## 特征清洗

* 通过pandas了解数据

* 通过pandas填充缺失的数据

data = pd.read_csv(path)#在路径path中读取csv文件,读取后data的格式为pd.DataFrame

data.head() # 查看数据前5行

data.shape #查看数据大小

data.info() #查看数据特征信息,包括缺失值数量等

data.describe() #查看数据的统计信息,包括每个特征的平均值/标准差等

data['feature1'].fillna(value = data['feature1'].mean) #将feature1列中的缺失值以feature1列的平均值进行填充

## 特征工程

### 数值型数据

* 幅度变换

* 计算统计值

* 特征之间进行算术和逻辑运算以产生新特征

* 产生高次特征和交叉特征

* 进行离散化

* One-hot 编码

#1.进行log变化,log对数据进行缩放,有助于数据呈现正态分布

#采用np+apply的方法

import numpy as np

log_feature1 = data['feature1'].apply(lambda x:np.log(x))

data.loc[:,'log_feature1'] = log_feature1 #增加一个对数特征列

#也可以使用sklearn自带的幅度变换函数进行幅度变换

# 幅度缩放,最大最小值缩放

from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()

feature1_mms = mm_scaler.fit_transform(data[['feature1']])

# 幅度缩放,标准化

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

feature1_ss = std_scaler.fit_transform(data[['feature1']])

#等等

#2.计算统计值

data['feature1'].max()

data['feature1'].min()

#计算分位数

data['feature1'].quantile(0.25)

# 3.1特征之间进行运算以产生新特征

data.loc[:,'new_feature1'] = data['feature1']+ 4*data['feature2']+1

data.loc[:,'new_feature2'] = (data['feature1']==0)&(data['feature2']==0)

# 4.产生高次特征和交叉特征

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

poly_fea = poly.fit_transform(data[['feature1','feature2']])

# 5.离散化 通过pandas的cut和qcut

data.loc[:,'feature1_cut'] = pd.cut(df_train['feature1'],5)

data.loc[:,'feature1_qcut'] = pd.qcut(df_train['feature1'],5)

# 6. onehot 编码

feature1_oht = pd.get_dummies(data[['feature1']])

### 日期处理

# 将数据中的data转换成pd中的datatime类型

data.loc[:,'date'] = pd.to_datetime(data['date_t'], format="")

# 取出月份

data.loc[:,'month'] = data['date'].dt.month

# 取出日

data.loc[:,'dom'] = data['date'].dt.day

# 取出一年当中第几天

data.loc[:,'doy'] = data['date'].dt.dayofyear

# 取出星期几

data.loc[:,'dow'] = data['date'].dt.dayofweek

### 文本处理

* 词袋模型

* TF-IDF

# 1.词袋模型

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

corpus = [

'This is the first document.',

'This is the second second document.',

'And the third one.',

'Is this the first document?'

]

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names() #得到特征名

X.toarray() #将X转换为np数组形势

#2.TF-IDF 模型

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_X = tfidf_vectorizer.fit_transform(corpus)

tfidf_vectorizer.get_feature_names()

tfidf_X.toarray()

## 特征选择

* Filter

* Wrapper

* Embedded

# Filter

from sklearn.feature_selection import SelectKBest

X_new = SelectKBest(k=2).fit_transform(X, y)# 默认使用卡方检验

# Wrapper

from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rfe = RFE(estimator=rf, n_features_to_select=2)

X_rfe = rfe.fit_transform(X,y)

#Embedded

from sklearn.feature_selection import SelectFromModel

from sklearn.svm import LinearSVC

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)

model = SelectFromModel(lsvc, prefit=True)

X_embed = model.transform(X)

## 模型融合

* 投票器

* Bagging

* Adaboost

# 投票器

from sklearn import model_selection

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

X = array[:,0:8]

Y = array[:,8]

kfold = model_selection.KFold(n_splits=5, random_state=2018)

# 创建投票器的子模型

estimators = []

model_1 = LogisticRegression()

estimators.append(('logistic', model_1))

model_2 = DecisionTreeClassifier()

estimators.append(('dt', model_2))

model_3 = SVC()

estimators.append(('svm', model_3))

# 构建投票器融合

ensemble = VotingClassifier(estimators)

result = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)

print(result.mean())

# Bagging

from sklearn.ensemble import BaggingClassifier

dt = DecisionTreeClassifier()

num = 100

kfold = model_selection.KFold(n_splits=5, random_state=2018)

model = BaggingClassifier(base_estimator=dt, n_estimators=num, random_state=2018)

result = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(result.mean())

#Adaboost

from sklearn.ensemble import AdaBoostClassifier

num_trees = 25

kfold = model_selection.KFold(n_splits=5, random_state=2018)

model = AdaBoostClassifier(n_estimators=num_trees, random_state=2018)

result = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(result.mean())

## xgboost

import pickle

import xgboost as xgb

import numpy as np

import lightgbm as lgb

from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from sklearn.metrics import confusion_matrix, mean_squared_error

from sklearn.datasets import load_iris, load_digits, load_boston

#回归问题示例

#网格搜索交叉验证,代码举例,没有测试集,测试集和训练集相同,当然这和基本套路不符合

boston = load_boston()

y = boston['target']

X = boston['data']

kf = KFold(n_splits=5, shuffle=True)

xgb_model = xgb.XGBRegressor()

clf = GridSearchCV(xgb_model,

{'max_depth': [2,4,6],

'n_estimators': [50,100,200]}, verbose=0,cv=kf)

print(clf.best_score_)# best_score 的评估标准是什么,还要考证

print(clf.best_params_)

xgb1_model = xgb.XGBRegressor(max_depth = 4,n_estimators = 200).fit(X,y)

predictions = xgb1_model.predict(X)

actuals = y

print("MSE:",mean_squared_error(actuals, predictions))

## lightGBM

estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {

'learning_rate': [0.01, 0.1, 1],

'n_estimators': [20, 40]

}

gbm = GridSearchCV(estimator, param_grid, cv = kf)

print('用网格搜索找到的最优超参数为:')

print(gbm.best_params_)

gbm = lgb.LGBMRegressor(objective='regression',

num_leaves=31,

learning_rate=0.1,

n_estimators=40)

# 使用fit函数拟合

eval_set=[(X, y)],

eval_metric='l1',

early_stopping_rounds=5)

# 预测

print('开始预测...')

y_pred = gbm.predict(X, num_iteration=gbm.best_iteration_)

# 评估预测结果

print('预测结果的rmse是:')

print(mean_squared_error(y, y_pred) ** 0.5)

三、参见模型调参

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是三种常见的机器学习算法Python源码示例: 决策树算法Python源码示例: ``` # 导入 from sklearn import tree # 假设你有训练数据集的预测器X和目标变量Y,以及测试数据集的预测器x_test # 创建决策树对象 model = tree.DecisionTreeClassifier(criterion='gini') # 对于分类问题,默认使用基尼系数,也可以选择信息增益 # 训练模型并检查得分 model.fit(X, Y) model.score(X, Y) # 预测输出 predicted = model.predict(x_test) ``` 梯度提升算法Python源码示例: ``` # 导入 from sklearn.ensemble import GradientBoostingClassifier # 假设你有训练数据集的预测器X和目标变量Y,以及测试数据集的预测器x_test # 创建梯度提升分类器对象 model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) # 训练模型并检查得分 model.fit(X, Y) # 预测输出 predicted = model.predict(x_test) ``` 朴素贝叶斯算法Python源码示例: ``` # 导入 from sklearn.naive_bayes import GaussianNB # 假设你有训练数据集的预测器X和目标变量Y,以及测试数据集的预测器x_test # 创建高斯朴素贝叶斯分类器对象 model = GaussianNB() # 训练模型并检查得分 model.fit(X, Y) # 预测输出 predicted = model.predict(x_test) ``` 这些是三种常见的机器学习算法Python源码示例,你可以根据你的需求和数据选择合适的算法进行模型训练和预测。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* *3* [【机器学习算法】10种常见机器学习算法+Python代码](https://blog.csdn.net/nnn0245/article/details/128222067)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v92^chatsearchT3_1"}}] [.reference_item style="max-width: 100%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值