Python语言实现机器学习

一、验证在SQL Server中已开启执行外部脚本(Python语言)的功能,检查Python版本并查看Python包列表
EXECUTE sp_configure  'external scripts enabled'
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import sys
print(sys.version)
'
GO

二、如果 Launchpad 正在运行,则可以运行简单的 Python 脚本,以验证外部脚本运行时是否可以与 SQL Server 通信
EXEC sp_execute_external_script @language = N'Python', 
@script = N'
OutputDataSet=InputDataSet;
',
@input_data_1=N'SELECT 1 AS Col1';
EXEC sp_execute_external_script  @language =N'Python',
@script=N'
OutputDataSet = InputDataSet;
',
@input_data_1 =N'SELECT 1 AS hello'
WITH RESULT SETS (([hello] int not null));
GO
EXEC sp_execute_external_script @language = N'Python', 
@script = N'
MyOutput=MyInput;
',
@input_data_1_name=N'MyInput',
@input_data_1=N'SELECT 1 AS foo',
@output_data_1_name=N'MyOutput';
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import pandas as pd
c=1/2
d=1*2
s= pd.Series([c,d])
df=pd.DataFrame(s)
OutputDataSet=df
'

EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import pandas as pd
s={"col1":[1,2], "col2":[3,4]}
df=pd.DataFrame(s)
OutputDataSet=df
'

三、添加简单的Python语言脚本,并执行
EXEC sp_execute_external_script @language = N'Python', 
@script = N'print(3+4)'
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
a = 1
b = 2
c = a/b
d = a*b
print(c, d)
'
EXECUTE sp_execute_external_script 
@language = N'Python', 
@script = N'
import sys
import os
print("*********************************")
print(sys.version)
print("!!Hello World!!")
print(os.getcwd())
print("********************************")
'
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'OutputDataSet = InputDataSet'
    , @input_data_1 = N'SELECT 1 AS hello'
WITH RESULT SETS(([Hello World] INT));
GO

四、实现机器学习服务
1.通过 SQL Server 机器学习服务运行简单 Python 脚本
CREATE TABLE PythonTestData (col1 INT NOT NULL)

INSERT INTO PythonTestData
VALUES (1);

INSERT INTO PythonTestData
VALUES (10);

INSERT INTO PythonTestData
VALUES (100);
GO
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'OutputDataSet = InputDataSet;'
    , @input_data_1 = N'SELECT * FROM PythonTestData;'
WITH RESULT SETS(([NewColName] INT NOT NULL));
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'SQL_out = SQL_in;'
    , @input_data_1 = N'SELECT 12 as Col;'
    , @input_data_1_name  = N'SQL_in'
    , @output_data_1_name = N'SQL_out'
WITH RESULT SETS(([NewColName] INT NOT NULL));
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import pandas as pd
mytextvariable = pandas.Series(["hello", " ", "world"]);
OutputDataSet = pd.DataFrame(mytextvariable);
'
    , @input_data_1 = N''
WITH RESULT SETS(([Col1] CHAR(20) NOT NULL));

2.在 SQL Server 机器学习服务中使用 Python 处理数据结构和对象

EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
a = 1
b = 2
c = a/b
print(c)
s = pandas.Series(c, index =["simple math example 1"])
print(s)
'
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
a = 1
b = 2
c = a/b
d = a*b
s = pandas.Series([c,d])
print(s)
'
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
a = 1
b = 2
c = a/b
s = pandas.Series(c, index =["simple math example 1", "simple math example 2"])
print(s)
'
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import pandas as pd
a = 1
b = 2
c = a/b
d = a*b
s = pandas.Series([c,d])
print(s)
df = pd.DataFrame(s)
OutputDataSet = df
'
WITH RESULT SETS((ResultValue FLOAT))
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import pandas as pd
a = 1
b = 2
c = a/b
d = a*b
s = pandas.Series([c,d])
print(s)
df = pd.DataFrame(s, index=[1])
OutputDataSet = df
'
WITH RESULT SETS((ResultValue FLOAT))
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import pandas as pd
a = 1
b = 2
c = a/b
s = pandas.Series(c, index =["simple math example 1", "simple math example 2"])
print(s)
df = pd.DataFrame(s, index=["simple math example 1"])
OutputDataSet = df
'
WITH RESULT SETS((ResultValue FLOAT))

3.结合使用 Python 函数和 SQL Server 机器学习服务

CREATE PROCEDURE MyPyNorm (
      @param1 INT
    , @param2 INT
    , @param3 INT
    )
AS
EXECUTE sp_execute_external_script @language = N'Python'
    , @script = N'
import numpy
import pandas
OutputDataSet = pandas.DataFrame(numpy.random.normal(size=mynumbers, loc=mymean, scale=mysd));
'
    , @input_data_1 = N'   ;'
    , @params = N' @mynumbers int, @mymean int, @mysd int'
    , @mynumbers = @param1
    , @mymean = @param2
    , @mysd = @param3
WITH RESULT SETS(([Density] FLOAT NOT NULL));
EXECUTE sp_execute_external_script
      @language = N'Python'
    , @script = N'
import time
start_time = time.time()

# Run Python processes

elapsed_time = time.time() - start_time
'
    , @input_data_1 = N' ;';

4.数据分析和回归预测

execute sp_execute_external_script 
@language = N'Python', 
@script = N'
import math
a = 1
b = 2
c = a*b
print(a,b,c)
d = math.pi/6
print(math.sin(d))
'

execute sp_execute_external_script 
@language = N'Python', 
@script = N'
from sklearn import linear_model      
import matplotlib.pyplot as plt     
import numpy as np
#X表示匹萨尺寸 Y表示匹萨价格
X = [[6], [8], [10], [14], [18]]
Y = [[7], [9], [13], [17.5], [18]]
print(X)
print(Y)
#回归训练
clf = linear_model.LinearRegression() 
clf.fit(X, Y)                         
res = clf.predict(np.array([12]).reshape(-1, 1))[0]
print(u"预测一张12英寸匹萨价格:$%.2f" % res)
#预测结果
X2 = [[0], [10], [14], [25]]
Y2 = clf.predict(X2)
'

CREATE PROCEDURE [dbo].[py_generate_customer_scores]
AS
BEGIN

-- Input query to generate the customer data
DECLARE @input_query NVARCHAR(MAX) = N'SELECT customer, orders, items, cost FROM dbo.Sales.Orders'

EXEC sp_execute_external_script @language = N'Python', @script = N'
import pandas as pd
from sklearn.cluster import KMeans

# Get data from input query
customer_data = my_input_data

# Define the model
n_clusters = 4
est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orders","items","cost"]])
clusters = est.labels_
customer_data["cluster"] = clusters

OutputDataSet = customer_data
'
, @input_data_1 = @input_query
, @input_data_1_name = N'my_input_data'
WITH RESULT SETS (("CustomerID" int, "Orders" float,"Items" float,"Cost" float,"ClusterResult" float));
END;
GO

5.使用 Python 包 scikit-learn 和 revoscalepy 来定型机器学习模型

DROP PROCEDURE IF EXISTS PyTrainTestSplit;
GO

CREATE PROCEDURE [dbo].[PyTrainTestSplit] (@pct int)
AS

DROP TABLE IF EXISTS dbo.nyctaxi_sample_training
SELECT * into nyctaxi_sample_training FROM nyctaxi_sample WHERE (ABS(CAST(BINARY_CHECKSUM(medallion,hack_license)  as int)) % 100) < @pct

DROP TABLE IF EXISTS dbo.nyctaxi_sample_testing
SELECT * into nyctaxi_sample_testing FROM nyctaxi_sample
WHERE (ABS(CAST(BINARY_CHECKSUM(medallion,hack_license)  as int)) % 100) > @pct
GO
DROP PROCEDURE IF EXISTS PyTrainScikit;
GO

CREATE PROCEDURE [dbo].[PyTrainScikit] (@trained_model varbinary(max) OUTPUT)
AS
BEGIN
EXEC sp_execute_external_script
  @language = N'Python',
  @script = N'
import numpy
import pickle
from sklearn.linear_model import LogisticRegression

##Create SciKit-Learn logistic regression model
X = InputDataSet[["passenger_count", "trip_distance", "trip_time_in_secs", "direct_distance"]]
y = numpy.ravel(InputDataSet[["tipped"]])

SKLalgo = LogisticRegression()
logitObj = SKLalgo.fit(X, y)

##Serialize model
trained_model = pickle.dumps(logitObj)
',
@input_data_1 = N'
select tipped, fare_amount, passenger_count, trip_time_in_secs, trip_distance, 
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude,  dropoff_latitude, dropoff_longitude) as direct_distance
from nyctaxi_sample_training
',
@input_data_1_name = N'InputDataSet',
@params = N'@trained_model varbinary(max) OUTPUT',
@trained_model = @trained_model OUTPUT;
;
END;
GO
DROP PROCEDURE IF EXISTS TrainTipPredictionModelRxPy;
GO

CREATE PROCEDURE [dbo].[TrainTipPredictionModelRxPy] (@trained_model varbinary(max) OUTPUT)
AS
BEGIN
EXEC sp_execute_external_script 
  @language = N'Python',
  @script = N'
import numpy
import pickle
from revoscalepy.functions.RxLogit import rx_logit

## Create a logistic regression model using rx_logit function from revoscalepy package
logitObj = rx_logit("tipped ~ passenger_count + trip_distance + trip_time_in_secs + direct_distance", data = InputDataSet);

## Serialize model
trained_model = pickle.dumps(logitObj)
',
@input_data_1 = N'
select tipped, fare_amount, passenger_count, trip_time_in_secs, trip_distance, 
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude,  dropoff_latitude, dropoff_longitude) as direct_distance
from nyctaxi_sample_training
',
@input_data_1_name = N'InputDataSet',
@params = N'@trained_model varbinary(max) OUTPUT',
@trained_model = @trained_model OUTPUT;
;
END;
GO
DROP PROCEDURE IF EXISTS PredictTipSciKitPy;
GO

CREATE PROCEDURE [dbo].[PredictTipSciKitPy] (@model varchar(50), @inquery nvarchar(max))
AS
BEGIN
DECLARE @lmodel2 varbinary(max) = (select model from nyc_taxi_models where name = @model);
EXEC sp_execute_external_script
  @language = N'Python',
  @script = N'
import pickle;
import numpy;
from sklearn import metrics

mod = pickle.loads(lmodel2)
X = InputDataSet[["passenger_count", "trip_distance", "trip_time_in_secs", "direct_distance"]]
y = numpy.ravel(InputDataSet[["tipped"]])

probArray = mod.predict_proba(X)
probList = []
for i in range(len(probArray)):
  probList.append((probArray[i])[1])

probArray = numpy.asarray(probList)
fpr, tpr, thresholds = metrics.roc_curve(y, probArray)
aucResult = metrics.auc(fpr, tpr)
print ("AUC on testing data is: " + str(aucResult))

OutputDataSet = pandas.DataFrame(data = probList, columns = ["predictions"])
',  
  @input_data_1 = @inquery,
  @input_data_1_name = N'InputDataSet',
  @params = N'@lmodel2 varbinary(max)',
  @lmodel2 = @lmodel2
WITH RESULT SETS ((Score float));
END
GO
DROP PROCEDURE IF EXISTS PredictTipRxPy;
GO

CREATE PROCEDURE [dbo].[PredictTipRxPy] (@model varchar(50), @inquery nvarchar(max))
AS
BEGIN
DECLARE @lmodel2 varbinary(max) = (select model from nyc_taxi_models where name = @model);
EXEC sp_execute_external_script 
  @language = N'Python',
  @script = N'
import pickle;
import numpy;
from sklearn import metrics
from revoscalepy.functions.RxPredict import rx_predict;

mod = pickle.loads(lmodel2)
X = InputDataSet[["passenger_count", "trip_distance", "trip_time_in_secs", "direct_distance"]]
y = numpy.ravel(InputDataSet[["tipped"]])

probArray = rx_predict(mod, X)
probList = probArray["tipped_Pred"].values 

probArray = numpy.asarray(probList)
fpr, tpr, thresholds = metrics.roc_curve(y, probArray)
aucResult = metrics.auc(fpr, tpr)
print ("AUC on testing data is: " + str(aucResult))

OutputDataSet = pandas.DataFrame(data = probList, columns = ["predictions"])
',  
  @input_data_1 = @inquery,
  @input_data_1_name = N'InputDataSet',
  @params = N'@lmodel2 varbinary(max)',
  @lmodel2 = @lmodel2
WITH RESULT SETS ((Score float));
END
GO
DROP PROCEDURE IF EXISTS PredictTipSingleModeSciKitPy;
GO

CREATE PROCEDURE [dbo].[PredictTipSingleModeSciKitPy] (@model varchar(50), @passenger_count int = 0,
  @trip_distance float = 0,
  @trip_time_in_secs int = 0,
  @pickup_latitude float = 0,
  @pickup_longitude float = 0,
  @dropoff_latitude float = 0,
  @dropoff_longitude float = 0)
AS
BEGIN
  DECLARE @inquery nvarchar(max) = N'
  SELECT * FROM [dbo].[fnEngineerFeatures]( 
    @passenger_count,
    @trip_distance,
    @trip_time_in_secs,
    @pickup_latitude,
    @pickup_longitude,
    @dropoff_latitude,
    @dropoff_longitude)
    '
DECLARE @lmodel2 varbinary(max) = (select model from nyc_taxi_models where name = @model);
EXEC sp_execute_external_script 
  @language = N'Python',
  @script = N'
import pickle;
import numpy;

# Load model and unserialize
mod = pickle.loads(model)

# Get features for scoring from input data
X = InputDataSet[["passenger_count", "trip_distance", "trip_time_in_secs", "direct_distance"]]

# Score data to get tip prediction probability as a list (of float)
probList = []
probList.append((mod.predict_proba(X)[0])[1])

# Create output data frame
OutputDataSet = pandas.DataFrame(data = probList, columns = ["predictions"])
',
  @input_data_1 = @inquery,
  @params = N'@model varbinary(max),@passenger_count int,@trip_distance float,
    @trip_time_in_secs int ,
    @pickup_latitude float ,
    @pickup_longitude float ,
    @dropoff_latitude float ,
    @dropoff_longitude float',
    @model = @lmodel2,
    @passenger_count =@passenger_count ,
    @trip_distance=@trip_distance,
    @trip_time_in_secs=@trip_time_in_secs,
    @pickup_latitude=@pickup_latitude,
    @pickup_longitude=@pickup_longitude,
    @dropoff_latitude=@dropoff_latitude,
    @dropoff_longitude=@dropoff_longitude
WITH RESULT SETS ((Score float));
END
GO
DROP PROCEDURE IF EXISTS PredictTipSingleModeRxPy;
GO

CREATE PROCEDURE [dbo].[PredictTipSingleModeRxPy] (@model varchar(50), @passenger_count int = 0,
  @trip_distance float = 0,
  @trip_time_in_secs int = 0,
  @pickup_latitude float = 0,
  @pickup_longitude float = 0,
  @dropoff_latitude float = 0,
  @dropoff_longitude float = 0)
AS
BEGIN
DECLARE @inquery nvarchar(max) = N'
  SELECT * FROM [dbo].[fnEngineerFeatures]( 
    @passenger_count,
    @trip_distance,
    @trip_time_in_secs,
    @pickup_latitude,
    @pickup_longitude,
    @dropoff_latitude,
    @dropoff_longitude)
  '
DECLARE @lmodel2 varbinary(max) = (select model from nyc_taxi_models where name = @model);
EXEC sp_execute_external_script 
  @language = N'Python',
  @script = N'
import pickle;
import numpy;
from revoscalepy.functions.RxPredict import rx_predict;

# Load model and unserialize
mod = pickle.loads(model)

# Get features for scoring from input data
X = InputDataSet[["passenger_count", "trip_distance", "trip_time_in_secs", "direct_distance"]]

# Score data to get tip prediction probability as a list (of float)

probArray = rx_predict(mod, X)

probList = []
probList = probArray["tipped_Pred"].values

# Create output data frame
OutputDataSet = pandas.DataFrame(data = probList, columns = ["predictions"])
',
  @input_data_1 = @inquery,
  @params = N'@model varbinary(max),@passenger_count int,@trip_distance float,
    @trip_time_in_secs int ,
    @pickup_latitude float ,
    @pickup_longitude float ,
    @dropoff_latitude float ,
    @dropoff_longitude float',
    @model = @lmodel2,
    @passenger_count =@passenger_count ,
    @trip_distance=@trip_distance,
    @trip_time_in_secs=@trip_time_in_secs,
    @pickup_latitude=@pickup_latitude,
    @pickup_longitude=@pickup_longitude,
    @dropoff_latitude=@dropoff_latitude,
    @dropoff_longitude=@dropoff_longitude
WITH RESULT SETS ((Score float));
END
GO

6.使用机器学习服务训练预测模型,调用该模型为新数据生成预测(回归或分类),并将结果返回到新的实体表中

DROP PROCEDURE IF EXISTS generate_model_rfc;
go
CREATE PROCEDURE generate_model_rfc (@trained_model varbinary(max) OUTPUT)
AS
BEGIN
    EXECUTE sp_execute_external_script
      @language = N'Python'
    , @script = N'
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

dt = train_data
dt["Level"] = dt["Level"].astype(''category'')

X = dt[["Fixed Acidity","Volatile Acidity","Citric acid","Residual Sugar","Chlorides","Free Sulfur Dioxide","Total Sulfur Dioxide","Density","PH","Sulphates","Alcohol"]]
Y = dt[["Level"]]

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.3, random_state=42)
RFC=RandomForestClassifier(n_estimators=100,criterion=''gini'',min_samples_split=2,max_depth=2)
RFC.fit(X_Train,Y_Train.values.ravel())
trained_model = pickle.dumps(RFC)
'
, @input_data_1 = N'select "Fixed Acidity", "Volatile Acidity", "Citric acid", "Residual Sugar", "Chlorides", "Free Sulfur Dioxide", "Total Sulfur Dioxide", "Density", "PH", "Sulphates", "Alcohol", "Level" from dbo.RedWineQuality'
, @input_data_1_name = N'train_data'
, @params = N'@trained_model varbinary(max) OUTPUT'
, @trained_model = @trained_model OUTPUT;
END;
GO
CREATE TABLE dbo.my_py_models (
    model_name VARCHAR(50) NOT NULL DEFAULT('default model') PRIMARY KEY,
    model_object VARBINARY(MAX) NOT NULL
);
GO
DECLARE @model_object VARBINARY(MAX);
EXEC generate_model_rfc @model_object OUTPUT;

INSERT INTO my_py_models (model_name, model_object) VALUES('RandomForestClassification(RFC)', @model_object);
DROP PROCEDURE IF EXISTS py_predict_rfc;
GO
CREATE PROCEDURE py_predict_rfc (@model varchar(100))
AS
BEGIN
	DECLARE @py_model varbinary(max) = (select model_object from dbo.my_py_models where model_name = @model);

	EXEC sp_execute_external_script
				@language = N'Python',
				@script = N'

import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

py_model_rfc = pickle.loads(py_model)

dt = data
dt["Level"] = dt["Level"].astype(''category'')

X = dt[["Fixed Acidity","Volatile Acidity","Citric acid","Residual Sugar","Chlorides","Free Sulfur Dioxide","Total Sulfur Dioxide","Density","PH","Sulphates","Alcohol"]]
Y = dt[["Level"]]
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.3, random_state=42)

Predictions = py_model_rfc.predict(X_Test)
# print("Accuracy:",metrics.accuracy_score(Y_Test, Predictions))

OutputDataSet=pd.DataFrame(data=Predictions,columns=[''Prediction''])
OutputDataSet[''Actual''] = pd.Series(Y_Test[''Level''].values,index=np.arange(0,len(Y_Test)))
'
, @input_data_1 = N'select "Fixed Acidity", "Volatile Acidity", "Citric acid", "Residual Sugar", "Chlorides", "Free Sulfur Dioxide", "Total Sulfur Dioxide", "Density", "PH", "Sulphates", "Alcohol", "Level" from dbo.RedWineQuality'
, @input_data_1_name = N'data'
, @params = N'@py_model varbinary(max)'
, @py_model = @py_model
with result sets (("Prediction" int, "Actual" int));

END;
GO
CREATE TABLE [dbo].[py_rfc_predictions](
 [Prediction] int,
 [Actual] int
)
INSERT INTO py_rfc_predictions
EXEC py_predict_rfc 'RandomForestClassification(RFC)';

-- Select contents of the table
SELECT * FROM py_rfc_predictions;

7.通过 SQL Server 机器学习服务在 Python 中创建预测模型并对其进行评分

CREATE DATABASE irissql
GO
USE irissql
GO
DROP TABLE IF EXISTS iris_data;
GO
CREATE TABLE iris_data (
  id INT NOT NULL IDENTITY PRIMARY KEY
  , "Sepal.Length" FLOAT NOT NULL, "Sepal.Width" FLOAT NOT NULL
  , "Petal.Length" FLOAT NOT NULL, "Petal.Width" FLOAT NOT NULL
  , "Species" VARCHAR(100) NOT NULL, "SpeciesId" INT NOT NULL
);
DROP TABLE IF EXISTS iris_models;
GO

CREATE TABLE iris_models (
  model_name VARCHAR(50) NOT NULL DEFAULT('default model') PRIMARY KEY,
  model VARBINARY(MAX) NOT NULL
);
GO
CREATE PROCEDURE get_iris_dataset
AS
BEGIN
EXEC sp_execute_external_script @language = N'Python', 
@script = N'
from sklearn import datasets
iris = datasets.load_iris()
iris_data = pandas.DataFrame(iris.data)
iris_data["Species"] = pandas.Categorical.from_codes(iris.target, iris.target_names)
iris_data["SpeciesId"] = iris.target
', 
@input_data_1 = N'', 
@output_data_1_name = N'iris_data'
WITH RESULT SETS (("Sepal.Length" float not null, "Sepal.Width" float not null, "Petal.Length" float not null, "Petal.Width" float not null, "Species" varchar(100) not null, "SpeciesId" int not null));
END;
GO
INSERT INTO iris_data ("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species", "SpeciesId")
EXEC dbo.get_iris_dataset;
SELECT TOP(10) * FROM iris_data;
SELECT COUNT(*) FROM iris_data;
USE irissql
GO
sp_configure 'external scripts enabled', 1;
RECONFIGURE WITH override;
CREATE PROCEDURE generate_iris_model (@trained_model VARBINARY(max) OUTPUT)
AS
BEGIN
    EXECUTE sp_execute_external_script @language = N'Python'
        , @script = N'
import pickle
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
trained_model = pickle.dumps(GNB.fit(iris_data[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]], iris_data[["SpeciesId"]].values.ravel()))
'
        , @input_data_1 = N'select "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "SpeciesId" from iris_data'
        , @input_data_1_name = N'iris_data'
        , @params = N'@trained_model varbinary(max) OUTPUT'
        , @trained_model = @trained_model OUTPUT;
END;
GO
DECLARE @model varbinary(max);
DECLARE @new_model_name varchar(50)
SET @new_model_name = 'Naive Bayes'
EXECUTE generate_iris_model @model OUTPUT;
DELETE iris_models WHERE model_name = @new_model_name;
INSERT INTO iris_models (model_name, model) values(@new_model_name, @model);
GO
SELECT * FROM dbo.iris_models
CREATE PROCEDURE predict_species (@model VARCHAR(100))
AS
BEGIN
    DECLARE @nb_model VARBINARY(max) = (
            SELECT model
            FROM iris_models
            WHERE model_name = @model
            );

    EXECUTE sp_execute_external_script @language = N'Python'
        , @script = N'
import pickle
irismodel = pickle.loads(nb_model)
species_pred = irismodel.predict(iris_data[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]])
iris_data["PredictedSpecies"] = species_pred
OutputDataSet = iris_data[["id","SpeciesId","PredictedSpecies"]] 
print(OutputDataSet)
'
        , @input_data_1 = N'select id, "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "SpeciesId" from iris_data'
        , @input_data_1_name = N'iris_data'
        , @params = N'@nb_model varbinary(max)'
        , @nb_model = @nb_model
    WITH RESULT SETS((
                "id" INT
              , "SpeciesId" INT
              , "SpeciesId.Predicted" INT
                ));
END;
GO
EXECUTE predict_species 'Naive Bayes';
GO

8.通过 SQL 机器学习对较大功能集做建模限制

from numpy.random import randn
matrix = randn(2000, 2001)

import pandas
data = pandas.DataFrame(data=matrix, columns=["Label"] + ["f%s" % i for i in range(1, matrix.shape[1])])
data["Label"] = (data["Label"] > 0.5).apply(lambda x: 1.0 if x else 0.0)

print("problem dimension:", data.shape)
print(data[["Label", "f1", "f2", data.columns[-1]]].head())

###################################################
# Let's train a logistic regression.

formula = "Label ~ {0}".format(" + ".join(data.columns[1:]))
print(formula[:50] + " + ...")

from numpy.random import randn
matrix = randn(2000, 3)

import random
import pandas
data = pandas.DataFrame(data=matrix, columns=["Label"] + ["f%s" % i for i in range(1, matrix.shape[1])])
data["Label"] = (data["Label"] > 0.5).apply(lambda x: 1.0 if x else 0.0)
data["cat"] = [["0", "1"][random.randint(0,1)] for i in range(0, data.shape[0])]

################################
# We define this column as a category.
data["cat"] = data["cat"].astype("category")

print("problem dimension:", data.shape)
print(data.head())

###################################################
# Let's train a logistic regression.

formula = "Label ~ {0}".format(" + ".join(data.columns[1:]))
print(formula)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值