安全库存-机器学习

Rone-X
于 2022-09-22 09:36:51 发布
阅读量463
点赞数
分类专栏： Python 文章标签：机器学习 python sklearn
本文链接：https://blog.csdn.net/RONE321/article/details/126985547
版权
Python 专栏收录该内容
14 篇文章 0 订阅
订阅专栏
备份代码——仅供参考

在这里插入图片描述
import pymssql
import pandas as pd
import numpy as np
import seaborn as sb

from sklearn import metrics 
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#测试环境的连接
conn_test = pymssql.connect(host='12.34.2.12',
                       user='sa',
                       ss='swd',
                       database='CS',
                       charset='utf8')


#查看连接是否成功
#cursor = conn.cursor()

engine = create_engine('mssql+pymssql://sd:xsw!2017@@12.13.14.15:1433/CA',pool_size=2, max_overflow=2)
#data.to_sql(name='ml_sku_month_total', con=engine, if_exists = 'append', index=False)

def getClassDF():
    sql_str = "select * from ml_score"
    df = pd.read_sql(sql_str,con = conn_test)
    
    return df

Final_DF = getClassDF()

copy_df = Final_DF
copy_df


#处理时间
#pd.get_dummies(all_data['range_time'])
copy_df['year'] = pd.to_datetime(copy_df['range_time'],infer_datetime_format=True)
copy_df['month'] = pd.to_datetime(copy_df['range_time'],infer_datetime_format=True)

copy_df['year'] = copy_df['year'].apply(lambda x:x.year)
copy_df['month'] = copy_df['month'].apply(lambda x:x.month)

copy_df = copy_df.drop( 'range_time',axis = 1)
copy_df

copy_df = copy_df.drop(['id','sku','store_id','create_time','count_sku_in_num','count_sku_out_num','in_num_count','out_num_count','current_month_inventory_num'],axis =1 )
copy_df

for column in copy_df.select_dtypes('object').columns:
    dummies = pd.get_dummies(copy_df[column],prefix = column)
    copy_df = pd.concat([copy_df, dummies], axis = 1)
    copy_df = copy_df.drop(column, axis = 1)
#ohe = pd.get_dummies(copy_df['classify'],prefix = column)



count_df = copy_df['label'].value_counts().to_frame()
count_df


count_df.plot(kind = 'bar')
copy_df.describe()

#给定Y列 ， X列
X = copy_df.drop(['label'],axis = 1)
y = copy_df['label']

#归一化 Scaler
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X),index = X.index,columns = X.columns)
#X_test = pd.DataFrame(scaler.transform(X_test),index = X_test.index,columns = X_test.columns)

##拆分训练数据 与 测试数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

X_train_res

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report

parameters = {
    'C': np.linspace(1, 10, 10)
             }
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters, cv=5, verbose=1, n_jobs=3)
clf.fit(X_train, y_train.ravel())

clf.best_params_

lr1 = LogisticRegression(C=7.0,penalty='l2', verbose=1)
lr1.fit(X_train, y_train.ravel())

import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

y_train_pre = lr1.predict(X_train)

cnf_matrix_tra = confusion_matrix(y_train, y_train_pre)

print("Recall metric in the train dataset: {}%".format(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])))


class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion Matrix of Tranning')
plt.show()

y_pre = lr1.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pre)

print("Recall metric in the testing dataset: {}%".format(100*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])))
#print("Precision metric in the testing dataset: {}%".format(100*cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0])))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion Matrix of Test')
plt.show()

tmp = lr1.fit(X_train_res, y_train_res.ravel())

y_pred_sample_score = tmp.decision_function(X_test)


fpr, tpr, thresholds = roc_curve(y_test, y_pred_sample_score)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()