(2)机器学习的实际应用

本节内容如有看不懂的地方,请参考用pandas玩转数据(1)用pandas玩转数据(2)(1-4)Matplotlib的实际应用
本节内容的部分思想已经分解为用pandas玩转数据(2)

1.企业欺诈识别

(本节内容的数据见电脑F:/python数据/audit_risk 或腾讯微云文件”python数据\audit_risk “)
在这里插入图片描述
最后一列是预测列,预测是否存在风险;前面的列是特征列。
我们要把特征列和预测列单独分开。

第一步先阅读数据
在这里插入图片描述
第二步:数据预处理
我们要把非数值型数据处理为数值型
第三步:模型划分(把数据划分为特征列和预测列,最后一列是预测列,预测是否存在风险;前面的列是特征列。)

import  pandas as pd

frame=pd.read_csv('F:/python数据/audit_risk.csv',header=0)
y=frame[frame.columns[len(frame.columns)-1]] #Risk是预测列,我们要单独提出来
#frame.columns[len(frame.columns)-1]得到的就是最后一列的列名称“Risk”
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) #Risk是预测列,我们要删除预测列“Risk”
X=frame
print(X)

'''我们之所以没有用frame['Risk']直接定位最后一列,是因为我们有时候见到的数据是没有列名的'''

在这里插入图片描述
第四步:数据训练

import  pandas as pd
from sklearn.model_selection import train_test_split

frame=pd.read_csv('F:/python数据/audit_risk.csv',header=0)
#frame.columns[len(frame.columns)-1]就是"Risk"
y=frame[frame.columns[len(frame.columns)-1]] #Risk是预测列,我们要单独提出来
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) #Risk是预测列,我们要删除预测列“Risk”
X=frame

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1) 
#依次是特征训练集,特征测试集,预测训练集,预测测试机
#test_size=0.1就是说训练集和测试集比例为9:1

第五步:效果评估

import  pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier #使用最近邻模型
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv',header=0)
y=frame[frame.columns[len(frame.columns)-1]] 
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) 
X=frame

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1) 

estimator=KNeighborsClassifier()
estimator.fit(X_train,y_train)
y_predicted=estimator.predict(X_test)

print('准确度为:{:.1f}%'.format(np.mean(y_test==y_predicted)*100))

在这里插入图片描述

2.企业欺诈识别的完善

(本节内容的数据见电脑F:/python数据/audit_risk 或腾讯微云文件”python数据\audit_risk “)
在这里插入图片描述
1.数据预处理
在这里插入图片描述

①非数值的处理

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
results=frame.applymap(np.isreal)
#applymap(函数a)可以将DtaFrame中所有元素都应用一下“函数a”的运算,np.isreal判断是不是数字
#只要某一列中存在一个非数字型数据,那么这一列就全是False
print(results)

在这里插入图片描述

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
results=frame.applymap(np.isreal).all() #加上.all()就可以只显示我们需要的信息
print(results)

在这里插入图片描述

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
results=frame.applymap(np.isreal).all()
print(results[(results==False)]) #只显示有问题的列

在这里插入图片描述

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 
#将我们检索出来的异常列全部用数值填充那些异常数值,pd.to_numeric就是转为数值的意思,errors='coerce'就是用空值填充非数字的数据

results=frame.applymap(np.isreal).all() #再次检索看看还有没有非数字列
print(results)

在这里插入图片描述
②空值的处理

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 
#将我们检索出来的异常列全部用数值填充那些异常数值,errors='coerce'就是用空值填充异常数据

results=frame.isnull() #返回一个和原始DataFrame一样大小的矩阵,其中True表示该数字为空值
print(results)

在这里插入图片描述

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 

results=frame.isnull().any(0) #使用any(0)当这一列内在任何一个空值则这一列返回True
print(results)

在这里插入图片描述

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 

results=frame.isnull().any(1) #使用any(1)当这一行内在任何一个空值则这一行返回True
print(results[results==True])

在这里插入图片描述

import  pandas as pd
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 
frame=frame.fillna(0) #使用0来填充对应的空值
results=frame.isnull().any(0)
print(results)

在这里插入图片描述

import  pandas as pd
from sklearn.impute import SimpleImputer

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 
imp=SimpleImputer(strategy='mean') #利用空值所在列的数据平均值来填充空值
newframe=imp.fit_transform(frame)
print(newframe)

在这里插入图片描述

import  pandas as pd
from sklearn.impute import SimpleImputer

frame=pd.read_csv('F:/python数据/audit_risk.csv')
print(frame)  #数据是DataFrame类型
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce') 
print(frame)  #数据还是DataFrame类型
imp=SimpleImputer(strategy='mean') #利用空值所在列的数据平均值来填充空值
newframe=imp.fit_transform(frame)
print(newframe)  #经过这一步处理,数据变成了numpy类型

'''我们有一步是划分特征数据与预测数据
y=frame[frame.columns[len(frame.columns)-1]] 
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) 
这一步必须趁着还是DataFrame类型时处理'''

在这里插入图片描述

import  pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier #使用最近邻模型
from sklearn.impute import SimpleImputer
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce')  #把异常数值处理为空值
y=frame[frame.columns[len(frame.columns)-1]] 
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) 

imp=SimpleImputer(strategy='mean') #利用空值所在列的数据平均值来填充空值
newframe=imp.fit_transform(frame)
X=newframe

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1) 

estimator=KNeighborsClassifier()
estimator.fit(X_train,y_train)
y_predicted=estimator.predict(X_test)

print('准确度为:{:.1f}%'.format(np.mean(y_test==y_predicted)*100))

在这里插入图片描述
③数据规范化处理

import  pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import StandardScaler
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce')  #把异常数值处理为空值
frame=frame.fillna(0) #用0来填充空值
y=frame[frame.columns[len(frame.columns)-1]] 
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) 

X=frame
X=StandardScaler().fit_transform(X) #这一步就是数据规范化处理,用来处理那些“过激”的数据

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1) 

estimator=KNeighborsClassifier()
estimator.fit(X_train,y_train)
y_predicted=estimator.predict(X_test)

print('准确度为:{:.1f}%'.format(np.mean(y_test==y_predicted)*100))

在这里插入图片描述
2.交叉验证
①将训练集和测试集交换一下

import  pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
import numpy as np

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce')  #把异常数值处理为空值
frame=frame.fillna(0) #用0来填充空值
y=frame[frame.columns[len(frame.columns)-1]] 
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) 

X=frame
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1) 

estimator=KNeighborsClassifier()
estimator.fit(X_test,y_test) #交换了训练集和测试集来交叉验证
y_predicted=estimator.predict(X_train)  #交换了训练集和测试集来交叉验证

print('准确度为:{:.1f}%'.format(np.mean(y_train==y_predicted)*100))

在这里插入图片描述
②用cross_val_score

import  pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
import numpy as np
from sklearn.model_selection import cross_val_score

frame=pd.read_csv('F:/python数据/audit_risk.csv')
frame['LOCATION_ID']=pd.to_numeric(frame['LOCATION_ID'],errors='coerce')  #把异常数值处理为空值
frame=frame.fillna(0) #用0来填充空值
y=frame[frame.columns[len(frame.columns)-1]] 
frame.drop(frame.columns[len(frame.columns)-1],axis=1,inplace=True) 

X=frame
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1) 

estimator=KNeighborsClassifier()
scores=cross_val_score(estimator,X,y,scoring='accuracy',cv=10) #scoring='accuracy'评估指标是“精准度”,cv=10交叉分组的次数是10
print(np.mean(scores))

在这里插入图片描述

3.会员卡预测

(本节内容的数据见电脑F:/python数据/customer 或腾讯微云文件”python数据\customer “)
在这里插入图片描述
包含27个相关的特征(姓名、地址、教育情况);还有一个会员卡的类型(金卡、银卡、铜卡、普通卡)
1.决策树
特征的选择:特征列太多,我们先选择三个数字型特征的列(年收入,小孩数,家庭汽车拥有量)。年收入是一个范围,我们要替换一下才能用;
在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].head(2))
frame['yearly_income']=frame['yearly_income'].str.replace('[^0-9]','') #frame['yearly_income'].str获得列那一列元素的字符串表示,然后用空字符替换不属于0-9的阿拉伯数字
print(frame['yearly_income'].head(2))

'''用3050表示30-50'''

在这里插入图片描述
方法二:

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].head(2))
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
print(frame['yearly_income'].head(2))

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
y=frame['member_card'] #把会员卡列作为预测列
X=frame[["yearly_income",'total_children','num_cars_owned']] #将三个数值列作为特征列

clf=DecisionTreeClassifier() #用了决策树
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

如果能够引入更多的分类特征,决策树的效果会更好一些,比如受教育程度和职业与会员等级也有很大的联系
在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import LabelEncoder

frame=pd.read_csv('F:/python数据/customer.csv')
encoding=LabelEncoder() #使用这种方法将字符串映射为数字
encoding.fit(frame['education'])
education_new=encoding.transform(frame['education'])
print(frame['education'].values)
print(education_new)

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
encoding=LabelEncoder()  #使用这种方法将字符串映射为数字
encoding.fit(frame['education'])
frame['education_new']=encoding.transform(frame['education'])
y=frame['member_card'] #把会员卡列作为预测列
X=frame[["yearly_income",'total_children','num_cars_owned']] #将三个数值列作为特征列

clf=DecisionTreeClassifier() #用了决策树
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
encoding=OneHotEncoder() 
print(frame['education'].values)
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
print(newData)

在这里插入图片描述

import pandas as pd
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['education'].values)
print(np.vstack(frame['education'].values)) #vstack把序列竖了起来,只有这样才能存储独热编码的那些列

在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)
print(frame_full)

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=DecisionTreeClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()  #我们仅仅是将'education'换成了"marital_status"婚否,准确率就提高了很多
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=DecisionTreeClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
2.随机森林
决策树是仅仅挑选了特征列,如果我们能够根据列的组合和行的组合建立不同的多颗矩阵数,用来分别预测,这就是随机森林
用起来很简单,我们只需要将DecisionTreeClassifier()替换为RandomForestClassifier()就可以了

import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()  #替换为随机森林
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
随机森林的参数很多,我们可以根据算法来获得最优的参数

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import GridSearchCV

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()

parameter_space={
    'max_features':[2,4,'auto'],
    'n_estimators':[100,],
    'criterion':['gini','entropy'],
    'min_samples_leaf':[2,4,6],
    }
clf=RandomForestClassifier()
grid=GridSearchCV(clf,parameter_space)
grid.fit(X,y)
print(grid.best_estimator_)
print(grid.best_score_)

在这里插入图片描述
把最优参数填进去

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

4.会员卡预测改进

(本节内容的数据见电脑F:/python数据/customer 或腾讯微云文件”python数据\customer “)
在这里插入图片描述
包含27个相关的特征(姓名、地址、教育情况);还有一个会员卡的类型(金卡、银卡、铜卡、普通卡)

1.数据的预处理

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].describe()) #了解这一列数据的总数,出现次数最高的数据,出现次数最高的数据出现的次数

print('------------------------------------------------')

print(frame['yearly_income'].unique()) #可以得到年收入的八种不同取值依次是什么

在这里插入图片描述

import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn import preprocessing


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=preprocessing.scale(frame['yearly_income']) #将年收入均值调整为0,将标准差调整为1
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income_new','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
print(frame['yearly_income_new'].describe()) 

'''std=35.973839可以看到方差非常大,数据分布非常分撒,我们得处理一下'''

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
frame['yearly_income_new']=frame['yearly_income_new']//30 #将数据调整的小一些
print(frame['yearly_income_new'].describe())  

在这里插入图片描述

import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
frame['yearly_income_new']=frame['yearly_income_new']//30 #将数据调整的小一些

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income_new','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
print(frame['age'].describe())  

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
2.数据的选择
①使用SelectKBest

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.feature_selection import SelectKBest
from scipy.stats import chi2_contingency as chi2

frame=pd.read_csv('F:/python数据/customer.csv')

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full
y=frame['member_card']

transformer=SelectKBest(score_func=chi2,k='all') #score_func=chi2表示使用卡方检验,k='all'表示返回所有特征
Xt_chi2=transformer.fit_transform(X,y)

print(transformer.scores_)

'''一共有六个特征{'age','yearly_income','total_children','num_cars_owned',"marital_status=True",'marital_status=True'}
我们可以看到,第一个特征的结果特别小,说明不具备相关性;其他的数值还是蛮高的
'''

在这里插入图片描述
②使用PCA

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

pca=PCA(n_components=2) #从上面的特征中最终生成两个关联较大的特征
Xd=pca.fit_transform(X)
np.set_printoptions(precision=3,suppress=True) #调用一个numpy的设置,将小数设置为三位
print(pca.explained_variance_ratio_) #我们看到第一列相关性非常大
print(Xd)

在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

pca=PCA(n_components=2) 
Xd=pca.fit_transform(X)

clf=RandomForestClassifier()
scores=cross_val_score(clf,Xd,y,scoring='accuracy') #将pca得到的两列特征应用
print(np.mean(scores))

在这里插入图片描述

5.每日订单预测

(本节内容的数据见电脑F:/python数据/Daily_Demand_Forecasting_Orders 或腾讯微云文件”python数据\Daily_Demand_Forecasting_Orders “)
在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
print(frame.head(1))

在这里插入图片描述
由上图可以看到部分列名过长,需要修改

import pandas as pd

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )
print(frame.head(1))

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )
X=frame['Non-urgent order'].values.reshape(-1,1) 
#选择了一个“非紧急订单”,将这个特征数列的数据转换为一个二元数据,reshape(-1,1) 其中1是指1列,-1是指根据实际情况确定行数
print(X)

在这里插入图片描述

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )
X=frame['Non-urgent order'].values.reshape(-1,1) 
y=frame['Target']
regressor=LinearRegression() #线性回归
scores=cross_val_score(regressor,X,y,scoring='r2')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )
X=frame['Non-urgent order'].values.reshape(-1,1) 
y=frame['Target']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=LinearRegression() 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
print(round(sm.mean_squared_error(y_test,y_test_pred),2))
print(round(sm.r2_score(y_test,y_test_pred),2))
print(regressor.coef_) #拟合的截距
print(regressor.intercept_) #拟合的系数

在这里插入图片描述

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )
X=frame['Non-urgent order'].values.reshape(-1,1) 
y=frame['Target']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=LinearRegression() 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)

plt.figure()
plt.scatter(X_test,y_test,color='green')
plt.plot(X_test,y_test_pred,color='black',linewidth=4)
plt.title('Test data')
plt.show()

在这里插入图片描述
使用岭回归可以避免较大的异常值

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
import sklearn.metrics as sm

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )
X=frame['Non-urgent order'].values.reshape(-1,1) 
y=frame['Target']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=Ridge() 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
print(round(sm.r2_score(y_test,y_test_pred),2))

在这里插入图片描述
多元回归(考虑更多的特征值,而不是只有一个特征)

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=LinearRegression() 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
print(round(sm.r2_score(y_test,y_test_pred),2))

在这里插入图片描述
多项式回归

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.preprocessing import PolynomialFeatures

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
polynomial=PolynomialFeatures(interaction_only=True) #多项式拟合回归需要在进行任何训练预测之前将原始特征数据进行转换
polynomial.fit_transform(X_train)
X_train_transformed=polynomial.fit_transform(X_train)

regressor=LinearRegression() 
regressor.fit(X_train_transformed,y_train)
X_test_transformed=polynomial.fit_transform(X_test) #将原始特征数据进行转换
y_test_pred=regressor.predict(X_test_transformed)

print(round(sm.r2_score(y_test,y_test_pred),2))

在这里插入图片描述
使用决策树

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm


frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=DecisionTreeRegressor(max_depth=6)
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
print(round(sm.r2_score(y_test,y_test_pred),2))

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.ensemble import AdaBoostRegressor

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=AdaBoostRegressor(DecisionTreeRegressor(max_depth=6)) #AdaBoostRegressor只需要将决策树进行封装即可
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
print(round(sm.r2_score(y_test,y_test_pred),2))

print(regressor.feature_importances_) #AdaBoostRegressor中可以通过feature_importances_获得12个特征与预测列的相关度

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
import numpy as np

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=AdaBoostRegressor(DecisionTreeRegressor(max_depth=6)) 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
feature_importances=100.0*(regressor.feature_importances_/max(regressor.feature_importances_)) #得到各个特征重要性与最大特征重要性的比值

print(np.sort(feature_importances)) #对这些特征重要性进行排序

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
import numpy as np
import matplotlib.pyplot as plt

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=AdaBoostRegressor(DecisionTreeRegressor(max_depth=6)) 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
feature_importances=100.0*(regressor.feature_importances_/max(regressor.feature_importances_)) #得到各个特征重要性与最大特征重要性的比值
values=np.sort(feature_importances)

plt.figure()
plt.bar(np.arange(12),values)
plt.show()

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
import numpy as np
import matplotlib.pyplot as plt

frame=pd.read_csv('F:/python数据/Daily_Demand_Forecasting_Orders.csv',sep=';')
pd.set_option('display.max_columns',None)
frame.rename(columns={'Week of the month (first week, second, third, fourth or fifth week':'week',
                      'Day of the week (Monday to Friday)':'day',
                      "Orders from the traffic controller sector":'sector',
                      'Target (Total orders)':'Target'},inplace=True
             )

y=frame['Target']
X=frame.drop(columns='Target')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)
regressor=AdaBoostRegressor(DecisionTreeRegressor(max_depth=6)) 
regressor.fit(X_train,y_train)
y_test_pred=regressor.predict(X_test)
feature_importances=100.0*(regressor.feature_importances_/max(regressor.feature_importances_)) #得到各个特征重要性与最大特征重要性的比值
values=np.sort(feature_importances)


plt.figure()
plt.bar(np.arange(12),values)

index_sorted=np.argsort(feature_importances)
plt.xticks(np.arange(12),X.columns.values[index_sorted])
#np.argsort(feature_importances)也是对feature_importances进行排序,但排序结果是对应列表元素的下标
#然后我们将np.argsort(feature_importances)作为下标的显示内容

plt.show()

在这里插入图片描述

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

你的甲乙丙

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值