import pandas as pd
import string
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn import metrics #Additional scklearn functions
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score,plot_confusion_matrix,classification_report,roc_auc_score,f1_score,make_scorer
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from matplotlib import pyplot
from collections import Counter
from featurewiz import featurewiz
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
#检查空值
data.isnull().sum().sort_values(ascending=False)
# #字符串转换为数值(整型)
# data['city_level'] = data['city_level'].astype('int')
# #字符串转换为数值(浮点型)
# data['monetary_of_food'] = data['monetary_of_food'].astype('float')
# #更改列名
# #建立字典字典:旧列名和新列名对应关系
# colNameDict = {'InvolceDate':'SaleDate','StockCode':'StockNo'}
# #!! 一定要旧列名放在冒号前
# #每组对应关系以[逗号]隔开
# data.rename(columns = colNameDict,inplace=True)
#将Y值放到最后
cols=list(data.columns.values)
cols.remove('value_level')
cols.append('value_level')
data=data[cols]
#ID留存
data_keepId=data.iloc[:,0:1]
data=data.drop('md5_buyer_id',axis=1)
# label 从0开始
data[['value_level']]=data[['value_level']]-1
# label encoding
def prepare_targets(X):
le = LabelEncoder()
le.fit(X)
X_enc = le.transform(X)
return X_enc
data['life_stage']=prepare_targets(data['life_stage'])
data['pre_province']=prepare_targets(data['pre_province'])
#查看label的分布:
labels= data[['value_level']]
sns.countplot(x='value_level',data=labels)
# split into input (X) and output (y) variables
X = data.iloc[:, :-1]
y = data.iloc[:,-1]
scaling=StandardScaler()
scaling.fit(X)
Scaled_X=scaling.transform(X)
ANTHONY_RUSH_1
最新推荐文章于 2024-11-06 10:55:54 发布
本文介绍了数据清洗过程,包括处理缺失值和类型转换,然后探讨了多种机器学习模型如XGBoost、LightGBM和SVM的使用,以及特征选择和标准化方法。重点在于针对生活阶段和预省份的标签编码,以及标签分布分析和模型性能评估。
摘要由CSDN通过智能技术生成