import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Word2Vec
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')#警告过滤
读取数据
# 从csv文件中读取数据
train = pd.read_csv('train.csv')#训练集
test = pd.read_csv('test.csv')#测试集
data = pd.concat([train, test], axis=0, ignore_index=True)len(data)# 导出特征列名称
features =[f for f in data.columns if f notin['是否流失','客户ID']]# 重置索引后,drop参数默认为False,想要删除原先的索引列要置为True.# 此处训练数据集中 是否流失 是应当不为空(有标签的),而测试集是无标签的,因此此处对数据集进行拆分,分为训练集和测试集并对其索引进行重置。
train = data[data['是否流失'].notnull()].reset_index(drop=True)#用索引重置生成一个新的DataFrame或Series
test = data[data['是否流失'].isnull()].reset_index(drop=True)# 分别对测试集、训练集提取特征数据
x_train = train[features]
x_test = test[features]# 提取训练集是否流失(标签)
y_train = train['是否流失']