电费敏感数据挖掘一: 数据处理与特征工程
电费敏感数据挖掘二: 文本特征构造
六. 构建XGBoost模型
6.1 读取特征
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csc_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from scipy.sparse import hstack
df = pickle.load(open(r'..\电费\statistical_features_1.pkl', 'rb'))
text = pickle.load(open(r'..\电费\text_features_1.pkl', 'rb'))
df = df.merge(text, on = 'CUST_NO', how = 'left')
train = df.loc[df.label != -1]
test = df.loc[df.label == -1]
print('训练集:',train.shape[0])
print('正样本:',train.loc[train.label == 1].shape[0])
print('负样本:',train.loc[train.label == 0].shape[0