- 数据整合
- 数据清洗
- 特征工程
- 建模预测
- 评估优化
-
数据准备
- 读取数据
data = pd.read_csv('data.csv') data.head() # 利用value_counts函数查看训练集标签的数量 pd.Series(data[‘A’]).value_counts() # 对于特征进行一些统计描述 data.describe() # 数据的基本信息 data.info()
根据标签数量,看数据量是否平衡,当存在数据不平衡问题时,需要特殊处理:如欠采样或者过采样。
from collections import Counter import matplotlib.pyplot as plt import seaborn as sns; sns.set_theme() # Most common tags all_tags = Counter(df.tag) all_tags.most_common() # Plot tag frequencies tags, tag_counts = zip(*all_tags.most_common()) plt.figure(figsize=(10, 3)) ax = sns.barplot(x=list(tags), y=list(tag_counts)) ax.set_xticklabels(tags, rotation=0, fontsize=8) plt.title("Tag distribution", fontsize=14) plt.ylabel("# of projects", fontsize=12) plt.show()
- 拆分数据,符合同一分布
seaborn.distplot()
-
可视化数据
-
数据分布
- 标签与特征组合的散点可视化
data = data.copy() ##进行浅拷贝,防止对于原始数据的修改 ## 特征与标签组合的散点可视化 sns.pairplot(data=data,diag_kind=‘hist’, hue= ‘target’) plt.show()
- 混淆矩阵—热力图
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test) print('The confusion matrix result:\n',confusion_matrix_result) # 利用热力图对于结果进行可视化 plt.figure(figsize=(8, 6)) sns.heatmap(confusion_matrix_result, annot=True,cmap=sns.color_palette("RdBu_r", 100)) plt.xlabel('Predicted labels') plt.ylabel('True labels') plt.show()
- 单变量分布:
# 如果是散点值,数据值单一,可以选用countplot() # 如果单分布数据连续,sns.displot()
-
词云
import warnings; warnings.filterwarnings("ignore") from wordcloud import WordCloud, STOPWORDS # Most frequent tokens for each tag tag="natural-language-processing" plt.figure(figsize=(10, 3)) subset = df[df.tag==tag] text = subset.title.values cloud = WordCloud( stopwords=STOPWORDS, background_color="black", collocations=False, width=500, height=300).generate(" ".join(text)) plt.axis("off") plt.imshow(cloud)
-
-
预处理
-
缺失值(xgboost可处理缺失值,或者用one-hot编码可以将缺失值转成一种类型)
-
查看缺失值
df.x1.value_counts() # 发现缺失值被表示成- df['x1'].replace('-', np.nan, inplace=True)
-
仅有小部分缺失
df = df[df.isnull().any(axis=1)] # dropna()
-
大多样本缺乏
df = df.drop[['A'],axis=1]
-
填充缺失值
df.A = df.A.fillna(df.A.mean())
-
-
重复值
df = df.drop_duplicates()
-
异常值(可选)
# 条件过滤,如基于标准差或者四分位 # 四分位法 # Outlier detection import numpy as np def detect_outliers2(df): outlier_indices = [] Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 # 定义上界和下界 upper_bound = Q3 + 1.5 * IQR lower_bound = Q1 - 1.5 * IQR outliers = df[(df < lower_bound) | (df > upper_bound)] df = df[(df >= lower_bound) & (df <= upper_bound)] return df # 标准差法 mean = df.mean() std = df.std() threshold = 3 outliers = df[abs(df - mean) > threshold * std] df = df[abs(df - mean) <= threshold * std,inplace = True]
-
-
转化标准化
从 sklearn.preprocessing 导入—实例化并拟合数据—使用 transform() 方法转换数据
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df_scaled = pd.DataFrame(scaler.fit_transform(df_merged[['Age', 'Income']]), columns=['Age', 'Income'])
- 归一化
# Standardization import numpy as np x = np.random.random(4) # values between 0 and 1 print ("x:\n", x) print (f"mean: {np.mean(x):.2f}, std: {np.std(x):.2f}") x_standardized = (x - np.mean(x)) / np.std(x) print ("x_standardized:\n", x_standardized) print (f"mean: {np.mean(x_standardized):.2f}, std: {np.std(x_standardized):.2f}")
- 最大-最小化
# Min-max import numpy as np x = np.random.random(4) # values between 0 and 1 print ("x:", x) print (f"min: {x.min():.2f}, max: {x.max():.2f}") x_scaled = (x - x.min()) / (x.max() - x.min()) print ("x_scaled:", x_scaled) print (f"min: {x_scaled.min():.2f}, max: {x_scaled.max():.2f}")
-
编码
- map(特征有大小意义的采用映射编码)
## 把所有的相同类别的特征编码为同一个值 # Label to index tags = train_df.tag.unique().tolist() num_classes = len(tags) class_to_index = {tag: i for i, tag in enumerate(tags)} class_to_index df["tag"] = df["tag"].map(class_to_index) df.head() # decode def decode(indices, index_to_class): return [index_to_class[index] for index in indices] index_to_class = {v:k for k, v in class_to_index.items()} decode(df.head()["tag"].values, index_to_class=index_to_class)
- one-hot(特征不具备大小意义的直接独热编码)
df_dummies = pd.get_dummies(df, columns=['A', 'B'])
二、模型
- 划分训练和测试
from sklearn.model_selection import train_test_split
# 划分数据集
features_train, features_test, target_train, target_test = train_test_split(features,
target, test_size=0.1,random_state=1)
train_set.shape, test_set.shape
- 加载回归模型
# 多种模型
from sklearn.linear_model import LinearRegression #线性回归
from sklearn.neighbors import KNeighborsRegressor #KNN回归
from sklearn.svm import SVR #SVM回归
from sklearn.linear_model import Ridge #岭回归
from sklearn.linear_model import Lasso #Lasso回归
from sklearn.tree import DecisionTreeRegressor #决策树回归
from sklearn.ensemble import RandomForestClassifier #随机森林回归
from sklearn.ensemble import BaggingRegressor # bagging回归
from sklearn.ensemble import AdaBoostRegressor #AdaBoost回归
from sklearn.ensemble import GradientBoostingRegressor #梯度提升回归
## 导入XGBoost模型
from xgboost.sklearn import XGBClassifier
## 定义 XGBoost模型
clf = XGBClassifier()
# 在训练集上训练XGBoost模型
clf.fit(x_train, y_train)
- 预测
test_predict = clf.predict(x_test)
- 计算均方误差
metrics.mean_squared_error(y_test, predictions)
- 特征选择
sns.barplot(y=data_features_part.columns, x=clf.feature_importances_)
from sklearn.metrics import accuracy_score
from xgboost import plot_importance
def estimate(model,data):
#sns.barplot(data.columns,model.feature_importances_)
ax1=plot_importance(model,importance_type="gain")
ax1.set_title('gain')
ax2=plot_importance(model, importance_type="weight")
ax2.set_title('weight')
ax3 = plot_importance(model, importance_type="cover")
ax3.set_title('cover')
plt.show()
def classes(data,label,test):
model=XGBClassifier()
model.fit(data,label)
ans=model.predict(test)
estimate(model, data)
return ans
ans=classes(x_train,y_train,x_test)
pre=accuracy_score(y_test, ans)
print('acc=',accuracy_score(y_test,ans))
三、评估和优化:(参考:评估指标)
- 交叉验证评估模型(模型选择),也可采取混淆矩阵
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# 定义模型评估的方法(n_splits即训练集被分的份数)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
# 评估模型(按照accuracy排序)
result = cross_val_score(model, X, y, cv=kf, scoring='accuracy') #scoring='neg_mean_squared_error'/'roc_auc'
# 打印模型表现
print(result)
print('Accuracy: %.3f' % result.mean())
- 生成评估指标报告
from sklearn.metrics import classification_report
# 生成分类器的性能报告
print(classification_report(target_test,
target_predicted,
target_names=class_names))
- 各指标
from sklearn.metrics import precision_recall_fscore_support
# Overall metrics
overall_metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
metrics["overall"]["precision"] = overall_metrics[0]
metrics["overall"]["recall"] = overall_metrics[1]
metrics["overall"]["f1"] = overall_metrics[2]
metrics["overall"]["num_samples"] = np.float64(len(y_test))
print (json.dumps(metrics["overall"], indent=4))
- 网格搜索调优模型
## 从sklearn库中导入网格调参函数
from sklearn.model_selection import GridSearchCV
## 定义参数取值范围
learning_rate = [0.1, 0.3, 0.6]
subsample = [0.8, 0.9]
colsample_bytree = [0.6, 0.8]
max_depth = [3,5,8]
parameters = { 'learning_rate': learning_rate,
'subsample': subsample,
'colsample_bytree':colsample_bytree,
'max_depth': max_depth}
model = XGBClassifier(n_estimators = 50)
## 进行网格搜索
clf = GridSearchCV(model, parameters, cv=3, scoring='accuracy',verbose=1,n_jobs=-1)
clf = clf.fit(x_train, y_train)
- 数据量对模型性能的影响
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
digits = load_digits()
features, target = digits.data, digits.target
# 使用交叉验证为不同规模的训练集计算训练和测试得分
train_sizes, train_scores, test_scores = learning_curve(RandomForestClassifier(),
features,
target,
cv=10,
scoring='accuracy',
n_jobs=-1,
train_sizes=np.linspace(0.01,1,50))
# 计算训练集得分的平均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, '--', color='black', label='Training score')
plt.plot(train_sizes, test_mean, color='black', label='Cross-validation score')
plt.fill_between(train_sizes, train_mean-train_std,
train_mean + train_std, color='#DDDDDD')
plt.fill_between(train_sizes, test_mean-test_std,
test_mean + test_std, color='#DDDDDD')
plt.title('learning_curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.legend(loc='best')
plt.tight_layout()
plt.show()