数据处理和预测

  • 数据整合
  • 数据清洗
  • 特征工程
  • 建模预测
  • 评估优化

一、数据:(参考:数据处理特征工程)

  1. 数据准备

    1. 读取数据
    data = pd.read_csv('data.csv')
    data.head()
    # 利用value_counts函数查看训练集标签的数量
    pd.Series(data[‘A’]).value_counts()
    # 对于特征进行一些统计描述
    data.describe()
    # 数据的基本信息
    data.info()
    

    根据标签数量,看数据量是否平衡,当存在数据不平衡问题时,需要特殊处理:如欠采样或者过采样。

    from collections import Counter
    import matplotlib.pyplot as plt
    import seaborn as sns; sns.set_theme()
    # Most common tags
    all_tags = Counter(df.tag)
    all_tags.most_common()
    
    # Plot tag frequencies
    tags, tag_counts = zip(*all_tags.most_common())
    plt.figure(figsize=(10, 3))
    ax = sns.barplot(x=list(tags), y=list(tag_counts))
    ax.set_xticklabels(tags, rotation=0, fontsize=8)
    plt.title("Tag distribution", fontsize=14)
    plt.ylabel("# of projects", fontsize=12)
    plt.show()
    
    1. 拆分数据,符合同一分布
    seaborn.distplot()
    
  2. 可视化数据

    1. 数据分布

      1. 标签与特征组合的散点可视化
       data = data.copy() ##进行浅拷贝,防止对于原始数据的修改
       ## 特征与标签组合的散点可视化
       sns.pairplot(data=data,diag_kind=‘hist’, hue= ‘target’)
       plt.show()
      
      1. 混淆矩阵—热力图
      confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
      print('The confusion matrix result:\n',confusion_matrix_result)
      
      # 利用热力图对于结果进行可视化
      plt.figure(figsize=(8, 6))
      sns.heatmap(confusion_matrix_result, annot=True,cmap=sns.color_palette("RdBu_r", 100))
      plt.xlabel('Predicted labels')
      plt.ylabel('True labels')
      plt.show()
      
      1. 单变量分布:
      # 如果是散点值,数据值单一,可以选用countplot()
      # 如果单分布数据连续,sns.displot()
      
    2. 词云

      import warnings; warnings.filterwarnings("ignore")
      from wordcloud import WordCloud, STOPWORDS
      # Most frequent tokens for each tag
      tag="natural-language-processing"
      plt.figure(figsize=(10, 3))
      subset = df[df.tag==tag]
      text = subset.title.values
      cloud = WordCloud(
          stopwords=STOPWORDS, background_color="black", collocations=False,
          width=500, height=300).generate(" ".join(text))
      plt.axis("off")
      plt.imshow(cloud)
      
  3. 预处理

    1. 缺失值(xgboost可处理缺失值,或者用one-hot编码可以将缺失值转成一种类型)

      1. 查看缺失值

        df.x1.value_counts()  # 发现缺失值被表示成-
        df['x1'].replace('-', np.nan, inplace=True)     
        
      2. 仅有小部分缺失

        df = df[df.isnull().any(axis=1)] # dropna()
        
      3. 大多样本缺乏

        df = df.drop[['A'],axis=1]
        
      4. 填充缺失值

        df.A = df.A.fillna(df.A.mean())
        
    2. 重复值

      df = df.drop_duplicates() 
      
    3. 异常值(可选)

      # 条件过滤,如基于标准差或者四分位
      # 四分位法
      # Outlier detection
      import numpy as np
      
      def detect_outliers2(df):
          outlier_indices = []
      
          Q1 = df.quantile(0.25)
          Q3 = df.quantile(0.75)
          IQR = Q3 - Q1
      
           # 定义上界和下界
          upper_bound = Q3 + 1.5 * IQR
          lower_bound = Q1 - 1.5 * IQR
      
          outliers = df[(df < lower_bound) | (df > upper_bound)]
      	df = df[(df >= lower_bound) & (df <= upper_bound)]
          return df
      
      # 标准差法
      mean = df.mean()
      std = df.std()
      
      threshold = 3
      outliers = df[abs(df - mean) > threshold * std]
      
      df = df[abs(df - mean) <= threshold * std,inplace = True]
      
  4. 转化标准化

    从 sklearn.preprocessing 导入—实例化并拟合数据—使用 transform() 方法转换数据

    from sklearn.preprocessing import StandardScaler 
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df_merged[['Age', 'Income']]), columns=['Age', 'Income'])
    
    1. 归一化
    # Standardization
    import numpy as np
    x = np.random.random(4) # values between 0 and 1
    print ("x:\n", x)
    print (f"mean: {np.mean(x):.2f}, std: {np.std(x):.2f}")
    x_standardized = (x - np.mean(x)) / np.std(x)
    print ("x_standardized:\n", x_standardized)
    print (f"mean: {np.mean(x_standardized):.2f}, std: {np.std(x_standardized):.2f}")
    
    1. 最大-最小化
    # Min-max
    import numpy as np
    x = np.random.random(4) # values between 0 and 1
    print ("x:", x)
    print (f"min: {x.min():.2f}, max: {x.max():.2f}")
    x_scaled = (x - x.min()) / (x.max() - x.min())
    print ("x_scaled:", x_scaled)
    print (f"min: {x_scaled.min():.2f}, max: {x_scaled.max():.2f}")
    
  5. 编码

    1. map(特征有大小意义的采用映射编码)
     ## 把所有的相同类别的特征编码为同一个值
    # Label to index
    tags = train_df.tag.unique().tolist()
    num_classes = len(tags)
    class_to_index = {tag: i for i, tag in enumerate(tags)}
    class_to_index
    
    df["tag"] = df["tag"].map(class_to_index)
    df.head()
    
    # decode
    def decode(indices, index_to_class):
        return [index_to_class[index] for index in indices]
    index_to_class = {v:k for k, v in class_to_index.items()}
    decode(df.head()["tag"].values, index_to_class=index_to_class)
    
    1. one-hot(特征不具备大小意义的直接独热编码)
    df_dummies = pd.get_dummies(df, columns=['A', 'B'])
    

二、模型

  1. 划分训练和测试
from sklearn.model_selection import train_test_split
# 划分数据集
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target,                                                                  test_size=0.1,random_state=1)
train_set.shape, test_set.shape
  1. 加载回归模型
 # 多种模型 
from sklearn.linear_model import LinearRegression #线性回归
from sklearn.neighbors import KNeighborsRegressor #KNN回归
from sklearn.svm import SVR #SVM回归
from sklearn.linear_model import Ridge  #岭回归
from sklearn.linear_model import Lasso #Lasso回归
from sklearn.tree import DecisionTreeRegressor #决策树回归
from sklearn.ensemble import RandomForestClassifier #随机森林回归
from sklearn.ensemble import BaggingRegressor # bagging回归
from sklearn.ensemble import AdaBoostRegressor #AdaBoost回归
from sklearn.ensemble import GradientBoostingRegressor #梯度提升回归
 ## 导入XGBoost模型
 from xgboost.sklearn import XGBClassifier
 ## 定义 XGBoost模型
 clf = XGBClassifier()
 # 在训练集上训练XGBoost模型
 clf.fit(x_train, y_train)
  1. 预测
 test_predict = clf.predict(x_test)
  1. 计算均方误差
metrics.mean_squared_error(y_test, predictions)
  1. 特征选择
 sns.barplot(y=data_features_part.columns, x=clf.feature_importances_)
 from sklearn.metrics import accuracy_score
 from xgboost import plot_importance
 def estimate(model,data):
    #sns.barplot(data.columns,model.feature_importances_)
    ax1=plot_importance(model,importance_type="gain")
    ax1.set_title('gain')
    ax2=plot_importance(model, importance_type="weight")
    ax2.set_title('weight')
    ax3 = plot_importance(model, importance_type="cover")
    ax3.set_title('cover')
    plt.show()
 def classes(data,label,test):
    model=XGBClassifier()
    model.fit(data,label)
    ans=model.predict(test)
    estimate(model, data)
    return ans
 
ans=classes(x_train,y_train,x_test)
pre=accuracy_score(y_test, ans)
print('acc=',accuracy_score(y_test,ans))

三、评估和优化:(参考:评估指标

  1. 交叉验证评估模型(模型选择),也可采取混淆矩阵
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# 定义模型评估的方法(n_splits即训练集被分的份数)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
# 评估模型(按照accuracy排序)
result = cross_val_score(model, X, y, cv=kf, scoring='accuracy') #scoring='neg_mean_squared_error'/'roc_auc'
# 打印模型表现
print(result)
print('Accuracy: %.3f' % result.mean())
  1. 生成评估指标报告
from sklearn.metrics import classification_report
# 生成分类器的性能报告
print(classification_report(target_test,
                           target_predicted,
                           target_names=class_names))
  1. 各指标
from sklearn.metrics import precision_recall_fscore_support
# Overall metrics
overall_metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
metrics["overall"]["precision"] = overall_metrics[0]
metrics["overall"]["recall"] = overall_metrics[1]
metrics["overall"]["f1"] = overall_metrics[2]
metrics["overall"]["num_samples"] = np.float64(len(y_test))
print (json.dumps(metrics["overall"], indent=4))
  1. 网格搜索调优模型
 ## 从sklearn库中导入网格调参函数
 from sklearn.model_selection import GridSearchCV
 ## 定义参数取值范围
learning_rate = [0.1, 0.3, 0.6]
subsample = [0.8, 0.9]
colsample_bytree = [0.6, 0.8]
max_depth = [3,5,8]

parameters = { 'learning_rate': learning_rate,
              'subsample': subsample,
              'colsample_bytree':colsample_bytree,
              'max_depth': max_depth}
model = XGBClassifier(n_estimators = 50)

## 进行网格搜索
clf = GridSearchCV(model, parameters, cv=3, scoring='accuracy',verbose=1,n_jobs=-1)
clf = clf.fit(x_train, y_train)
  1. 数据量对模型性能的影响
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve

digits = load_digits()
features, target = digits.data, digits.target

# 使用交叉验证为不同规模的训练集计算训练和测试得分
train_sizes, train_scores, test_scores = learning_curve(RandomForestClassifier(),
                                                       features,
                                                       target,
                                                       cv=10,
                                                       scoring='accuracy',
                                                       n_jobs=-1,
                                                      train_sizes=np.linspace(0.01,1,50))
# 计算训练集得分的平均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, '--', color='black', label='Training score')
plt.plot(train_sizes, test_mean, color='black', label='Cross-validation score')
plt.fill_between(train_sizes, train_mean-train_std,
                train_mean + train_std, color='#DDDDDD')
plt.fill_between(train_sizes, test_mean-test_std,
                test_mean + test_std, color='#DDDDDD')
plt.title('learning_curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
  • 4
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值