pyspark 随机森林训练和预测项目

pyspark 项目训练

配置spark环境

import json
import datetime
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

from datetime import datetime
import matplotlib.pyplot as plt
import os
import numpy as np
os.environ['PYSPARK_PYTHON']='/usr/local/anaconda3/bin/python3.6'
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.window import *
from pyspark.sql import Window
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
spark = (SparkSession
 .builder
 .appName("gps-test-DockerLinuxContainer")
 .enableHiveSupport()
 .config("spark.executor.instances", "300")
 .config("spark.executor.memory","16g")
 .config("spark.executor.cores","4")
 .config("spark.driver.memory","8g")
 .config("spark.driver.maxResultSize", "8g")
 .config("spark.sql.shuffle.partitions","1200")
 .config("spark.yarn.appMasterEnv.yarn.nodemanager.container-executor.class","DockerLinuxContainer")
 .config("spark.executorEnv.yarn.nodemanager.container-executor.class","DockerLinuxContainer")
 .getOrCreate()) 

数据预处理

####################################################### 函数封装
# 读取数据集合
def getDataFromHive(table):
    sql = "select * from "+table
    print(sql)
    df = spark.sql(sql)
    print('The number of training data', df.count())
    #df.printSchema()
    return df 

        
# 查看数据中类的占比
def getClassRatio(df, label):
    counts_level = df.groupBy(label).count().rdd.collectAsMap()
    count_total = df.count()
    for level, cnt in counts_level.items():
        print(level, cnt, round(cnt/count_total,2))

        
# 数据采样使各类均衡
def resampleClass(df, label, sample_ratio1, sample_ratio2, sample_ratio3):
    # level = 1 的数据按照1/3进行降采样
    df_level_1 = df.where(label+'=1').sample(True, sample_ratio1, seed = 2018)
    # level = 2 保留所有样本
    df_level_2 = df.where(label+'=2').sample(True, sample_ratio2, seed = 2018)
    # level = 3 复制5倍
    df_level_3 = df.where(label+'=3').sample(True, sample_ratio3, seed = 2018)
    df = df_level_1.union(df_level_2).union(df_level_3)
    getClassRatio(df, label)
    return df
    
        
# 除去one-hot字段和pin,其余字段全部转为double类型
def convertToDouble(df, no_double_cols):
    double_cols = list(set(df.columns).difference(set(no_double_cols)))    
    df = df.select(no_double_cols + [col(column).cast("double").alias(column) for column in double_cols])
    return df 


# 字段缺失值填充
def fillNone(df, cate_cols, num_cols, onehot_cols):
    # 对于转为double的类别型特征,用 -1
    df = df.fillna(-1.0, subset=cate_cols)
    # 对于数值型特征,用 0
    df = df.fillna(0.0, subset=num_cols)
    # 对于需要onehot的特征,用 '-1'
    df = df.fillna('-1', subset=onehot_cols)
    for col in df.columns:
        filter_condition = col+' is null'
        cnt = df.filter(filter_condition).count()
        if cnt > 0:
            print(col+" have null value")
    return df


# 将数据集分为train_data和test_data
def splitData(df, split_train_ratio):
    #将数据随机分为训练集和测试集, 并将它们存与内存中,加快后序运行速度 cache()
    train_df, test_df = df.randomSplit(weights=[split_train_ratio, 1-split_train_ratio], seed&#
  • 2
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值