dataframe scala 修改值_scala入门 | spark-submit提交scala任务

安装scala

https://www.scala-lang.org/download/

49a3175029660f9a96ef880a89e4687b.png

sbt方式: https://www.scala-sbt.org/1.x/docs/sbt-by-example.html

  • build.sbt

(scala版本和spark版本需保持一致,如都是2.11或都是2.12...)

name := "lrDemo"
version := "0.1"
scalaVersion := "2.11.12"
libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "2.4.3"
libraryDependencies += "org.apache.spark" % "spark-mllib_2.11" % "2.4.3"
libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.4.3"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % "2.4.3"
// 需要1.8的jdk
name := "lrDemo"
version := "0.1"
scalaVersion := "2.12.10"
libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "2.4.3"
libraryDependencies += "org.apache.spark" % "spark-mllib_2.12" % "2.4.3"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.3"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.3"
// 可用1.8或11或13的jdk
  • Intellij IDEA中更换jdk版本

79a5c0d204bf65dec284a02dc3625c8c.png

5217ae97be1e4960dfe04add5c54a5b5.png
  • lrDemo.scala (以LogisticRegression为例)
package hxj.program.sparkDemo

import java.text.SimpleDateFormat
import java.util.Date

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

object lrDemo {

  def NowDate(): String = {
    val df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    df.format(new Date())
  }

  def main(args: Array[String]): Unit = {
    本地测试
//    val spark = SparkSession.builder().master("local[2]").getOrCreate()
//    val sampleHDFS_train = "file:///Users/admin/IdeaProjects/lrDemo/data/input_demo"
//    val sampleHDFS_test = "file:///Users/admin/IdeaProjects/lrDemo/data/input_demo"
//    val outputHDFS = "file:///Users/admin/IdeaProjects/lrDemo/output"

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext // 不同形式风格的sc
    println("The url to track the job: http://bx-namenode-02:8088/proxy/" + sc.applicationId)

    val sampleHDFS_train = args(0)
    val sampleHDFS_test = args(1)
    val outputHDFS = args(2)


    val featureLst = Array("feature1", "feature2", "feature3")
    val colLst = Array("uid", "label", "feature1", "feature2", "feature3")

    //读取hdfs上数据,将RDD转为DataFrame
    println("step 1 ", NowDate())
    val schemaSample = StructType(colLst.map(column => StructField(column, StringType, true)))
    // 训练数据
    val sampleRDD_train = sc.textFile(sampleHDFS_train)
    val rowSample_train: RDD[Row] = sampleRDD_train.map(_.split("t"))
      .map(line => Row(line(0), line(1), line(2), line(3), line(4)))
    var sampleDataFrame_train= spark.createDataFrame(rowSample_train, schemaSample)
    // 测试数据
    val sampleRDD_test = sc.textFile(sampleHDFS_test)
    val rowSample_test: RDD[Row] = sampleRDD_test.map(_.split("t"))
      .map(line => Row(line(0), line(1), line(2), line(3), line(4)))
    var sampleDataFrame_test= spark.createDataFrame(rowSample_test, schemaSample)
    // 转换数据类型
    for (colName <- colLst) {
        if (colName != "uid"){
            sampleDataFrame_test = sampleDataFrame_test.withColumn(colName, col(colName).cast(DoubleType))
            sampleDataFrame_train = sampleDataFrame_train.withColumn(colName, col(colName).cast(DoubleType))
        }
    }

    /训练
    println("step 2 ", NowDate())
    val vectorAssembler: VectorAssembler = new VectorAssembler().setInputCols(featureLst).setOutputCol("features")
    val lrModel = new LogisticRegression().setLabelCol("label").setFeaturesCol("features").setRegParam(0.01).setMaxIter(100)
    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lrModel))
    val model = pipeline.fit(sampleDataFrame_train)

    ///预测,保存结果///
    println("step 3 ", NowDate())
    val labelsAndPreds = model.transform(sampleDataFrame_test)
      .select("uid", "label", "prediction")
    labelsAndPreds.show()
    labelsAndPreds.write.mode("overwrite").csv(outputHDFS + "/target/output")
  }
}

97479a2cab48102e3fac8593b576c1b1.png

打jar包

cdf245df19203f50efa23ef8b442e676.png

0ba383fcb73036b11095bc156bf31974.png

注:若打的jar包一直提示“找不到主类”:新建一个project,只print个helloWorld 打包,成功后 再修改build.sbt和代码(不确定原因。。感觉是IDEA的bug。。。)

  • spark-submit
SPARK_PATH=/user/spark/spark
YARN_QUEUE=

DEPLOY_MODE=cluster
DEPLOY_MODE=client

input_path_train=hdfs:///user/huangxiaojuan/program/sparkDemo/input/train
input_path_test=hdfs:///user/huangxiaojuan/program/sparkDemo/input/test
output_path=hdfs:///user/huangxiaojuan/program/sparkDemo/scala_lrDemo

hadoop fs -rmr $output_path


${SPARK_PATH}/bin/spark-submit 
  --master yarn 
  --name "scala_lrDemo" 
  --queue ${YARN_QUEUE} 
  --deploy-mode ${DEPLOY_MODE} 
  --driver-memory 4g 
  --driver-cores 2 
  --executor-memory 12g 
  --executor-cores 4 
  --num-executors 6 
  --jars lrDemo.jar 
  --conf spark.default.parallelism=20 
  --conf spark.executor.memoryOverhead=4g 
  --conf spark.driver.memoryOverhead=2g 
  --conf spark.yarn.maxAppAttempts=3 
  --conf spark.yarn.submit.waitAppCompletion=true 
  --class hxj.program.sparkDemo.lrDemo ./lrDemo.jar $input_path_train $input_path_test $output_path

查看日志:yarn logs -applicationId application_xxxxxxxx_xxxxx

杀掉任务:yarn application -kill application_xxxxxxxx_xxxxx

pyspark 参考:

叶一一:pyspark入门 | spark-submit 提交pyspark任务​zhuanlan.zhihu.com
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值