Spark2.3.2 机器学习工作流构建

scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession

scala> val spark = SparkSession.builder().
     |             master("local").
     |             appName("my App Name").
     |             getOrCreate()
2018-12-07 02:14:10 WARN  SparkSession$Builder:66 - Using an existing SparkSession; some configuration may not take effect.
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1a139347

scala> import org.apache.spark.ml.feature._
import org.apache.spark.ml.feature._

scala> import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.LogisticRegression

scala> import org.apache.spark.ml.{Pipeline,PipelineModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}

scala> import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.Vector

scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row

scala> val training = spark.createDataFrame(Seq((0L, "a b c d e spark", 1.0),(1L, "b d", 0.0),(2L, "spark f g h", 1.0),(3L, "hadoop mapreduce", 0.0))).toDF("id", "text", "label")
2018-12-07 02:15:29 WARN  ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
training: org.apache.spark.sql.DataFrame = [id: bigint, text: string ... 1 more field]

scala> 

scala> val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_b90cb26b1f51

scala> val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
hashingTF: org.apache.spark.ml.feature.HashingTF = hashingTF_e810c12ed27c

scala> val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.01)
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_fdee17135e3d

scala> val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr))
pipeline: org.apache.spark.ml.Pipeline = pipeline_a9b6a2d92374

scala> val model = pipeline.fit(training)
2018-12-07 02:16:55 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2018-12-07 02:16:55 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
model: org.apache.spark.ml.PipelineModel = pipeline_a9b6a2d92374

scala> val test = spark.createDataFrame(Seq((4L, "spark i j k"),(5L, "l m n"),(6L, "spark a"),(7L, "apache hadoop"))).toDF("id", "text")
test: org.apache.spark.sql.DataFrame = [id: bigint, text: string]

scala> model.transform(test).select("id", "text", "probability", "prediction").collect().foreach {case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction")}
(4, spark i j k) --> prob=[0.540643354485232,0.45935664551476796], prediction=0.0
(5, l m n) --> prob=[0.9334382627383527,0.06656173726164716], prediction=0.0
(6, spark a) --> prob=[0.1504143004807332,0.8495856995192668], prediction=1.0
(7, apache hadoop) --> prob=[0.9768636139518375,0.02313638604816238], prediction=0.0

  

转载于:https://www.cnblogs.com/RHadoop-Hive/p/10078387.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值