[Spark][spark_ml]#5_projects

文本情感

object Main {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("SA")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    spark.sparkContext.setLogLevel("WARN") ///日志级别

    import spark.implicits._
    val rand = new Random()
    val neg = spark.read.textFile("neg.txt").map(
      line => {
        (line.split(" ").filter(!_.equals(" ")),0,rand.nextDouble())
      }).toDF("words","value","random")
    val pos = spark.read.textFile("pos.txt").map(
      line => {
        (line.split(" ").filter(!_.equals(" ")),1,rand.nextDouble())
      }).toDF("words","value","random") //思考:这里把inner function提出重用来如何操作

    val data = neg.union(pos).sort("random") //思考:为什么不用join
    //data.show(false)
    //println(neg.count(),data.count())//合并

    //文本特征抽取
    val hashingTf = new HashingTF()
      .setInputCol("words")
      .setOutputCol("hashing")
      .transform(data)
    val idfModel = new IDF()
      .setInputCol("hashing")
      .setOutputCol("tfidf")
      .fit(hashingTf)
    val transformedData = idfModel
      .transform(hashingTf)
    val Array(training,test) = transformedData
      .randomSplit(Array(0.7,0.3))

    //根据抽取到的文本特征,使用分类器进行分类,这是一个二分类问题
    //分类器是可替换的
    val bayes = new NaiveBayes()
      .setFeaturesCol("tfidf") //X
      .setLabelCol("value")    //y
      .fit(training)
    val result = bayes.transform(test) //交叉验证
    result.show(false)

    //对模型的准去率进行评估
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("value")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(result)
    println(s"""accuracy is $accuracy""")

   
    //尝试用pipeline重构代码

  }
}

推荐系统

object Main {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("RS")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    val parseRating = (string: String) =>{
      val stringArray = string.split("\t")
      Rating(stringArray(0).toInt,stringArray(1).toInt,stringArray(2).toFloat)
    }
    import spark.implicits._
    val data = spark.read.textFile("u.data")
      .map(parseRating)
      .toDF("userID","itemID","rating")
    //data.show(false)
    val Array(traing,test) = data.randomSplit(Array(0.8,0.2))

    val als = new ALS()
      .setMaxIter(20)
      .setUserCol("userID")
      .setItemCol("itemID")
      .setRatingCol("rating")
      .setRegParam(0.01)//正则化参数

    val model = als.fit(traing)
    model.setColdStartStrategy("drop")//冷启动策略,这是推荐系统的一个重点内容哦~

    val predictions = model.transform(test)
    //predictions.show(false)//根据(userID,itemID)预测rating


    //MovieLens
    val users = spark.createDataset(Array(196)).toDF("userID")
    //users.show(false)
    model.recommendForUserSubset(users,10).show(false)//想一想工业实践该怎么结合这段代码?

    //模型评估
    val evaluator = new RegressionEvaluator()
      .setMetricName("rmse")
      .setLabelCol("rating")
      .setPredictionCol("prediction")
    val rmse = evaluator.evaluate(predictions)
    println(s"Root-mean-square error is $rmse \n")

    //Spark机器学习模型的持久化
    //模型保存
    //model.save("./xxx")
    //模型加载
    //val model = ALS.load("xxxx")


  }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值