Spark中文文本分析建模

实用的朴素贝叶斯模型建模
建模过程主要是把文本转化成向量然后再作分析
数据格式:

0,善良 美丽
1,丑陋 阴险 卑鄙
0,温和
.......
注:前面是给文章贴的标签,后面是文章的分词,分词可以找关于分词的文章去查看,后面我也会写关于分词的文章
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

class CreatModel {

}
object CreatModel{
  case class RawDataRecord(category: String, text: String)

  def main(args: Array[String]): Unit = {
    val config = new SparkConf().setAppName("createModel").setMaster("local[4]");
    val sc =new  SparkContext(config);
    val spark = SparkSession.builder().config(config).config("spark.sql.warehouse.dir", "warehouse/dir").getOrCreate();
    import spark.implicits._
    //分数据
    val Array(srcDF,testDF) = sc.textFile("D:\\decstop\\testFiles\\sougou").map {
      x =>
        val data = x.split(",")
        RawDataRecord(data(0),data(1))
    }.toDF().randomSplit(Array(0.7,0.3))

    //分词
    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    val wordsData = tokenizer.transform(srcDF)
    wordsData.show(false)
    val testtokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    val testwordsData = testtokenizer.transform(testDF)

    //文档词频
    val hashingTF =
      new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
    val featurizedData = hashingTF.transform(wordsData)

    val testhashingTF =
      new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
    val testfeaturizedData = testhashingTF.transform(testwordsData)

    //逆文档词频
    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val idfModel = idf.fit(featurizedData)
    val rescaledData = idfModel.transform(featurizedData)

    val testidf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val testidfModel = testidf.fit(testfeaturizedData)
    val testrescaledData = testidfModel.transform(testfeaturizedData)
    rescaledData.show(false) 
    //转换成贝叶斯的输入格式
    val trainDataRdd = rescaledData.select($"category",$"features").map {
      case Row(label: String, features:Vector) =>
        LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
    }

    val testtrainDataRdd = testrescaledData.select($"category",$"features").map {
      case Row(label: String, features:Vector) =>
        LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
    }

    val model =new NaiveBayes().fit(trainDataRdd)

    val predictions = model.transform(testtrainDataRdd)
    println("predictln out:");
    predictions.show();
    model.write.overwrite().save("resoult")

    //模型评估
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println("accuracy out :")
    println("Accuracy:"+accuracy)

  }
}

 

 

转载于:https://www.cnblogs.com/itboys/p/6860633.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值