《machine learning with spark》学习笔记--文本挖掘

最新推荐文章于 2018-06-06 17:24:02 发布

Star先生

最新推荐文章于 2018-06-06 17:24:02 发布

阅读量2.4k

点赞数

分类专栏：大数据机器学习 spark 文章标签： spark processing 机器学习

本文链接：https://blog.csdn.net/pangjiuzala/article/details/50670314

版权

大数据同时被 3 个专栏收录

24 篇文章 0 订阅

订阅专栏

机器学习

12 篇文章 1 订阅

订阅专栏

spark

5 篇文章 0 订阅

订阅专栏

We will introduce more advanced text processing techniques available in MLlib to work with large-scale text datasets.

In this article, we will:

Work through detailed examples that illustrate data processing, feature
extraction, and the modeling pipeline, as they relate to text data
Evaluate the similarity between two documents based on the words in
the documents
Use the extracted text features as inputs for a classification model
Cover a recent development in natural language processing to model words themselves as vectors and illustrate the use of Spark’s Word2Vec model to evaluate the similarity between two words, based on their meaning

DataSource

20news-bydate.tar.gz

SourceCode



import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import com.sun.org.apache.xalan.internal.xsltc.compiler.Whitespace
import scala.util.matching.Regex
import org.apache.spark.ml.feature.StopWords
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.{ SparseVector => SV }
import org.apache.spark.mllib.feature.IDF
import breeze.linalg.SparseVector
import breeze.linalg.norm
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.feature.Word2Vec
object AdvanceText {
  def tokenize(line: String, regex: Regex, stopwords: String, rareTokens: String): Seq[String] = {
    line.split("""\W+""")
      .map(_.toLowerCase)
      .filter(token => regex.pattern.matcher(token).matches)
      .filterNot(token => stopwords.contains(token))
      .filterNot(token => rareTokens.contains(token))
      .filter(token => token.size >= 2)
      .toSeq
  }
  def main(args: Array[String]): Unit = {
    val path = "hdfs://master:9000/user/root/input/20news-bydate-train/*"
    val conf = new SparkConf().setAppName("text")
    val sc = new SparkContext(conf)
    val rdd = sc.wholeTextFiles(path)
    //    val text = rdd.map {
    //      case (file, text) => text
    //    }
    // println(text.count)
    val newgroups = rdd.map {
      case (file, text) =>
        file.split("/").takeRight(2).head
    }

    val countByGroup = newgroups.map(n => (n, 1)).reduceByKey(_ + _).collect.sortBy(-_._2).mkString("\n")
    //println(countByGroup)
    val text = rdd.map {
      case (file, text) => text
    }
    val whileSpaceSplit = text.flatMap(t => t.split(" ").
      map(_.toLowerCase))
    //println(whileSpaceSplit.distinct.count)
    // println(whileSpaceSplit.sample(true, 0.3, 42).take(100).mkString(","))
    val nonWordSplit = text.flatMap(t => t.split("""\W+""").map(_.toLowerCase))
    // println(nonWordSplit.distinct.count)
    //println(nonWordSplit.distinct.sample(true, 0.3, 42).take(100).mkString(","))
    val regex = """[^0-9]*""".r
    val filterNumbers = nonWordSplit.filter(token =>
      regex.pattern.matcher(token).matches)
    // println(filterNumbers.distinct.count)
    //println(filterNumbers.distinct.sample(true, 0.3, 42).take(100).mkString(","))
    val tokenCounts = filterNumbers.map(t => (t, 1)).reduceByKey(_ + _)
    val oreringDesc = Ordering.by[(String, Int), Int](_._2)
    //println(tokenCounts.top(20)(oreringDesc).mkString("\n"))
    val stopwords = Set(
      "the", "a", "an", "of", "or", "in", "for", "by", "on", "but", "is", "not",
      "with", "as", "was", "if",
      "they", "are", "this", "and", "it", "have", "from", "at", "my",
      "be", "that", "to")
    val tokenCountsFilteredStopwords = tokenCounts.filter {
      case (k, v) =>
        !stopwords.contains(k)
    }
    //println(tokenCountsFilteredStopwords.top(20)(oreringDesc).mkString("\n"))
    val tokenCountsFiltereredSize = tokenCountsFilteredStopwords.filter {
      case (k, v) => k.size >= 2

    }
    //println(tokenCountsFiltereredSize.top(20)(oreringDesc).mkString("\n"))
    val rareTokens = tokenCounts.filter { case (k, v) => v < 2 }.map {
      case (k, v) => k
    }.collect.toSet
    val tokenCountsFilteredAll = tokenCountsFiltereredSize.filter {
      case (k, v) => !rareTokens.contains(k)
    }
    //println(tokenCountsFilteredAll.top(20)(oreringDesc).mkString("\n"))
    val tokens = text.map(doc => tokenize(doc, regex, stopwords.toString(), rareTokens.toString()))
    val dim = math.pow(2, 18).toInt
    val hashingTF = new HashingTF(dim)
    val tf = hashingTF.transform(tokens)
    tf.cache
    /*conculate tf*/
    val v = tf.first.asInstanceOf[SV]
    //println(v.size)
    //println(v.values.size)
    //println(v.values.take(10).toSeq)
    //println(v.indices.take(10).toSeq)
    /*conculate idf*/
    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    val v2 = tfidf.first.asInstanceOf[SV]
    //println(v2.size)
    //println(v2.values.size)
    //println(v2.values.take(10).toSeq)
    //println(v2.indices.take(10).toSeq)
    val minMaxVals = tfidf.map { v =>
      val sv = v.asInstanceOf[SV]
      (sv.values.min, sv.values.max)
    }
    val globalMinMax = minMaxVals.reduce {
      case ((min1, max1), (min2, max2)) =>
        (math.min(min1, min2), math.max(max1, max2))
    }
    //println(globalMinMax)
    val common = sc.parallelize(Seq(Seq("you", "do", "we")))
    val tfCommon = hashingTF.transform(common)
    val tfidfCommon = idf.transform(tfCommon)
    val commonVector = tfidfCommon.first.asInstanceOf[SV]
    //println(commonVector.values.toSeq)
    val uncommon = sc.parallelize(Seq(Seq("telescope", "legislation",
      "investment")))
    val tfUncommon = hashingTF.transform(uncommon)
    val tfidfUncommon = idf.transform(tfUncommon)
    val uncommonVector = tfidfUncommon.first.asInstanceOf[SV]
    //println(uncommonVector.values.toSeq)
    val hockeyText = rdd.filter {
      case (file, text) => file.contains("honey")
    }
    val hockeyTF = hockeyText.mapValues(doc =>
      hashingTF.transform(tokenize(doc, regex, stopwords.toString(), rareTokens.toString())))
    val hockeyTFIdf = idf.transform(hockeyTF.map(_._2))
    val hockey1 = hockeyTFIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
    val breeze1 = new SparseVector(hockey1.indices, hockey1.values, hockey1.size)
    val hockey2 = hockeyTFIdf.sample(true, 0.1, 43).first.asInstanceOf[SV]
    val breeze2 = new SparseVector(hockey2.indices, hockey2.values, hockey2.size)
    val cosineSim = breeze1.dot(breeze2) / (norm(breeze1) * norm(breeze2))
    //println(cosineSim)
    val graphicsText = rdd.filter {
      case (file, text) =>
        file.contains("comp.graphics")
    }
    val graphicsTF = graphicsText.mapValues(doc =>
      hashingTF.transform(tokenize(doc, regex, stopwords.toString(), rareTokens.toString())))
    val graphicsTFIdf = idf.transform(graphicsTF.map(_._2))
    val graphics = graphicsTFIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
    val breezeGraphics = new SparseVector(graphics.indices, graphics.values, graphics.size)
    val cosineSim2 = breeze1.dot(breezeGraphics) / (norm(breeze1) * norm(breezeGraphics))
    println(cosineSim2)
    val baseballText = rdd.filter {
      case (file, text) =>
        file.contains("baseball")
    }
    val baseballTF = baseballText.mapValues(doc =>
      hashingTF.transform(tokenize(doc, regex, stopwords.toString(), rareTokens.toString())))
    val baseballTfIdf = idf.transform(baseballTF.map(_._2))
    val baseball = baseballTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
    val breezeBaseball = new SparseVector(baseball.indices,
      baseball.values, baseball.size)
    val consineSim3 = breeze1.dot(breezeBaseball) / (norm(breeze1) *
      norm(breezeBaseball))
    println(consineSim3)
    val newsgroupsMap = newgroups.distinct.collect().zipWithIndex.toMap
    val zipped = newgroups.zip(tfidf)
    val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    train.cache
    val model = NaiveBayes.train(train, lambda = 0.1)
    val testPath = "hdfs://master:9000/user/root/input/20news-bydate-test/*"
    val testRDD = sc.wholeTextFiles(testPath)
    val testLabels = testRDD.map {
      case (file, text) =>
        val topic = file.split("/").takeRight(2).head
        newsgroupsMap(topic)
    }
    val testTf = testRDD.map {
      case (file, text) =>
        hashingTF.transform(tokenize(text, regex, stopwords.toString(), rareTokens.toString()))
    }
    val testTfIdf = idf.transform(testTf)
    val zippedTest = testLabels.zip(testTfIdf)
    val test = zippedTest.map {
      case (topic, vector) =>
        LabeledPoint(topic, vector)
    }
    val predictionAndLabel = test.map(p => (model.predict(p.features),
      p.label))
    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    val metrics = new MulticlassMetrics(predictionAndLabel)
    println(accuracy)
    println(metrics.weightedFMeasure)
    val rawTokens = rdd.map { case (file, text) => text.split(" ") }
    //evaluate the performance on the test set as we did for the model trained with TF-IDF features:
    val rawTF = tokens.map(doc => hashingTF.transform(doc))
    val rawTrain = newgroups.zip(rawTF).map {
      case (topic, vector) =>
        LabeledPoint(newsgroupsMap(topic), vector)
    }
    val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
    val rawTestTF = testRDD.map {
      case (file, text) =>
        hashingTF.transform(text.split(" "))
    }
    val rawZippedTest = testLabels.zip(rawTestTF)
    val rawTest = rawZippedTest.map {
      case (topic, vector) =>
        LabeledPoint(topic, vector)
    }
    val rawPredictionAndLabel = rawTest.map(p =>
      (rawModel.predict(p.features), p.label))
    val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 ==
      x._2).count() / rawTest.count()
    println(rawAccuracy)
    val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
    println(rawMetrics.weightedFMeasure)
    val word2vec = new Word2Vec()
    word2vec.setSeed(42)
    val word2vecModel = word2vec.fit(tokens)
    word2vecModel.findSynonyms("hockey", 20).foreach(println)
    word2vecModel.findSynonyms("legislation", 20).foreach(println)
  }

}

Star先生

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
《machine learning with spark》学习笔记--文本挖掘

We will introduce more advanced text processing techniques available in MLlib to work with large-scale text datasets.In this article, we will:Work through detailed examples that illustrate data proces
复制链接

扫一扫