package clustering.garbage
import java.io.PrintWriter
import org.apache.spark.SparkContext
/**
* Created by fhqplzj on 17-1-12 at 下午8:40.
*/
object Lines {
def main(args: Array[String]): Unit = {
val sc = new SparkContext("local[*]", s"${getClass.getSimpleName.stripSuffix("$")}")
val path = "/tmp/fuck"
removeRareWords(sc, path)
}
/**
* 去除词频数为1的单词,单词之间以空格分割,行之间以换行符分割
*
* @param sc
* @param path
*/
def removeRareWords(sc: SparkContext, path: String): Unit = {
val docs = sc.wholeTextFiles(path)
/*词频数为1的单词组成的集合*/
val rareWords = docs.
values.
flatMap(_.split("\\s+")).
map((_, 1)).
reduceByKey(_ + _).
filter(_._2 == 1).
keys.
collect().
toSet
docs.foreach {
case (name, doc) =>
val printer = new PrintWriter(name.substring(name.indexOf("/")))
val content = doc.split("\n").map(_.split(" ").filterNot(rareWords.contains).mkString(" ")).mkString("\n")
printer.print(content)
printer.close()
}
}
}
文本预处理,去除词频数为1的文档
最新推荐文章于 2022-02-14 23:12:31 发布