import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
/**
* @author Administrator
*
*/
object WholeTextFileTest {
def main(args: Array[String]): Unit = {
System.out.println("start the app " + new Date())
//
val conf = new SparkConf().setAppName("Spark Data processing").setMaster("local[*]")
val spark = new SparkContext(conf)
// val dataset = spark.textFile(Configuration.baseDir)
val dataset = spark.wholeTextFiles(Configuration.baseDir)
// dataset.collect().foreach(println)
// System.out.println(dataset.count)
val directories = dataset.collect()
for(i <-0 until directories.length)
{
val filenameContent = directories(i)
val filename = filenameContent._1
System.out.println(filename)
val originalfileRDD = spark.parallelize(filenameContent._2.split("\n"))
originalfileRDD.collect().foreach(println)
System.out.println(originalfileRDD.count)
}
}
}
baseDir: Spark支持通配符*,所以路径可以写成/*/*/*/* 以便支持多及目录