package moke
import org.apache.spark.sql.SparkSession
/*
* 第一步: 清洗, 抽取列数据
*/
object sparkStatFormatJob {
def main(args: Array[String]): Unit = {
// System.setProperty("hadoop.home.dir", "G:/winutils/")
val spark = SparkSession.builder().master("local[2]").appName("sparkStatFormatJob").getOrCreate()
val access = spark.sparkContext.textFile("file:///G:/data/access.20161111.log")
//access.take(40).foreach(println)
access.map(line => {
val splits = line.split(" ")
val ip = splits(0)
// 拼接
val time = splits(3) + " " + splits(4)
val url = splits(11).replace("\"", "")
val traffic = splits(9)
//(ip, DateUtil.parse(time), url, traffic)
DateUtil.parse(time) + "\t" + url + "\t" + traffic + "\t" + ip
}).saveAsTextFile("file:///G:/output/")
// .take(10).foreach(println)
spark.stop()
}
}
开始打印输出,是没问题的, 后来保存本地(window 7)报错,
Driver stacktrace:
18/07/19 10:42:26 INFO HadoopRDD: Input split: file:/G:/data/access.20161111.log:67108864+33554432
18/07/19 10:42:26 INFO DAGScheduler: Job 0 failed: saveAsTextFile at sparkStatFormatJob.scala:29, took 0.821748 s
18/07/19 10:42:26 INFO Executor: Executor is trying to kill task 2.0 in stage 0.0 (TID 2)
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NullPointerException
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1012)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:482)
at org.apache.hadoop.util.Shell.run(Shell.java:455)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:715)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:808)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:791)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:656)
at org.apache.hadoop.fs.FilterFileSystem.setPermission(FilterFileSystem.java:490)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:462)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:428)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:908)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:801)
at org.apache.hadoop.mapred.TextOutputFormat.getRecordWriter(TextOutputFormat.java:123)
at org.apache.spark.SparkHadoopWriter.open(SparkHadoopWriter.scala:90)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1206)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1197)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
上网百度后,
查询之后 发现是本地缺少hadoop需要的一个文件所致(本地没有hadoop环境)
如果本地已经安装了hadoop 一般不会有此问题 如果不愿安装 可按照下述方法解决
1)下载需要的文件 winutils.exe
http://public-repo-1.hortonworks.com/hdp-win-alpha/winutils.exe
2) 将此文件放置在某个目录下,比如G:\winutils\bin\中。
3)在程序的一开始声明:System.setProperty("hadoop.home.dir", "G:/winutils/")