val ISRDD = MMSIRDD.intersection(filesRDD).cache()
val longAccumulator = sc.longAccumulator(“mmsi-account”)
longAccumulator.add(1)
/保存mmsi交集*************/
// val savafiledir = “D:\Hadoop\ship\record”
val savafiledir = args(2)
// 有效mmsi
ISRDD.coalesce(1).saveAsTextFile(savafiledir)
//
val ISRDDCount = ISRDD.count()
// 有效数据
sc.parallelize(List(ISRDDCount)).coalesce(1).saveAsTextFile(savafiledir+“/count”)
/************************************************************************************************************************************/
println(“------------一共N条有效的MMSI数据:”+ISRDDCount)
/*************************************************************************************************************************************/
println(“partition:”+ISRDD.getNumPartitions)
// filesRDD.foreach(x => println(x))
// broadcast share
val fileBroadcast = sc.broadcast(ISRDD.collect())
println(“-------------------------------开始执行-------------------------------------”)
// 遍历 fileArray
fileBroadcast.value.foreach({
// filesRDD.foreach({
// files.foreach({
// filePath = null; fileName = null; fileRDD = null;
filestr => {
var file:File = null
var fileName:String = null
var dir:String = null
var mmsi:String = null
var currentFileRDD:RDD[String] = null;
try {
file = new File(filestr)
fileName = file.getName
dir = “file:///” + pathdir + “/” + fileName
mmsi = fileName.substring(0, fileName.length - 4)
currentFileRDD = sc.textFile(dir)
// 计算文件行数
val count = currentFileRDD.count()
if (count > 1) {
val verifyCSV = spark.read.csv(dir).rdd.first().toString()
val f1 = verifyCSV.contains(“Longitude”)
val f2 = verifyCSV.contains(“Latitude”)
// println(“verifyCSV”, f1, f2)
if (f1 && f2) {
// 将 MMSI 写入文件操作
fileWriter = new FileWriter(savafiledir.substring(8,savafiledir.length) + “/MMSIFile”, true)
bufferedWriter = new BufferedWriter(fileWriter)
bufferedWriter.write(mmsi+“\n”)
bufferedWriter.close(); bufferedWriter = null; fileWriter = null
// longAccumulator.add(1) ISRDDCount
println(“============================== 正在执行第 " + longAccumulator.sum +” 条船 (MMSI) 剩余 " + (ISRDDCount - longAccumulator.sum ) +" 条船 (MMSI) =================================")
/** ******************************************************分割线 ********************************************************/
// println(“/遍历目录*****/”)
println(“date:” + data + " mmsi:" + mmsi + " fileName:" + fileName + " file numbere of rows:" + count)
// csv转DataFrame
val df = spark.read.option(“header”, “true”).option(“inferSchema”, “true”).csv(dir)
// 单个文件的记录数
total = df.count()
println(“mmsi:” + mmsi + “,一共有:” + total + " 条记录!")
/** 取每天的日期集合******* */
// 创建临时表 route
df.createOrReplaceTempView(“route”)
// 每天的日期(按时间排序,不重复)
val dicData = spark.sql(“SELECT LEFT(Time,8) AS Date FROM route GROUP BY Date ORDER BY Date”).na.drop()
// DataFrame转Array(当月日期集合)
dateList = dicData.collect().map(x => x.toString().substring(1, 9)).filter(_.substring(0, 6) == data)
dateList.foreach(x => print(x + " "))
// 用于去除月份
dateListRDD = sc.parallelize(dateList)
// 当月的天数 // 当月第一天 // 当月最后一天
allDays = dateList.length;
startDate = dateList(0);
endDate = dateList(allDays - 1)
println(“\n当月首日日期:” + startDate +