一、scala读取hdfs文件
import java.io.{BufferedReader, InputStreamReader}
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import scala.collection.mutable.ArrayBuffer
object IpRulesLoaderr8 {
var ipRel: ArrayBuffer[(Long, Long, String, String)] = new ArrayBuffer[(Long, Long, String, String)]()
private val fileSystem: FileSystem = FileSystem.get(new URI("hdfs://xxx.xx.x.xxx:9000"), new Configuration())
private val stream: FSDataInputStream = fileSystem.open(new Path("/ip/ip.txt"))
private val br = new BufferedReader(new InputStreamReader(stream))
var lines: String = null
do {
lines = br.readLine()
if (lines != null) {
val sp: Array[String] = lines.split("[|]")
val start: Long = sp(2).toLong
val end: Long = sp(3).toLong
val province: String = sp(6)
val city: String = sp(7)
val tp: (Long, Long, String, String) = (start, end, province, city)
ipRel += tp
}
} while (lines != null)
def getAllRules(): ArrayBuffer[(Long, Long, String, String)] = {
ipRel
}
}
二、spark读取hdfs文件,并获取文件名
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.{FileSplit, InputSplit, TextInputFormat}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.{HadoopRDD, RDD}
import org.apache.spark.sql.{DataFrame, SparkSession}
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName.filter(!_.equals('$')))
.master("local[*]")
.config("spark.debug.maxToStringFields", 1000)
.getOrCreate()
import spark.implicits._
val sc: SparkContext = spark.sparkContext
val input = "hdfs://xxxxx.x.xxx:8020/user/groundtruth/"
val fileRDD = sc.hadoopFile[LongWritable, Text, TextInputFormat](input)
val hadoopRDD = fileRDD.asInstanceOf[HadoopRDD[LongWritable, Text]]
val fileNameAndLine: RDD[String] = hadoopRDD
.mapPartitionsWithInputSplit(
(inputSplit: InputSplit, iterator: Iterator[(LongWritable, Text)])
=> {
val file = inputSplit.asInstanceOf[FileSplit]
iterator.map(x => {
file.getPath.getName.toString().split("_")(3) + "," + x._2.toString
})
})
fileNameAndLine.take(10).foreach(println(_))
}
三、java从HDFS中获取文件的最新更新时间
public static String getNewestTime(String pathName) throws Exception {
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.xxx.xxx:xx20/"),
new Configuration(), "root");
Path path = new Path(("/user/timetable/" + pathName));
boolean isDir = fs.isDirectory(path);
FSDataInputStream fis = null;
if (isDir) {
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(path, false);
while (listFiles.hasNext()) {
LocatedFileStatus fileStatus = listFiles.next();
String name = fileStatus.getPath().getName();
if (name.contains(".csv")) {
String route = path.toString() + "/" + name;
Path path1 = new Path(route);
fis = fs.open(path1);
}
}
} else {
fis = fs.open(path);
}
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
List<String> list = new ArrayList<>();
String line;
while ((line = br.readLine()) != null) {
String newLine = line.replace("\"", "");
list.add(newLine);
}
String newTime = list.get((list.size() - 1)).split(",")[0];
br.close();
fis.close();
fs.close();
return newTime;
}