scala、java读取hdfs文件，获取文件名

本文链接：https://blog.csdn.net/u010271601/article/details/108093511

文章目录

一、scala读取hdfs文件

import java.io.{BufferedReader, InputStreamReader}
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import scala.collection.mutable.ArrayBuffer

object IpRulesLoaderr8 {
  var ipRel: ArrayBuffer[(Long, Long, String, String)] = new ArrayBuffer[(Long, Long, String, String)]()

  //读取hdfs中的文件  获得输入流
  private val fileSystem: FileSystem = FileSystem.get(new URI("hdfs://xxx.xx.x.xxx:9000"), new Configuration())
  private val stream: FSDataInputStream = fileSystem.open(new Path("/ip/ip.txt"))
  private val br = new BufferedReader(new InputStreamReader(stream))

  var lines: String = null

  //scala中的while循环读取数据会出现问题
  //  while ((lines = br.readLine()) != null) {
  //    print(lines)
  //  }
  do {
    lines = br.readLine()
    //一定要判断不为空的时候
    if (lines != null) {
      val sp: Array[String] = lines.split("[|]")
      val start: Long = sp(2).toLong
      val end: Long = sp(3).toLong
      val province: String = sp(6)
      val city: String = sp(7)
      val tp: (Long, Long, String, String) = (start, end, province, city)
      ipRel += tp
    }
  } while (lines != null)

  //以上是静态代码块  随类的加载而加载
  def getAllRules(): ArrayBuffer[(Long, Long, String, String)] = {
    ipRel
  }
}

二、spark读取hdfs文件，并获取文件名

import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.{FileSplit, InputSplit, TextInputFormat}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.{HadoopRDD, RDD}
import org.apache.spark.sql.{DataFrame, SparkSession}



  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName(this.getClass.getSimpleName.filter(!_.equals('$')))
      .master("local[*]")
      .config("spark.debug.maxToStringFields", 1000)
      .getOrCreate()
    import spark.implicits._

    val sc: SparkContext = spark.sparkContext
    val input = "hdfs://xxxxx.x.xxx:8020/user/groundtruth/"

    val fileRDD = sc.hadoopFile[LongWritable, Text, TextInputFormat](input)
    val hadoopRDD = fileRDD.asInstanceOf[HadoopRDD[LongWritable, Text]]

    // 1、拼接文件名
    val fileNameAndLine: RDD[String] = hadoopRDD
      .mapPartitionsWithInputSplit(
        (inputSplit: InputSplit, iterator: Iterator[(LongWritable, Text)])
        => {
          val file = inputSplit.asInstanceOf[FileSplit]
          iterator.map(x => {
            file.getPath.getName.toString().split("_")(3) + "," + x._2.toString
          })
        })
    fileNameAndLine.take(10).foreach(println(_))
}

三、java从HDFS中获取文件的最新更新时间


    /**
     * 从HDFS中获取文件的最新更新时间
     *
     * @param pathName : 文件名
     * @return publishTime and baseTime
     * @throws Exception
     */
  public static String getNewestTime(String pathName) throws Exception {
        FileSystem fs = FileSystem.get(new URI("hdfs://192.168.xxx.xxx:xx20/"),
                new Configuration(), "root");
        Path path = new Path(("/user/timetable/" + pathName));
        boolean isDir = fs.isDirectory(path);
        FSDataInputStream fis = null;

        if (isDir) {
            RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(path, false);
            while (listFiles.hasNext()) {
                LocatedFileStatus fileStatus = listFiles.next();
                String name = fileStatus.getPath().getName();
                if (name.contains(".csv")) {
                    String route = path.toString() + "/" + name;
                    Path path1 = new Path(route);
                    fis = fs.open(path1);
                }
            }
        } else {
            fis = fs.open(path);
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(fis));

        List<String> list = new ArrayList<>();
        String line;
        while ((line = br.readLine()) != null) {
            String newLine = line.replace("\"", "");
            list.add(newLine);
        }
        // 202010090946,202010090800
        String newTime = list.get((list.size() - 1)).split(",")[0];
        br.close();
        fis.close();
        fs.close();
        return newTime;
    }