Mac下.rtf文件中String转化Int失败

问题:在macOS下进行spark作业:输出文件中最大及最小数

文件结构如下:

代码如下:

import org.apache.spark.{SparkConf, SparkContext}

object MaxAndMin {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setMaster("local").setAppName("MaxAndMin")
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("ERROR")

    val lines = sc.textFile("hdfs://localhost:9000/b.txt")//此处为b.rtf时报错
    val result = lines.filter(_.trim.length>0).
      map(line => ("key",line.trim.toInt)).//此处报错
      groupByKey().
      map(x => {
        var max = Integer.MIN_VALUE
        var min = Integer.MAX_VALUE
        for(num <- x._2) {
          if(num > max)
            max = num
          if(num < min)
            min = num
        }
        (max,min)
      }).collect.foreach(x => {
      println("max:\t" + x._1)
      println("min:\t" + x._2)
    })

  }
}

异常信息:

ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NumberFormatException: For input string: "{\rtf1\ansi\ansicpg936\cocoartf1561\cocoasubrtf400"
	at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
	at java.lang.Integer.parseInt(Integer.java:580)
	at java.lang.Integer.parseInt(Integer.java:615)
	at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:29)
	at MaxAndMin$$anonfun$2.apply(MaxAndMin.scala:11)
	at MaxAndMin$$anonfun$2.apply(MaxAndMin.scala:11)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:150)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
18/06/23 22:27:48 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NumberFormatException: For input string: "{\rtf1\ansi\ansicpg936\cocoartf1561\cocoasubrtf400"
	at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
	at java.lang.Integer.parseInt(Integer.java:580)
	at java.lang.Integer.parseInt(Integer.java:615)
	at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:29)
	at MaxAndMin$$anonfun$2.apply(MaxAndMin.scala:11)
	at MaxAndMin$$anonfun$2.apply(MaxAndMin.scala:11)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:150)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
	at MaxAndMin$.main(MaxAndMin.scala:23)
	at MaxAndMin.main(MaxAndMin.scala)
Caused by: java.lang.NumberFormatException: For input string: "{\rtf1\ansi\ansicpg936\cocoartf1561\cocoasubrtf400"
	at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
	at java.lang.Integer.parseInt(Integer.java:580)
	at java.lang.Integer.parseInt(Integer.java:615)
	at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:29)
	at MaxAndMin$$anonfun$2.apply(MaxAndMin.scala:11)
	at MaxAndMin$$anonfun$2.apply(MaxAndMin.scala:11)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:150)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
可以看到,第十一行的

map(line => ("key",line.trim.toInt))

类型转换异常,在什么时候会抛这个异常呢?

在“abc”.toInt时会出现此异常,原因是.rtf文件不是一个纯文本格式,.trim()方法消除空格后还存在一些奇怪的东西,所以.toInt会抛异常。

解决方法:用txt格式文件作为源文件。

在macOS下编辑txt:点击打开链接

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
RTF是一种文本文件格式,通常用于保存文本内容和格式的标记。由于RTF文件没有特定的图标,因此在创建RTF文件时,操作系统通常会为其分配一个默认的图标,与其他文本文件类似。但是,有时候在某些情况下,RTF文件可能没有图标显示。 这可能是由于以下几个原因: 1. 操作系统问题:如果您的操作系统出现错误或配置有问题,可能会导致某些文件类型的图标无法正确地显示。检查操作系统设置或尝试重新安装相关的文件关联程序可能有助于解决此问题。 2. 图标缓存问题:操作系统和文件管理器通常会缓存文件的图标,以提高文件浏览速度。如果缓存过期或损坏,可能会导致图标无法正常显示。此时,您可以尝试清除图标缓存并重新加载文件浏览器。 3. 关联程序问题:每种文件类型通常都会与一个默认的关联程序相关联,用于打开和处理该类型的文件。如果您的关联程序有问题或未正确地与RTF文件关联,操作系统可能无法正确地显示其图标。您可以尝试更改或修复文件关联程序以解决此问题。 总之,在正常情况下,RTF文件应该具有与其他文本文件相同的默认图标。如果您在新建RTF文件时发现其没有图标,可能是由于操作系统、图标缓存或关联程序的问题。通过检查相关设置、清除缓存或更改关联程序,您应该能够解决这个问题,使RTF文件正确地显示图标。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值