1. 写相同的hive表
代码示例
import os
import time
import logging
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("mytest") \
.enableHiveSupport() \
.getOrCreate()
df = spark.sql("select * from xxx.xxx")
print(df.show())
df.write.mode("overwrite").saveAsTable("xxx.xxx")
抛出异常
--------------------------------------------------------------------------- AnalysisException Traceback (most recent call last) /tmp/ipykernel_36930/1113601968.py in <module> 12 df = spark.sql("select * from dev_sztoc_crm.leon_test") 13 print(df.show()) ---> 14 df.write.mode("overwrite").saveAsTable("dev_sztoc_crm.leon_test") /usr/share/spark3/python/pyspark/sql/readwriter.py in saveAsTable(self, name, format, mode, partitionBy, **options) 1158 if format is not None: 1159 self.format(format) -> 1160 self._jwrite.saveAsTable(name) 1161 1162 def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None, /usr/share/spark3/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 1302 1303 answer = self.gateway_client.send_command(command) -> 1304 return_value = get_return_value( 1305 answer, self.gateway_client, self.target_id, self.name) 1306 /usr/share/spark3/python/pyspark/sql/utils.py in deco(*a, **kw) 115 # Hide where the exception came from that shows a non-Pythonic 116 # JVM exception message. --> 117 raise converted from None 118 else: 119 raise AnalysisException: Cannot overwrite table xxx.xxx that is also being read from
查看hive表,数据还在!
正确代码示例
import os
import time
import logging
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("pyspark-test") \
.enableHiveSupport() \
.getOrCreate()
df = spark.sql("select * from dev_sztoc_crm.leon_test")
# 将结果保存在临时表中
df.write.mode("Overwrite").saveAsTable("dev_sztoc_crm.leon_test_tmp")
# 将临时表覆盖结果表
spark.table("dev_sztoc_crm.leon_test_tmp").write.mode("overwrite").saveAsTable("dev_sztoc_crm.leon_test")
# 删除临时表
spark.sql("DROP TABLE IF EXISTS dev_sztoc_crm.leon_test_tmp")
2. spark写相同的hdfs路径
import os
import time
import logging
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("mytest")
.getOrCreate()
df = spark.read.csv("xxx")
df.write.mode("overwrite").save("xxx")
抛出异常
Caused by: java.io.FileNotFoundException: File does not exist: /user/leon/overwrite/xxx.csv……
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
查看原来的hdfs路径,发现文件已经被清空!
正确代码示例
import os
import time
import logging
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("mytest") \
.getOrCreate()
jvm = spark._jvm
jsc = spark._jsc
fs = jvm.org.apache.hadoop.fs.FileSystem.get(jsc.hadoopConfiguration())
ori_path = "/user/leon/overwrite/"
tmp_path = "/user/leon/overwrite_tmp/"
df = spark.read.csv(ori_path)
df.write.mode("overwrite").csv(tmp_path)
df = spark.read.csv(tmp_path)
df.write.mode("overwrite").csv(ori_path)
fs.delete(spark.sparkContext._jvm.org.apache.hadoop.fs.Path(tmp_path), True)
3. 结论
spark读hive表并回写hive表,spark读hdfs路径并回写hdfs路径,均会抛出异常,前者数据还在,后者数据将会丢失。回写hive表,需要借助临时表;回写hdfs路径,需要借助临时目录。