pyspark环境搭建
1.D:\Python\python37\Lib\site-packages\pyspark\jars
放入
iceberg-spark3-runtime-0.13.1.jar
alluxio-2.6.2-client.jar
2.D:\Python\python37\Lib\site-packages\pyspark
创建conf文件夹 放入 hdfs-site.xml hive-site.xml
代码
import os
import warnings
import argparse
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField,StructType,DecimalType,IntegerType,TimestampType, StringType
import pypinyin
warnings.filterwarnings("ignore")
def get_spark():
os.environ.setdefault('HADOOP_USER_NAME', 'root')
spark = SparkSession.builder\
.config('spark.sql.debug.maxToStringFields', 2000) \
.config('spark.debug.maxToStringFields', 2000) \
.getOrCreate()
spark.conf.set("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.iceberg.type", "hive")
spark.conf.set("spark.sql.catalog.iceberg.uri", "thrift://192.168.x.xx:9083")
spark.conf.set("spark.sql.iceberg.handle-timestamp-without-timezone", True)
# Cannot handle timestamp without timezone fields in Spark. Spark does not natively support this type but if you would like to handle all timestamps as timestamp with timezone set 'spark.sql.iceberg.handle-timestamp-without-timezone' to true
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "DYNAMIC")
# spark.conf.set("spark.sql.storeAssignmentPolicy", "LEGACY")
# https://www.cnblogs.com/songchaolin/p/12098618.html pyspark.sql.utils.AnalysisException: LEGACY store assignment policy is disallowed in Spark data source V2. Please set the configuration spark.sql.storeAssignmentPolicy to other values.
return spark
def Capitalize_hanzipinyin(word):
return ''
def main_run(dt):
table_name='iceberg.xxx.xxx'
target_table_name = 'iceberg.xxx.xxx'
target_table_name_columns = ['A','B']
sql = """
select
A,B
from
%s
where
dt = '%s'
"""%(table_name, dt)
spark = get_spark()
spark_df = spark.sql(sql)
toPinyinUDF = udf(Capitalize_hanzipinyin, StringType())
spark_df = spark_df.withColumn('A_pinyin', toPinyinUDF('A'))
# soulution 1
delete_sql = "delete from %s where dt = '%s' "%(target_table_name,dt)
spark.sql(delete_sql)
spark_df.write.saveAsTable(target_table_name, None, "append", partitionBy='dt')
# solution 2
spark_df.createOrReplaceTempView("test")#:创建临时视图
spark.sql(
"insert overwrite table %s partition(dt) select A,B,A_pinyin from test" % target_table_name)
# 使用select * 会报错 Cannot safely cast '': string to int
# soulution 3
new_spark_df = spark.sql("SELECT A,B,A_pinyin from test")
new_spark_df.write.insertInto(target_table_name, True)
# solution 4 会全部覆盖表的数据
# new_spark_df.write.saveAsTable(target_table_name, None, "overwrite",partitionBy='dt')
# solution 5 spark df 转 pandas df 数据类型可能匹配失败
df = spark_df.toPandas()
# 此数据帧转换为pandas时,列类型从spark中的integer更改为pandas中的float
df['A_pinyin'] = df['A'].apply(Capitalize_hanzipinyin)
df = df[target_table_name_columns] #更换位置
schema = StructType([
StructField("A", StringType(), True),
...
])
# 设置了scheam field A: IntegerType can not accept object 2.0 in type <class 'float'>
DF = spark.createDataFrame(df,schema)
#没有schema ValueError: Some of types cannot be determined after inferring存在字段spark无法推断它的类型
DF.write.insertInto(target_table_name, True)