pyspark按照时间循环补数
1. 制造时间窗口list
import datetime
begin = datetime. date( 2020 , 7 , 13 )
end = datetime. date( 2020 , 7 , 30 )
d = begin
delta = datetime. timedelta( days= 1 )
date_list = [ ]
while d <= end:
print ( str ( d. strftime( "%Y-%m-%d" ) ) )
date_list. append( str ( d. strftime( "%Y-%m-%d" ) ) )
d += delta
print ( date_list)
输出:
[ '2020-07-13' , '2020-07-14' , '2020-07-15' , '2020-07-16' , '2020-07-17' , '2020-07-18' , '2020-07-19' , '2020-07-20' , '2020-07-21' , '2020-07-22' , '2020-07-23' , '2020-07-24' , '2020-07-25' , '2020-07-26' , '2020-07-27' , '2020-07-28' , '2020-07-29' , '2020-07-30' ]
2. pyspark补数循环code
from pyspark. sql import SparkSession
import datetime
if __name__ == "__main__" :
spark = SparkSession. builder\
. appName( "pyspark" ) \
. master( "yarn" ) \
. config( "spark.sql.hive.convertMetastoreParquet" , "false" ) \
. config( "spark.sql.parquet.mergeSchema" , "false" ) \
. config( "spark.submit.deployMode" , "client" ) \
. config( "mapred.input.dir.recursive" , "true" ) \
. config( "hive.mapred.supports.subdirectories" , "true" ) \
. config( "spark.kryoserializer.buffer.max" , "1024m" ) \
. config( "spark.dynamicAllocation.enabled" , "true" ) \
. config( "spark.shuffle.service.enabled" , "true" ) \
. config( "spark.dynamicAllocation.maxExecutors" , "10" ) \
. config( "spark.sql.auto.repartition" , "true" ) \
. config( "hive.exec.dynamic.partition.mode" , "nonstrict" ) \
. config( "hive.exec.dynamic.partition" , "true" ) \
. config( "spark.num.executors" , "10" ) \
. config( "spark.executor.cores" , "1" ) \
. config( "spark.executor.memory" , "4g" ) \
. config( "spark.driver.memory" , "4g" ) \
. config( "spark.driver.cores" , "1" ) \
. config( "spark.executor.memoryOverhead" , "1" ) \
. config( "spark.default.parallelism" , "100" ) \
. enableHiveSupport( ) \
. getOrCreate( )
begin = datetime. date( 2020 , 7 , 13 )
end = datetime. date( 2020 , 7 , 30 )
d = begin
delta = datetime. timedelta( days= 1 )
date_list = [ ]
while d <= end:
print ( str ( d. strftime( "%Y-%m-%d" ) ) )
date_list. append( str ( d. strftime( "%Y-%m-%d" ) ) )
d += delta
print ( date_list)
// 动态分区补数据
for fill_date in date_list:
insert_sql = """insert overwrite table tmp partition(biz_date)
select userid,itemid,biz_date from tmp_1 where biz_date = '{0}' """ . format ( fill_date)
print ( insert_sql)
table_1 = spark. sql( insert_sql) . coalesce( 1 ) . collect( )