pyspark按照时间循环补数

pyspark按照时间循环补数

1. 制造时间窗口list
import  datetime

begin = datetime.date(2020, 7, 13)
end = datetime.date(2020, 7, 30)
d = begin
delta = datetime.timedelta(days=1)
date_list = []
while d <= end:
    print(str(d.strftime("%Y-%m-%d")))
    date_list.append(str(d.strftime("%Y-%m-%d")))
    d += delta
print(date_list)

输出:
['2020-07-13', '2020-07-14', '2020-07-15', '2020-07-16', '2020-07-17', '2020-07-18', '2020-07-19', '2020-07-20', '2020-07-21', '2020-07-22', '2020-07-23', '2020-07-24', '2020-07-25', '2020-07-26', '2020-07-27', '2020-07-28', '2020-07-29', '2020-07-30']
2. pyspark补数循环code
from pyspark.sql import SparkSession
import  datetime


if __name__ == "__main__":
    spark = SparkSession.builder\
        .appName("pyspark")\
        .master("yarn")\
        .config("spark.sql.hive.convertMetastoreParquet","false")\
        .config("spark.sql.parquet.mergeSchema", "false")\
        .config("spark.submit.deployMode","client")\
        .config("mapred.input.dir.recursive","true")\
        .config("hive.mapred.supports.subdirectories","true")\
        .config("spark.kryoserializer.buffer.max", "1024m")\
        .config("spark.dynamicAllocation.enabled", "true")\
        .config("spark.shuffle.service.enabled", "true")\
        .config("spark.dynamicAllocation.maxExecutors", "10")\
        .config("spark.sql.auto.repartition","true") \
        .config("hive.exec.dynamic.partition.mode", "nonstrict") \
        .config("hive.exec.dynamic.partition", "true") \
        .config("spark.num.executors","10")\
        .config("spark.executor.cores","1")\
        .config("spark.executor.memory","4g")\
        .config("spark.driver.memory","4g")\
        .config("spark.driver.cores","1")\
        .config("spark.executor.memoryOverhead","1") \
        .config("spark.default.parallelism", "100") \
        .enableHiveSupport()\
        .getOrCreate()


    begin = datetime.date(2020, 7, 13)
    end = datetime.date(2020, 7, 30)

    d = begin
    delta = datetime.timedelta(days=1)
    date_list = []
    while d <= end:
        print(str(d.strftime("%Y-%m-%d")))
        date_list.append(str(d.strftime("%Y-%m-%d")))
        d += delta
    print(date_list)
	// 动态分区补数据
    for fill_date in date_list:
        insert_sql ="""insert overwrite table tmp partition(biz_date)
         select userid,itemid,biz_date from tmp_1 where biz_date = '{0}' """.format(fill_date)
        print(insert_sql)
        table_1 = spark.sql(insert_sql).coalesce(1).collect()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值