pyspark参数设置,常用时间函数

48 篇文章 0 订阅
36 篇文章 0 订阅
spark-submit --master yarn-client --num-executors 8 --driver-memory 4g --executor-memory 2g spark_demo.py 
spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_1") \
    #.config("spark.sql.warehouse.dir", "spark.warehouse") \
    .config("spark.executor.heartbeatInterval", 120) \
    .config("spark.debug.maxToStringFields", 1000) \
    #.config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "256m") \
    .config("spark.rpc.message.maxSize",256) \
    .config("spark.port.maxRetries", 32) \
    .config("spark.rpc.numRetries", 9) \
    .config("spark.rpc.retry.wait","9s") \
    .config("spark.scheduler.maxRegisteredResourcesWaitingTime","120s") \
    .config("spark.shuffle.service.enabled","true") \
    .config("spark.dynamicAllocation.enabled","true") \
    .config("spark.dynamicAllocation.maxExecutors",108) \
    .config("spark.dynamicAllocation.minExecutors",1) \
    .config("spark.dynamicAllocation.schedulerBacklogTimeout","10s") \
    .config("spark.dynamicAllocation.executorIdleTimeout","120s") \
    .config("spark.network.timeout","240s") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.driver.cores", 16) \
    .config("spark.driver.memory", "64g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.python.worker.memory", "1g") \
    .config("spark.python.worker.reuse", "true") \
    .enableHiveSupport() \
    .getOrCreate()
spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_2") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.broadcast.blockSize", "4m") \
    .config("spark.executor.heartbeatInterval", 240) \
    .config("spark.network.timeout", "360s") \
    .config("spark.rpc.message.maxSize", 256) \
    .config("spark.port.maxRetries", 32) \
    .config("spark.rpc.numRetries", 24) \
    .config("spark.rpc.retry.wait", "3s") \
    .config("spark.shuffle.io.numConnectionsPerPeer", 3) \
    .config("spark.shuffle.io.maxRetries", 9) \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.scheduler.maxRegisteredResourcesWaitingTime", "120s") \
    .config("spark.driver.cores", 16) \
    .config("spark.driver.memory", "64g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memoryOverhead","2G") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .enableHiveSupport() \
    .getOrCreate()
reload(sys)
sys.setdefaultencoding('utf-8')

spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_3") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.driver.cores", 4) \
    .config("spark.driver.memory", "128g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.driver.memoryOverhead", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "6g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.dynamicAllocation.maxExecutors", 1000) \
    .config("spark.default.parallelism", 1000) \
    .config("spark.cleaner.periodicGC.interval", "10min") \
    .enableHiveSupport() \
    .getOrCreate()
#常用时间函数
from __future__ import division, print_function, absolute_import
import sys, time, datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn

reload(sys)
sys.setdefaultencoding('utf-8')

spark = SparkSession.builder \
    .master("yarn") \
    .appName("xxx") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.driver.cores", 4) \
    .config("spark.driver.memory", "16g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memoryOverhead", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.dynamicAllocation.maxExecutors", 1000) \
    .config("spark.default.parallelism", 1000) \
    .config("spark.cleaner.periodicGC.interval", "10min") \
    .enableHiveSupport() \
    .getOrCreate()


def getDate(fakeToday='today', moveDays=0, isReturnPt=False):
    fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
    the_day = fakeToday + datetime.timedelta(days=moveDays)
    date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
    return the_day.strftime(date_format)


def ptToDate(pt, isDetial=False, isMinTime=True):
    dt = datetime.datetime.strptime(pt, '%Y%m%d').strftime('%Y-%m-%d')
    dt = dt + ' 00:00:00' if isMinTime else dt + ' 23:59:59'
    return dt if isDetial else dt[:10]


def toTimestamp(datetime):
    fmt = '%Y-%m-%d %H:%M:%S'
    if len(datetime) == 8:
        fmt = '%Y%m%d'
    elif len(datetime) == 10:
        fmt = '%Y-%m-%d'
    elif len(datetime) == 19:
        fmt = '%Y-%m-%d %H:%M:%S'
    return int(time.mktime(time.strptime(datetime, fmt)) * 1000)  # len(x)==13


def getLastMonday(fakeToday='today',includeTodayMonday=False,isReturnPt=False):
    fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
    moveDays = -7 if fakeToday.weekday() == 0 else -fakeToday.weekday()
    moveDays = 0 if includeTodayMonday and moveDays==-7 else moveDays
    the_day = fakeToday + datetime.timedelta(days=moveDays)
    date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
    return the_day.strftime(date_format)

def timeStat(func):
    def wrapper(*args, **kwargs):
        import time
        t1 = time.time()
        r = func(*args, **kwargs)
        t2 = time.time()
        print('UseTimeMinutes(fn: {}): '.format(func.__name__), round((t2 - t1) / 60, 2))
        return r
    return wrapper

import pandas as pd
pt_list = [str(x)[:10].replace('-','') for x in pd.date_range(end=last_1_date, freq='D',periods=30)]
  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值