import json
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType, StructType, StructField, StringType, IntegerType
from derivedParameter._AILERON_L_STD import _AILERON_L
from derivedParameter._AILERON_1_STD import _AILERON_1
from derivedParameter._VRTG_STD import _VRTG
from utils.index import getValueFromXml
import time
import os
os.environ["HADOOP_CONF_DIR"]="/bigdata/server/hadoop/etc/hadoop"
os.environ["YARN_CONF_DIR"]="/bigdata/server/hadoop/etc/hadoop"
if __name__ == '__main__':
starttime = time.time()
spark = SparkSession.builder.appName("测试生成导出参数") \
.master("local[*]") \
.config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0") \
.getOrCreate()
xmlstarttime = time.time()
df = spark.read.format("com.databricks.spark.xml") \
.options(rootTag='KYV') \
.options(rowTag='KYV') \
.option("attributePrefix", "") \
.load('hdfs://ecs-0002:8020/qar/keyValues/K_AirFASE_B_STD.xml')
print(df.printSchema())
print(df.columns)
K_AirFASE_B=getValueFromXml(df, "XFK_VALUE", "#K_AirFASE_B")
df = spark.read.format("com.databricks.spark.xml") \
.options(rootTag='KYV') \
.options(rowTag='KYV') \
.option("attributePrefix", "") \
.load('hdfs://ecs-0002:8020/qar/keyValues/K_AILERON_MIN_STD.xml')
K_AILERON_MIN = getValueFromXml(df, "XFK_VALUE", "#K_AILERON_MIN")
df = spark.read.format("com.databricks.spark.xml") \
.options(rootTag='KYV') \
.options(rowTag='KYV') \
.option("attributePrefix", "") \
.load('hdfs://ecs-0002:8020/qar/keyValues/K_AILERON_MAX_STD.xml')
K_AILERON_MAX = getValueFromXml(df, "XFK_VALUE", "#K_AILERON_MAX")
print(K_AirFASE_B, K_AILERON_MIN, K_AILERON_MAX)
sparkstarttime = time.time()
ddf = spark.read\
.option("sep", ",") \
.option("header", True) \
.option("encoding", "utf-8") \
.option("comment", ",") \
.csv('hdfs://ecs-0002:8020/qar/B321P_19052023193529_REC02659.qar.csv')
_AILERON_L_UDF = F.udf(_AILERON_L, FloatType())
_AILERON_1_UDF = F.udf(_AILERON_1, FloatType())
ddf = ddf.withColumn(colName="_AILERON_L",col=_AILERON_L_UDF(ddf['AILERON_LEFT'])) \
.withColumn(colName="_AILERON_1",col=_AILERON_1_UDF(ddf['AILERON_LEFT'], F.lit(K_AirFASE_B), F.lit( K_AILERON_MIN), F.lit(K_AILERON_MAX))) \
.withColumn(colName="rownumid", col=ddf['rownum'].cast(IntegerType()))
ddf.printSchema()
# ddf.show(truncate=False)
ddf.createTempView("temp")
list = spark.sql("select *, (select max(tpprev.VRTG) from temp as tpprev where (tm.rownumid-1)=tpprev.rownumid) as prev, \
(select max(tpnext.VRTG) from temp as tpnext where (tm.rownumid+1)=tpnext.rownumid) as next, \
(select max(gstemp.GS) from temp as gstemp where (tm.rownumid-1)=gstemp.rownumid) as GS_PREV \
from temp as tm")
list.createTempView("pntemp")
vrtgDf = spark.sql("select *, concat_ws(',',prev,VRTG,next) as VRTG_PREV_NEXT_GROUP from pntemp")
# vrtgDf.printSchema()
# vrtgDf.show(truncate=False)
IDG_IN_TEMP1_JSON_STR = json.dumps(vrtgDf.rdd.map(lambda x: (x.rownumid, x.IDG_IN_TEMP1)).collect())
def eq(rownumid, data, rowNum, val):
if rowNum == 0 :
index = -1
for item in data:
index += 1
if rowNum == 0 and rownumid == item[0] and str(item[1]) == str(val):
return index
else:
if data[rowNum][1] == val:
return rowNum
return -1
def get(data, index):
return data[index][1]
def getCurValue(rownumid, data):
index = -1
for item in data:
index += 1
if rownumid == item[0] :
return item[1]
return None
@F.udf(returnType=StringType())
def _TEST_UDF(rownumid, IDG_IN_TEMP1_JSON_STR):
data = json.loads(IDG_IN_TEMP1_JSON_STR)
index = eq(rownumid, data, 0, '40')
if index > -1 and eq(rownumid, data, index-1, '41') > -1 :
return get(data, index) + "_" + get(data, index-1)
return getCurValue(rownumid, data)
_VRTG_UDF = F.udf(_VRTG, FloatType())
vrtgDf.withColumn(colName="_VRTG",col=_VRTG_UDF(vrtgDf['VRTG'], vrtgDf['VRTG_PREV_NEXT_GROUP'], vrtgDf['GS'], vrtgDf['GS_PREV'])) \
.withColumn(colName="_TEST", col=_TEST_UDF(vrtgDf['rownumid'], F.lit(IDG_IN_TEMP1_JSON_STR))) \
.show(truncate=False)
# ddf.createTempView("temp")
# """where DATETIME between date_sub(DATETIME, 1) and date_add(DATETIME, 1)"""
# # spark.sql("select DATETIME, LDG_LH, (select len(LDG_LH) from temp) as LIST_STR from temp").show(truncate=False)
# 导出时必须是一个空文件
# ddf.coalesce(1).write.option("header","true").csv("hdfs://ecs-qar1-0002:8020/csvout/B321P_19052023193529_REC02659.qar.csv")
pyspark的单例指定行计算
于 2023-07-13 19:01:27 首次发布