Sparksql Dataframe聚合操作

import numpy as np
import sys
#sys.path.append("/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip")
#sys.path.append("/opt/cloudera/parcels/CDH/lib/spark/python/lib/pyspark.zip")
from sklearn.metrics import mean_squared_error,r2_score
from scipy import optimize
import scipy as sci
from sklearn.metrics import roc_curve
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql.functions import *
import warnings
import random
import collections
import datetime
from pyspark import SparkContext,SparkConf
from pyspark.sql import *
import datetime
from dateutil.relativedelta import *
from pyspark.sql.types import *
from sqlalchemy import create_engine
#import org.apache

def inline_spc_raw_LastResult(one_month_start_date,end_date):
    inline_spc_raw_LastResultDF = rawDataDF_three\
        .where("createtime >= '{0}' and createtime < '{1}'".format(one_month_start_date, end_date)) \
        .selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1")
    return inline_spc_raw_LastResultDF


def inlineSiteSumProd(inline_spc_raw_DF,n_month,summarytype,prod):
    # .selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1") \ and itemname = 'DMBWDGET-OCDMTS061-0162_THKP1#1'
    inlineSiteSumProdDF = inline_spc_raw_DF\
        .where("lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
        .groupBy("{0}".format(prod),"edcspecid","edcspecrevision","itemname")\
        .agg(
            f.avg("sitecount").alias("sitecount"),
            f.max("samplecount").alias("samplecount"),
            f.avg("target").alias("target"),
            f.max("lowerspeclimit").alias("lowerspeclimit"),
            f.min("upperspeclimit").alias("upperspeclimit"),
            f.max("dcspecdes").alias("dcspecdes"),
            f.max("datatype").alias("datatype"),
            f.max("dcspecitemowner").alias("dcspecitemowner"),
            f.countDistinct("samplematerialname").alias("lotcnt"),
            f.max("dccdefine").alias("dccdefine"),
            f.avg("result").alias("mean"),
            f.stddev("result").alias("std"),
            f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("upperspeclimit")- f.max("lowerspeclimit"))/(6 * f.stddev("result")))).otherwise("null").alias("cp"),
            f.when (~f.isnull(f.min("upperspeclimit")- f.max("lowerspeclimit")) & ((f.min("upperspeclimit")- f.max("lowerspeclimit")) != 0),(f.abs(f.avg("result")-f.avg("target"))/((f.min("upperspeclimit")- f.max("lowerspeclimit"))/2))).otherwise("null").alias("k"),
            f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
                    f.least(
                    ((f.min("upperspeclimit") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("lowerspeclimit"))/(3 * f.stddev("result"))))
                    ).otherwise("null").alias("cpk"),
            f.min("spcusl").alias("spcusl"),
            f.max("spclsl").alias("spclsl"),
            f.avg("spctarget").alias("spctarget"),
            f.avg("spcgoal").alias("spcgoal"),
            f.avg("kgoal").alias("kgoal"),
            f.avg("downratiogoal").alias("downratiogoal"),
            f.max("spcflag").alias("spcflag"),
            f.max("dcseq").alias("dcseq"),

            f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("spcusl")- f.max("spclsl"))/(6 * f.stddev("result")))).otherwise("null").alias("spccp"),
            f.when (~f.isnull(f.min("spcusl")- f.max("spclsl")) & ((f.min("spcusl")- f.max("spclsl")) != 0),(f.abs(f.avg("result")-f.avg("spctarget"))/((f.min("spcusl")- f.max("spclsl"))/2))).otherwise("null").alias("spck"),
            f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
                    f.least(
                    ((f.min("spcusl") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("spclsl"))/(3 * f.stddev("result"))))
                    ).otherwise("null").alias("spccpk"),

            f.countDistinct("lotname").alias("lotcount"),
            (f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")

             ).selectExpr("*","'{0}' as monthly".format(n_month),"'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
                          #"case when (min('upperspeclimit') is null) or (max('lowerspeclimit') is null) then null  when stddev('result')is not null and stddev('result') <> 0 then ((min('upperspeclimit')- max('lowerspeclimit'))/(6 * stddev('result'))) else null end cp"

    #OneMonths Site Sum end
    return inlineSiteSumProdDF

    #inlineSiteSumProd4_OneMonth.show(20)



def inlineWaferSumProd(inline_spc_raw_DF,n_month,summarytype,prod):
    #OneMonth site sum

    inlineWaferSumProdDF = inline_spc_raw_DF\
        .where("lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
        .groupBy("{0}".format(prod),"edcspecid","edcspecrevision","itemname","lotname","samplematerialname")\
        .agg(
            f.avg("sitecount").alias("sitecount"),
            f.max("samplecount").alias("samplecount"),
            f.avg("target").alias("target"),
            f.max("lowerspeclimit").alias("lowerspeclimit"),
            f.min("upperspeclimit").alias("upperspeclimit"),
            f.max("dcspecdes").alias("dcspecdes"),
            f.max("datatype").alias("datatype"),
            f.max("dcspecitemowner").alias("dcspecitemowner"),
            #f.countDistinct("samplematerialname").alias("lotcnt"),
            f.max("dccdefine").alias("dccdefine"),
            f.avg("result").alias("avg"),
            f.stddev("result").alias("std"),

            #~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),


            # f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("upperspeclimit")- f.max("lowerspeclimit"))/(6 * f.stddev("result")))).otherwise("null").alias("cp"),
            # f.when (~f.isnull(f.min("upperspeclimit")- f.max("lowerspeclimit")) & ((f.min("upperspeclimit")- f.max("lowerspeclimit")) != 0),(f.abs(f.avg("result")-f.avg("target"))/((f.min("upperspeclimit")- f.max("lowerspeclimit"))/2))).otherwise("null").alias("k"),
            # f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
            #         f.least(
            #         ((f.min("upperspeclimit") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("lowerspeclimit"))/(3 * f.stddev("result"))))
            #         ).otherwise("null").alias("cpk"),
            f.min("spcusl").alias("spcusl"),
            f.max("spclsl").alias("spclsl"),
            f.avg("spctarget").alias("spctarget"),
            f.avg("spcgoal").alias("spcgoal"),
            f.avg("kgoal").alias("kgoal"),
            f.avg("downratiogoal").alias("downratiogoal"),
            f.max("spcflag").alias("spcflag"),
            f.max("dcseq").alias("dcseq"),

            # f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("spcusl")- f.max("spclsl"))/(6 * f.stddev("result")))).otherwise("null").alias("spccp"),
            # f.when (~f.isnull(f.min("spcusl")- f.max("spclsl")) & ((f.min("spcusl")- f.max("spclsl")) != 0),(f.abs(f.avg("result")-f.avg("spctarget"))/((f.min("spcusl")- f.max("spclsl"))/2))).otherwise("null").alias("spck"),
            # f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
            #         f.least(
            #         ((f.min("spcusl") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("spclsl"))/(3 * f.stddev("result"))))
            #         ).otherwise("null").alias("spccpk"),
            #
            # f.countDistinct("lotname").alias("lotcount"),
            # (f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")

        ).selectExpr("*","{0} as monthly".format(n_month),"{0} as summarytype".format(summarytype))\
        .groupBy("{0}".format(prod),"edcspecid","edcspecrevision","itemname")\
        .agg(
            f.avg("sitecount").alias("sitecount"),
            f.max("samplecount").alias("samplecount"),
            f.avg("target").alias("target"),
            f.max("lowerspeclimit").alias("lowerspeclimit"),
            f.min("upperspeclimit").alias("upperspeclimit"),
            f.max("dcspecdes").alias("dcspecdes"),
            f.max("datatype").alias("datatype"),
            f.max("dcspecitemowner").alias("dcspecitemowner"),
            f.countDistinct("samplematerialname").alias("wafercnt"),
            f.max("dccdefine").alias("dccdefine"),
            f.avg("avg").alias("mean"),
            f.avg("std").alias("stdmean"),
            f.stddev("std").alias("std"),
            f.stddev("avg").alias("meanstd"),
            # ~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),

            f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
                   ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("avg")))).otherwise(
                "null").alias("cp"),
            f.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
                        (f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0), (
                               f.abs(f.avg("avg") - f.avg("target")) / (
                                   (f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2))).otherwise("null").alias("k"),
            f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
                   f.least(
                       ((f.min("upperspeclimit") - f.avg("avg")) / (3 * f.stddev("avg"))),
                       ((f.avg("avg") - f.min("lowerspeclimit")) / (3 * f.stddev("avg"))))
                   ).otherwise("null").alias("cpk"),
            f.min("spcusl").alias("spcusl"),
            f.max("spclsl").alias("spclsl"),
            f.avg("spctarget").alias("spctarget"),
            f.avg("spcgoal").alias("spcgoal"),
            f.avg("kgoal").alias("kgoal"),
            f.avg("downratiogoal").alias("downratiogoal"),
            f.max("spcflag").alias("spcflag"),
            f.max("dcseq").alias("dcseq"),

            f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
                   ((f.min("spcusl") - f.max("spclsl")) / (6 * f.stddev("avg")))).otherwise("null").alias("spccp"),
            f.when(~f.isnull(f.min("spcusl") - f.max("spclsl")) & ((f.min("spcusl") - f.max("spclsl")) != 0),
                   (f.abs(f.avg("avg") - f.avg("spctarget")) / ((f.min("spcusl") - f.max("spclsl")) / 2))).otherwise(
                "null").alias("spck"),
            f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
                   f.least(
                       ((f.min("spcusl") - f.avg("avg")) / (3 * f.stddev("avg"))),
                       ((f.avg("avg") - f.min("spclsl")) / (3 * f.stddev("avg"))))
                   ).otherwise("null").alias("spccpk"),

            f.countDistinct("lotname").alias("lotcount"),
            (f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")

        ).selectExpr("*", "'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    #OneMonths Site Sum end
    return inlineWaferSumProdDF



def inlineSiteSumNoFilter(inline_spc_raw_DF,n_month,summarytype,prod):
    inlineSiteSumNoFilterDF = inline_spc_raw_DF \
        .groupBy("{0}".format(prod), "edcspecid", "edcspecrevision", "itemname") \
        .agg(
        f.avg("sitecount").alias("sitecount"),
        f.max("samplecount").alias("samplecount"),
        f.avg("target").alias("target"),
        f.max("lowerspeclimit").alias("lowerspeclimit"),
        f.min("upperspeclimit").alias("upperspeclimit"),
        f.max("dcspecdes").alias("dcspecdes"),
        f.max("datatype").alias("datatype"),
        f.max("dcspecitemowner").alias("dcspecitemowner"),
        f.countDistinct("samplematerialname").alias("lotcnt"),
        f.max("dccdefine").alias("dccdefine"),
        f.avg("result").alias("avg"),
        f.stddev("result").alias("std"),

        # ~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),

        f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
               ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result")))).otherwise(
            "null").alias("cp"),
        f.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
                    (f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0), (
                           f.abs(f.avg("result") - f.avg("target")) / (
                               (f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2))).otherwise("null").alias("k"),
        f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
               f.least(
                   ((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
                   ((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
               ).otherwise("null").alias("cpk"),
        f.min("spcusl").alias("spcusl"),
        f.max("spclsl").alias("spclsl"),
        f.avg("spctarget").alias("spctarget"),
        f.avg("spcgoal").alias("spcgoal"),
        f.avg("kgoal").alias("kgoal"),
        f.avg("downratiogoal").alias("downratiogoal"),
        f.max("spcflag").alias("spcflag"),
        f.max("dcseq").alias("dcseq"),

        f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
               ((f.min("spcusl") - f.max("spclsl")) / (6 * f.stddev("result")))).otherwise("null").alias("spccp"),
        f.when(~f.isnull(f.min("spcusl") - f.max("spclsl")) & ((f.min("spcusl") - f.max("spclsl")) != 0),
               (f.abs(f.avg("result") - f.avg("spctarget")) / ((f.min("spcusl") - f.max("spclsl")) / 2))).otherwise(
            "null").alias("spck"),
        f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
               f.least(
                   ((f.min("spcusl") - f.avg("result")) / (3 * f.stddev("result"))),
                   ((f.avg("result") - f.min("spclsl")) / (3 * f.stddev("result"))))
               ).otherwise("null").alias("spccpk"),

        f.countDistinct("lotname").alias("lotcount"),
        (f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")

    ).selectExpr("*", "'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    return inlineSiteSumNoFilterDF


def inlineToolMatchingSum(inline_spc_raw_DF,n_month,summarytype,prod):
    inlineToolMatchingDF1 = inline_spc_raw_DF.where(
        "lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
        .where("processmachinename != processsubmachinename")\
        .groupBy("{0}".format(prod),"itemname","processmachinename","processsubmachinename") \
        .agg(
            f.max("modulename").alias("modulename"),
            f.countDistinct("lotsysid").alias("waferconut"),


            f.max("target").alias("target"),
            f.max("upperspeclimit").alias("upperspeclimit"),
            f.max("lowerspeclimit").alias("lowerspeclimit"),
            f.avg("result").alias("mean"),
            f.stddev("result").alias("std"),

            f.when((f.isnull(f.min("upperspeclimit"))) | (f.isnull(f.max("lowerspeclimit"))),"null")
            .when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result"))))
            .otherwise("null").alias("cp"),

            f.when(~f.isnull(f.min("upperspeclimit")) & f.isnull(f.max("lowerspeclimit")) & (f.min("upperspeclimit") - f.avg("target") != 0),
                   (f.avg("result") - f.avg("target")) / (f.min("upperspeclimit") - f.avg("target"))
                   )
            .when(f.isnull(f.min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & ((f.avg("target") - f.max("lowerspeclimit")) != 0),
                  (f.avg("target") - f.avg("result")) / (f.avg("target") - f.max("lowerspeclimit"))
                  )
            .when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & ((f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0),
                  f.abs(f.avg("result") - f.avg("target")) / ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2)
                  )
            .otherwise("null").alias("k"),

            f.when(~f.isnull(min("upperspeclimit"))  & (f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
                    (f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))
                   )
            .when(f.isnull(f.min("upperspeclimit")) & (~f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
                  (f.avg("result") - f.max("lowerspeclimit")) / (3 * f.stddev("result"))
                  )
            .when(~f.isnull(f.stddev("result"))  & (f.stddev("result") != 0),
                  f.least(
                      ((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
                      ((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
                  ).otherwise("null").alias("cpk")


    ).selectExpr("{0} as productname".format(prod),"itemname","modulename","processmachinename",
                 "processsubmachinename","waferconut","target","upperspeclimit","lowerspeclimit",
                 "mean","std","cp","k","cpk",
                 "'{0}' as monthly".format(n_month),"'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    inlineToolMatchingDF2 = inline_spc_raw_DF.where(
        "lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
        .groupBy("{0}".format(prod), "itemname", "processmachinename") \
        .agg(
        f.max("modulename").alias("modulename"),
        f.countDistinct("lotsysid").alias("waferconut"),

        f.max("target").alias("target"),
        f.max("upperspeclimit").alias("upperspeclimit"),
        f.max("lowerspeclimit").alias("lowerspeclimit"),
        f.avg("result").alias("mean"),
        f.stddev("result").alias("std"),

        f.when((f.isnull(f.min("upperspeclimit"))) | (f.isnull(f.max("lowerspeclimit"))), "null")
            .when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
                  ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result"))))
            .otherwise("null").alias("cp"),

        f.when(~f.isnull(f.min("upperspeclimit")) & f.isnull(f.max("lowerspeclimit")) & (
                    f.min("upperspeclimit") - f.avg("target") != 0),
               (f.avg("result") - f.avg("target")) / (f.min("upperspeclimit") - f.avg("target"))
               )
            .when(f.isnull(f.min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (
                    (f.avg("target") - f.max("lowerspeclimit")) != 0),
                  (f.avg("target") - f.avg("result")) / (f.avg("target") - f.max("lowerspeclimit"))
                  )
            .when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
                    (f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0),
                  f.abs(f.avg("result") - f.avg("target")) / ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2)
                  )
            .otherwise("null").alias("k"),

        f.when(
            ~f.isnull(min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (
                        f.stddev("result") != 0),
            (f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))
            )
            .when(f.isnull(f.min("upperspeclimit")) & (~f.isnull(f.max("lowerspeclimit"))) & (
            ~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
                  (f.avg("result") - f.max("lowerspeclimit")) / (3 * f.stddev("result"))
                  )
            .when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
                  f.least(
                      ((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
                      ((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
                  ).otherwise("null").alias("cpk")

                ).selectExpr("{0} as productname".format(prod), "itemname", "modulename", "processmachinename",
                 "processmachinename as processsubmachinename", "waferconut", "target", "upperspeclimit", "lowerspeclimit",
                 "mean", "std", "cp", "k", "cpk",
                 "'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))


    inlineToolMatchingDF3 = inline_spc_raw_DF.where(
        "lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
        .groupBy("{0}".format(prod), "itemname") \
        .agg(
        f.max("modulename").alias("modulename"),
        f.countDistinct("lotsysid").alias("waferconut"),

        f.max("target").alias("target"),
        f.max("upperspeclimit").alias("upperspeclimit"),
        f.max("lowerspeclimit").alias("lowerspeclimit"),
        f.avg("result").alias("mean"),
        f.stddev("result").alias("std"),

        f.when((f.isnull(f.min("upperspeclimit"))) | (f.isnull(f.max("lowerspeclimit"))), "null")
            .when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
                  ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result"))))
            .otherwise("null").alias("cp"),

        f.when(~f.isnull(f.min("upperspeclimit")) & f.isnull(f.max("lowerspeclimit")) & (
                f.min("upperspeclimit") - f.avg("target") != 0),
               (f.avg("result") - f.avg("target")) / (f.min("upperspeclimit") - f.avg("target"))
               )
            .when(f.isnull(f.min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (
                (f.avg("target") - f.max("lowerspeclimit")) != 0),
                  (f.avg("target") - f.avg("result")) / (f.avg("target") - f.max("lowerspeclimit"))
                  )
            .when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
                (f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0),
                  f.abs(f.avg("result") - f.avg("target")) / ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2)
                  )
            .otherwise("null").alias("k"),

        f.when(
            ~f.isnull(min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (
                    f.stddev("result") != 0),
            (f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))
        )
            .when(f.isnull(f.min("upperspeclimit")) & (~f.isnull(f.max("lowerspeclimit"))) & (
            ~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
                  (f.avg("result") - f.max("lowerspeclimit")) / (3 * f.stddev("result"))
                  )
            .when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
                  f.least(
                      ((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
                      ((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
                  ).otherwise("null").alias("cpk")

    ).selectExpr("{0} as productname".format(prod), "itemname", "modulename", "'NA' as processmachinename",
                 "'NA' as processsubmachinename", "waferconut", "target", "upperspeclimit",
                 "lowerspeclimit",
                 "mean", "std", "cp", "k", "cpk",
                 "'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    inlineToolMatchingDF=inlineToolMatchingDF1.unionAll(inlineToolMatchingDF2).unionAll(inlineToolMatchingDF3)

    return inlineToolMatchingDF


if __name__ == '__main__':
    currentTime = datetime.datetime.now()  # datetime.datetime.strptime("2022-01-15 01:00:00",'%Y-%m-%d %H:%M:%S')

    end_date = currentTime.strftime('%Y-%m-%d 00:00:00')
    print(end_date)

    start_date = (currentTime + datetime.timedelta(days=-1)).strftime('%Y-%m-%d') + ' 00:00:00'
    print(start_date)
    month_end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S')
    print(month_end_date)
    FAB_DATE = month_end_date.strftime('%Y%m%d')
    print(FAB_DATE)
    N_DAY = int(FAB_DATE[6:8])
    print(N_DAY)
    if N_DAY > 25:

        n_month = (month_end_date + relativedelta(months=+1)).strftime('%Y%m')
        print(n_month)
        n_threemonth = (month_end_date + relativedelta(months=-1)).strftime('%Y%m') + "-" + (
                    month_end_date + relativedelta(months=+1)).strftime('%Y%m')
        print(n_threemonth)
        #   one_month_start_date := TO_DATE(to_char(end_date, 'YYYYMM') | | '25000000', 'YYYYMMDDHH24MISS');
        one_month_start_date = (month_end_date).strftime('%Y-%m') + "-25 00:00:00"
        print(one_month_start_date)
        three_month_start_date = (month_end_date + relativedelta(months=-2)).strftime('%Y-%m') + "-25 00:00:00"
        print(three_month_start_date)
    else:
        n_month = (month_end_date).strftime('%Y%m')
        print(n_month)
        n_threemonth = (month_end_date + relativedelta(months=-2)).strftime('%Y%m') + "-" + (month_end_date).strftime(
            '%Y%m')
        print(n_threemonth)
        #   one_month_start_date := TO_DATE(to_char(end_date, 'YYYYMM') | | '25000000', 'YYYYMMDDHH24MISS');
        one_month_start_date = (month_end_date + relativedelta(months=-1)).strftime('%Y-%m') + "-25 00:00:00"
        print(one_month_start_date)
        three_month_start_date = (month_end_date + relativedelta(months=-3)).strftime('%Y-%m') + "-25 00:00:00"
        print(three_month_start_date)

    conf = SparkConf().setAppName('inline_spc_sum_prd')#.setMaster('local[16]')

    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    sqlContext = HiveContext(sc)
    spark = SparkSession.builder.getOrCreate()

    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
    #spark.conf.set("spark.debug.maxToStringFields", "100")

    aa = datetime.datetime.strptime(three_month_start_date,'%Y-%m-%d %H:%M:%S')

    #aa = datetime.datetime.strptime('2022-01-12 00:00:00', '%Y-%m-%d %H:%M:%S')

    print(aa)

    inline_spc_raw_schema = StructType([
        StructField("SYSID", StringType(), True),
        StructField("PRODUCTNAME", StringType(), True),
        StructField("EDCSPECID", StringType(), True),
        StructField("EDCSPECREVISION", StringType(), True),
        StructField("ACTIVESTATE", StringType(), True),
        StructField("ITEMNAME", StringType(), True),
        StructField("SITECOUNT", StringType(), True),
        StructField("SAMPLECOUNT", StringType(), True),
        StructField("TARGET", DecimalType(), True),
        StructField("LOWERSPECLIMIT", DecimalType(), True),
        StructField("UPPERSPECLIMIT", DecimalType(), True),
        StructField("LOTNAME", StringType(), True),
        StructField("SAMPLEMATERIALNAME", StringType(), True),
        StructField("SITENAME", StringType(), True),
        StructField("DCSPECDES", StringType(), True),
        StructField("DATATYPE", StringType(), True),
        StructField("DCSPECITEMOWNER", StringType(), True),
        StructField("LOTTYPE", StringType(), True),
        StructField("DCCDEFINE", StringType(), True),
        StructField("RESULT", DecimalType(), True),
        StructField("CREATETIME", StringType(), True),
        StructField("UPDATETIME", StringType(), True),
        StructField("SPCUSL", DecimalType(), True),
        StructField("SPCLSL", DecimalType(), True),
        StructField("SPCTARGET", DecimalType(), True),
        StructField("SPCGOAL", DecimalType(), True),
        StructField("SPCFLAG", StringType(), True),
        StructField("KGOAL", DecimalType(), True),
        StructField("DOWNRATIOGOAL", DecimalType(), True),
        StructField("DCSEQ", StringType(), True),
        StructField("STEPSEQUENCE", StringType(), True),
        StructField("PROCESSOPERATIONNAME", StringType(), True),
        StructField("PROCESSMACHINENAME", StringType(), True),
        StructField("PROCESSSUBMACHINENAME", StringType(), True),
        StructField("UPPERSCREENLIMIT", DecimalType(), True),
        StructField("LOWERSCREENLIMIT", DecimalType(), True),
        StructField("MODULENAME", StringType(), True),
        StructField("LOTSYSID", StringType(), True),
        StructField("PROD4", StringType(), True),
        StructField("PROD5", StringType(), True)
    ])

    # rawDataDF = spark.read.format('com.databricks.spark.csv').options(inferschema=True,delimiter='\t').load('')
    rawDataDF_three = spark.createDataFrame(spark.sparkContext.emptyRDD(), inline_spc_raw_schema)

    while aa < (datetime.datetime.now() + datetime.timedelta(days=-1)):
        bb = aa.strftime('%Y%m%d')
        filepath = 'hdfs://10.250.3.151:8020//user//it//inline_spc_raw//inline_spc_raw_{0}.csv'.format(bb)
        rawDataDF_three = rawDataDF_three.unionAll(
            spark.read.format('com.databricks.spark.csv').options(inferschema=True, delimiter='\t').load(filepath))
        #rawDataDF_three = rawDataDF_three
        print(bb)
        print(filepath) 
        aa = aa + datetime.timedelta(days=+1)


    #sql="select * from it .inline_spc_raw where createtime >= '{0}' and createtime < '{1}'".format('start_date','end_date')

    #sql = "select * from it .inline_spc_raw limit 10"
    #rawDataDF_three = rawDataDF_three.unionAll(sqlContext.sql(sql))

    #rawDataDF_three.show(10)
    #inline_spc_raw_ThreeMonthDF = sqlContext.sql(sql)
    inline_spc_raw_OneMonthDF = inline_spc_raw_LastResult(one_month_start_date, end_date)
    inline_spc_raw_ThreeMonthDF = inline_spc_raw_LastResult(three_month_start_date, end_date)
    print('LastResult End' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    inlineSiteSumProd4_OneMonth=inlineSiteSumProd(inline_spc_raw_OneMonthDF,n_month,'1','prod4').withColumnRenamed('prod4','productname')
    #inlineSiteSumProd4_OneMonth.withColumnRenamed('prod4','productname')
    inlineSiteSumProd5_OneMonth = inlineSiteSumProd(inline_spc_raw_OneMonthDF, n_month, '1', 'prod5').withColumnRenamed('prod5','productname')
    inlineSiteSumProd4_ThreeMonth = inlineSiteSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod4').withColumnRenamed('prod4','productname')
    inlineSiteSumProd5_ThreeMonth = inlineSiteSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod5').withColumnRenamed('prod5','productname')
    inlineSiteSumProdAll = inlineSiteSumProd4_OneMonth.unionAll(inlineSiteSumProd5_OneMonth).unionAll(inlineSiteSumProd4_ThreeMonth).unionAll(inlineSiteSumProd5_ThreeMonth)
    print('Site insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #inlineSiteSumProdAll.show(20)
    print('Site insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #inlineSiteSumProdAll.write.csv('10.250.3.150//opt//datax//script//ITScheduleScript//IT_scripts_LTP//inline_spc_site_sum_prd_ltp.csv')

    print('Site insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #inlineSiteSumProdAll=inlineSiteSumProdAll.rdd.repartition(1).persist()
    inlineSiteSumProdAll.registerTempTable("SiteTempTable")
    #inlineSiteSumProdAll.write.format("hive").mode("overwrite").saveAsTable('it.inline_spc_site_sum_prd_ltp')
    #sqlContext.sql("create table  if not exists it.inline_spc_site_sum_prd_ltps select * from TempTable")
    sqlContext.sql("insert overwrite table it.inline_spc_site_sum_prd_ltp select * from SiteTempTable")

    print('Site insert End'+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #
    # engine = create_engine("oracle://RPT:RPT@10.66.3.239/p6rptdb1",encode='utf-8',echo=True)
    # data = pd.read_sql('select * from ETL_CONFIG_HISTORY  where UPDATE_TIME > sysdate - 1/24',engine)
    # data.show(20)



    #inlineSiteSumProdAll.write.mode("overwrite").jdbc()

    inlineWaferSumProd4_OneMonth = inlineWaferSumProd(inline_spc_raw_OneMonthDF,n_month,'1','prod4').withColumnRenamed('prod4','productname')
    #inlineWaferSumProd4_OneMonth.withColumnRenamed('prod4','productname')
    inlineWaferSumProd5_OneMonth = inlineWaferSumProd(inline_spc_raw_OneMonthDF, n_month, '1', 'prod5').withColumnRenamed('prod5','productname')
    inlineWaferSumProd4_ThreeMonth = inlineWaferSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod4').withColumnRenamed('prod4','productname')
    inlineWaferSumProd5_ThreeMonth = inlineWaferSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod5').withColumnRenamed('prod5','productname')
    inlineWaferSumProdAll=inlineWaferSumProd4_OneMonth.unionAll(inlineWaferSumProd5_OneMonth).unionAll(inlineWaferSumProd4_ThreeMonth).unionAll(inlineWaferSumProd5_ThreeMonth)
    #inlineWaferSumProdAll.show(20)
    print('Wafer insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #inlineWaferSumProdAll.write.format("hive").mode("overwrite").saveAsTable('it.inline_spc_wafer_sum_prd_ltp')    #very  slow
    inlineWaferSumProdAll.registerTempTable("WaferTempTable")
    sqlContext.sql("insert overwrite table it.inline_spc_wafer_sum_prd_ltp select * from WaferTempTable")
    print('Wafer insert End'+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    # inlineSiteSumNoFilterProd4_OneMonth = inlineSiteSumNoFilter(inline_spc_raw_OneMonthDF, n_month, '1', 'prod4')
    # inlineSiteSumNoFilterProd5_OneMonth = inlineSiteSumNoFilter(inline_spc_raw_OneMonthDF, n_month, '1', 'prod5')
    # inlineSiteSumNoFilterProd4_ThreeMonth = inlineSiteSumNoFilter(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod4')
    # inlineSiteSumNoFilterProd5_ThreeMonth = inlineSiteSumNoFilter(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod5')
    # inlineSiteSumNoFilterProd4_OneMonth.show(20)

    inlineToolMatchingProd4_OneMonth = inlineToolMatchingSum(inline_spc_raw_OneMonthDF,n_month,1,'prod4').withColumnRenamed('prod4','productname')
    inlineToolMatchingProd5_OneMonth = inlineToolMatchingSum(inline_spc_raw_OneMonthDF, n_month, 1, 'prod5').withColumnRenamed('prod5', 'productname')
    inlineToolMatchingProd4_ThreeMonth = inlineToolMatchingSum(inline_spc_raw_ThreeMonthDF, n_threemonth, 3,'prod4').withColumnRenamed('prod4', 'productname')
    inlineToolMatchingProd5_ThreeMonth = inlineToolMatchingSum(inline_spc_raw_ThreeMonthDF, n_threemonth, 3,'prod5').withColumnRenamed('prod5', 'productname')
    inlineToolMatchingProdAll = inlineToolMatchingProd4_OneMonth.unionAll(inlineToolMatchingProd5_OneMonth).unionAll(inlineToolMatchingProd4_ThreeMonth).unionAll(inlineToolMatchingProd5_ThreeMonth)

    #inlineToolMatchingProdAll.show(20)
    print('Machine insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #inlineToolMatchingProdAll.write.format("hive").mode("overwrite").saveAsTable('it.inline_tool_matching_sum_ltp')
    inlineToolMatchingProdAll.registerTempTable("MachineTempTable")
    sqlContext.sql("insert overwrite table it.inline_tool_matching_sum_ltp select * from MachineTempTable")
    print('Machine insert End' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    # rawFilterDataDF_ThreeMonth = rawDataDF_three.selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1")

    # rawFilterDataDF_OneMonth = rawDataDF.where("createtime>={0} and createtime<{1}").selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1")

    # print(rawFilterDataDF_ThreeMonth.schema)
    # rawFilterDataDF_ThreeMonth.show(20)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Tony-甲

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值