import numpy as np
import sys
#sys.path.append("/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip")
#sys.path.append("/opt/cloudera/parcels/CDH/lib/spark/python/lib/pyspark.zip")
from sklearn.metrics import mean_squared_error,r2_score
from scipy import optimize
import scipy as sci
from sklearn.metrics import roc_curve
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql.functions import *
import warnings
import random
import collections
import datetime
from pyspark import SparkContext,SparkConf
from pyspark.sql import *
import datetime
from dateutil.relativedelta import *
from pyspark.sql.types import *
from sqlalchemy import create_engine
#import org.apache
def inline_spc_raw_LastResult(one_month_start_date,end_date):
inline_spc_raw_LastResultDF = rawDataDF_three\
.where("createtime >= '{0}' and createtime < '{1}'".format(one_month_start_date, end_date)) \
.selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1")
return inline_spc_raw_LastResultDF
def inlineSiteSumProd(inline_spc_raw_DF,n_month,summarytype,prod):
# .selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1") \ and itemname = 'DMBWDGET-OCDMTS061-0162_THKP1#1'
inlineSiteSumProdDF = inline_spc_raw_DF\
.where("lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
.groupBy("{0}".format(prod),"edcspecid","edcspecrevision","itemname")\
.agg(
f.avg("sitecount").alias("sitecount"),
f.max("samplecount").alias("samplecount"),
f.avg("target").alias("target"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.min("upperspeclimit").alias("upperspeclimit"),
f.max("dcspecdes").alias("dcspecdes"),
f.max("datatype").alias("datatype"),
f.max("dcspecitemowner").alias("dcspecitemowner"),
f.countDistinct("samplematerialname").alias("lotcnt"),
f.max("dccdefine").alias("dccdefine"),
f.avg("result").alias("mean"),
f.stddev("result").alias("std"),
f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("upperspeclimit")- f.max("lowerspeclimit"))/(6 * f.stddev("result")))).otherwise("null").alias("cp"),
f.when (~f.isnull(f.min("upperspeclimit")- f.max("lowerspeclimit")) & ((f.min("upperspeclimit")- f.max("lowerspeclimit")) != 0),(f.abs(f.avg("result")-f.avg("target"))/((f.min("upperspeclimit")- f.max("lowerspeclimit"))/2))).otherwise("null").alias("k"),
f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("upperspeclimit") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("lowerspeclimit"))/(3 * f.stddev("result"))))
).otherwise("null").alias("cpk"),
f.min("spcusl").alias("spcusl"),
f.max("spclsl").alias("spclsl"),
f.avg("spctarget").alias("spctarget"),
f.avg("spcgoal").alias("spcgoal"),
f.avg("kgoal").alias("kgoal"),
f.avg("downratiogoal").alias("downratiogoal"),
f.max("spcflag").alias("spcflag"),
f.max("dcseq").alias("dcseq"),
f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("spcusl")- f.max("spclsl"))/(6 * f.stddev("result")))).otherwise("null").alias("spccp"),
f.when (~f.isnull(f.min("spcusl")- f.max("spclsl")) & ((f.min("spcusl")- f.max("spclsl")) != 0),(f.abs(f.avg("result")-f.avg("spctarget"))/((f.min("spcusl")- f.max("spclsl"))/2))).otherwise("null").alias("spck"),
f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("spcusl") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("spclsl"))/(3 * f.stddev("result"))))
).otherwise("null").alias("spccpk"),
f.countDistinct("lotname").alias("lotcount"),
(f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")
).selectExpr("*","'{0}' as monthly".format(n_month),"'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
#"case when (min('upperspeclimit') is null) or (max('lowerspeclimit') is null) then null when stddev('result')is not null and stddev('result') <> 0 then ((min('upperspeclimit')- max('lowerspeclimit'))/(6 * stddev('result'))) else null end cp"
#OneMonths Site Sum end
return inlineSiteSumProdDF
#inlineSiteSumProd4_OneMonth.show(20)
def inlineWaferSumProd(inline_spc_raw_DF,n_month,summarytype,prod):
#OneMonth site sum
inlineWaferSumProdDF = inline_spc_raw_DF\
.where("lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
.groupBy("{0}".format(prod),"edcspecid","edcspecrevision","itemname","lotname","samplematerialname")\
.agg(
f.avg("sitecount").alias("sitecount"),
f.max("samplecount").alias("samplecount"),
f.avg("target").alias("target"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.min("upperspeclimit").alias("upperspeclimit"),
f.max("dcspecdes").alias("dcspecdes"),
f.max("datatype").alias("datatype"),
f.max("dcspecitemowner").alias("dcspecitemowner"),
#f.countDistinct("samplematerialname").alias("lotcnt"),
f.max("dccdefine").alias("dccdefine"),
f.avg("result").alias("avg"),
f.stddev("result").alias("std"),
#~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
# f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("upperspeclimit")- f.max("lowerspeclimit"))/(6 * f.stddev("result")))).otherwise("null").alias("cp"),
# f.when (~f.isnull(f.min("upperspeclimit")- f.max("lowerspeclimit")) & ((f.min("upperspeclimit")- f.max("lowerspeclimit")) != 0),(f.abs(f.avg("result")-f.avg("target"))/((f.min("upperspeclimit")- f.max("lowerspeclimit"))/2))).otherwise("null").alias("k"),
# f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
# f.least(
# ((f.min("upperspeclimit") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("lowerspeclimit"))/(3 * f.stddev("result"))))
# ).otherwise("null").alias("cpk"),
f.min("spcusl").alias("spcusl"),
f.max("spclsl").alias("spclsl"),
f.avg("spctarget").alias("spctarget"),
f.avg("spcgoal").alias("spcgoal"),
f.avg("kgoal").alias("kgoal"),
f.avg("downratiogoal").alias("downratiogoal"),
f.max("spcflag").alias("spcflag"),
f.max("dcseq").alias("dcseq"),
# f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0) , ((f.min("spcusl")- f.max("spclsl"))/(6 * f.stddev("result")))).otherwise("null").alias("spccp"),
# f.when (~f.isnull(f.min("spcusl")- f.max("spclsl")) & ((f.min("spcusl")- f.max("spclsl")) != 0),(f.abs(f.avg("result")-f.avg("spctarget"))/((f.min("spcusl")- f.max("spclsl"))/2))).otherwise("null").alias("spck"),
# f.when (~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
# f.least(
# ((f.min("spcusl") - f.avg("result"))/(3 * f.stddev("result"))),((f.avg("result")-f.min("spclsl"))/(3 * f.stddev("result"))))
# ).otherwise("null").alias("spccpk"),
#
# f.countDistinct("lotname").alias("lotcount"),
# (f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")
).selectExpr("*","{0} as monthly".format(n_month),"{0} as summarytype".format(summarytype))\
.groupBy("{0}".format(prod),"edcspecid","edcspecrevision","itemname")\
.agg(
f.avg("sitecount").alias("sitecount"),
f.max("samplecount").alias("samplecount"),
f.avg("target").alias("target"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.min("upperspeclimit").alias("upperspeclimit"),
f.max("dcspecdes").alias("dcspecdes"),
f.max("datatype").alias("datatype"),
f.max("dcspecitemowner").alias("dcspecitemowner"),
f.countDistinct("samplematerialname").alias("wafercnt"),
f.max("dccdefine").alias("dccdefine"),
f.avg("avg").alias("mean"),
f.avg("std").alias("stdmean"),
f.stddev("std").alias("std"),
f.stddev("avg").alias("meanstd"),
# ~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("avg")))).otherwise(
"null").alias("cp"),
f.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
(f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0), (
f.abs(f.avg("avg") - f.avg("target")) / (
(f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2))).otherwise("null").alias("k"),
f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
f.least(
((f.min("upperspeclimit") - f.avg("avg")) / (3 * f.stddev("avg"))),
((f.avg("avg") - f.min("lowerspeclimit")) / (3 * f.stddev("avg"))))
).otherwise("null").alias("cpk"),
f.min("spcusl").alias("spcusl"),
f.max("spclsl").alias("spclsl"),
f.avg("spctarget").alias("spctarget"),
f.avg("spcgoal").alias("spcgoal"),
f.avg("kgoal").alias("kgoal"),
f.avg("downratiogoal").alias("downratiogoal"),
f.max("spcflag").alias("spcflag"),
f.max("dcseq").alias("dcseq"),
f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
((f.min("spcusl") - f.max("spclsl")) / (6 * f.stddev("avg")))).otherwise("null").alias("spccp"),
f.when(~f.isnull(f.min("spcusl") - f.max("spclsl")) & ((f.min("spcusl") - f.max("spclsl")) != 0),
(f.abs(f.avg("avg") - f.avg("spctarget")) / ((f.min("spcusl") - f.max("spclsl")) / 2))).otherwise(
"null").alias("spck"),
f.when(~f.isnull(f.stddev("avg")) & (f.stddev("avg") != 0),
f.least(
((f.min("spcusl") - f.avg("avg")) / (3 * f.stddev("avg"))),
((f.avg("avg") - f.min("spclsl")) / (3 * f.stddev("avg"))))
).otherwise("null").alias("spccpk"),
f.countDistinct("lotname").alias("lotcount"),
(f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")
).selectExpr("*", "'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
#OneMonths Site Sum end
return inlineWaferSumProdDF
def inlineSiteSumNoFilter(inline_spc_raw_DF,n_month,summarytype,prod):
inlineSiteSumNoFilterDF = inline_spc_raw_DF \
.groupBy("{0}".format(prod), "edcspecid", "edcspecrevision", "itemname") \
.agg(
f.avg("sitecount").alias("sitecount"),
f.max("samplecount").alias("samplecount"),
f.avg("target").alias("target"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.min("upperspeclimit").alias("upperspeclimit"),
f.max("dcspecdes").alias("dcspecdes"),
f.max("datatype").alias("datatype"),
f.max("dcspecitemowner").alias("dcspecitemowner"),
f.countDistinct("samplematerialname").alias("lotcnt"),
f.max("dccdefine").alias("dccdefine"),
f.avg("result").alias("avg"),
f.stddev("result").alias("std"),
# ~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result")))).otherwise(
"null").alias("cp"),
f.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
(f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0), (
f.abs(f.avg("result") - f.avg("target")) / (
(f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2))).otherwise("null").alias("k"),
f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
).otherwise("null").alias("cpk"),
f.min("spcusl").alias("spcusl"),
f.max("spclsl").alias("spclsl"),
f.avg("spctarget").alias("spctarget"),
f.avg("spcgoal").alias("spcgoal"),
f.avg("kgoal").alias("kgoal"),
f.avg("downratiogoal").alias("downratiogoal"),
f.max("spcflag").alias("spcflag"),
f.max("dcseq").alias("dcseq"),
f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
((f.min("spcusl") - f.max("spclsl")) / (6 * f.stddev("result")))).otherwise("null").alias("spccp"),
f.when(~f.isnull(f.min("spcusl") - f.max("spclsl")) & ((f.min("spcusl") - f.max("spclsl")) != 0),
(f.abs(f.avg("result") - f.avg("spctarget")) / ((f.min("spcusl") - f.max("spclsl")) / 2))).otherwise(
"null").alias("spck"),
f.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("spcusl") - f.avg("result")) / (3 * f.stddev("result"))),
((f.avg("result") - f.min("spclsl")) / (3 * f.stddev("result"))))
).otherwise("null").alias("spccpk"),
f.countDistinct("lotname").alias("lotcount"),
(f.countDistinct("samplematerialname") * f.avg("sitecount")).alias("tot_site")
).selectExpr("*", "'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
return inlineSiteSumNoFilterDF
def inlineToolMatchingSum(inline_spc_raw_DF,n_month,summarytype,prod):
inlineToolMatchingDF1 = inline_spc_raw_DF.where(
"lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
.where("processmachinename != processsubmachinename")\
.groupBy("{0}".format(prod),"itemname","processmachinename","processsubmachinename") \
.agg(
f.max("modulename").alias("modulename"),
f.countDistinct("lotsysid").alias("waferconut"),
f.max("target").alias("target"),
f.max("upperspeclimit").alias("upperspeclimit"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.avg("result").alias("mean"),
f.stddev("result").alias("std"),
f.when((f.isnull(f.min("upperspeclimit"))) | (f.isnull(f.max("lowerspeclimit"))),"null")
.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result"))))
.otherwise("null").alias("cp"),
f.when(~f.isnull(f.min("upperspeclimit")) & f.isnull(f.max("lowerspeclimit")) & (f.min("upperspeclimit") - f.avg("target") != 0),
(f.avg("result") - f.avg("target")) / (f.min("upperspeclimit") - f.avg("target"))
)
.when(f.isnull(f.min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & ((f.avg("target") - f.max("lowerspeclimit")) != 0),
(f.avg("target") - f.avg("result")) / (f.avg("target") - f.max("lowerspeclimit"))
)
.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & ((f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0),
f.abs(f.avg("result") - f.avg("target")) / ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2)
)
.otherwise("null").alias("k"),
f.when(~f.isnull(min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
(f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))
)
.when(f.isnull(f.min("upperspeclimit")) & (~f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
(f.avg("result") - f.max("lowerspeclimit")) / (3 * f.stddev("result"))
)
.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
).otherwise("null").alias("cpk")
).selectExpr("{0} as productname".format(prod),"itemname","modulename","processmachinename",
"processsubmachinename","waferconut","target","upperspeclimit","lowerspeclimit",
"mean","std","cp","k","cpk",
"'{0}' as monthly".format(n_month),"'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
inlineToolMatchingDF2 = inline_spc_raw_DF.where(
"lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
.groupBy("{0}".format(prod), "itemname", "processmachinename") \
.agg(
f.max("modulename").alias("modulename"),
f.countDistinct("lotsysid").alias("waferconut"),
f.max("target").alias("target"),
f.max("upperspeclimit").alias("upperspeclimit"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.avg("result").alias("mean"),
f.stddev("result").alias("std"),
f.when((f.isnull(f.min("upperspeclimit"))) | (f.isnull(f.max("lowerspeclimit"))), "null")
.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result"))))
.otherwise("null").alias("cp"),
f.when(~f.isnull(f.min("upperspeclimit")) & f.isnull(f.max("lowerspeclimit")) & (
f.min("upperspeclimit") - f.avg("target") != 0),
(f.avg("result") - f.avg("target")) / (f.min("upperspeclimit") - f.avg("target"))
)
.when(f.isnull(f.min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (
(f.avg("target") - f.max("lowerspeclimit")) != 0),
(f.avg("target") - f.avg("result")) / (f.avg("target") - f.max("lowerspeclimit"))
)
.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
(f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0),
f.abs(f.avg("result") - f.avg("target")) / ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2)
)
.otherwise("null").alias("k"),
f.when(
~f.isnull(min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (
f.stddev("result") != 0),
(f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))
)
.when(f.isnull(f.min("upperspeclimit")) & (~f.isnull(f.max("lowerspeclimit"))) & (
~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
(f.avg("result") - f.max("lowerspeclimit")) / (3 * f.stddev("result"))
)
.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
).otherwise("null").alias("cpk")
).selectExpr("{0} as productname".format(prod), "itemname", "modulename", "processmachinename",
"processmachinename as processsubmachinename", "waferconut", "target", "upperspeclimit", "lowerspeclimit",
"mean", "std", "cp", "k", "cpk",
"'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
inlineToolMatchingDF3 = inline_spc_raw_DF.where(
"lottype in ('PR','PP','PE','PC','PA','PB','PM','PI') and lotname not like 'AS%'") \
.groupBy("{0}".format(prod), "itemname") \
.agg(
f.max("modulename").alias("modulename"),
f.countDistinct("lotsysid").alias("waferconut"),
f.max("target").alias("target"),
f.max("upperspeclimit").alias("upperspeclimit"),
f.max("lowerspeclimit").alias("lowerspeclimit"),
f.avg("result").alias("mean"),
f.stddev("result").alias("std"),
f.when((f.isnull(f.min("upperspeclimit"))) | (f.isnull(f.max("lowerspeclimit"))), "null")
.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
((f.min("upperspeclimit") - f.max("lowerspeclimit")) / (6 * f.stddev("result"))))
.otherwise("null").alias("cp"),
f.when(~f.isnull(f.min("upperspeclimit")) & f.isnull(f.max("lowerspeclimit")) & (
f.min("upperspeclimit") - f.avg("target") != 0),
(f.avg("result") - f.avg("target")) / (f.min("upperspeclimit") - f.avg("target"))
)
.when(f.isnull(f.min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (
(f.avg("target") - f.max("lowerspeclimit")) != 0),
(f.avg("target") - f.avg("result")) / (f.avg("target") - f.max("lowerspeclimit"))
)
.when(~f.isnull(f.min("upperspeclimit") - f.max("lowerspeclimit")) & (
(f.min("upperspeclimit") - f.max("lowerspeclimit")) != 0),
f.abs(f.avg("result") - f.avg("target")) / ((f.min("upperspeclimit") - f.max("lowerspeclimit")) / 2)
)
.otherwise("null").alias("k"),
f.when(
~f.isnull(min("upperspeclimit")) & (f.isnull(f.max("lowerspeclimit"))) & (~f.isnull(f.stddev("result"))) & (
f.stddev("result") != 0),
(f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))
)
.when(f.isnull(f.min("upperspeclimit")) & (~f.isnull(f.max("lowerspeclimit"))) & (
~f.isnull(f.stddev("result"))) & (f.stddev("result") != 0),
(f.avg("result") - f.max("lowerspeclimit")) / (3 * f.stddev("result"))
)
.when(~f.isnull(f.stddev("result")) & (f.stddev("result") != 0),
f.least(
((f.min("upperspeclimit") - f.avg("result")) / (3 * f.stddev("result"))),
((f.avg("result") - f.min("lowerspeclimit")) / (3 * f.stddev("result"))))
).otherwise("null").alias("cpk")
).selectExpr("{0} as productname".format(prod), "itemname", "modulename", "'NA' as processmachinename",
"'NA' as processsubmachinename", "waferconut", "target", "upperspeclimit",
"lowerspeclimit",
"mean", "std", "cp", "k", "cpk",
"'{0}' as monthly".format(n_month), "'{0}' as summarytype".format(summarytype),"'{0}' as updatetime".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
inlineToolMatchingDF=inlineToolMatchingDF1.unionAll(inlineToolMatchingDF2).unionAll(inlineToolMatchingDF3)
return inlineToolMatchingDF
if __name__ == '__main__':
currentTime = datetime.datetime.now() # datetime.datetime.strptime("2022-01-15 01:00:00",'%Y-%m-%d %H:%M:%S')
end_date = currentTime.strftime('%Y-%m-%d 00:00:00')
print(end_date)
start_date = (currentTime + datetime.timedelta(days=-1)).strftime('%Y-%m-%d') + ' 00:00:00'
print(start_date)
month_end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S')
print(month_end_date)
FAB_DATE = month_end_date.strftime('%Y%m%d')
print(FAB_DATE)
N_DAY = int(FAB_DATE[6:8])
print(N_DAY)
if N_DAY > 25:
n_month = (month_end_date + relativedelta(months=+1)).strftime('%Y%m')
print(n_month)
n_threemonth = (month_end_date + relativedelta(months=-1)).strftime('%Y%m') + "-" + (
month_end_date + relativedelta(months=+1)).strftime('%Y%m')
print(n_threemonth)
# one_month_start_date := TO_DATE(to_char(end_date, 'YYYYMM') | | '25000000', 'YYYYMMDDHH24MISS');
one_month_start_date = (month_end_date).strftime('%Y-%m') + "-25 00:00:00"
print(one_month_start_date)
three_month_start_date = (month_end_date + relativedelta(months=-2)).strftime('%Y-%m') + "-25 00:00:00"
print(three_month_start_date)
else:
n_month = (month_end_date).strftime('%Y%m')
print(n_month)
n_threemonth = (month_end_date + relativedelta(months=-2)).strftime('%Y%m') + "-" + (month_end_date).strftime(
'%Y%m')
print(n_threemonth)
# one_month_start_date := TO_DATE(to_char(end_date, 'YYYYMM') | | '25000000', 'YYYYMMDDHH24MISS');
one_month_start_date = (month_end_date + relativedelta(months=-1)).strftime('%Y-%m') + "-25 00:00:00"
print(one_month_start_date)
three_month_start_date = (month_end_date + relativedelta(months=-3)).strftime('%Y-%m') + "-25 00:00:00"
print(three_month_start_date)
conf = SparkConf().setAppName('inline_spc_sum_prd')#.setMaster('local[16]')
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
sqlContext = HiveContext(sc)
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
#spark.conf.set("spark.debug.maxToStringFields", "100")
aa = datetime.datetime.strptime(three_month_start_date,'%Y-%m-%d %H:%M:%S')
#aa = datetime.datetime.strptime('2022-01-12 00:00:00', '%Y-%m-%d %H:%M:%S')
print(aa)
inline_spc_raw_schema = StructType([
StructField("SYSID", StringType(), True),
StructField("PRODUCTNAME", StringType(), True),
StructField("EDCSPECID", StringType(), True),
StructField("EDCSPECREVISION", StringType(), True),
StructField("ACTIVESTATE", StringType(), True),
StructField("ITEMNAME", StringType(), True),
StructField("SITECOUNT", StringType(), True),
StructField("SAMPLECOUNT", StringType(), True),
StructField("TARGET", DecimalType(), True),
StructField("LOWERSPECLIMIT", DecimalType(), True),
StructField("UPPERSPECLIMIT", DecimalType(), True),
StructField("LOTNAME", StringType(), True),
StructField("SAMPLEMATERIALNAME", StringType(), True),
StructField("SITENAME", StringType(), True),
StructField("DCSPECDES", StringType(), True),
StructField("DATATYPE", StringType(), True),
StructField("DCSPECITEMOWNER", StringType(), True),
StructField("LOTTYPE", StringType(), True),
StructField("DCCDEFINE", StringType(), True),
StructField("RESULT", DecimalType(), True),
StructField("CREATETIME", StringType(), True),
StructField("UPDATETIME", StringType(), True),
StructField("SPCUSL", DecimalType(), True),
StructField("SPCLSL", DecimalType(), True),
StructField("SPCTARGET", DecimalType(), True),
StructField("SPCGOAL", DecimalType(), True),
StructField("SPCFLAG", StringType(), True),
StructField("KGOAL", DecimalType(), True),
StructField("DOWNRATIOGOAL", DecimalType(), True),
StructField("DCSEQ", StringType(), True),
StructField("STEPSEQUENCE", StringType(), True),
StructField("PROCESSOPERATIONNAME", StringType(), True),
StructField("PROCESSMACHINENAME", StringType(), True),
StructField("PROCESSSUBMACHINENAME", StringType(), True),
StructField("UPPERSCREENLIMIT", DecimalType(), True),
StructField("LOWERSCREENLIMIT", DecimalType(), True),
StructField("MODULENAME", StringType(), True),
StructField("LOTSYSID", StringType(), True),
StructField("PROD4", StringType(), True),
StructField("PROD5", StringType(), True)
])
# rawDataDF = spark.read.format('com.databricks.spark.csv').options(inferschema=True,delimiter='\t').load('')
rawDataDF_three = spark.createDataFrame(spark.sparkContext.emptyRDD(), inline_spc_raw_schema)
while aa < (datetime.datetime.now() + datetime.timedelta(days=-1)):
bb = aa.strftime('%Y%m%d')
filepath = 'hdfs://10.250.3.151:8020//user//it//inline_spc_raw//inline_spc_raw_{0}.csv'.format(bb)
rawDataDF_three = rawDataDF_three.unionAll(
spark.read.format('com.databricks.spark.csv').options(inferschema=True, delimiter='\t').load(filepath))
#rawDataDF_three = rawDataDF_three
print(bb)
print(filepath)
aa = aa + datetime.timedelta(days=+1)
#sql="select * from it .inline_spc_raw where createtime >= '{0}' and createtime < '{1}'".format('start_date','end_date')
#sql = "select * from it .inline_spc_raw limit 10"
#rawDataDF_three = rawDataDF_three.unionAll(sqlContext.sql(sql))
#rawDataDF_three.show(10)
#inline_spc_raw_ThreeMonthDF = sqlContext.sql(sql)
inline_spc_raw_OneMonthDF = inline_spc_raw_LastResult(one_month_start_date, end_date)
inline_spc_raw_ThreeMonthDF = inline_spc_raw_LastResult(three_month_start_date, end_date)
print('LastResult End' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
inlineSiteSumProd4_OneMonth=inlineSiteSumProd(inline_spc_raw_OneMonthDF,n_month,'1','prod4').withColumnRenamed('prod4','productname')
#inlineSiteSumProd4_OneMonth.withColumnRenamed('prod4','productname')
inlineSiteSumProd5_OneMonth = inlineSiteSumProd(inline_spc_raw_OneMonthDF, n_month, '1', 'prod5').withColumnRenamed('prod5','productname')
inlineSiteSumProd4_ThreeMonth = inlineSiteSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod4').withColumnRenamed('prod4','productname')
inlineSiteSumProd5_ThreeMonth = inlineSiteSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod5').withColumnRenamed('prod5','productname')
inlineSiteSumProdAll = inlineSiteSumProd4_OneMonth.unionAll(inlineSiteSumProd5_OneMonth).unionAll(inlineSiteSumProd4_ThreeMonth).unionAll(inlineSiteSumProd5_ThreeMonth)
print('Site insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#inlineSiteSumProdAll.show(20)
print('Site insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#inlineSiteSumProdAll.write.csv('10.250.3.150//opt//datax//script//ITScheduleScript//IT_scripts_LTP//inline_spc_site_sum_prd_ltp.csv')
print('Site insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#inlineSiteSumProdAll=inlineSiteSumProdAll.rdd.repartition(1).persist()
inlineSiteSumProdAll.registerTempTable("SiteTempTable")
#inlineSiteSumProdAll.write.format("hive").mode("overwrite").saveAsTable('it.inline_spc_site_sum_prd_ltp')
#sqlContext.sql("create table if not exists it.inline_spc_site_sum_prd_ltps select * from TempTable")
sqlContext.sql("insert overwrite table it.inline_spc_site_sum_prd_ltp select * from SiteTempTable")
print('Site insert End'+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#
# engine = create_engine("oracle://RPT:RPT@10.66.3.239/p6rptdb1",encode='utf-8',echo=True)
# data = pd.read_sql('select * from ETL_CONFIG_HISTORY where UPDATE_TIME > sysdate - 1/24',engine)
# data.show(20)
#inlineSiteSumProdAll.write.mode("overwrite").jdbc()
inlineWaferSumProd4_OneMonth = inlineWaferSumProd(inline_spc_raw_OneMonthDF,n_month,'1','prod4').withColumnRenamed('prod4','productname')
#inlineWaferSumProd4_OneMonth.withColumnRenamed('prod4','productname')
inlineWaferSumProd5_OneMonth = inlineWaferSumProd(inline_spc_raw_OneMonthDF, n_month, '1', 'prod5').withColumnRenamed('prod5','productname')
inlineWaferSumProd4_ThreeMonth = inlineWaferSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod4').withColumnRenamed('prod4','productname')
inlineWaferSumProd5_ThreeMonth = inlineWaferSumProd(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod5').withColumnRenamed('prod5','productname')
inlineWaferSumProdAll=inlineWaferSumProd4_OneMonth.unionAll(inlineWaferSumProd5_OneMonth).unionAll(inlineWaferSumProd4_ThreeMonth).unionAll(inlineWaferSumProd5_ThreeMonth)
#inlineWaferSumProdAll.show(20)
print('Wafer insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#inlineWaferSumProdAll.write.format("hive").mode("overwrite").saveAsTable('it.inline_spc_wafer_sum_prd_ltp') #very slow
inlineWaferSumProdAll.registerTempTable("WaferTempTable")
sqlContext.sql("insert overwrite table it.inline_spc_wafer_sum_prd_ltp select * from WaferTempTable")
print('Wafer insert End'+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# inlineSiteSumNoFilterProd4_OneMonth = inlineSiteSumNoFilter(inline_spc_raw_OneMonthDF, n_month, '1', 'prod4')
# inlineSiteSumNoFilterProd5_OneMonth = inlineSiteSumNoFilter(inline_spc_raw_OneMonthDF, n_month, '1', 'prod5')
# inlineSiteSumNoFilterProd4_ThreeMonth = inlineSiteSumNoFilter(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod4')
# inlineSiteSumNoFilterProd5_ThreeMonth = inlineSiteSumNoFilter(inline_spc_raw_ThreeMonthDF, n_threemonth, '3', 'prod5')
# inlineSiteSumNoFilterProd4_OneMonth.show(20)
inlineToolMatchingProd4_OneMonth = inlineToolMatchingSum(inline_spc_raw_OneMonthDF,n_month,1,'prod4').withColumnRenamed('prod4','productname')
inlineToolMatchingProd5_OneMonth = inlineToolMatchingSum(inline_spc_raw_OneMonthDF, n_month, 1, 'prod5').withColumnRenamed('prod5', 'productname')
inlineToolMatchingProd4_ThreeMonth = inlineToolMatchingSum(inline_spc_raw_ThreeMonthDF, n_threemonth, 3,'prod4').withColumnRenamed('prod4', 'productname')
inlineToolMatchingProd5_ThreeMonth = inlineToolMatchingSum(inline_spc_raw_ThreeMonthDF, n_threemonth, 3,'prod5').withColumnRenamed('prod5', 'productname')
inlineToolMatchingProdAll = inlineToolMatchingProd4_OneMonth.unionAll(inlineToolMatchingProd5_OneMonth).unionAll(inlineToolMatchingProd4_ThreeMonth).unionAll(inlineToolMatchingProd5_ThreeMonth)
#inlineToolMatchingProdAll.show(20)
print('Machine insert Start' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#inlineToolMatchingProdAll.write.format("hive").mode("overwrite").saveAsTable('it.inline_tool_matching_sum_ltp')
inlineToolMatchingProdAll.registerTempTable("MachineTempTable")
sqlContext.sql("insert overwrite table it.inline_tool_matching_sum_ltp select * from MachineTempTable")
print('Machine insert End' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# rawFilterDataDF_ThreeMonth = rawDataDF_three.selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1")
# rawFilterDataDF_OneMonth = rawDataDF.where("createtime>={0} and createtime<{1}").selectExpr("*","row_number() over (partition by PRODUCTNAME,ITEMNAME,LOTNAME,SAMPLEMATERIALNAME,sitename order by createtime desc) as rank").where("rank=1")
# print(rawFilterDataDF_ThreeMonth.schema)
# rawFilterDataDF_ThreeMonth.show(20)
Sparksql Dataframe聚合操作
于 2022-01-19 09:43:35 首次发布