当天内交易金额快进快出的次数。
业务描述:借钱和贷钱为一组,比如:第一组 2022年12月28日 借100,贷99;第二组 2022年12月28日借100,借100,贷200,就是当天内两次快进快出。下面是详细代码:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F
import datetime as dt
from pyspark.sql import types as T
import time
from pyspark.sql.types import IntegerType, FloatType
# import scala.util.Try
#添加此代码
import findspark
findspark.init()
sparkconf = SparkConf().setAppName("MYPRO").setMaster("local[*]")#.set("spark.ui.showConsoleProgress", "false")\
#.set("mapred.map.child.java.opts","-Xmx2048m").set("spark.driver.host","test")#.set("spark.driver.bindAddress","127.0.0.1")
sc = SparkContext(conf=sparkconf)
print("master:" + sc.master)
sc.setLogLevel("WARN")
# Setpath(sc)
spark = SparkSession.builder.config(conf=sparkconf).getOrCreate()
biz_df = spark.read.option("header", "true").csv('D:\\workStation\\othreCoding\\Dynamic\\tgn-master\\data\\black_data\\data1.csv')
# biz_df = spark.read.option("header", "true").csv('D:\\data\\black_all_data_feature\\feature.csv')
biz_df = biz_df\
.withColumn("jiaoyije", F.col("jiaoyije").cast('double'))\
.withColumn("zhanghye", F.col("zhanghye").cast('double'))\
.withColumn("kehuzhao", F.col("kehuzhao").cast('String'))\
.withColumnRenamed("jiedaibz", "debit_credit")\
.withColumnRenamed("jiaoyije", "amt")\
.withColumnRenamed("zhanghye", "bal_amt")\
.withColumnRenamed("kehuzhao", "acct")\
.withColumnRenamed("duifkhzh", "opp_acct")\
.withColumnRenamed("jiaoyirq", "tr_dt")\
.withColumnRenamed("jiaoyisj", "tm").cache()
# // 修改时间
# // rawdata2 = changeTime(rawdata2)
def changeDT_(x):
sdfSource = dt.datetime.strptime(x, "%Y%m%d")
sdfTarget = dt.datetime.strftime(sdfSource, "%Y-%m-%d")
return sdfTarget
# dateTarget = sdfTarget.format(sdfSource.parse(x))
# dateTarget
def changeTM_(x):
xx = ""
if(len(x) >= 6):
xx = x[0:6]
else:
dd = 6 - len(x)
xx = x
while(dd >0):
xx += "0"
dd -= 1
# print(x)
x = "20221230" + x
sdfSource = dt.datetime.strptime(x, "%Y%m%d%H%M%S")
sdfTarget = dt.datetime.strptime(sdfSource, "%Y-%m-%d %H:%M:%S")
# sdfTarget = time.strftime('%H:%M:%S', x)
return sdfTarget.split(" ")[1]
# dateTarget = sdfTarget.format(sdfSource.parse(xx))
# dateTarget
def changeDTTM_(x, tm):
str_tm = x + tm
return str_tm
changeDT = F.udf(changeDT_, T.StringType())
# changeTM = F.udf(changeTM_, T.StringType())
changeDTTM = F.udf(changeDTTM_, T.StringType())
biz_df = biz_df.withColumn("tr_dt", changeDT(F.col("tr_dt")))
# .withColumn("tm", changeTM(F.col("tm"))).cache()
# biz_df = biz_df.withColumn("tr_tm", changeDTTM("tr_dt", "tm")).cache()
dfMaxDT = biz_df.groupBy("acct").agg(F.max(F.col("tr_dt")).alias("max_tr_dt")).withColumnRenamed("acct", "acct22")
#.drop("acct")
biz_df = biz_df.join(dfMaxDT, dfMaxDT["acct22"] == biz_df["acct"], "inner").drop("acct22")
biz_df.createOrReplaceTempView("rawdata2")
biz_df = spark.sql("select acct, amt, debit_credit, bal_amt, tr_dt from rawdata2 where date_sub(max_tr_dt, 30) <= date(tr_dt)")
# biz_df.select("acct").show()
biz_df = biz_df.where("acct in ('62312301003400681', '62312301001467500')")
biz_df0 = biz_df.select("acct", "tr_dt").distinct()
biz_df1 = biz_df.select("acct").distinct()
df_pands = biz_df.toPandas()
df_pands = df_pands.sort_values(by=['acct', 'tr_dt'], ascending=True)
print(df_pands.head(50))
df_pands = df_pands[['acct', 'amt', 'bal_amt', 'debit_credit', 'tr_dt']]
# print(df_pands.iloc[:100])
pre_acct = ''
pre_dt = ''
pre_c = 0.0
pre_d = 0.0
str_flag = []
day_count = 0
acct_all_count = 0
# AABCCD
def get_acct_fast_count(acct, tar_acct, fast_count, old_val):
if(acct == tar_acct):
return fast_count
else:
return old_val
get_acct_fast_count = changeDT = F.udf(get_acct_fast_count, T.IntegerType())
def get_day_fast_count(acct, tar_acct, tr_dt, tar_tr_dt, fast_count, old_val):
if(acct == tar_acct and tr_dt == tar_tr_dt):
return fast_count
else:
return old_val
get_day_fast_count = changeDT = F.udf(get_day_fast_count, T.IntegerType())
# def hasColumn(df, path): bool = Try(df(path)).isSuccess
for it_acct, it_amt, it_deb, it_dt in zip(df_pands['acct'], df_pands['amt'], df_pands['debit_credit'], df_pands['tr_dt']):
# for it_acct, it_amt, it_deb, it_dt in zip(df_pands['acct'], df_pands['amt'], df_pands['debit_credit'], df_pands['tr_dt']):
# print(it_acct, it_amt, it_deb, it_dt)
print(it_acct, it_amt, it_deb, it_dt)
if(pre_acct == ''):
pre_acct = it_acct
pre_dt = it_dt
if(it_dt != pre_dt and it_deb == 'D'):
continue
if(it_acct != pre_acct or it_dt != pre_dt): #判断是不是和上一个账号相同
# if(it_dt != pre_dt): #判断是不是同一个日期
# print(it_acct, it_dt, it_deb)
# str_flag.append(it_deb)
# else:
if(pre_d == 0):
pre_d = 1
pre_c = 0
res = pre_c / pre_d
if(res >= 0.9 and res <= 1.1):
day_count += 1
acct_all_count += day_count
if("day_fast_count" in biz_df0.columns):
biz_df0 = biz_df0.withColumn("day_fast_count",
get_day_fast_count(F.col("acct"), F.lit(pre_acct), F.col("tr_dt"), F.lit(pre_dt),
F.lit(day_count), F.col("day_fast_count")))
else:
biz_df0 = biz_df0.withColumn("day_fast_count",
get_day_fast_count(F.col("acct"), F.lit(pre_acct), F.col("tr_dt"), F.lit(pre_dt),
F.lit(day_count), F.lit(0)))
day_count = 0
pre_c = 0.0
pre_d = 0.0
pre_dt = it_dt
str_flag = []
# continue
if(it_acct != pre_acct): #如果不是同一个账号,相关数据清零
# 汇总当前账号的总满足数量
if("acct_fast_count" in biz_df0.columns):
biz_df1 = biz_df1.withColumn("acct_fast_count", get_acct_fast_count(F.col("acct"), F.lit(pre_acct), F.lit(acct_all_count), F.col("acct_fast_count")))
else:
biz_df1 = biz_df1.withColumn("acct_fast_count", get_acct_fast_count(F.col("acct"), F.lit(pre_acct), F.lit(acct_all_count), F.lit(0)))
acct_all_count = 0
day_count = 0
pre_c = 0.0
pre_d = 0.0
pre_dt = it_dt
str_flag = []
pre_acct = it_acct
# str_flag.append(it_deb)
if(it_deb == 'C' and (('C' in str_flag and 'D' in str_flag and str_flag[0] != 'C') or ('D' not in str_flag) or ('D' in str_flag and 'C' not in str_flag))):
pre_c += it_amt
elif(it_deb == 'D' and (('C' in str_flag and 'D' in str_flag and str_flag[0] != 'D') or ('C' not in str_flag) or ('C' in str_flag and 'D' not in str_flag))):
pre_d += it_amt
else:
if(pre_d == 0):
pre_d = 1
pre_c = 0
res = pre_c / pre_d
if(res >= 0.9 and res <= 1.1):
day_count += 1
str_flag = []
pre_c = 0.0
pre_d = 0.0
if(it_deb == 'C'):
pre_c += it_amt
if(it_deb == 'D'):
pre_d += it_amt
str_flag.append(it_deb)
# print("====")
biz_df1 = biz_df1.withColumn("acct_fast_count", get_acct_fast_count(F.col("acct"), F.lit(pre_acct), F.lit(acct_all_count), F.col("acct_fast_count")))
biz_df0.show()
biz_df1.show()