Santander Customer Transaction Prediction(2)

https://www.kaggle.com/c/santander-customer-transaction-prediction/leaderboard

import pandas as pd
import matplotlib.pyplot as plt
# 初始化spark
def spark_init(master = "yarn",appName="test"):  
    from pyspark.sql import SparkSession 
    from pyspark.sql import types  
    spark = SparkSession.builder.master(master).appName(appName).getOrCreate() 
    return spark
# 将单个数据源转成dataframe
def get_input_data(path, header=True, inferSchema=True, schema=None, sep=',',encoding='UTF-8'): 
    df = spark.read.csv(path, header=header, inferSchema=inferSchema, schema=schema,sep=sep,encoding=encoding)
    return df
# 将处理过后的数据源持久化到hdfs 
def output_data(df, location, size=1, format="csv", mode="overwrite", header=False, delimiter=','): 
    df.repartition(size).write.save(location, format=format, header=header, delimiter=delimiter,
                                    mode=mode)
    return location
# 数据源描述 
# 数据探索
#数据统计性描述
def data_describe(df):
    import json
    des = df.describe().toPandas()
    return json.dumps(des.to_dict(), ensure_ascii=False)
# 两个变量因子的相关性描速
def data_single_corr(df,col1,col2):
    return df.corr(col1, col1)
# 所有数值类型的变量因子的相关性
def data_all_corr(df):
    import json
    import pandas as pd
    numerical = [t[0] for t in df.dtypes if t[1] == 'int'] 
    n_numerical = len(numerical)
    corr = []
    for i in range(0, n_numerical):
        temp = [None] * i
        for j in range(i, n_numerical):
            temp.append(df.corr(numerical[i], numerical[j]))
        corr.append(temp)  
    df01 = pd.DataFrame(corr,columns=numerical) 
    return json.dumps(df01.to_dict(), ensure_ascii=False)
# 预处理-抽样
def sample(df,withReplacement=False, fraction=0.5, seed=None):
    sample1 = df.sample(
    withReplacement=withReplacement, # 无放回抽样
    fraction=fraction,
    seed=seed)  
    return sample1
#  字段筛选 
def field_select(df, select_fields):
    """
    字段筛选
    :return:
    """
    df = df.select(*select_fields)
    return df
# 缺失值填充(暂时只提供均值填充)
def fill_mean(df,col): 
    from pyspark.sql.functions import mean
    mean_val = df.select(mean(df[col])).collect()
    mean_sales = mean_val[0][0] # to show the number
    return df.na.fill(mean_sales,[col])

# 字符串类别特征onehot的处理
def deal_categoricalCol(categoricalColumns): 
    satge = []
    new_cols = []
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index')
        # encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
        outputCol = categoricalCol + "classVec"
        encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=outputCol)
        satge += [stringIndexer, encoder]
        new_cols.append(outputCol)
    return satge,new_cols

# fit pipeline
def fit_pipeline(train, stages): 
    cols = train.columns
    from pyspark.ml import Pipeline
    pipeline = Pipeline(stages=stages)
    # 适配pipeline
    pipelineModel = pipeline.fit(train)  
    return pipelineModel

# 保存模型
def save_model(pipelineModel,path):
    """
    :type pipelineModel  Pipeline
    :param pipelineModel:
    :return:
    """
    pipelineModel.save(path)
    return path

# 模型预测
def model_pre(test,input_path,output_path,size=1, format="csv", mode="overwrite", header=False, delimiter=','):
    """
    :type return  Pipeline
    :return
    """
    from pyspark.ml import PipelineModel
    pipeModel = PipelineModel.load(path)
    predictions = pipeModel.transform(test)
    predictions.repartition(size).write.save(output_path, format=format, header=header, delimiter=delimiter,
                                    mode=mode)
    predictions.show()
    

In [2]:

spark = spark_init(master='local[*]',appName='sant_demo01')
base_path =  "F:\\001experience\\MatchSummary\\resources\\sant\\" 
spark

Out[2]:

SparkSession - in-memory

SparkContext

Spark UI

Version

v2.2.0

Master

local[*]

AppName

sant_demo01

数据探索

  1. 查看数据分布情况
  2. 区分连续变量和离散变量
  3. 相关性分析
  4. Hypothesis testing
  5. 可视化展示
  6. 统计分析
  7. 问题理解
  8. 异常数据原因分析
  9. 空数据原因分析
  10. 错误数据分析

在Santander,我们的使命是帮助人们和企业繁荣发展。 我们一直在寻找方法来帮助客户了解他们的财务状况,并确定哪些产品和服务可以帮助他们实现货币目标。

我们的数据科学团队不断挑战我们的机器学习算法,与全球数据科学界合作,确保我们能够更准确地识别解决我们最常见挑战的新方法,二元分类问题,例如:客户是否满意? 客户会购买此产品吗? 客户可以支付这笔贷款吗?

在此挑战中,我们邀请Kagglers帮助我们确定哪些客户将来会进行特定交易,无论交易金额多少。 为此次竞赛提供的数据与我们可用于解决此问题的实际数据具有相同的结构。

In [3]:

df_train = get_input_data(base_path + 'train.csv')  
pd.DataFrame(df_train.take(10), columns=df_train.columns).transpose() 

Out[3]:

 0123456789
ID_codetrain_0train_1train_2train_3train_4train_5train_6train_7train_8train_9
target0000000000
var_08.925511.50068.609311.06049.836911.476311.809113.55816.107112.5088
var_1-6.7863-4.1473-2.7457-2.1518-1.4834-2.3182-0.0832-7.98812.44261.9743
var_211.908113.858812.08058.952212.874612.6089.349413.877613.93078.896
var_35.0935.3897.89287.19576.63758.62644.29167.59855.63275.4508
var_411.460712.362210.582512.584612.277210.962111.13558.65438.801413.6043
var_5-9.28347.0433-9.0837-1.83612.44863.5609-8.01980.8316.163-16.2859
var_65.11875.62086.94275.84285.94054.53226.19615.6894.45146.0637
var_718.626616.533814.615514.92519.251415.225512.077122.326210.185416.841
var_8-4.923.1468-4.9193-5.86096.26543.5855-4.37815.0647-3.18820.1287
var_95.7478.08515.95258.2457.67845.9797.92327.19719.08277.9682
var_102.9252-0.4032-0.32492.3061-9.44580.801-5.12881.45320.95010.8787
var_113.18218.0585-11.26482.8102-12.1419-0.6192-7.5271-6.70331.79823.0537
var_1214.013714.023914.192913.846313.848113.63814.162914.291914.065413.9639
var_130.57458.41357.312411.97047.88951.258913.305810.9699-3.05720.8071
var_148.79895.43457.52446.45697.78948.19397.84126.91911.16429.924
var_1514.569113.700314.647214.837215.055314.989414.336314.245914.875715.2659
var_165.748713.82757.678210.7438.487112.07637.59519.537610.007511.39
var_17-7.2393-15.5849-1.7395-0.4299-3.068-1.47111.0922-0.7226-8.94721.5367
var_184.2847.84.701115.94266.52636.734121.19765.15483.83495.4649
var_1930.713328.570820.477513.725711.315214.82416.294617.15350.85613.6196
var_2010.5353.428717.755920.30121.424619.717215.887713.732610.695823.7806
var_2116.21912.740718.137712.557918.960811.988224.259514.41956.37384.4221
var_222.57918.55241.21456.820210.11021.04688.11591.23756.5586.1695
var_232.47163.37163.51372.72292.71423.86633.97693.17112.61823.2978
var_2414.38316.97795.677712.135414.2084.72527.68519.125813.25064.5923
var_2513.432513.89113.217713.736713.543313.942713.3613.32513.792913.3778
var_26-5.1488-11.7684-7.9940.81353.1736-1.2796-0.51563.3883-14.4918-3.22
var_27-0.4073-2.5586-2.9029-0.9059-3.3423-4.37630.069-0.4418-2.5407-2.3302
.................................
var_170-4.76455.5378-7.0927-7.15411.4493-6.14490.26198.95193.1838-3.0868
var_171-8.42545.0988-3.9116-6.192-2.6627-2.0285-1.1405-2.3522-1.7865-1.2558
var_17220.877322.0337.256918.236619.805618.410625.16756.13354.910524.2683
var_1733.15315.5134-5.823411.71342.37051.44572.69650.08763.5803-4.5382
var_17418.561830.264525.68214.748318.468521.885317.015219.564232.914918.2209
var_1757.742310.496810.92028.101316.33099.265412.794213.200813.02017.5652
var_176-10.1245-7.2352-0.310411.8771-3.3456-6.5247-3.0403-11.1786-2.48456.3377
var_17713.724116.57218.843813.955213.526110.76878.173517.304111.098814.6223
var_178-3.5189-7.3477-9.7009-10.47011.7189-7.62834.5637-0.65357.4609-13.896
var_1791.720211.07522.40135.69615.17431.02083.89730.0592-2.14082.391
var_180-8.4051-5.5937-4.2935-3.7546-7.69387.1968-8.14165.114-3.91722.7878
var_1819.01649.48789.39088.41179.768511.122710.05710.54787.729111.3457
var_1823.0657-14.91-13.26481.89864.8912.225715.78626.9736-11.4027-9.6774
var_18314.36919.42453.15457.260112.21986.40563.35936.97242.069610.3382
var_18425.839822.544123.0866-0.463911.850321.05511.91424.0369-1.793719.0645
var_1855.8764-4.8622-5.3-0.0498-7.8931-13.6509-4.287-4.822-0.003-7.6785
var_18611.84117.65435.37457.93366.42094.76917.50158.494711.50246.758
var_187-19.7159-15.9319-6.266-12.82795.927-8.9114-29.9763-5.9076-18.3172-21.607
var_18817.574313.317510.193412.412416.020115.100717.286718.866313.140320.8112
var_1890.5857-0.3566-0.84171.8489-0.28292.42861.85391.97310.7014-0.1873
var_1904.43547.64212.90574.4666-1.4905-6.30688.78313.171.42980.5543
var_1913.96427.72149.79054.74339.52146.60256.45216.549114.7516.316
var_1923.13642.58371.67040.7178-0.15085.29123.53253.99061.63951.0371
var_1931.69110.95161.68581.42149.19420.44030.17775.80611.41813.6885
var_19418.522715.430521.604223.034713.287614.945218.331423.140714.83714.8344
var_195-2.39782.03393.1417-1.2706-1.51211.03140.5845-0.3776-1.9940.4467
var_1967.87848.1267-6.5213-2.92753.9267-3.62419.11044.2178-1.073314.1287
var_1978.56358.78898.267510.29229.50319.7679.11439.42378.19757.9133
var_19812.780318.35614.722217.969717.997412.580910.88698.662419.511416.2375
var_199-1.09141.95180.3965-8.9996-8.8104-4.7602-3.20973.48064.845314.2514

202 rows × 10 columns

In [20]:

df_des = df_train.toPandas().describe(include = 'all')
# df_des.transpose()
df_des

Out[20]:

 ID_codetargetvar_0var_1var_2var_3var_4var_5var_6var_7...var_190var_191var_192var_193var_194var_195var_196var_197var_198var_199
count200000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000...200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000200000.000000
unique200000NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
toptrain_46809NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
freq1NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
meanNaN0.10049010.679914-1.62762210.7151926.79652911.078333-5.0653175.40894916.545850...3.2344407.4384081.9278393.33177417.993784-0.1420882.3033358.90815815.870720-3.326537
stdNaN0.3006533.0400514.0500442.6408942.0433191.6231507.8632670.8666073.418076...4.5599223.0232721.4784233.9920303.1351621.4293725.4543690.9216253.01094510.438015
minNaN0.0000000.408400-15.0434002.117100-0.0402005.074800-32.5626002.3473005.349700...-14.093300-2.691700-3.814500-11.7834008.694400-5.261000-14.2096005.9606006.299300-38.852800
25%NaN0.0000008.453850-4.7400258.7224755.2540759.883175-11.2003504.76770013.943800...-0.0588255.1574000.8897750.58460015.629800-1.170700-1.9469258.25280013.829700-11.208475
50%NaN0.00000010.524750-1.60805010.5800006.82500011.108250-4.8331505.38510016.456800...3.2036007.3477501.9013003.39635017.957950-0.1727002.4089008.88820015.934050-2.819550
75%NaN0.00000012.7582001.35862512.5167008.32410012.2611250.9248006.00300019.102900...6.4062009.5125252.9495006.20580020.3965250.8296006.5567259.59330018.0647254.836800
maxNaN1.00000020.31500010.37680019.35300013.18830016.67140017.2516008.44770027.691800...18.44090016.7165008.40240018.28180027.9288004.27290018.32150012.00040026.07910028.500700

11 rows × 202 columns

In [5]:

# df_des = df_des.transpose()
df_des = pd.read_csv("./describe.csv",index_col=False) 
df_des['range'] = df_des['max']-df_des['min'] #极差
df_des['var'] = df_des['std']/df_des['mean'] #变异系数
df_des['dis'] = df_des['75%']-df_des['25%'] #四分位数间距
df_des.to_csv("./describe.csv") 
df_des

Out[5]:

 Unnamed: 0Unnamed: 0.1IDcodecountmeanstdmin25%50%75%maxrangevardis
000target200000.00.1004900.3006530.00000.0000000.000000.0000001.00001.00002.9918700.000000
111var_0200000.010.6799143.0400510.40848.45385010.5247512.75820020.315019.90660.2846514.304350
222var_1200000.0-1.6276224.050044-15.0434-4.740025-1.608051.35862510.376825.4202-2.4883206.098650
333var_2200000.010.7151922.6408942.11718.72247510.5800012.51670019.353017.23590.2464633.794225
444var_3200000.06.7965292.043319-0.04025.2540756.825008.32410013.188313.22850.3006423.070025
555var_4200000.011.0783331.6231505.07489.88317511.1082512.26112516.671411.59660.1465162.377950
666var_5200000.0-5.0653177.863267-32.5626-11.200350-4.833150.92480017.251649.8142-1.55237412.125150
777var_6200000.05.4089490.8666072.34734.7677005.385106.0030008.44776.10040.1602171.235300
888var_7200000.016.5458503.4180765.349713.94380016.4568019.10290027.691822.34210.2065825.159100
999var_8200000.00.2841623.332634-10.5055-2.3178000.393702.93790010.151320.656811.7279415.255700
101010var_9200000.07.5672361.2350703.97056.6188007.629608.58442511.15067.18010.1632131.965625
111111var_10200000.00.3943405.500793-20.7313-3.5949500.487304.38292518.670239.401513.9493507.977875
121212var_11200000.0-3.2455965.970253-26.0950-7.510600-3.286950.85282517.188743.2837-1.8394948.363425
131313var_12200000.014.0239780.19005913.434613.89400014.0255014.16420014.65451.21990.0135520.270200
141414var_13200000.08.5302324.639536-6.01115.0728008.6042512.27477522.331528.34260.5438937.201975
151515var_14200000.07.5376062.2479081.01335.7818757.520309.27042514.937713.92440.2982263.488550
161616var_15200000.014.5731260.41171113.076914.26280014.5741014.87450015.86332.78640.0282510.611700
171717var_16200000.09.3332642.5574210.63517.4522759.2320511.05590017.950617.31550.2740113.603625
181818var_17200000.0-5.6967316.712612-33.3802-10.476225-5.66635-0.81077519.025952.4061-1.1783279.665450
191919var_18200000.015.2440137.851370-10.66429.17795015.1962521.01332541.748052.41220.51504611.835375
202020var_19200000.012.4385677.996694-12.40256.27647512.4539018.43330035.183047.58550.64289512.156825
212121var_20200000.013.2908945.876254-5.43228.62780013.1968017.87940031.285936.71810.4421269.251600
222222var_21200000.017.2578838.196564-10.089011.55100017.2342523.08905049.044359.13330.47494611.538050
232323var_22200000.04.3054302.847958-5.32252.1824004.275156.29320014.594519.91700.6614804.110800
242424var_23200000.03.0195400.5268931.20982.6341003.008653.4038004.87523.66540.1744950.769700
252525var_24200000.010.5844003.777245-0.67847.61300010.3803513.47960025.446026.12440.3568695.866600
262626var_25200000.013.6674960.28553512.720013.45640013.6625013.86370014.65461.93460.0208920.407300
272727var_26200000.0-4.0551335.922210-24.2431-8.321725-4.19690-0.09020015.675139.9182-1.4604238.231525
282828var_27200000.0-1.1379081.523714-6.1668-2.307900-1.132100.0156253.24319.4099-1.3390492.323525
292929var_28200000.05.5329800.7833672.08964.9921005.534856.0937008.78746.69780.1415811.101600
.............................................
171171171var_170200000.0-0.0049624.424621-14.5060-3.2585000.002803.09640016.731931.2379-891.6902236.354900
172172172var_171200000.0-0.8317775.378008-22.4793-4.720350-0.807352.95680017.917340.3966-6.4656897.677150
173173173var_172200000.019.8170948.674171-11.453313.73177519.7480025.90772553.591965.04520.43771212.175950
174174174var_173200000.0-0.6779675.966674-22.7487-5.009525-0.569753.61990018.855441.6041-8.8008338.629425
175175175var_174200000.020.2106777.136427-2.995315.06460020.2061025.64122543.546846.54210.35310210.576625
176176176var_175200000.011.6406132.8921673.24159.37160011.6798013.74550020.854817.61330.2484554.373900
177177177var_176200000.0-2.7995857.513939-29.1165-8.386500-2.538452.70440020.245249.3617-2.68394811.090900
178178178var_177200000.011.8829332.6288954.95219.80867511.7372513.93130020.596515.64440.2212334.122625
179179179var_178200000.0-1.0140648.579810-29.2734-7.395700-0.942055.33875029.841359.1147-8.46082112.734450
180180180var_179200000.02.5914442.798956-7.85610.6255752.512304.39112513.448721.30481.0800763.765550
181181181var_180200000.0-2.7416665.261243-22.0374-6.673900-2.688800.99620012.750534.7879-1.9189957.670100
182182182var_181200000.010.0855181.3718625.41659.08470010.0360511.01130014.39398.97740.1360231.926600
183183183var_182200000.00.7191098.963434-26.0011-6.0644250.720207.49917529.248755.249812.46463713.563600
184184184var_183200000.08.7690884.474924-4.80825.4231008.6000012.12742523.704928.51310.5103076.704325
185185185var_184200000.012.7566769.318280-18.48975.66330012.5210019.45615044.363462.85310.73046313.792850
186186186var_185200000.0-3.9832614.725167-22.5833-7.360000-3.94695-0.59065012.997535.5808-1.1862566.769350
187187187var_186200000.08.9702743.189759-3.02236.7152008.9021511.19380021.739224.76150.3555924.478600
188188188var_187200000.0-10.33504311.574708-47.7536-19.205125-10.20975-1.46600022.786170.5397-1.11994817.739125
189189189var_188200000.015.3771743.9446044.412312.50155015.2394518.34522529.330324.91800.2565235.843675
190190190var_189200000.00.7460720.976348-2.55430.0149000.742601.4829004.03416.58841.3086521.468000
191191191var_190200000.03.2344404.559922-14.0933-0.0588253.203606.40620018.440932.53421.4098036.465025
192192192var_191200000.07.4384083.023272-2.69175.1574007.347759.51252516.716519.40820.4064414.355125
193193193var_192200000.01.9278391.478423-3.81450.8897751.901302.9495008.402412.21690.7668812.059725
194194194var_193200000.03.3317743.992030-11.78340.5846003.396356.20580018.281830.06521.1981705.621200
195195195var_194200000.017.9937843.1351628.694415.62980017.9579520.39652527.928819.23440.1742364.766725
196196196var_195200000.0-0.1420881.429372-5.2610-1.170700-0.172700.8296004.27299.5339-10.0597382.000300
197197197var_196200000.02.3033355.454369-14.2096-1.9469252.408906.55672518.321532.53112.3680318.503650
198198198var_197200000.08.9081580.9216255.96068.2528008.888209.59330012.00046.03980.1034591.340500
199199199var_198200000.015.8707203.0109456.299313.82970015.9340518.06472526.079119.77980.1897174.235025
200200200var_199200000.0-3.32653710.438015-38.8528-11.208475-2.819554.83680028.500767.3535-3.13780216.045275

201 rows × 14 columns

In [68]:

df_des['stddev'].min()

Out[68]:

0.007186267883143137

In [69]:

df_des['stddev'].quantile(q=0.5)

Out[69]:

3.944604281951493

In [24]:

# 中位数的取值情况
df_des.plot(kind='bar',x='IDcode',y='50%',color='red',figsize=(14,6))
plt.show()

In [5]:

df_train.select('var_193').distinct().count()

Out[5]:

110557

In [7]:

df_train.select('var_45').distinct().count()

Out[7]:

169968

In [21]:

df_train.select('var_91').distinct().count()

Out[21]:

7962

In [24]:

# 查看每个变量的种类
for col in df_train.columns:
    print(col,df_train.select(col).distinct().count())
ID_code 200000
target 2
var_0 94672
var_1 108932
var_2 86555
var_3 74597
var_4 63515
var_5 141030
var_6 38599
var_7 103063
var_8 98617
var_9 49417
var_10 128764
var_11 130193
var_12 9561
var_13 115181
var_14 79122
var_15 19810
var_16 86918
var_17 137823
var_18 139515
var_19 144180
var_20 127764
var_21 140062
var_22 90661
var_23 24913
var_24 105101
var_25 14853
var_26 127089
var_27 60186
var_28 35859
var_29 88339
var_30 145977
var_31 77388
var_32 85964
var_33 112239
var_34 25164
var_35 122384
var_36 96404
var_37 79040
var_38 115366
var_39 112674
var_40 141878
var_41 131896
var_42 31592
var_43 15188
var_44 127702
var_45 169968
var_46 93450
var_47 154781
var_48 152039
var_49 140641
var_50 32308
var_51 143455
var_52 121313
var_53 33460
var_54 144776
var_55 128077
var_56 103045
var_57 35545
var_58 113908
var_59 37744
var_60 113763
var_61 159369
var_62 74778
var_63 97098
var_64 59379
var_65 108347
var_66 47722
var_67 137253
var_68 451
var_69 110346
var_70 153193
var_71 13527
var_72 110115
var_73 142582
var_74 161058
var_75 129383
var_76 139317
var_77 106809
var_78 72254
var_79 53212
var_80 136432
var_81 79065
var_82 144829
var_83 144281
var_84 133766
var_85 108437
var_86 140594
var_87 125296
var_88 84918
var_89 103522
var_90 157210
var_91 7962
var_92 110743
var_93 26708
var_94 89146
var_95 29388
var_96 148099
var_97 158739
var_98 33266
var_99 69301
var_100 150727
var_101 122295
var_102 146237
var_103 9376
var_104 72627
var_105 39115
var_106 71065
var_107 137827
var_108 8525
var_109 112172
var_110 106121
var_111 46464
var_112 60482
var_113 116496
var_114 43084
var_115 86729
var_116 63467
var_117 164469
var_118 143667
var_119 112403
var_120 158269
var_121 64695
var_122 121768
var_123 129893
var_124 91022
var_125 16059
var_126 32411
var_127 95711
var_128 98200
var_129 113425
var_130 36638
var_131 21465
var_132 57923
var_133 19236
var_134 131620
var_135 140774
var_136 156615
var_137 144397
var_138 117429
var_139 137294
var_140 121384
var_141 134444
var_142 128613
var_143 94372
var_144 40595
var_145 108526
var_146 84314
var_147 137559
var_148 10608
var_149 148504
var_150 83660
var_151 109667
var_152 95823
var_153 73728
var_154 119342
var_155 127457
var_156 40634
var_157 126534
var_158 144556
var_159 112830
var_160 156274
var_161 11071
var_162 57396
var_163 123168
var_164 122744
var_165 119403
var_166 17902
var_167 140955
var_168 97227
var_169 18242
var_170 113721
var_171 125914
var_172 143366
var_173 128120
var_174 134945
var_175 92659
var_176 142521
var_177 85720
var_178 145236
var_179 90091
var_180 123477
var_181 56164
var_182 149196
var_183 117529
var_184 145185
var_185 120747
var_186 98060
var_187 157031
var_188 108813
var_189 41765
var_190 114959
var_191 94266
var_192 59066
var_193 110557
var_194 97069
var_195 57870
var_196 125560
var_197 40537
var_198 94153
var_199 149430

In [6]:

df_train.count()

Out[6]:

200000

In [6]:

dd = float(df_des[df_des['IDcode']=='var_199']['50%'])
dd

Out[6]:

-2.81955

In [ ]:

# 数据清洗
1. 缺失值处理
2. 异常数据处理
3. 剔除多重共线性变量
4. 数据分箱

In [14]:

df_std01 = df_des[df_des['std']<0.1][['IDcode','std']].sort_values(by=['std'],ascending=True)
select_cols01 = df_std01['IDcode'].tolist()
# select_cols01.remove('target')
df_std02 = df_des[df_des['std']>=0.1][['IDcode','std']].sort_values(by=['std'],ascending=True)
select_cols02 = df_std02['IDcode'].tolist()
select_cols02.remove('target')
# print(select_cols)
stages = []
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
import pyspark.ml.feature as ft
def deal_binarizer(categoricalColumns): 
    satge = []
    new_cols = []
    for categoricalCol in categoricalColumns:
        outputCol = categoricalCol+"bin"
        dd = float(df_des[df_des['IDcode']==categoricalCol]['50%'])
#         print(categoricalCol,dd)
        binarizer = ft.Binarizer(threshold=dd, inputCol=categoricalCol, outputCol=outputCol) 
        satge += [binarizer]
        new_cols.append(outputCol)
    return satge,new_cols
satge,new_cols = deal_binarizer(select_cols01)
stages += satge
inputCols = new_cols + select_cols02
print(inputCols)
assembler = VectorAssembler(inputCols=inputCols, outputCol="asfeatures")
stages += [assembler]
label_stringIdx = StringIndexer(inputCol='target', outputCol='label')
stages += [label_stringIdx]
scaler = StandardScaler(inputCol="asfeatures", outputCol="features",
                        withStd=True, withMean=False)
stages += [scaler]

from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(df_train)
df = pipelineModel.transform(df_train)
train, test = df.randomSplit([0.7, 0.3], seed=2019)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
# 测试
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features',regParam=0.01, labelCol='label', maxIter=200)
lrModel = lr.fit(train)

import matplotlib.pyplot as plt
import numpy as np
beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()


trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))


pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

predictions = lrModel.transform(test)
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))
print('areaUnderROC', evaluator.evaluate(predictions,
{evaluator.metricName: 'areaUnderROC'}))
print('areaUnderPR', evaluator.evaluate(predictions,{evaluator.metricName: 'areaUnderPR'}))
# predictions.select('features').show()
predictions = predictions.select('features', 'target', 'label', 'rawPrediction', 'prediction', 'probability')
# predictions.show()
pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose() 
['var_68bin', 'var_91', 'var_108', 'var_103', 'var_12', 'var_148', 'var_161', 'var_71', 'var_25', 'var_43', 'var_125', 'var_169', 'var_166', 'var_133', 'var_15', 'var_131', 'var_23', 'var_34', 'var_93', 'var_95', 'var_50', 'var_42', 'var_98', 'var_53', 'var_126', 'var_28', 'var_57', 'var_130', 'var_59', 'var_105', 'var_6', 'var_197', 'var_144', 'var_156', 'var_189', 'var_114', 'var_111', 'var_66', 'var_9', 'var_79', 'var_181', 'var_162', 'var_195', 'var_132', 'var_192', 'var_64', 'var_27', 'var_112', 'var_4', 'var_116', 'var_121', 'var_99', 'var_106', 'var_104', 'var_78', 'var_153', 'var_62', 'var_3', 'var_31', 'var_14', 'var_37', 'var_81', 'var_150', 'var_88', 'var_146', 'var_16', 'var_32', 'var_29', 'var_115', 'var_177', 'var_2', 'var_124', 'var_94', 'var_179', 'var_22', 'var_46', 'var_175', 'var_143', 'var_152', 'var_198', 'var_191', 'var_0', 'var_168', 'var_63', 'var_36', 'var_194', 'var_127', 'var_186', 'var_128', 'var_8', 'var_7', 'var_56', 'var_89', 'var_24', 'var_65', 'var_77', 'var_110', 'var_145', 'var_85', 'var_188', 'var_72', 'var_69', 'var_151', 'var_193', 'var_1', 'var_39', 'var_159', 'var_129', 'var_119', 'var_92', 'var_60', 'var_38', 'var_58', 'var_33', 'var_109', 'var_170', 'var_113', 'var_183', 'var_138', 'var_190', 'var_13', 'var_185', 'var_140', 'var_101', 'var_154', 'var_52', 'var_165', 'var_122', 'var_35', 'var_180', 'var_163', 'var_171', 'var_196', 'var_164', 'var_10', 'var_157', 'var_87', 'var_55', 'var_142', 'var_155', 'var_20', 'var_44', 'var_26', 'var_41', 'var_173', 'var_11', 'var_75', 'var_123', 'var_134', 'var_84', 'var_17', 'var_141', 'var_174', 'var_67', 'var_147', 'var_80', 'var_73', 'var_176', 'var_107', 'var_135', 'var_139', 'var_86', 'var_167', 'var_18', 'var_49', 'var_5', 'var_158', 'var_76', 'var_30', 'var_19', 'var_51', 'var_21', 'var_40', 'var_83', 'var_54', 'var_82', 'var_96', 'var_178', 'var_102', 'var_172', 'var_118', 'var_137', 'var_182', 'var_100', 'var_184', 'var_136', 'var_149', 'var_199', 'var_47', 'var_160', 'var_48', 'var_187', 'var_61', 'var_70', 'var_120', 'var_97', 'var_90', 'var_117', 'var_74', 'var_45']
Training Dataset Count: 140208
Test Dataset Count: 59792

Training set areaUnderROC: 0.8572874986254869

Test Area Under ROC 0.8604638791224748
areaUnderROC 0.8604638791224721
areaUnderPR 0.5145853422620837

Out[14]:

 0123456789
features[2.0000262487479796, 47.42117806123396, 82.294...[0.0, 46.302865299428504, 82.68222664303848, 7...[2.0000262487479796, 45.757139153802285, 84.51...[0.0, 44.64668799313429, 83.20358606228598, 8....[0.0, 44.38397948845709, 82.9966788488178, 7.3...[2.0000262487479796, 46.29958963228788, 85.204...[0.0, 44.481594369247375, 82.99434091420234, 9...[0.0, 47.517482675168, 83.12409628536034, 9.30...[2.0000262487479796, 45.63659460302771, 82.554...[0.0, 46.1672526798071, 82.50746603053287, 10....
target0010000100
label0010000100
rawPrediction[2.745411097253725, -2.745411097253725][2.2770635627955, -2.2770635627955][1.7640231694663067, -1.7640231694663067][2.3870306837485726, -2.3870306837485726][2.699555439548069, -2.699555439548069][1.9689919906092934, -1.9689919906092934][2.236223494259175, -2.236223494259175][2.352430409935469, -2.352430409935469][2.8009650886646655, -2.8009650886646655][2.6047753141937164, -2.6047753141937164]
prediction0000000000
probability[0.9396536620507552, 0.0603463379492449][0.9069595547320315, 0.09304044526796858][0.8537128180771243, 0.14628718192287574][0.9158329668960973, 0.08416703310390271][0.9370004063505419, 0.06299959364945804][0.8775028018243339, 0.12249719817566608][0.9034555593377342, 0.09654444066226563][0.9131272157024063, 0.08687278429759375][0.942727953398344, 0.05727204660165593][0.9311682791393624, 0.06883172086063759]

In [20]:

df_test = get_input_data(base_path + 'test.csv')  
pd.DataFrame(df_test.take(10), columns=df_test.columns).transpose() 
df_test = pipelineModel.transform(df_test)
predictions = lrModel.transform(df_test)
# predictions.select('features').show()
predictions = predictions.select('ID_code',  'prediction')
# predictions.show()
pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose() 

Out[20]:

 0123456789
ID_codetest_0test_1test_2test_3test_4test_5test_6test_7test_8test_9
prediction0000000000

In [19]:

predictions.show(5)
+--------------------+--------------------+----------+--------------------+
|            features|       rawPrediction|prediction|         probability|
+--------------------+--------------------+----------+--------------------+
|[0.0,46.405721247...|[1.84608169708193...|       0.0|[0.86366639261870...|
|[2.00002624874797...|[1.86154755297036...|       0.0|[0.86547722599330...|
|[0.0,45.986435853...|[2.29001370850934...|       0.0|[0.90804659470819...|
|[0.0,45.318199756...|[1.90419746231964...|       0.0|[0.87036585836244...|
|[2.00002624874797...|[2.07114740649738...|       0.0|[0.88806706897729...|
+--------------------+--------------------+----------+--------------------+
only showing top 5 rows

In [23]:

# predictions = predictions.toPandas()
predictions['target'] = predictions['prediction'].apply(lambda x: int(x))
predictions[["ID_code","target"]].to_csv("./submit01.csv",index=False)
predictions[["ID_code","target"]]

Out[23]:

 ID_codetarget
0test_00
1test_10
2test_20
3test_30
4test_40
5test_50
6test_60
7test_70
8test_80
9test_90
10test_100
11test_110
12test_120
13test_130
14test_140
15test_150
16test_160
17test_170
18test_180
19test_190
20test_200
21test_210
22test_220
23test_230
24test_240
25test_250
26test_260
27test_270
28test_280
29test_290
.........
199970test_1999700
199971test_1999710
199972test_1999720
199973test_1999730
199974test_1999740
199975test_1999750
199976test_1999760
199977test_1999770
199978test_1999780
199979test_1999790
199980test_1999800
199981test_1999810
199982test_1999820
199983test_1999830
199984test_1999840
199985test_1999850
199986test_1999860
199987test_1999870
199988test_1999880
199989test_1999890
199990test_1999900
199991test_1999910
199992test_1999920
199993test_1999930
199994test_1999940
199995test_1999950
199996test_1999960
199997test_1999970
199998test_1999980
199999test_1999990

200000 rows × 2 columns

In [ ]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 20)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))


from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=100)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

print(gbt.explainParams())


# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# paramGrid = (ParamGridBuilder()
#              .addGrid(gbt.maxDepth, [12, 8, 20])
#              .addGrid(gbt.maxBins, [200, 600])
#              .addGrid(gbt.maxIter, [100, 1000])
#              .build())
# cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
# # Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
# cvModel = cv.fit(train)
# predictions = cvModel.transform(test)
# print('Test Area Under ROC', evaluator.evaluate(predictions))
# print('areaUnderROC', evaluator.evaluate(predictions,
# {evaluator.metricName: 'areaUnderROC'}))
# print('areaUnderPR', evaluator.evaluate(predictions,{evaluator.metricName: 'areaUnderPR'}))

# predictions.show()
pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose() 
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ljtyxl

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值