导包
from pyspark import SparkConf,SparkContext
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.sql import SparkSession
import numpy as np
import time
fit函数进行模型训练、评估
def fit(traindata,rf,file):
'''
:param traindata: 总数据集
:param rf: 模型
:param file: 存入的csv文件路径
:return:
'''
#time.clock统计训练时间
s = time.clock()
#划分训练集和测试集
train,testdata=traindata.randomSplit([0.8,0.2])
#建立pipeline,设置阶段,这里不需要对标签和训练集进行转换,只有模型rf
pipe=Pipeline().setStages([rf])
#传入训练数据
pipemodel=pipe.fit(traindata)
#测试
rfpre=pipemodel.transform(testdata)
e=time.clock()
print('Time:\t',e-s)
# rfpre.select("prediction","label").show()
#建立二分类评估器
evaluate=BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction")
#提取,存入csv文件
elements = ["label","prediction"]
df_temp = rfpre[elements]
df_temp.write.csv(file)
#输出准确率
acc=evaluate.evaluate(rfpre)
print("Accuracy of Model: ",acc)
RandomForest
#读取数据
traindata=sc.textFile('hdfs://localhost:9000/bigwork/traindata.csv').map(lambda x:x.split(','))\
.filter(lambda x:x[3]!='age').map(lambda x:Row(label=int(x[2]),feature=Vectors.dense\
(float(x[3]),float(x[4]),float(x[5]),float(x[6]),float(x[7])))).toDF()
rf=RandomForestClassifier(labelCol='label',featuresCol='feature',maxDepth=20)
#调用fit进行训练
fit(traindata,rf,"PreofOverdueGCY1.csv")
对不平衡数据集进行过采样
#获得全部数据集
data=sc.textFile('hdfs://localhost:9000/bigwork/traindata.csv').map(lambda x:x.split(','))\
.filter(lambda x:x[3]!='age').collect()
data=np.array(data)
#获取标签
label=data[:,2]
#统计逾期和未逾期的人数
l1=len(label[label=='1'])
l0=len(label[label=='0'])
#计算需要给样本数少的类别增加的样本数
beishu=int(l0/l1-1)*l1
#获得数据集中样本标签为1的样本下标
index=np.array(np.where(label=='1'))
index=index[0]
#使用np.random.choice进行有重复抽样
index_choose=np.random.choice(index,beishu,replace=True)
#由于虚拟机性能实在不行,本实验每个类别取50000样本进行作为数据集
index1=index_choose[:50000]
index0=np.array(np.where(label=='0'))[0][:50000]
d=data[index1]
d1=data[index0]
data=list(np.append(d,d1,axis=0))
将list数据转化为RDD,然后设置标签label和特征feature,转换为DataFrame,进行训练
data_df=sc.parallelize(data).map(lambda x:Row(label=int(x[2]),feature=Vectors.dense(float(x[3]),float(x[4]),float(x[5]),float(x[6]),float(x[7]),float(x[8])))).toDF()
rf=RandomForestClassifier(labelCol='label',featuresCol='feature',maxDepth=20)
fit(data_df,rf,'1')
DecisionTree
traindata=sc.textFile('hdfs://localhost:9000/bigwork/traindata.csv').map(lambda x:x.split(','))\
.filter(lambda x:x[3]!='age').map(lambda x:Row(label=int(x[2]),feature=Vectors.dense(float(x[3]),float(x[4]),float(x[5]),float(x[6]),float(x[7])))).toDF()
# print(traindata.show(5))
rf=DecisionTreeClassifier(labelCol='label',featuresCol='feature',maxDepth=30)
fit(traindata,rf,"PreofOverdueDeTree.csv")
logistic
traindata=sc.textFile('hdfs://localhost:9000/bigwork/traindata.csv').map(lambda x:x.split(','))\
.filter(lambda x:x[3]!='age').map(lambda x:Row(label=int(x[2]),feature=Vectors.dense(float(x[3]),float(x[4]),float(x[5]),float(x[6]),float(x[7])))).toDF()
rf=LogisticRegression(featuresCol='feature',labelCol='label',maxIter=100)
fit(traindata,rf,"PreofOverdueLogistic5000.csv")