数据集格式
CrimePredict.py
import pandas as pd
import numpy as np
res_dic = []
# 1、载入数据
train = pd.read_csv('processed_train.csv', parse_dates = ['Dates'])#input train_path
test = pd.read_csv('processed_test.csv', parse_dates = ['Dates'])#input test_path
# 2、数据预处理,对category进行编码
from sklearn import preprocessing
label = preprocessing.LabelEncoder()
crime = label.fit_transform(train.Category) #进行编号
# 3、对Dates、DayOfWeek、PdDistrict三个特征进行二值化处理(1或者0),因为3个在训练集和测试集都出现
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = pd.get_dummies(train.Dates.dt.hour)
#month = pd.get_dummies(train.Dates.dt.month)
train_data = pd.concat([days, district, hour], axis=1) # 将days district hour连成一张表 ,当axis = 1的时候,concat就是行对齐,然后将不同列名称的两张表合并
train_data['crime'] = crime # 在DataFrame数据结构 表的 最后加一列,在本例中相当于标签
# 实际上,只使用了三个特征,和犯罪类型作为标签 即只使用了原始数据集中的4列数据
# 但是train_data这张表 其实是将3个特征展开成了几十个特征 对应一个标签
crime = label.fit_transform(test.Category) #进行编号
# 针对测试集做同样的处理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = pd.get_dummies(test.Dates.dt.hour)
#month = pd.get_dummies(test.Dates.dt.month)
test_data = pd.concat([days, district, hour], axis=1)
test_data['crime'] = crime
# 4、将样本几何分割成训练集和验证集(70%训练,30%验证),返回的是划分好的训练集 和 验证集
from sklearn.model_selection import train_test_split
training, validation = train_test_split(train_data, train_size=0.7)
# 5、朴素贝叶斯
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from pyspark.ml.classification import NaiveBayes
model = BernoulliNB()
#model = NaiveBayes()
feature_list = training.columns.tolist() #将列名字转换为列表
feature_list = feature_list[:len(feature_list) - 1] # 选取的特征列 最后一列是标签,不能要,注意列表是左闭右开
model.fit(training[feature_list], training['crime']) #根据给定的训练数据拟合模型
predicted = np.array(model.predict_proba(validation[feature_list])) #validation[feature_list] 不包括最后一列crime 的验证集 model.predict_proba 第 i 行 第 j 列上的数值是模型预测第 i 个预测样本 为某个【标签】的概(表头是标签类别),从小到大排序的 predicted是在验证集上的结果
predicted_acc = np.array(model.predict(validation[feature_list]))
predicted_test_acc = np.array(model.predict(test_data[feature_list]))
logLoss = "BernoulliNB log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "BernoulliNB train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "BernoulliNB test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss) #多分类的对数损失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)
# 6、其他模型等 (逻辑回归,随机森林)
from sklearn.linear_model import LogisticRegression
#from pyspark.ml.classification import LogisticRegression
model_LR = LogisticRegression(C=0.1)
model_LR.fit(training[feature_list], training['crime'])
predicted = np.array(model_LR.predict_proba(validation[feature_list]))
predicted_acc = np.array(model_LR.predict(validation[feature_list]))
predicted_test_acc = np.array(model_LR.predict(test_data[feature_list]))
logLoss = "LogisticRegression log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "LogisticRegression train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "LogisticRegression test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss) #多分类的对数损失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)
from sklearn.ensemble import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
model_RF = RandomForestClassifier()
#model_RF = RandomForestRegressor()
model_RF.fit(training[feature_list], training['crime'])
predicted = np.array(model_RF.predict_proba(validation[feature_list]))
predicted_acc = np.array(model_RF.predict(validation[feature_list]))
predicted_test_acc = np.array(model_RF.predict(test_data[feature_list]))
logLoss = "RandomForest log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "RandomForest train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "RandomForest test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss) #多分类的对数损失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)
with open('result.txt','w') as f:#output_path
for res in res_dic:
f.write(res + '\n')
运行结果
CrimePredict_pyspark.py
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()
# 读取测试数据 s3://my-cluster-zdy/
df = spark.read.csv('s3://my-cluster-zdy/processed_train.csv',inferSchema=True,header=True)#input_path
df_t = spark.read.csv('s3://my-cluster-zdy/processed_test.csv',inferSchema=True,header=True)
print('-------------- train data transfer ------------------')
from pyspark.ml.feature import StringIndexer # StringIndexer可以把字符串的列按照出现频率进行排序,出现次数最高的对应的Index为0
## 2.1 将字符串转换为可度量值
search_engine_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeek_Num").fit(df) # 返回对应的模型,即StringIndexerModel
df = search_engine_indexer.transform(df) # 输入的dataset进行模型转换,返回经过转换后的dataset
search_engine_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Num").fit(df) # 返回对应的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)
search_engine_indexer = StringIndexer(inputCol="Category", outputCol="Category_Num").fit(df) # 返回对应的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)
## 2.2 进行独热编码
from pyspark.ml.feature import OneHotEncoder # OneHotEncoder 它可以实现将分类特征的每个元素转化为一个可以用来计算的值
## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="DayOfWeek_Num", outputCol="DayOfWeek_Vector")
df = search_engine_encoder.transform(df)
## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="PdDistrict_Num", outputCol="PdDistrict_Vector")
df = search_engine_encoder.transform(df)
print('-------------- test data transfer ------------------')
## 2.1 将字符串转换为可度量值
search_engine_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeek_Num").fit(df_t) # 返回对应的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t) # 输入的dataset进行模型转换,返回经过转换后的dataset
search_engine_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Num").fit(df_t) # 返回对应的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)
search_engine_indexer = StringIndexer(inputCol="Category", outputCol="Category_Num").fit(df_t) # 返回对应的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)
## 2.2 进行独热编码
## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="DayOfWeek_Num", outputCol="DayOfWeek_Vector")
df_t = search_engine_encoder.transform(df_t)
## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="PdDistrict_Num", outputCol="PdDistrict_Vector")
df_t = search_engine_encoder.transform(df_t)
# 3 - 进行逻辑回归数据训练
print('-------------- LogisticRegression Training ------------------')
from pyspark.ml.feature import VectorAssembler # 导入VerctorAssembler 将多个列合并成向量列的特征转换器,即将表中各列用一个类似list表示,输出预测列为单独一列。
## 3.1 将经过进行量化后的platform,country和原来的Age,Repeat_Visitor ,Web_pages_viewed 构成特征向量
df_assembler = VectorAssembler(inputCols=['DayOfWeek_Vector','PdDistrict_Vector'], outputCol="features")
df = df_assembler.transform(df)
df_t = df_assembler.transform(df_t)
## 查看构建后的数据
model_df=df.select(['features','Category_Num'])
model_df_t=df_t.select(['features','Category_Num'])
## 3.2 进行逻辑回归
from pyspark.ml.classification import LogisticRegression # 逻辑回归。该类支持多项逻辑(softmax)和二项逻辑回归
training_df,test_df=model_df.randomSplit([0.75,0.25]) # 划分数据,75%的数据用于训练,25%数据用于验证测试
log_reg=LogisticRegression(labelCol='Category_Num').fit(training_df) # 返回LogisticRegressionModel类型模型对象
print('{}{}'.format('LogisticRegression Train accuracy:',log_reg.evaluate(training_df).accuracy) ) # 查看预测的准确率
print('{}{}'.format('LogisticRegression Test accuracy:',log_reg.evaluate(model_df_t).accuracy) ) # 查看预测的准确率
print('-------------- RandomForest Training ------------------')
from pyspark.ml.classification import RandomForestClassifier
training_df,test_df=model_df.randomSplit([0.75,0.25]) # 划分数据,75%的数据用于训练,25%数据用于验证测试
rf_classifier=RandomForestClassifier(labelCol='Category_Num').fit(training_df) # 返回LogisticRegressionModel类型模型对象
rf_predictions=rf_classifier.transform(test_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # 多类分类的评估器,它期望两个输入列:预测和标签
rf_accuracy=MulticlassClassificationEvaluator(labelCol='Category_Num',metricName='accuracy').evaluate(rf_predictions)
#print('MulticlassClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(rf_accuracy))
print('{}{}'.format('RandomForestClassifier Train accuracy:',rf_accuracy) ) # 查看预测的准确率
rf_predictions=rf_classifier.transform(model_df_t)
rf_accuracy=MulticlassClassificationEvaluator(labelCol='Category_Num',metricName='accuracy').evaluate(rf_predictions)
#print('MulticlassClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(rf_accuracy))
print('{}{}'.format('RandomForestClassifier Test accuracy:',rf_accuracy) ) # 查看预测的准确率
运行结果