使用Pyspark训练模型后,经常要将模型的训练结果输出为hive表,这篇博文就介绍如何将dataframe数据存为hive表。
想把DataFrame数据存为hive数据,就需要用到HiveContext,下面看下如何使用:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
import numpy as np
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
#创建一个会话
spark = SparkSession \
.builder \
.master("yarn") \
.appName('create_df_test2') \
.enableHiveSupport() \
.getOrCreate()
#输入要训练的数据
trainData = spark.sql("""select * from table""")
# 1.2 构造训练数据集
trainingSet = trainData.rdd.map(lambda x: Row(label=x[-1], features=Vectors.dense(x[:-