2.任务定义
初始化spark的链接,初始化hive链接
from pyspark.sql import SparkSession
from pyspark import SparkContext
from py4j.java_gateway import JavaGateway, java_import
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
# 创建初始化hive和spark
sc = SparkContext()
spark = SparkSession.builder.master("spark://hadoop1:7077").appName("test_spark").enableHiveSupport().getOrCreate()
hiveContext = HiveContext(sc)
# 导入外部jar包
java_import(sc._gateway.jvm, "cn.kunming.fileutils.Document")
func = sc._gateway.jvm.Document()
定义任务处理的ETL类
# transform.py
class Transform(object):
@staticmethod
def _parse(path):
try:
return func.read(path)
except Exception as e:
return ""
@staticmethod
def extract(sql):
# 从hive提取数据
return hiveContext.sql(sql).toPandas()
@staticmethod
def transform(data):
# 利用外部jar包转换数据
data["content"] = data["path"].apply(Transform._parse)
return data
@staticmethod
def load(data):
# 定义hbase的设置
zookeeper_host = 'hadoop1:2181'
hbase_table_name = 'hive_hbase_article'
conf = {
"hbase.zookeeper.quorum": zookeeper_host,
"hbase.mapred.outputtable": hbase_table_name,
"mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
"mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"
}
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
# 写入hbase的基本数据结构
"""
records = [
["row_key", "cf1", "field", "value"]
]
"""
data.dropna(inplace=True)
# spark处理数据写入hbase
sc.parallelize(data.index.to_list()).flatMap(
lambda x: (
[
data.loc[x, 'row_key'],
[
data.loc[x, 'row_key'],
"other",
"attachment_title1",
data.loc[x, 'oldname']
]
],
[
data.loc[x, 'row_key'],
[
data.loc[x, 'row_key'],
"other",
"attachment_content1",
data.loc[x, 'content']
]
]
)
).map(
lambda x: (
x[0], x[1]
)
).saveAsNewAPIHadoopDataset(
conf=conf,
keyConverter=keyConv,
valueConverter=valueConv
)
``