定义单个参数化辅助函数
read_data(sc, schema_file, data_dir):
"return df"
schema = read(schema_file)
rdd = sc.textFile()
sqlc = SQLContext(sc)
df = sqlc.createDataFrame(rdd, schema=schema)
return df
那么主要的计划就是
df1 = read_data(sc, sf1, dd1)
df2 = read_data(sc, sf2, dd2)
df3 = read_data(sc, sf3, dd3)
df4 = read_data(sc, sf4, dd4)
# assumes inner joins each with different join attrs
net = (df1.join(df2, (df1["key1a"]==df2["key2a"]
& df1["key1b"]==df2["key2b"]))
.join(df3, (df1["key1c"]==df3["key3c"]
& df1["key1d"]==df3["key3d"]))
.join(df4, (df1["key1e"]==df4["key4e"]
& df1["key1f"]==df4["key4f"])))
# you may wish to remove some duplicated columns, esp. since you said there are many input columns
filt = net.select(...) # or
filt = net.drop(...)
filt.write.parquet("out")
我个人会把Hive表创建放在别处,而不是将它捆绑到脚本中 .
您必须小心使用sc(或spark)作为全局,特别是如果脚本可以以交互方式和其他代码调用 . 我通常添加样板来有条件地创建sc在脚本a la底部的 __main__ 上下文中
try:
# noinspection PyUnboundLocalVariable
assert sc
except (AssertionError, UnboundLocalError):
spark_name = ("my app name")
conf = SparkConf()
conf.setAppName(spark_name)
sc = SparkContext.getOrCreate(conf)