数据原始格式
# -*- coding: utf-8 -*-
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession, HiveContext
from pyspark.sql.types import *
from pyspark.sql import Row
# 自定义测试函数
def ceshi(score):
if score<0.1:
return 0.01
else:
return 100
# 保存文件前,先删除输出目录
def save_file(rdd, output_path):
try:
os.system('hdfs dfs -rm -r ' + output_path)
except Exception as e:
print(e)
rdd.saveAsTextFile(output_path)
if __name__ == "__main__":
conf = SparkConf().setMaster("yarn").setAppName("My App")
sc = SparkContext(conf=conf)
#spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()
#spark = SparkSession.builder.config('spark.driver.memory', '2g').getOrCreate()
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
# 以orc格式存储的数据库表文件
input_path = r"hdfs://ns1/user/hive/warehouse/zz.db/XX"
output_path = r"hdfs://ns1/user/zz/kk/test_data"
df = spark.read.orc(input_path)
rdd1 = df.rdd
rdd2 = rdd1.filter(lambda line:len(line)==7) \
.map(lambda line:(line[0],line[6], ceshi(line[3])))
save_file(rdd2, output_path)
spark.stop()
print("------done")