读hdfs上的文件
from smart_open import open
import pandas as pd
file_stream = open("hdfs:///tmp/a.csv", encoding='gb2312')
file = pd.read_csv(file_stream)
这个csv是这种格式的,一个需求求每个class1下得分最高的class2展示
class0 | class1 | class2 | var | score |
---|---|---|---|---|
A | a | a-1 | zhangsan | 80 |
A | a | a-2 | lisi | 90 |
A | b | b-1 | wangwu | 97 |
class myclass:
def __init__(self, filePath=""):
self.definition_dict = defaultdict(defaultdict)
if filePath != "":
self.file = pd.read_csv(filePath)
self.__initScore()
def __initScore(self):
self.definition_dict = self.file.set_index("Var").to_dict(orient="index")
return
def get_rule_dict(self):
definition_dict = self.file.set_index(['class0', 'class1', 'class2'])
result = lambda: defaultdict(result)
mytree = result()
for r, kv in definition_dict.iterrows():
a, b, c = r
mytree[a][b][c] = kv.loc['var']
return mytree
udf
from spark.sql.functions import udf
import json
@udf(StringType())
def udf1(arg):
res={}
return json.dumps(res,ensure_ascii=False) # 中文不乱码
arr1=['column1','column2']
df=df.withColumn('test',F.concat_ws('-',F.array(arr1)))
df=df.withColumn('test2',udfx('test',F.lit('常量')))