#!/usr/bin/python3
import json
import traceback
from pyspark.sql import SparkSession
Columns = ['uid', 'ts', 'tz', 'event', 'event_network', 'item_id', 'group_id', 'country', 'lang']
dt = "20200101"
def line_process(line):
"""
处理一行的数据
:param line:
:return:
"""
try:
obj = json.loads(line.strip())
gaid = obj.get("uid", '')
tz = obj.get("tz", '')
country = obj.get("country")
lang = obj.get("lang", '')
# 事件列表
events = obj.get("events", [])
records = []
for record in events:
event = record.get('event', '')
ts = record.get('ts', 0)
event_network = record.get('net', 0)
pySpark读取S3上多个路径数据,并保存到Hive中
最新推荐文章于 2022-05-08 22:21:25 发布