map/reduce example
# -*- coding: UTF-8 -*-
from pyspark import SparkConf, SparkContext, StorageLevel
import json
conf = SparkConf().setAppName('SimpleApp').setMaster('yarn')
sc = SparkContext(conf=conf)
# Report-201702271315-201702271329.gz => 一个json一行的gzip压缩文件
rdd_file = sc.textFile('hdfs://master:9000/user/ubuntu/work/Report-201702271315-201702271329.gz')
# rdd_file = sc.textFile('s3a://my-bucket/tmp/Report-201702271315-201702271329.gz')
rdd_dict_list = rdd_file.map(json.loads)
rdd_dict_list.persist(StorageLevel.MEMORY_AND_DISK) # MEMORY_ONLY,MEMORY_AND_DISK,DISK_ONLY
def lambda_map_page(d):
event_type = d['event_type']
page_id = d.get('resolve_page_id') or d.get('page_id', '')
if page_id in ['23', '25', '33', '34', '39']:
if event_type == 3:
return page_id, 1
else:
return page_id, 0
else:
return '', 0
rdd_page_kv = rdd_dict_list.map(lambda_map_page).reduceByKey(lambda a, b: a + b)
print rdd_page_kv.collect()
# 打印出每个page_id的install数
# [('', 0), (u'34', 0), (u'23', 61), (u'39', 3), (u'33', 30)]
# 几个常用的 Transformation
# => map,filter,flatMap,sample,union,distinct,reduceByKey,groupByKey,sortByKey,join
# 几个常用的 Actions
# => reduce,collect,count,first,take,countByKey,saveAsTextFile,saveAsSequenceFile
spark sql
# -*- coding: UTF-8 -*-
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext
from collections import OrderedDict
import json
conf = SparkConf().setAppName('SimpleApp').setMaster('local[*]')
sc = SparkContext(conf=conf)
# 这个虽然可能不使用,但是必须要有
sql_context = SQLContext(sc) # Not to mention you need a SQLContext to work with DataFrames anyway.
# rdd_file = sc.textFile('hdfs://master:9000/user/ubuntu/work/Report-2887718663-20031-201702271315-201702271329Ath.gz')
tmp_urls = ['stafmt/Report-2887712797-20010-201610010000-201610010014Ath.gz',
'stafmt/Report-2887712797-20010-201610010015-201610010029Ath.gz',
'stafmt/Report-2887712797-20010-201610010030-201610010044Ath.gz',
'stafmt/Report-2887712797-20010-201610010045-201610010059Ath.gz',
'stafmt/Report-2887712797-20010-201610010100-201610010114Ath.gz',
'stafmt/Report-2887712797-20010-201610010115-201610010129Ath.gz',
'stafmt/Report-2887712797-20010-201610020015-201610020029Ath.gz',
'stafmt/Report-2887712797-20010-201610010130-201610010144Ath.gz']
s_urls = ','.join(['s3a://aff.parallel-app.com/{url}'.format(url=url) for url in tmp_urls])
rdd_file = sc.textFile(s_urls, 2)
rdd_dict_list = rdd_file.map(json.loads)
# rdd_dict_list.persist(StorageLevel.MEMORY_AND_DISK) # MEMORY_ONLY,MEMORY_AND_DISK,DISK_ONLY
# print rdd_dict_list.take(2)
def convert_dict_to_row_old(d):
return Row(**OrderedDict(sorted(d.items())))
def convert_dict_to_row(d):
page_id = d.get('resolve_page_id') or d.get('page_id', '')
event_type = d.get('event_type', 0)
ymd = d.get('create_time_ymd', '')
offer_src = d.get('resolve_offer_src') or d.get('offer_src', '')
tmp_dict = {'page_id': page_id, 'event_type': event_type, 'ymd': ymd, 'offer_src': offer_src}
return Row(**OrderedDict(sorted(tmp_dict.items())))
df = rdd_dict_list.map(convert_dict_to_row).toDF()
# 方式1(ORM)
df.show()
print df.where('page_id=33').groupBy(['offer_src', 'event_type', 'ymd']).agg({'*': 'count'}).collect()
# 方式2(RAW SQL)
df.registerTempTable("offer_report")
sql = """
select ymd,offer_src,event_type,count(*)
from offer_report where page_id='33' and event_type in (31,32,33,34)
group by offer_src,event_type,ymd
"""
sql_context.sql(sql).show()
一个spark sql实际用例
# -*- coding: UTF-8 -*-
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext
from collections import OrderedDict
import json
conf = SparkConf().setAppName('SimpleApp').setMaster('local[*]')
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc) # Not to mention you need a SQLContext to work with DataFrames anyway.
values = {}
def query_from_s3(ymd):
rdd_file1 = sc.textFile('s3a://aff.parallel-app.com/fluentd/track/{ymd}/*'.format(ymd=ymd), 2)
rdd_file2 = sc.textFile('s3a://aff.parallel-app.com/fluentd/track/20170428/track_2017042800*'.format(ymd=ymd), 2)
rdd_file3 = sc.textFile('s3a://aff.parallel-app.com/fluentd/track/20170428/track_2017042801*'.format(ymd=ymd), 2)
rdd_files = [rdd_file1, rdd_file2, rdd_file3]
rdd_file = sc.union(rdd_files)
def convert_line_to_row(line):
line = line.strip()
idx = line.find('{')
if idx != -1:
try:
d = json.loads(line[idx:])
except:
d = {}
else:
d = {}
ymd = d.get('ymd', '')
hms = d.get('hms', '')
aid = d.get('aid', '')
host_package = d.get('host_package', '')
ver = d.get('ver', 0)
ad_type = d.get('ad_type', 0)
ad_packages = d.get('ad_packages', '')
ad_names = d.get('ad_names', '')
label_reasons = d.get('label_reasons', '')
trigger_event = d.get('trigger_event', 0)
match_return = d.get('match_return', {})
channel = d.get('channel', '')
if match_return:
match_return = 1
else:
match_return = 0
tmp_dict = {'aid': aid, 'host_package': host_package, 'ver': ver, 'ad_type': ad_type, 'ymd': ymd, 'hms': hms,
'ad_packages': ad_packages, 'trigger_event': trigger_event, 'match_return': match_return,
'channel': channel, 'ad_names': ad_names, 'label_reasons': label_reasons}
return Row(**OrderedDict(sorted(tmp_dict.items())))
df = rdd_file.map(convert_line_to_row).toDF()
df.persist(StorageLevel.DISK_ONLY) # MEMORY_ONLY,MEMORY_AND_DISK,DISK_ONLY
df.registerTempTable("tbl_track")
sql_active = """
SELECT aid, COUNT(*) AS cnt, COUNT(*) OVER() AS active_users
FROM tbl_track
GROUP BY aid
LIMIT 10
""".format(ymd=ymd)
sql_context.sql(sql_active).show()
active_rows = sql_context.sql(sql_active).collect()
print active_rows
query_from_s3('20170427')