spark map reduce和spark sql

map/reduce example

# -*- coding: UTF-8 -*-

from pyspark import SparkConf, SparkContext, StorageLevel
import json

conf = SparkConf().setAppName('SimpleApp').setMaster('yarn')
sc = SparkContext(conf=conf)

# Report-201702271315-201702271329.gz => 一个json一行的gzip压缩文件
rdd_file = sc.textFile('hdfs://master:9000/user/ubuntu/work/Report-201702271315-201702271329.gz')
# rdd_file = sc.textFile('s3a://my-bucket/tmp/Report-201702271315-201702271329.gz')

rdd_dict_list = rdd_file.map(json.loads)
rdd_dict_list.persist(StorageLevel.MEMORY_AND_DISK)  # MEMORY_ONLY,MEMORY_AND_DISK,DISK_ONLY


def lambda_map_page(d):
    event_type = d['event_type']
    page_id = d.get('resolve_page_id') or d.get('page_id', '')
    if page_id in ['23', '25', '33', '34', '39']:
        if event_type == 3:
            return page_id, 1
        else:
            return page_id, 0
    else:
        return '', 0


rdd_page_kv = rdd_dict_list.map(lambda_map_page).reduceByKey(lambda a, b: a + b)

print rdd_page_kv.collect()
# 打印出每个page_id的install数
# [('', 0), (u'34', 0), (u'23', 61), (u'39', 3), (u'33', 30)]


# 几个常用的 Transformation
# => map,filter,flatMap,sample,union,distinct,reduceByKey,groupByKey,sortByKey,join
# 几个常用的 Actions
# => reduce,collect,count,first,take,countByKey,saveAsTextFile,saveAsSequenceFile

spark sql

# -*- coding: UTF-8 -*-

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext
from collections import OrderedDict
import json

conf = SparkConf().setAppName('SimpleApp').setMaster('local[*]')
sc = SparkContext(conf=conf)
# 这个虽然可能不使用,但是必须要有
sql_context = SQLContext(sc)  # Not to mention you need a SQLContext to work with DataFrames anyway.

# rdd_file = sc.textFile('hdfs://master:9000/user/ubuntu/work/Report-2887718663-20031-201702271315-201702271329Ath.gz')
tmp_urls = ['stafmt/Report-2887712797-20010-201610010000-201610010014Ath.gz',
            'stafmt/Report-2887712797-20010-201610010015-201610010029Ath.gz',
            'stafmt/Report-2887712797-20010-201610010030-201610010044Ath.gz',
            'stafmt/Report-2887712797-20010-201610010045-201610010059Ath.gz',
            'stafmt/Report-2887712797-20010-201610010100-201610010114Ath.gz',
            'stafmt/Report-2887712797-20010-201610010115-201610010129Ath.gz',
            'stafmt/Report-2887712797-20010-201610020015-201610020029Ath.gz',
            'stafmt/Report-2887712797-20010-201610010130-201610010144Ath.gz']
s_urls = ','.join(['s3a://aff.parallel-app.com/{url}'.format(url=url) for url in tmp_urls])
rdd_file = sc.textFile(s_urls, 2)

rdd_dict_list = rdd_file.map(json.loads)


# rdd_dict_list.persist(StorageLevel.MEMORY_AND_DISK)  # MEMORY_ONLY,MEMORY_AND_DISK,DISK_ONLY

# print rdd_dict_list.take(2)


def convert_dict_to_row_old(d):
    return Row(**OrderedDict(sorted(d.items())))


def convert_dict_to_row(d):
    page_id = d.get('resolve_page_id') or d.get('page_id', '')
    event_type = d.get('event_type', 0)
    ymd = d.get('create_time_ymd', '')
    offer_src = d.get('resolve_offer_src') or d.get('offer_src', '')
    tmp_dict = {'page_id': page_id, 'event_type': event_type, 'ymd': ymd, 'offer_src': offer_src}
    return Row(**OrderedDict(sorted(tmp_dict.items())))


df = rdd_dict_list.map(convert_dict_to_row).toDF()

# 方式1(ORM)
df.show()
print df.where('page_id=33').groupBy(['offer_src', 'event_type', 'ymd']).agg({'*': 'count'}).collect()

# 方式2(RAW SQL)
df.registerTempTable("offer_report")
sql = """
select ymd,offer_src,event_type,count(*)
from offer_report where page_id='33' and event_type in (31,32,33,34)
group by offer_src,event_type,ymd
"""
sql_context.sql(sql).show()

一个spark sql实际用例

# -*- coding: UTF-8 -*-

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext
from collections import OrderedDict
import json

conf = SparkConf().setAppName('SimpleApp').setMaster('local[*]')
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)  # Not to mention you need a SQLContext to work with DataFrames anyway.
values = {}


def query_from_s3(ymd):
    rdd_file1 = sc.textFile('s3a://aff.parallel-app.com/fluentd/track/{ymd}/*'.format(ymd=ymd), 2)
    rdd_file2 = sc.textFile('s3a://aff.parallel-app.com/fluentd/track/20170428/track_2017042800*'.format(ymd=ymd), 2)
    rdd_file3 = sc.textFile('s3a://aff.parallel-app.com/fluentd/track/20170428/track_2017042801*'.format(ymd=ymd), 2)
    rdd_files = [rdd_file1, rdd_file2, rdd_file3]

    rdd_file = sc.union(rdd_files)

    def convert_line_to_row(line):
        line = line.strip()
        idx = line.find('{')
        if idx != -1:
            try:
                d = json.loads(line[idx:])
            except:
                d = {}
        else:
            d = {}
        ymd = d.get('ymd', '')
        hms = d.get('hms', '')
        aid = d.get('aid', '')
        host_package = d.get('host_package', '')
        ver = d.get('ver', 0)
        ad_type = d.get('ad_type', 0)
        ad_packages = d.get('ad_packages', '')
        ad_names = d.get('ad_names', '')
        label_reasons = d.get('label_reasons', '')

        trigger_event = d.get('trigger_event', 0)
        match_return = d.get('match_return', {})
        channel = d.get('channel', '')

        if match_return:
            match_return = 1
        else:
            match_return = 0

        tmp_dict = {'aid': aid, 'host_package': host_package, 'ver': ver, 'ad_type': ad_type, 'ymd': ymd, 'hms': hms,
                    'ad_packages': ad_packages, 'trigger_event': trigger_event, 'match_return': match_return,
                    'channel': channel, 'ad_names': ad_names, 'label_reasons': label_reasons}
        return Row(**OrderedDict(sorted(tmp_dict.items())))

    df = rdd_file.map(convert_line_to_row).toDF()
    df.persist(StorageLevel.DISK_ONLY)  # MEMORY_ONLY,MEMORY_AND_DISK,DISK_ONLY

    df.registerTempTable("tbl_track")

    sql_active = """
    SELECT aid, COUNT(*) AS cnt, COUNT(*) OVER() AS active_users
    FROM tbl_track
    GROUP BY aid
    LIMIT 10
    """.format(ymd=ymd)
    sql_context.sql(sql_active).show()
    active_rows = sql_context.sql(sql_active).collect()
    print active_rows


query_from_s3('20170427')

转载于:https://my.oschina.net/1123581321/blog/858722

Structured Streaming是Spark 2.0引入的一种流式处理引擎,它是建立在Spark SQL引擎之上的。Structured Streaming提供了一种以流式的方式对数据进行处理和分析的方法,它能够将流式数据转换成连续的数据流,并且提供了和Spark SQL相似的编程接口。Structured Streaming具有和Spark SQL一样的优势,比如强大的优化能力和丰富的数据源支持,同时还支持流式数据处理的特性,比如低延迟、高吞吐量和容错性。 Spark SQL是一种用于处理结构化数据的Spark模块,它提供了一种类似于SQL的编程接口,使得用户可以像处理关系型数据库一样处理Spark中的数据。Spark SQL支持从Hive、JSON、Parquet、JDBC等数据源中读取数据,并提供了对数据的查询、过滤、聚合等操作。 Spark Streaming是Spark生态系统中的另一个模块,它是专门用于处理实时数据的流式处理引擎。Spark Streaming支持将实时数据流转换成一系列的离散化的RDD,并提供了一系列的DStream操作,包括mapreduce、join等操作。Spark Streaming支持从Kafka、Flume、Twitter、HDFS等数据源中读取数据,并提供了容错性、可伸缩性等特性。 因此,Structured Streaming和Spark SQLSpark Streaming都属于Spark生态系统中不同的组件,Structured Streaming建立在Spark SQL引擎之上,具有和Spark SQL相似的编程接口,同时支持流式数据处理的特性;而Spark Streaming则是专门用于处理实时数据的流式处理引擎,提供了一系列的DStream操作。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值