三、RDD编程(2)

数据读写

文件数据读写

>>> textFile = sc.textFile("file:///opt/module/spark/mycode/rdd/word.txt")
>>> textFile.first()
'Hello Hadoop'
#若word.txt不存在,直到执行才会报错

'''
saveAsTextFile() 
1. 参数是目录名称不是文件名称,因为要分区
2. 要求目录名称事先不存在,如果存在则报错
'''
>>> textFile.saveAsTextFile("file:///opt/module/spark/mycode/rdd/writeback")
$ cat writeback/part-00000
Hello Hadoop
Hello Spark
Hadoop is good
$ cat writeback/part-00001
Spark is fast
Spark is better
#因为pyspark启动时采用默认参数,默认两个分区
$ ./bin/pyspark --master local[1]
#删除writeback再执行一次则只有一个part

#从分布式文件系统HDFS读写数据一样的操作
textFile = sc.textFile("word.txt")
textFile.saveAsTextFile("writeback")

读写Hbase数据

  1. 安装Hbase,参考《Spark编程基础》官网实验指南
  2. 创建一个student表
#删除表
hbase(main):001:0> disable 'student'
hbase(main):002:0> drop 'student'

hbase(main):003:0> create 'student', 'info'
Created table student
Took 5.0472 seconds                                                                                        
=> Hbase::Table - student
hbase(main):004:0> put 'student', '1', 'info:name', 'Xueqian'
Took 0.3650 seconds
hbase(main):005:0> put 'student', '1', 'info:gender', 'F'
hbase(main):006:0> put 'student', '1', 'info:age', '23'
hbase(main):007:0> put 'student', '2', 'info:name', 'Weiliang'
hbase(main):008:0> put 'student', '2', 'info:gender', 'M'
hbase(main):009:0> put 'student', '2', 'info:age', '24'

hbase(main):010:0> get 'student', '1'
COLUMN                          CELL                                                                       
 info:age                       timestamp=1637453631890, value=23                                           
 info:gender                    timestamp=1637453479150, value=F                                             
 info:name                      timestamp=1637453431396, value=Xueqian                                       
1 row(s)
Took 0.1093 seconds

hbase(main):011:0> scan 'student'
ROW                             COLUMN+CELL                                                                 
 1                              column=info:age, timestamp=1637453631890, value=23                           
 1                              column=info:gender, timestamp=1637453479150, value=F                         
 1                              column=info:name, timestamp=1637453431396, value=Xueqian                     
 2                              column=info:age, timestamp=1637453676666, value=24                           
 2                              column=info:gender, timestamp=1637453690973, value=M                         
 2                              column=info:name, timestamp=1637453650092, value=Weiliang                   
2 row(s)
Took 0.0355 seconds   

  1. 编写程序读取Hbase数据
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("ReadHBase")
sc = SparkContext(conf = conf)
host = 'localhost' #换成hadoop也可以
table = 'student' #刚才创建的student表
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table}
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv,valueConverter=valueConv,conf=conf)
count = hbase_rdd.count()
hbase_rdd.cache()
output = hbase_rdd.collect()
for (k, v) in output:
    print (k, v)

1 {"qualifier" : "age", "timestamp" : "1637453631890", "columnFamily" : "info", "row" : "1", "type" : "Put", "value" : "23"}
{"qualifier" : "gender", "timestamp" : "1637453479150", "columnFamily" : "info", "row" : "1", "type" : "Put", "value" : "F"}
{"qualifier" : "name", "timestamp" : "1637453431396", "columnFamily" : "info", "row" : "1", "type" : "Put", "value" : "Xueqian"}
2 {"qualifier" : "age", "timestamp" : "1637453676666", "columnFamily" : "info", "row" : "2", "type" : "Put", "value" : "24"}
{"qualifier" : "gender", "timestamp" : "1637453690973", "columnFamily" : "info", "row" : "2", "type" : "Put", "value" : "M"}
{"qualifier" : "name", "timestamp" : "1637453650092", "columnFamily" : "info", "row" : "2", "type" : "Put", "value" : "Weiliang"}


  1. 编写程序向HBase写入数据
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("ReadHBase")
sc = SparkContext(conf = conf)
host = 'localhost'
table = 'student'
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
rawData = ['3,info,name,Rongcheng', '3,info,gender,M', 
           '3,info,age,26', '4,info,name,Guanhua', 
           '4,info,gender,M', '4,info,age,27']
sc.parallelize(rawData).map(lambda x : (x[0], x.split(','))).saveAsNewAPIHadoopDataset(conf = conf, keyConverter = keyConv, valueConverter = valueConv)
hbase> scan  'student'

综合实例

  1. 求Top值
$ cd /opt/module/spark/mycode/rdd/
$ vim file0.txt
$ cat file0.txt 
1,1768,50,155
2,1218,600,211
3,2239,788,242
4,3101,28,599
5,4899,290,129
6,3110,54,1201
7,4436,259,877
8,2369,7890,27

#四个字段分别为orderid、userid、payment、productid,要求求出TopN个payment值
$ vim TopN.py
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("ReadHBase")
sc = SparkContext(conf = conf)
lines = sc.textFile("file:///opt/module/spark/mycode/rdd/file0.txt")
result1 = lines.filter(lambda line : (len(line.strip()) > 0) and (len(line.split(",")) == 4))
result2 = result1.map(lambda line : line.split(",")[2])
result3 = result2.map(lambda x : (int(x), ""))
result4 = result3.repartition(1)
result5 = result4.sortByKey(False)
result6 = result5.map(lambda x : x[0]) #result6 = result5.keys() 也可
result7 = result6.take(5)
for a in result7:
    print(a)
$ python3 TopN.py
7890                                                                            
788
600
290
259

  1. 文件排序
$ mkdir FileSort
$ cd FileSort
$ vim file1.txt
$ vim file2.txt
$ vim file3.txt
$ cd ..
$ vim FileSort.py
from pyspark import SparkConf, SparkContext

index = 0

def getindex():
    global index
    index += 1
    return index

def main():
    conf = SparkConf().setMaster("local").setAppName("FileSort")
    sc = SparkContext(conf = conf)
    lines = sc.textFile("file:///opt/module/spark/mycode/rdd/FileSort/file*.txt")
    index = 0
    result1 = lines.filter(lambda line : (len(line.strip()) > 0))
    result2 = result1.map(lambda x : (int(x.strip()), ""))
    result3 = result2.sortByKey(True)
    result4 = result3.map(lambda x : x[0])
    result5 = result4.repartition(1)
    result6 = result5.map(lambda x : (getindex(), x))
    result6.saveAsTextFile("file:///opt/module/spark/mycode/rdd/FileSort/SortResult")

if __name__ == "__main__":
    main()
$ python3 FileSort.py
$ cat FileSort/SortResult/part-00000 
(1, 1)
(2, 4)
(3, 5)
(4, 12)
(5, 16)
(6, 25)
(7, 33)
(8, 37)
(9, 39)
(10, 40)
(11, 45)
(12, 1010)

  1. 二次排序
$ mkdir secondarysort
$ cd secondarysort
$ vim file4.txt
$ cat file4.txt
5 3
1 6
4 9
8 3
4 7
5 6
3 2
$ cd ..
$ vim SecondarySortApp.py
from operator import gt
from pyspark import SparkConf, SparkContext

class SecondarySortKey():
    def __init__(self, k):
        self.column1 = k[0]
        self.column2 = k[1]

    def __gt__(self, other):
        if other.column1 == self.column1:
            return gt(self.column2, other.column2)
        else:
            return gt(self.column1, other.column1)

def main():
    conf = SparkConf().setMaster("local[1]").setAppName("spark_sort")
    sc = SparkContext(conf = conf)
    lines = sc.textFile("file:///opt/module/spark/mycode/rdd/secondarysort/file4.txt")
    rdd1 = lines.filter(lambda x : (len(x.strip()) > 0))
    rdd2 = rdd1.map(lambda x : ((int(x.split(" ")[0]), int(x.split(" ")[1])), x))
    rdd3 = rdd2.map(lambda x : (SecondarySortKey(x[0]), x[1]))
    rdd4 = rdd3.sortByKey(False)
    rdd5 = rdd4.map(lambda x : x[1])
    rdd5.foreach(print)

if __name__ == "__main__":
    main()
$ python3 SecondarySort.py
5 6
5 3
4 9
4 7
3 2
1 6

习题

  1. 请阐述RDD有哪几种创建方式
  2. 请给出常用的RDD转换操作API并说明其作用
  3. 请说明为何在使用persisit()方法对一个RDD进行持久化时,会将其称为”标记为持久化“
  4. 请阐述RDD分区的作用
  5. 请阐述在各种模式下默认RDD分区数目是如何确定的
  6. 请举例说明reduceByKey和groupByKey的区别
  7. 请阐述为了让Spark顺利读/写HBase数据,需要做哪些准备工作

学习资料:《Spark编程基础》
学习网站:dblab.xmu.edu/post/spark-python

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值